├── .gitignore ├── README.md ├── mlcsseg-ansj ├── lib │ ├── ansj_seg-1.4-min.jar │ └── tree_split-1.3.jar ├── pom.xml └── src │ └── main │ ├── assembly │ └── zip.xml │ └── java │ └── org │ └── ansj │ └── solr │ ├── AnsjTokenizer.java │ ├── AnsjTokenizerFactory.java │ └── TestAnsj.java ├── mlcsseg-common ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── mlcs │ └── search │ └── mlcsseg │ ├── common │ └── ScheduledExecutor.java │ └── lucene │ ├── CnTokenizer.java │ ├── ReloadableTokenizerFactory.java │ └── ReloaderRegister.java ├── mlcsseg-filter ├── pom.xml └── src │ ├── main │ └── java │ │ └── org │ │ └── apache │ │ └── solr │ │ └── analysis │ │ ├── DStopFilter.java │ │ ├── DStopFilterFactory.java │ │ └── DSynonymFilterFactory.java │ └── test │ └── java │ └── org │ └── mlcsseg │ └── filter │ └── AppTest.java ├── mlcsseg-ik ├── pom.xml └── src │ ├── main │ ├── java │ │ └── org │ │ │ └── wltea │ │ │ └── analyzer │ │ │ ├── cfg │ │ │ ├── Configuration.java │ │ │ └── DefaultConfig.java │ │ │ ├── core │ │ │ ├── AnalyzeContext.java │ │ │ ├── CJKSegmenter.java │ │ │ ├── CN_QuantifierSegmenter.java │ │ │ ├── CharacterUtil.java │ │ │ ├── IKArbitrator.java │ │ │ ├── IKSegmenter.java │ │ │ ├── ISegmenter.java │ │ │ ├── LetterSegmenter.java │ │ │ ├── Lexeme.java │ │ │ ├── LexemePath.java │ │ │ └── QuickSortSet.java │ │ │ ├── dic │ │ │ ├── DictCharNode.java │ │ │ ├── DictSegment.java │ │ │ ├── Dictionary.java │ │ │ └── Hit.java │ │ │ └── lucene │ │ │ ├── IKTokenizer.java │ │ │ └── IKTokenizerFactory.java │ └── resources │ │ ├── chars.dic │ │ ├── main2012.dic │ │ └── quantifier.dic │ └── test │ └── java │ └── org │ └── wltea │ └── analyzer │ └── test │ └── TestIk.java ├── pom.xml ├── test1 └── conf │ ├── admin-extra.html │ ├── admin-extra.menu-bottom.html │ ├── admin-extra.menu-top.html │ ├── extDic.txt │ ├── extDic1.txt │ ├── ik.conf │ ├── isynonyms.txt │ ├── schema.xml │ ├── solrconfig.xml │ ├── stop.conf │ ├── stopwords.txt │ ├── synonym.conf │ ├── synonym2.conf │ ├── synonyms.txt │ └── update-script.js └── test2 └── conf ├── admin-extra.html ├── admin-extra.menu-bottom.html ├── admin-extra.menu-top.html ├── ansj.conf ├── extDic.txt ├── extDic1.txt ├── isynonyms.txt ├── schema.xml ├── solrconfig.xml ├── stop.conf ├── stopwords.txt ├── synonym.conf ├── synonym2.conf ├── synonyms.txt └── update-script.js /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Package Files # 4 | *.war 5 | *.ear 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mlcsseg :solr分词器大补贴 2 | ======= 3 | 4 | 包括IK, ANSJ,过滤器。支持动态加载solr配置路径下的自定义词库。 5 | 6 | 支持最新的`4.6`版本。master分支是4.6的,其他分支支持对应的solr版本 7 | 8 | 配置和说明都在:http://mlcsdev.iteye.com/blog/2037109 9 | 10 | 欢迎使用,并以任何方式提供意见和建议。 11 | -------------------------------------------------------------------------------- /mlcsseg-ansj/lib/ansj_seg-1.4-min.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcsdev/mlcsseg/942bc12ee27e5297bf6a042952299a82c490ca19/mlcsseg-ansj/lib/ansj_seg-1.4-min.jar -------------------------------------------------------------------------------- /mlcsseg-ansj/lib/tree_split-1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcsdev/mlcsseg/942bc12ee27e5297bf6a042952299a82c490ca19/mlcsseg-ansj/lib/tree_split-1.3.jar -------------------------------------------------------------------------------- /mlcsseg-ansj/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.mlcs.search 6 | mlcsseg 7 | 4.6.0-SNAPSHOT 8 | 9 | mlcsseg-ansj 10 | 11 | 12 | 13 | com.mlcs.search 14 | mlcsseg-common 15 | 4.6.0-SNAPSHOT 16 | 17 | 18 | org.ansj 19 | ansj_seg 20 | 1.4 21 | system 22 | ${project.basedir}/lib/ansj_seg-1.4-min.jar 23 | 24 | 25 | org.ansj 26 | tree_split 27 | 1.3 28 | system 29 | ${project.basedir}/lib/tree_split-1.3.jar 30 | 31 | 32 | 33 | 34 | 35 | 36 | org.apache.maven.plugins 37 | maven-compiler-plugin 38 | 39 | 1.6 40 | 1.6 41 | utf8 42 | 43 | 44 | 45 | org.apache.maven.plugins 46 | maven-resources-plugin 47 | 2.5 48 | 49 | UTF-8 50 | 51 | 52 | 53 | maven-assembly-plugin 54 | 2.2.1 55 | 56 | 57 | src/main/assembly/zip.xml 58 | 59 | 60 | 61 | 62 | org.apache.maven.plugins 63 | maven-jar-plugin 64 | 2.4 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /mlcsseg-ansj/src/main/assembly/zip.xml: -------------------------------------------------------------------------------- 1 | 5 | bin 6 | 7 | zip 8 | 9 | 10 | 11 | true 12 | jar 13 | 14 | 15 | org.apache.solr:solr* 16 | com.spatial4j:spatial4j 17 | org.apache.lucene:lucene* 18 | com.google.guava:guava* 19 | commons*:commons* 20 | org.restlet.jee:org.restlet* 21 | org.apache.zookeeper:zookeeper* 22 | org.noggit:noggit* 23 | org.slf4j*:slf4j* 24 | org.codehaus.woodstox:wstx-asl* 25 | org.apache.httpcomponents:http* 26 | 27 | 28 | 29 | 30 | 31 | lib 32 | jar 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /mlcsseg-ansj/src/main/java/org/ansj/solr/AnsjTokenizer.java: -------------------------------------------------------------------------------- 1 | package org.ansj.solr; 2 | 3 | import java.io.IOException; 4 | import java.io.Reader; 5 | import java.util.ArrayList; 6 | import java.util.Iterator; 7 | import java.util.List; 8 | import org.ansj.domain.Term; 9 | import org.ansj.splitWord.analysis.IndexAnalysis; 10 | import org.ansj.splitWord.analysis.ToAnalysis; 11 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 12 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 13 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 14 | 15 | import com.mlcs.search.mlcsseg.lucene.CnTokenizer; 16 | 17 | 18 | public class AnsjTokenizer extends CnTokenizer{ 19 | private int analysisType ; 20 | private boolean removePunc; 21 | 22 | private CharTermAttribute termAtt; 23 | private OffsetAttribute offsetAtt; 24 | private TypeAttribute typeAtt; 25 | int lastOffset = 0; 26 | int endPosition =0; 27 | private Iterator tokenIter; 28 | private List tokenBuffer; 29 | static 30 | { 31 | ToAnalysis.parse(""); 32 | } 33 | 34 | public AnsjTokenizer(Reader input, int analysisType, boolean removePunc) { 35 | super(input); 36 | offsetAtt = addAttribute(OffsetAttribute.class); 37 | termAtt = addAttribute(CharTermAttribute.class); 38 | typeAtt = addAttribute(TypeAttribute.class); 39 | this.analysisType = analysisType; 40 | this.removePunc = removePunc; 41 | } 42 | 43 | @Override 44 | public boolean incrementToken() throws IOException { 45 | if (tokenIter == null || !tokenIter.hasNext()){ 46 | String currentSentence = checkSentences(); 47 | if (currentSentence!= null){ 48 | tokenBuffer = new ArrayList(); 49 | if (analysisType == 1){ 50 | for(Term term : ToAnalysis.parse(currentSentence)){ 51 | if (removePunc && stopwords.contains(term.getName())) 52 | continue; 53 | tokenBuffer.add(term); 54 | } 55 | 56 | }else { 57 | for(Term term : IndexAnalysis.parse(currentSentence)){ 58 | if (removePunc && stopwords.contains(term.getName())) 59 | continue; 60 | tokenBuffer.add(term); 61 | } 62 | } 63 | tokenIter = tokenBuffer.iterator(); 64 | if (!tokenIter.hasNext()){ 65 | return false; 66 | } 67 | } else { 68 | return false; // no more sentences, end of stream! 69 | } 70 | } 71 | clearAttributes(); 72 | 73 | Term term = tokenIter.next(); 74 | if (removePunc){ 75 | while(stopwords.contains(term.getName())){ 76 | if (!tokenIter.hasNext()){ 77 | }else{ 78 | term = tokenIter.next(); 79 | } 80 | } 81 | } 82 | termAtt.append(term.getName()); 83 | termAtt.setLength(term.getName().length()); 84 | 85 | int currentStart = tokenStart + term.getOffe(); 86 | int currentEnd = tokenStart + term.getToValue(); 87 | offsetAtt.setOffset(currentStart,currentEnd); 88 | typeAtt.setType("word"); 89 | 90 | // int pi = currentStart - lastOffset; 91 | // if(term.getOffe() <= 0) { 92 | // pi = 1; 93 | // } 94 | // positionIncrementAtt.setPositionIncrement( pi ); 95 | lastOffset = currentStart; 96 | endPosition = currentEnd; 97 | return true; 98 | } 99 | 100 | 101 | 102 | @Override 103 | public void reset() throws IOException { 104 | super.reset(); 105 | } 106 | 107 | public final void end() { 108 | // set final offset 109 | int finalOffset = correctOffset(this.endPosition); 110 | offsetAtt.setOffset(finalOffset, finalOffset); 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /mlcsseg-ansj/src/main/java/org/ansj/solr/AnsjTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.ansj.solr; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.Reader; 6 | import java.util.List; 7 | import java.util.Map; 8 | import org.ansj.library.UserDefineLibrary; 9 | import org.apache.lucene.analysis.Tokenizer; 10 | import org.apache.lucene.analysis.util.ResourceLoader; 11 | import org.apache.lucene.util.AttributeSource.AttributeFactory; 12 | 13 | import com.mlcs.search.mlcsseg.lucene.ReloadableTokenizerFactory; 14 | import com.mlcs.search.mlcsseg.lucene.ReloaderRegister; 15 | 16 | 17 | public class AnsjTokenizerFactory extends ReloadableTokenizerFactory { 18 | 19 | private int analysisType = 0; 20 | private boolean rmPunc = true; 21 | 22 | public AnsjTokenizerFactory(Map args) { 23 | super(args); 24 | analysisType = getInt(args, "analysisType", 0); 25 | rmPunc = getBoolean(args, "rmPunc", true); 26 | System.out.println(":::ansj:construction::::::::::::::::::::::::::" + conf); 27 | } 28 | 29 | 30 | 31 | public void inform(ResourceLoader loader) throws IOException { 32 | System.out.println(":::ansj:::inform::::::::::::::::::::::::" + conf); 33 | ReloaderRegister.register(this, loader, conf); 34 | } 35 | 36 | @Override 37 | public Tokenizer create(AttributeFactory factory, Reader input) { 38 | return new AnsjTokenizer(input, analysisType, rmPunc); 39 | } 40 | 41 | 42 | 43 | @Override 44 | public void update(List inputStreams) { 45 | if (inputStreams!= null){ 46 | UserDefineLibrary.reloadMainAndAdd(inputStreams); 47 | } 48 | } 49 | 50 | 51 | 52 | 53 | } 54 | -------------------------------------------------------------------------------- /mlcsseg-ansj/src/main/java/org/ansj/solr/TestAnsj.java: -------------------------------------------------------------------------------- 1 | package org.ansj.solr; 2 | 3 | import java.io.IOException; 4 | import java.io.StringReader; 5 | import java.util.List; 6 | 7 | import org.ansj.domain.Term; 8 | import org.ansj.splitWord.analysis.IndexAnalysis; 9 | import org.ansj.splitWord.analysis.ToAnalysis; 10 | import org.apache.lucene.analysis.Tokenizer; 11 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 12 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 13 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 14 | 15 | 16 | 17 | public class TestAnsj { 18 | 19 | public static void main(String[] args) throws IOException { 20 | List parse = ToAnalysis.parse("天天向上,媒体打打。《回家真好》"); 21 | System.out.println(parse); 22 | Tokenizer tokenizer = new AnsjTokenizer(new StringReader("天天向上,媒体打打。《回家真好》"), 0, true); 23 | CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); 24 | OffsetAttribute offsetAtt = 25 | tokenizer.addAttribute(OffsetAttribute.class); 26 | PositionIncrementAttribute positionIncrementAtt = 27 | tokenizer.addAttribute(PositionIncrementAttribute.class); 28 | 29 | 30 | while (tokenizer.incrementToken()){ 31 | 32 | System.out.print(new String(termAtt.toString()) ); 33 | System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" ); 34 | System.out.print( positionIncrementAtt.getPositionIncrement() +"/"); 35 | 36 | } 37 | tokenizer.close(); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /mlcsseg-common/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | com.mlcs.search 7 | mlcsseg 8 | 4.6.0-SNAPSHOT 9 | 10 | mlcsseg-common 11 | mlcsseg-common 12 | http://maven.apache.org 13 | 14 | UTF-8 15 | 16 | 17 | -------------------------------------------------------------------------------- /mlcsseg-common/src/main/java/com/mlcs/search/mlcsseg/common/ScheduledExecutor.java: -------------------------------------------------------------------------------- 1 | package com.mlcs.search.mlcsseg.common; 2 | import java.util.concurrent.Executors; 3 | import java.util.concurrent.ScheduledExecutorService; 4 | import java.util.concurrent.ThreadFactory; 5 | import java.util.concurrent.TimeUnit; 6 | 7 | 8 | 9 | public class ScheduledExecutor { 10 | 11 | static class SegTF implements ThreadFactory{ 12 | 13 | public Thread newThread(Runnable r) { 14 | Thread t = new Thread(r, "SegmentScheduledExecutorThread"); 15 | t.setDaemon(true); 16 | return t; 17 | } 18 | 19 | } 20 | 21 | final public static ScheduledExecutorService ScheduledService = Executors.newSingleThreadScheduledExecutor(new SegTF()); 22 | 23 | 24 | public static void submit(Runnable cmd, long periodMilliSenconds){ 25 | ScheduledService.scheduleAtFixedRate(cmd, 10l, periodMilliSenconds, TimeUnit.MILLISECONDS); 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /mlcsseg-common/src/main/java/com/mlcs/search/mlcsseg/lucene/CnTokenizer.java: -------------------------------------------------------------------------------- 1 | package com.mlcs.search.mlcsseg.lucene; 2 | 3 | import java.io.IOException; 4 | import java.io.Reader; 5 | import java.io.StringReader; 6 | import java.util.HashSet; 7 | import java.util.Set; 8 | 9 | import org.apache.lucene.analysis.Tokenizer; 10 | 11 | /** 12 | * 增加基础的停用词过滤,切长句的能力。分词细节没做. 13 | * @Description TODO 14 | * @author shanbo.liang 15 | */ 16 | public abstract class CnTokenizer extends Tokenizer{ 17 | public final static String SPACES = "  \t\r\n"; 18 | public final static String PUNCTUATION = "。,!?;,!?;"; 19 | public final static String stop = "',.`-_=?\'|\"(){}[]<>*#&^$@!~:;+/《》—-,。、:;!·?“”)(【】[]●'"; 20 | public static Set stopwords = new HashSet(); 21 | 22 | protected final StringBuilder buffer = new StringBuilder(); 23 | protected int tokenStart = 0, tokenEnd = 0; 24 | 25 | 26 | static 27 | { 28 | for(String c : stop.split("")){ 29 | stopwords.add(c); 30 | } 31 | } 32 | 33 | protected CnTokenizer(Reader input) { 34 | super(input); 35 | } 36 | 37 | protected String checkSentences() throws IOException{ 38 | buffer.setLength(0); 39 | int ci; 40 | char ch, pch; 41 | boolean atBegin = true; 42 | tokenStart = tokenEnd; 43 | ci = input.read(); 44 | ch = (char) ci; 45 | 46 | while (true) { 47 | if (ci == -1) { 48 | break; 49 | } else if (PUNCTUATION.indexOf(ch) != -1) { 50 | // End of a sentence 51 | buffer.append(ch); 52 | tokenEnd++; 53 | break; 54 | } else if (atBegin && SPACES.indexOf(ch) != -1) { 55 | tokenStart++; 56 | tokenEnd++; 57 | ci = input.read(); 58 | ch = (char) ci; 59 | } else { 60 | buffer.append(ch); 61 | atBegin = false; 62 | tokenEnd++; 63 | pch = ch; 64 | ci = input.read(); 65 | ch = (char) ci; 66 | // Two spaces, such as CR, LF 67 | if (SPACES.indexOf(ch) != -1 68 | && SPACES.indexOf(pch) != -1) { 69 | // buffer.append(ch); 70 | tokenEnd++; 71 | break; 72 | } 73 | } 74 | } 75 | if (buffer.length() == 0){ 76 | //sentences finished~ 77 | return null; 78 | }else { 79 | return buffer.toString(); 80 | } 81 | 82 | } 83 | 84 | public void reset() throws IOException { 85 | super.reset(); 86 | tokenStart = tokenEnd = 0; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /mlcsseg-common/src/main/java/com/mlcs/search/mlcsseg/lucene/ReloadableTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package com.mlcs.search.mlcsseg.lucene; 2 | 3 | import java.io.InputStream; 4 | import java.util.List; 5 | import java.util.Map; 6 | 7 | import org.apache.lucene.analysis.util.ResourceLoaderAware; 8 | import org.apache.lucene.analysis.util.TokenizerFactory; 9 | 10 | 11 | public abstract class ReloadableTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware{ 12 | 13 | protected String conf; 14 | 15 | protected ReloadableTokenizerFactory(Map args) { 16 | super(args); 17 | assureMatchVersion(); 18 | conf = get(args, "conf"); 19 | } 20 | 21 | public abstract void update(List inputStreams); 22 | 23 | public String getBeanName(){ 24 | return this.getClass().toString(); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /mlcsseg-common/src/main/java/com/mlcs/search/mlcsseg/lucene/ReloaderRegister.java: -------------------------------------------------------------------------------- 1 | package com.mlcs.search.mlcsseg.lucene; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.util.ArrayList; 6 | import java.util.Collections; 7 | import java.util.HashMap; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Properties; 11 | 12 | import org.apache.lucene.analysis.util.ResourceLoader; 13 | 14 | import com.mlcs.search.mlcsseg.common.ScheduledExecutor; 15 | 16 | /** 17 | * register it in 'inform(ResourceLoader loader)' 18 | * @Description TODO 19 | * @author shanbo.liang 20 | */ 21 | public class ReloaderRegister{ 22 | 23 | 24 | 25 | 26 | private static Map reloadAwares = new HashMap(); 27 | 28 | 29 | public static class ConfigChecker { 30 | 31 | private long lastUpdateTime = Long.MIN_VALUE; 32 | 33 | 34 | 35 | 36 | public static List SplitFileNames(String fileNames) { 37 | if (fileNames == null || fileNames.isEmpty()) 38 | return Collections.emptyList(); 39 | 40 | List result = new ArrayList(); 41 | for (String file : fileNames.split("[,\\s]+")) { 42 | result.add(file); 43 | } 44 | 45 | return result; 46 | } 47 | 48 | public List currentToReload(InputStream confStream){ 49 | try{ 50 | Properties p = new Properties(); 51 | p.load(confStream); 52 | confStream.close(); 53 | String lastupdate = p.getProperty("lastupdate", "0"); 54 | Long t = new Long(lastupdate); 55 | // System.out.println(" => " + toString() + "=========loading conf========= : " + p.toString() ); 56 | if (t > this.lastUpdateTime){ 57 | System.out.println("lastUpdateTime is new, files will be loaded!" ); 58 | this.lastUpdateTime = t.longValue(); 59 | String paths = p.getProperty("files"); 60 | if (paths==null || paths.trim().isEmpty()) // 必须有地址 61 | return Collections.emptyList(); 62 | 63 | List dicPaths = SplitFileNames(p.getProperty("files")); 64 | return dicPaths; 65 | }else{ 66 | this.lastUpdateTime = t.longValue(); 67 | return Collections.emptyList(); 68 | } 69 | }catch(IOException e){ 70 | return Collections.emptyList(); 71 | } 72 | } 73 | 74 | public String toString(){ 75 | return "configchecker@" + lastUpdateTime; 76 | } 77 | 78 | } 79 | 80 | 81 | /** 82 | * 向注册机注册一个可定时更新的tokenfactory;register it in 'inform(ResourceLoader loader)' 83 | * @param reloadFactory 84 | * @param loader 85 | * @param confName 86 | * @return 87 | */ 88 | public static synchronized String register(final ReloadableTokenizerFactory reloadFactory, final ResourceLoader loader, final String confName){ 89 | if ( reloadAwares.containsKey(reloadFactory.getBeanName())){ 90 | return "already"; 91 | }else{ 92 | if(confName != null && !confName.trim().isEmpty()){ //存在conf才注册进来 93 | final ConfigChecker cc = new ConfigChecker(); 94 | reloadAwares.put(reloadFactory.getBeanName(), cc); 95 | loadAndUpdate(cc, reloadFactory, loader, confName); 96 | ScheduledExecutor.submit(new Runnable() { 97 | public void run() { 98 | loadAndUpdate(cc, reloadFactory, loader, confName); 99 | } 100 | }, 30 * 1000); 101 | return "ok"; 102 | } 103 | return "conf is empty"; 104 | } 105 | } 106 | 107 | private static void loadAndUpdate(final ConfigChecker cc, final ReloadableTokenizerFactory reloadFactory, final ResourceLoader loader, final String confName){ 108 | 109 | try { 110 | List dicts = cc.currentToReload(loader.openResource(confName)); 111 | if (!dicts.isEmpty()){ 112 | List insFromLoader = new ArrayList(dicts.size()); 113 | for(String dictName : dicts){ 114 | try{ 115 | insFromLoader.add(loader.openResource(dictName)); 116 | }catch(IOException e){ 117 | System.out.println("missing dict source : " + dictName); 118 | } 119 | } 120 | reloadFactory.update(insFromLoader); 121 | System.out.println("reload finish! " + dicts); 122 | } 123 | } catch (IOException e) { 124 | e.printStackTrace(); 125 | } 126 | } 127 | 128 | 129 | } 130 | -------------------------------------------------------------------------------- /mlcsseg-filter/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.mlcs.search 8 | mlcsseg 9 | 4.6.0-SNAPSHOT 10 | 11 | mlcsseg-filter 12 | mlcsseg-filter 13 | http://maven.apache.org 14 | 15 | UTF-8 16 | 17 | 18 | 19 | 20 | junit 21 | junit 22 | 3.8.1 23 | test 24 | 25 | 26 | com.mlcs.search 27 | mlcsseg-common 28 | 4.6.0-SNAPSHOT 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /mlcsseg-filter/src/main/java/org/apache/solr/analysis/DStopFilter.java: -------------------------------------------------------------------------------- 1 | package org.apache.solr.analysis; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.lucene.analysis.TokenStream; 6 | 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 8 | import org.apache.lucene.analysis.util.CharArraySet; 9 | import org.apache.lucene.analysis.util.FilteringTokenFilter; 10 | import org.apache.lucene.util.Version; 11 | 12 | public class DStopFilter extends FilteringTokenFilter { 13 | 14 | private final CharArraySet stopWords; 15 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 16 | 17 | public DStopFilter(TokenStream input, CharArraySet stopWords) { 18 | super(Version.LUCENE_46, input); 19 | 20 | this.stopWords = stopWords; 21 | } 22 | 23 | @Override 24 | protected boolean accept() throws IOException { 25 | 26 | // System.out.println("accept()"+termAtt.toString()); 27 | return !stopWords.contains(termAtt.buffer(), 0, termAtt.length()); // 未被赋值过?隐藏操作在哪里实现? 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /mlcsseg-filter/src/main/java/org/apache/solr/analysis/DStopFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.apache.solr.analysis; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.util.Map; 6 | import java.util.Properties; 7 | 8 | import org.apache.lucene.analysis.TokenStream; 9 | import org.apache.lucene.analysis.util.CharArraySet; 10 | import org.apache.lucene.analysis.util.ResourceLoader; 11 | import org.apache.lucene.analysis.util.ResourceLoaderAware; 12 | import org.apache.lucene.analysis.util.TokenFilterFactory; 13 | 14 | import com.mlcs.search.mlcsseg.common.ScheduledExecutor; 15 | 16 | 17 | public class DStopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { 18 | 19 | public DStopFilterFactory(Map args) { 20 | super(args); 21 | ignoreCase = getBoolean(args, "ignoreCase", false); 22 | // enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false); 23 | conf = get(args, "conf"); //paths & lastupdate 24 | System.out.println("construct:::::stop::::::::::::::::::::::" + conf); 25 | } 26 | 27 | private CharArraySet stopWords; 28 | private boolean ignoreCase; 29 | // private boolean enablePositionIncrements; 30 | 31 | private ResourceLoader loader; 32 | 33 | private String conf; 34 | private long lastUpdateTime = -1; 35 | 36 | public void inform(final ResourceLoader loader) throws IOException { 37 | System.out.println("inform:::::stop::::::::::::::::::::::" + conf); 38 | this.loader = loader; 39 | this.update(); 40 | if(conf != null && !conf.trim().isEmpty()){ 41 | ScheduledExecutor.submit(new Runnable() { 42 | 43 | public void run() { 44 | try { 45 | update(); 46 | } catch (IOException e) { 47 | e.printStackTrace(); 48 | } 49 | } 50 | }, 1000 * 60 ); 51 | } 52 | } 53 | 54 | @Override 55 | public TokenStream create(TokenStream arg0) { 56 | DStopFilter stopFilter = new DStopFilter( arg0, stopWords); 57 | return stopFilter; 58 | } 59 | 60 | public void update() throws IOException { 61 | Properties p = canUpdate(); 62 | if (p != null){ 63 | System.out.println(" updating~~~!! "); 64 | stopWords = getWordSet(loader, p.getProperty("files"), ignoreCase); 65 | System.out.println(" finish!! "); 66 | } 67 | 68 | } 69 | 70 | 71 | private Properties canUpdate() { 72 | 73 | try{ 74 | Properties p = new Properties(); 75 | InputStream confStream = loader.openResource(conf); 76 | p.load(confStream); 77 | confStream.close(); 78 | String lastupdate = p.getProperty("lastupdate", "0"); 79 | Long t = new Long(lastupdate); 80 | 81 | if (t > this.lastUpdateTime){ 82 | this.lastUpdateTime = t.longValue(); 83 | String paths = p.getProperty("files"); 84 | if (paths==null || paths.trim().isEmpty()) // 必须有地址 85 | return null; 86 | System.out.println("loading conf"); 87 | return p; 88 | }else{ 89 | this.lastUpdateTime = t.longValue(); 90 | return null; 91 | } 92 | }catch(Exception e){ 93 | System.err.println("stop parsing conf NullPointerException~~~~~" + e.getMessage()); 94 | return null; 95 | } 96 | } 97 | 98 | } 99 | -------------------------------------------------------------------------------- /mlcsseg-filter/src/main/java/org/apache/solr/analysis/DSynonymFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.apache.solr.analysis; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.io.InputStreamReader; 7 | import java.io.Reader; 8 | import java.nio.charset.Charset; 9 | import java.nio.charset.CharsetDecoder; 10 | import java.nio.charset.CodingErrorAction; 11 | import java.text.ParseException; 12 | import java.util.List; 13 | import java.util.Map; 14 | import java.util.Properties; 15 | 16 | import org.apache.lucene.analysis.TokenStream; 17 | import org.apache.lucene.analysis.Analyzer; 18 | import org.apache.lucene.analysis.core.LowerCaseFilter; 19 | import org.apache.lucene.analysis.core.WhitespaceTokenizer; 20 | import org.apache.lucene.analysis.synonym.SolrSynonymParser; 21 | import org.apache.lucene.analysis.synonym.SynonymFilter; 22 | import org.apache.lucene.analysis.synonym.SynonymMap; 23 | import org.apache.lucene.analysis.util.ResourceLoader; 24 | import org.apache.lucene.analysis.util.ResourceLoaderAware; 25 | import org.apache.lucene.analysis.util.TokenFilterFactory; 26 | import org.apache.lucene.util.Version; 27 | 28 | import com.mlcs.search.mlcsseg.common.ScheduledExecutor; 29 | 30 | 31 | public class DSynonymFilterFactory extends TokenFilterFactory implements 32 | ResourceLoaderAware { 33 | 34 | public DSynonymFilterFactory(Map args) throws IOException { 35 | super(args); 36 | expand = getBoolean(args, "expand", true); 37 | ignoreCase = getBoolean(args, "ignoreCase", false); 38 | conf = get(args, "conf"); //paths & lastupdate 39 | System.out.println(conf); 40 | } 41 | 42 | private SynonymMap map; // 词库,可以通过引用改变 43 | private boolean ignoreCase; //属性 44 | private boolean expand; 45 | private ResourceLoader loader = null; 46 | 47 | private String conf; // properties格式, 存lastupdatetime和词库路径files:逗号间隔 48 | private long lastUpdateTime = -1; 49 | 50 | public void inform(ResourceLoader loader) throws IOException { 51 | System.out.println(":::::synonym::::::::::::::::::::::" + conf); 52 | this.loader = loader; 53 | this.update(); 54 | if(conf != null && !conf.trim().isEmpty()){ 55 | ScheduledExecutor.submit(new Runnable() { 56 | 57 | public void run() { 58 | update(); 59 | 60 | } 61 | }, 1000 * 60); 62 | } 63 | } 64 | 65 | private SynonymMap loadSolrSynonyms(ResourceLoader loader, Properties p) throws IOException, ParseException { 66 | final Analyzer analyzer = new Analyzer() { 67 | @Override 68 | protected TokenStreamComponents createComponents(String fieldName, Reader reader) { 69 | WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_46, reader); 70 | TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_46, tokenizer) : tokenizer; 71 | return new TokenStreamComponents(tokenizer, stream); 72 | } 73 | }; 74 | String synonyms = p.getProperty("files"); 75 | 76 | CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() 77 | .onMalformedInput(CodingErrorAction.REPORT) 78 | .onUnmappableCharacter(CodingErrorAction.REPORT); 79 | 80 | SolrSynonymParser parser = new SolrSynonymParser(true, expand, analyzer); 81 | File synonymFile = new File(synonyms); 82 | if (loader != null){ //first call in constructor 83 | if (synonymFile.exists()) { 84 | decoder.reset(); 85 | 86 | parser.parse(new InputStreamReader(loader.openResource(synonyms), 87 | decoder)); 88 | } else { 89 | List files = splitFileNames(synonyms); 90 | for (String file : files) { 91 | decoder.reset(); 92 | parser.parse(new InputStreamReader(loader.openResource(file), 93 | decoder)); 94 | } 95 | } 96 | } 97 | 98 | return parser.build(); 99 | } 100 | 101 | @Override 102 | public TokenStream create(TokenStream input) { 103 | return map.fst == null ? input : new SynonymFilter(input, map,ignoreCase); 104 | } 105 | 106 | public void update() { 107 | 108 | Properties p = canUpdate(); 109 | if (p != null){ 110 | try { 111 | System.out.println(" updating !"); 112 | map = loadSolrSynonyms(loader, p); // 内部已实现切换 113 | System.out.println(" finish~!"); 114 | } catch (IOException e) { 115 | System.err.println(" IOException!!"); 116 | e.printStackTrace(); 117 | } catch (ParseException e) { 118 | System.err.println(" ParseException!!"); 119 | e.printStackTrace(); 120 | } 121 | } 122 | } 123 | 124 | private Properties canUpdate() { 125 | 126 | try{ 127 | Properties p = new Properties(); 128 | InputStream confStream = loader.openResource(conf); 129 | p.load(confStream); 130 | confStream.close(); 131 | String lastupdate = p.getProperty("lastupdate", "0"); 132 | Long t = new Long(lastupdate); 133 | 134 | if (t > this.lastUpdateTime){ 135 | this.lastUpdateTime = t.longValue(); 136 | String paths = p.getProperty("files"); 137 | if (paths==null || paths.trim().isEmpty()) // 必须有地址 138 | return null; 139 | System.out.println("loading conf"); 140 | return p; 141 | }else{ 142 | this.lastUpdateTime = t.longValue(); 143 | return null; 144 | } 145 | }catch(Exception e){ 146 | System.err.println("synonym parsing conf NullPointerException~~~~~" + e.getMessage()); 147 | return null; 148 | } 149 | } 150 | 151 | } 152 | -------------------------------------------------------------------------------- /mlcsseg-filter/src/test/java/org/mlcsseg/filter/AppTest.java: -------------------------------------------------------------------------------- 1 | package org.mlcsseg.filter; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /mlcsseg-ik/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.mlcs.search 6 | mlcsseg 7 | 4.6.0-SNAPSHOT 8 | 9 | mlcsseg-ik 10 | mlcsseg-ik 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | com.mlcs.search 26 | mlcsseg-common 27 | 4.6.0-SNAPSHOT 28 | 29 | 30 | 31 | 32 | 33 | src/main/resources 34 | 35 | **/*.dic 36 | **/*.xml 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/cfg/Configuration.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.cfg; 26 | 27 | import java.util.List; 28 | 29 | /** 30 | * 31 | * 配置管理类接口 32 | * 33 | */ 34 | public interface Configuration { 35 | 36 | /** 37 | * 返回useSmart标志位 38 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 39 | * @return useSmart 40 | */ 41 | public boolean useSmart(); 42 | 43 | /** 44 | * 设置useSmart标志位 45 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 46 | * @param useSmart 47 | */ 48 | public void setUseSmart(boolean useSmart); 49 | 50 | 51 | /** 52 | * 获取主词典路径 53 | * 54 | * @return String 主词典路径 55 | */ 56 | public String getMainDictionary(); 57 | 58 | /** 59 | * 获取量词词典路径 60 | * @return String 量词词典路径 61 | */ 62 | public String getQuantifierDicionary(); 63 | 64 | /** 65 | * 获取扩展字典配置路径 66 | * @return List 相对类加载器的路径 67 | */ 68 | public List getExtDictionarys(); 69 | 70 | 71 | /** 72 | * 获取扩展停止词典配置路径 73 | * @return List 相对类加载器的路径 74 | */ 75 | public List getExtStopWordDictionarys(); 76 | } 77 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/cfg/DefaultConfig.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 25 | */ 26 | package org.wltea.analyzer.cfg; 27 | 28 | import java.io.IOException; 29 | import java.io.InputStream; 30 | import java.util.ArrayList; 31 | import java.util.InvalidPropertiesFormatException; 32 | import java.util.List; 33 | import java.util.Properties; 34 | 35 | /** 36 | * Configuration 默认实现 37 | * 2012-5-8 38 | * 39 | */ 40 | public class DefaultConfig implements Configuration{ 41 | 42 | /* 43 | * 分词器默认字典路径 44 | */ 45 | private static final String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic"; 46 | private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic"; 47 | 48 | /* 49 | * 分词器配置文件路径 50 | */ 51 | private static final String FILE_NAME = "IKAnalyzer.cfg.xml"; 52 | //配置属性——扩展字典 53 | private static final String EXT_DICT = "ext_dict"; 54 | //配置属性——扩展停止词典 55 | private static final String EXT_STOP = "ext_stopwords"; 56 | 57 | private Properties props; 58 | /* 59 | * 是否使用smart方式分词 60 | */ 61 | private boolean useSmart; 62 | 63 | /** 64 | * 返回单例 65 | * @return Configuration单例 66 | */ 67 | public static Configuration getInstance(){ 68 | return new DefaultConfig(); 69 | } 70 | 71 | /* 72 | * 初始化配置文件 73 | */ 74 | private DefaultConfig(){ 75 | props = new Properties(); 76 | 77 | InputStream input = this.getClass().getClassLoader().getResourceAsStream(FILE_NAME); 78 | if(input != null){ 79 | try { 80 | props.loadFromXML(input); 81 | } catch (InvalidPropertiesFormatException e) { 82 | e.printStackTrace(); 83 | } catch (IOException e) { 84 | e.printStackTrace(); 85 | } 86 | } 87 | } 88 | 89 | 90 | /** 91 | * 返回useSmart标志位 92 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 93 | * @return useSmart 94 | */ 95 | public boolean useSmart() { 96 | return useSmart; 97 | } 98 | 99 | /** 100 | * 设置useSmart标志位 101 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 102 | * @param useSmart 103 | */ 104 | public void setUseSmart(boolean useSmart) { 105 | this.useSmart = useSmart; 106 | } 107 | 108 | /** 109 | * 获取主词典路径 110 | * 111 | * @return String 主词典路径 112 | */ 113 | public String getMainDictionary(){ 114 | return PATH_DIC_MAIN; 115 | } 116 | 117 | /** 118 | * 获取量词词典路径 119 | * @return String 量词词典路径 120 | */ 121 | public String getQuantifierDicionary(){ 122 | return PATH_DIC_QUANTIFIER; 123 | } 124 | 125 | /** 126 | * 获取扩展字典配置路径 127 | * @return List 相对类加载器的路径 128 | */ 129 | public List getExtDictionarys(){ 130 | List extDictFiles = new ArrayList(2); 131 | String extDictCfg = props.getProperty(EXT_DICT); 132 | if(extDictCfg != null){ 133 | //使用;分割多个扩展字典配置 134 | String[] filePaths = extDictCfg.split(";"); 135 | if(filePaths != null){ 136 | for(String filePath : filePaths){ 137 | if(filePath != null && !"".equals(filePath.trim())){ 138 | extDictFiles.add(filePath.trim()); 139 | } 140 | } 141 | } 142 | } 143 | return extDictFiles; 144 | } 145 | 146 | 147 | /** 148 | * 获取扩展停止词典配置路径 149 | * @return List 相对类加载器的路径 150 | */ 151 | public List getExtStopWordDictionarys(){ 152 | List extStopWordDictFiles = new ArrayList(2); 153 | String extStopWordDictCfg = props.getProperty(EXT_STOP); 154 | if(extStopWordDictCfg != null){ 155 | //使用;分割多个扩展字典配置 156 | String[] filePaths = extStopWordDictCfg.split(";"); 157 | if(filePaths != null){ 158 | for(String filePath : filePaths){ 159 | if(filePath != null && !"".equals(filePath.trim())){ 160 | extStopWordDictFiles.add(filePath.trim()); 161 | } 162 | } 163 | } 164 | } 165 | return extStopWordDictFiles; 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.io.IOException; 28 | import java.io.Reader; 29 | import java.util.HashMap; 30 | import java.util.HashSet; 31 | import java.util.LinkedList; 32 | import java.util.Map; 33 | import java.util.Set; 34 | 35 | import org.wltea.analyzer.cfg.Configuration; 36 | /** 37 | * 38 | * 分词器上下文状态 39 | * 40 | */ 41 | class AnalyzeContext { 42 | 43 | //默认缓冲区大小 44 | private static final int BUFF_SIZE = 4096; 45 | //缓冲区耗尽的临界值 46 | private static final int BUFF_EXHAUST_CRITICAL = 100; 47 | 48 | 49 | //字符窜读取缓冲 50 | private char[] segmentBuff; 51 | //字符类型数组 52 | private int[] charTypes; 53 | 54 | 55 | //记录Reader内已分析的字串总长度 56 | //在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移 57 | private int buffOffset; 58 | //当前缓冲区位置指针 59 | private int cursor; 60 | //最近一次读入的,可处理的字串长度 61 | private int available; 62 | 63 | 64 | //子分词器锁 65 | //该集合非空,说明有子分词器在占用segmentBuff 66 | private Set buffLocker; 67 | 68 | //原始分词结果集合,未经歧义处理 69 | private QuickSortSet orgLexemes; 70 | //LexemePath位置索引表 71 | private Map pathMap; 72 | //最终分词结果集 73 | private LinkedList results; 74 | 75 | //分词器配置项 76 | private Configuration cfg; 77 | 78 | public AnalyzeContext(Configuration cfg){ 79 | this.cfg = cfg; 80 | this.segmentBuff = new char[BUFF_SIZE]; 81 | this.charTypes = new int[BUFF_SIZE]; 82 | this.buffLocker = new HashSet(); 83 | this.orgLexemes = new QuickSortSet(); 84 | this.pathMap = new HashMap(); 85 | this.results = new LinkedList(); 86 | } 87 | 88 | int getCursor(){ 89 | return this.cursor; 90 | } 91 | // 92 | // void setCursor(int cursor){ 93 | // this.cursor = cursor; 94 | // } 95 | 96 | char[] getSegmentBuff(){ 97 | return this.segmentBuff; 98 | } 99 | 100 | char getCurrentChar(){ 101 | return this.segmentBuff[this.cursor]; 102 | } 103 | 104 | int getCurrentCharType(){ 105 | return this.charTypes[this.cursor]; 106 | } 107 | 108 | int getBufferOffset(){ 109 | return this.buffOffset; 110 | } 111 | 112 | /** 113 | * 根据context的上下文情况,填充segmentBuff 114 | * @param reader 115 | * @return 返回待分析的(有效的)字串长度 116 | * @throws IOException 117 | */ 118 | int fillBuffer(Reader reader) throws IOException{ 119 | int readCount = 0; 120 | if(this.buffOffset == 0){ 121 | //首次读取reader 122 | readCount = reader.read(segmentBuff); 123 | }else{ 124 | int offset = this.available - this.cursor; 125 | if(offset > 0){ 126 | //最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部 127 | System.arraycopy(this.segmentBuff , this.cursor , this.segmentBuff , 0 , offset); 128 | readCount = offset; 129 | } 130 | //继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分 131 | readCount += reader.read(this.segmentBuff , offset , BUFF_SIZE - offset); 132 | } 133 | //记录最后一次从Reader中读入的可用字符长度 134 | this.available = readCount; 135 | //重置当前指针 136 | this.cursor = 0; 137 | return readCount; 138 | } 139 | 140 | /** 141 | * 初始化buff指针,处理第一个字符 142 | */ 143 | void initCursor(){ 144 | this.cursor = 0; 145 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); 146 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); 147 | } 148 | 149 | /** 150 | * 指针+1 151 | * 成功返回 true; 指针已经到了buff尾部,不能前进,返回false 152 | * 并处理当前字符 153 | */ 154 | boolean moveCursor(){ 155 | if(this.cursor < this.available - 1){ 156 | this.cursor++; 157 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); 158 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); 159 | return true; 160 | }else{ 161 | return false; 162 | } 163 | } 164 | 165 | /** 166 | * 设置当前segmentBuff为锁定状态 167 | * 加入占用segmentBuff的子分词器名称,表示占用segmentBuff 168 | * @param segmenterName 169 | */ 170 | void lockBuffer(String segmenterName){ 171 | this.buffLocker.add(segmenterName); 172 | } 173 | 174 | /** 175 | * 移除指定的子分词器名,释放对segmentBuff的占用 176 | * @param segmenterName 177 | */ 178 | void unlockBuffer(String segmenterName){ 179 | this.buffLocker.remove(segmenterName); 180 | } 181 | 182 | /** 183 | * 只要buffLocker中存在segmenterName 184 | * 则buffer被锁定 185 | * @return boolean 缓冲去是否被锁定 186 | */ 187 | boolean isBufferLocked(){ 188 | return this.buffLocker.size() > 0; 189 | } 190 | 191 | /** 192 | * 判断当前segmentBuff是否已经用完 193 | * 当前执针cursor移至segmentBuff末端this.available - 1 194 | * @return 195 | */ 196 | boolean isBufferConsumed(){ 197 | return this.cursor == this.available - 1; 198 | } 199 | 200 | /** 201 | * 判断segmentBuff是否需要读取新数据 202 | * 203 | * 满足一下条件时, 204 | * 1.available == BUFF_SIZE 表示buffer满载 205 | * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 206 | * 3.!context.isBufferLocked()表示没有segmenter在占用buffer 207 | * 要中断当前循环(buffer要进行移位,并再读取数据的操作) 208 | * @return 209 | */ 210 | boolean needRefillBuffer(){ 211 | return this.available == BUFF_SIZE 212 | && this.cursor < this.available - 1 213 | && this.cursor > this.available - BUFF_EXHAUST_CRITICAL 214 | && !this.isBufferLocked(); 215 | } 216 | 217 | /** 218 | * 累计当前的segmentBuff相对于reader起始位置的位移 219 | */ 220 | void markBufferOffset(){ 221 | this.buffOffset += this.cursor; 222 | } 223 | 224 | /** 225 | * 向分词结果集添加词元 226 | * @param lexeme 227 | */ 228 | void addLexeme(Lexeme lexeme){ 229 | this.orgLexemes.addLexeme(lexeme); 230 | } 231 | 232 | /** 233 | * 添加分词结果路径 234 | * 路径起始位置 ---> 路径 映射表 235 | * @param path 236 | */ 237 | void addLexemePath(LexemePath path){ 238 | if(path != null){ 239 | this.pathMap.put(path.getPathBegin(), path); 240 | } 241 | } 242 | 243 | 244 | /** 245 | * 返回原始分词结果 246 | * @return 247 | */ 248 | QuickSortSet getOrgLexemes(){ 249 | return this.orgLexemes; 250 | } 251 | 252 | /** 253 | * 推送分词结果到结果集合 254 | * 1.从buff头部遍历到this.cursor已处理位置 255 | * 2.将map中存在的分词结果推入results 256 | * 3.将map中不存在的CJDK字符以单字方式推入results 257 | */ 258 | void outputToResult(){ 259 | int index = 0; 260 | for( ; index <= this.cursor ;){ 261 | //跳过非CJK字符 262 | if(CharacterUtil.CHAR_USELESS == this.charTypes[index]){ 263 | index++; 264 | continue; 265 | } 266 | //从pathMap找出对应index位置的LexemePath 267 | LexemePath path = this.pathMap.get(index); 268 | if(path != null){ 269 | //输出LexemePath中的lexeme到results集合 270 | Lexeme l = path.pollFirst(); 271 | while(l != null){ 272 | this.results.add(l); 273 | //将index移至lexeme后 274 | index = l.getBegin() + l.getLength(); 275 | l = path.pollFirst(); 276 | if(l != null){ 277 | //输出path内部,词元间遗漏的单字 278 | for(;index < l.getBegin();index++){ 279 | this.outputSingleCJK(index); 280 | } 281 | } 282 | } 283 | }else{//pathMap中找不到index对应的LexemePath 284 | //单字输出 285 | this.outputSingleCJK(index); 286 | index++; 287 | } 288 | } 289 | //清空当前的Map 290 | this.pathMap.clear(); 291 | } 292 | 293 | /** 294 | * 对CJK字符进行单字输出 295 | * @param index 296 | */ 297 | private void outputSingleCJK(int index){ 298 | if(CharacterUtil.CHAR_CHINESE == this.charTypes[index]){ 299 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_CNCHAR); 300 | this.results.add(singleCharLexeme); 301 | }else if(CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]){ 302 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_OTHER_CJK); 303 | this.results.add(singleCharLexeme); 304 | } 305 | } 306 | 307 | /** 308 | * 返回lexeme 309 | * 310 | * 同时处理合并 311 | * @return 312 | */ 313 | Lexeme getNextLexeme(){ 314 | //从结果集取出,并移除第一个Lexme 315 | Lexeme result = this.results.pollFirst(); 316 | /*while(result != null){ 317 | //数量词合并 318 | this.compound(result); 319 | if(Dictionary.getSingleton().isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){ 320 | //是停止词继续取列表的下一个 321 | result = this.results.pollFirst(); 322 | }else{ 323 | //不是停止词, 生成lexeme的词元文本,输出 324 | result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength())); 325 | break; 326 | } 327 | }*/ 328 | if(result != null) 329 | { 330 | this.compound(result); 331 | result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength())); 332 | } 333 | return result; 334 | } 335 | 336 | /** 337 | * 重置分词上下文状态 338 | */ 339 | void reset(){ 340 | this.buffLocker.clear(); 341 | this.orgLexemes = new QuickSortSet(); 342 | this.available =0; 343 | this.buffOffset = 0; 344 | this.charTypes = new int[BUFF_SIZE]; 345 | this.cursor = 0; 346 | this.results.clear(); 347 | this.segmentBuff = new char[BUFF_SIZE]; 348 | this.pathMap.clear(); 349 | } 350 | 351 | /** 352 | * 组合词元 353 | */ 354 | private void compound(Lexeme result){ 355 | if(!this.cfg.useSmart()){ 356 | return ; 357 | } 358 | //数量词合并处理 359 | if(!this.results.isEmpty()){ 360 | 361 | if(Lexeme.TYPE_ARABIC == result.getLexemeType()){ 362 | Lexeme nextLexeme = this.results.peekFirst(); 363 | boolean appendOk = false; 364 | if(Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()){ 365 | //合并英文数词+中文数词 366 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM); 367 | }else if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){ 368 | //合并英文数词+中文量词 369 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN); 370 | } 371 | if(appendOk){ 372 | //弹出 373 | this.results.pollFirst(); 374 | } 375 | } 376 | 377 | //可能存在第二轮合并 378 | if(Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()){ 379 | Lexeme nextLexeme = this.results.peekFirst(); 380 | boolean appendOk = false; 381 | if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){ 382 | //合并中文数词+中文量词 383 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN); 384 | } 385 | if(appendOk){ 386 | //弹出 387 | this.results.pollFirst(); 388 | } 389 | } 390 | 391 | } 392 | } 393 | 394 | } 395 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * IK 中文分词 版本 5.0 4 | * IK Analyzer release 5.0 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * 21 | * 源代码由林良益(linliangyi2005@gmail.com)提供 22 | * 版权声明 2012,乌龙茶工作室 23 | * provided by Linliangyi and copyright 2012 by Oolong studio 24 | * 25 | */ 26 | package org.wltea.analyzer.core; 27 | 28 | import java.util.LinkedList; 29 | import java.util.List; 30 | 31 | import org.wltea.analyzer.dic.Dictionary; 32 | import org.wltea.analyzer.dic.Hit; 33 | 34 | 35 | /** 36 | * 中文-日韩文子分词器 37 | */ 38 | class CJKSegmenter implements ISegmenter { 39 | 40 | //子分词器标签 41 | static final String SEGMENTER_NAME = "CJK_SEGMENTER"; 42 | //待处理的分词hit队列 43 | private List tmpHits; 44 | 45 | 46 | CJKSegmenter(){ 47 | this.tmpHits = new LinkedList(); 48 | } 49 | 50 | /* (non-Javadoc) 51 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) 52 | */ 53 | public void analyze(AnalyzeContext context) { 54 | if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){ 55 | 56 | //优先处理tmpHits中的hit 57 | if(!this.tmpHits.isEmpty()){ 58 | //处理词段队列 59 | Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); 60 | for(Hit hit : tmpArray){ 61 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); 62 | if(hit.isMatch()){ 63 | //输出当前的词 64 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD); 65 | context.addLexeme(newLexeme); 66 | 67 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除 68 | this.tmpHits.remove(hit); 69 | } 70 | 71 | }else if(hit.isUnmatch()){ 72 | //hit不是词,移除 73 | this.tmpHits.remove(hit); 74 | } 75 | } 76 | } 77 | 78 | //********************************* 79 | //再对当前指针位置的字符进行单字匹配 80 | Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); 81 | if(singleCharHit.isMatch()){//首字成词 82 | //输出当前的词 83 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD); 84 | context.addLexeme(newLexeme); 85 | 86 | //同时也是词前缀 87 | if(singleCharHit.isPrefix()){ 88 | //前缀匹配则放入hit列表 89 | this.tmpHits.add(singleCharHit); 90 | } 91 | }else if(singleCharHit.isPrefix()){//首字为词前缀 92 | //前缀匹配则放入hit列表 93 | this.tmpHits.add(singleCharHit); 94 | } 95 | 96 | 97 | }else{ 98 | //遇到CHAR_USELESS字符 99 | //清空队列 100 | this.tmpHits.clear(); 101 | } 102 | 103 | //判断缓冲区是否已经读完 104 | if(context.isBufferConsumed()){ 105 | //清空队列 106 | this.tmpHits.clear(); 107 | } 108 | 109 | //判断是否锁定缓冲区 110 | if(this.tmpHits.size() == 0){ 111 | context.unlockBuffer(SEGMENTER_NAME); 112 | 113 | }else{ 114 | context.lockBuffer(SEGMENTER_NAME); 115 | } 116 | } 117 | 118 | /* (non-Javadoc) 119 | * @see org.wltea.analyzer.core.ISegmenter#reset() 120 | */ 121 | public void reset() { 122 | //清空队列 123 | this.tmpHits.clear(); 124 | } 125 | 126 | } 127 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.util.HashSet; 28 | import java.util.LinkedList; 29 | import java.util.List; 30 | import java.util.Set; 31 | 32 | import org.wltea.analyzer.dic.Dictionary; 33 | import org.wltea.analyzer.dic.Hit; 34 | 35 | /** 36 | * 37 | * 中文数量词子分词器 38 | */ 39 | class CN_QuantifierSegmenter implements ISegmenter{ 40 | 41 | //子分词器标签 42 | static final String SEGMENTER_NAME = "QUAN_SEGMENTER"; 43 | 44 | //中文数词 45 | private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";//Cnum 46 | private static Set ChnNumberChars = new HashSet(); 47 | static{ 48 | char[] ca = Chn_Num.toCharArray(); 49 | for(char nChar : ca){ 50 | ChnNumberChars.add(nChar); 51 | } 52 | } 53 | 54 | /* 55 | * 词元的开始位置, 56 | * 同时作为子分词器状态标识 57 | * 当start > -1 时,标识当前的分词器正在处理字符 58 | */ 59 | private int nStart; 60 | /* 61 | * 记录词元结束位置 62 | * end记录的是在词元中最后一个出现的合理的数词结束 63 | */ 64 | private int nEnd; 65 | 66 | //待处理的量词hit队列 67 | private List countHits; 68 | 69 | 70 | CN_QuantifierSegmenter(){ 71 | nStart = -1; 72 | nEnd = -1; 73 | this.countHits = new LinkedList(); 74 | } 75 | 76 | /** 77 | * 分词 78 | */ 79 | public void analyze(AnalyzeContext context) { 80 | //处理中文数词 81 | this.processCNumber(context); 82 | //处理中文量词 83 | this.processCount(context); 84 | 85 | //判断是否锁定缓冲区 86 | if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){ 87 | //对缓冲区解锁 88 | context.unlockBuffer(SEGMENTER_NAME); 89 | }else{ 90 | context.lockBuffer(SEGMENTER_NAME); 91 | } 92 | } 93 | 94 | 95 | /** 96 | * 重置子分词器状态 97 | */ 98 | public void reset() { 99 | nStart = -1; 100 | nEnd = -1; 101 | countHits.clear(); 102 | } 103 | 104 | /** 105 | * 处理数词 106 | */ 107 | private void processCNumber(AnalyzeContext context){ 108 | if(nStart == -1 && nEnd == -1){//初始状态 109 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 110 | && ChnNumberChars.contains(context.getCurrentChar())){ 111 | //记录数词的起始、结束位置 112 | nStart = context.getCursor(); 113 | nEnd = context.getCursor(); 114 | } 115 | }else{//正在处理状态 116 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 117 | && ChnNumberChars.contains(context.getCurrentChar())){ 118 | //记录数词的结束位置 119 | nEnd = context.getCursor(); 120 | }else{ 121 | //输出数词 122 | this.outputNumLexeme(context); 123 | //重置头尾指针 124 | nStart = -1; 125 | nEnd = -1; 126 | } 127 | } 128 | 129 | //缓冲区已经用完,还有尚未输出的数词 130 | if(context.isBufferConsumed()){ 131 | if(nStart != -1 && nEnd != -1){ 132 | //输出数词 133 | outputNumLexeme(context); 134 | //重置头尾指针 135 | nStart = -1; 136 | nEnd = -1; 137 | } 138 | } 139 | } 140 | 141 | /** 142 | * 处理中文量词 143 | * @param context 144 | */ 145 | private void processCount(AnalyzeContext context){ 146 | // 判断是否需要启动量词扫描 147 | if(!this.needCountScan(context)){ 148 | return; 149 | } 150 | 151 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){ 152 | 153 | //优先处理countHits中的hit 154 | if(!this.countHits.isEmpty()){ 155 | //处理词段队列 156 | Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]); 157 | for(Hit hit : tmpArray){ 158 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); 159 | if(hit.isMatch()){ 160 | //输出当前的词 161 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT); 162 | context.addLexeme(newLexeme); 163 | 164 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除 165 | this.countHits.remove(hit); 166 | } 167 | 168 | }else if(hit.isUnmatch()){ 169 | //hit不是词,移除 170 | this.countHits.remove(hit); 171 | } 172 | } 173 | } 174 | 175 | //********************************* 176 | //对当前指针位置的字符进行单字匹配 177 | Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); 178 | if(singleCharHit.isMatch()){//首字成量词词 179 | //输出当前的词 180 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT); 181 | context.addLexeme(newLexeme); 182 | 183 | //同时也是词前缀 184 | if(singleCharHit.isPrefix()){ 185 | //前缀匹配则放入hit列表 186 | this.countHits.add(singleCharHit); 187 | } 188 | }else if(singleCharHit.isPrefix()){//首字为量词前缀 189 | //前缀匹配则放入hit列表 190 | this.countHits.add(singleCharHit); 191 | } 192 | 193 | 194 | }else{ 195 | //输入的不是中文字符 196 | //清空未成形的量词 197 | this.countHits.clear(); 198 | } 199 | 200 | //缓冲区数据已经读完,还有尚未输出的量词 201 | if(context.isBufferConsumed()){ 202 | //清空未成形的量词 203 | this.countHits.clear(); 204 | } 205 | } 206 | 207 | /** 208 | * 判断是否需要扫描量词 209 | * @return 210 | */ 211 | private boolean needCountScan(AnalyzeContext context){ 212 | if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){ 213 | //正在处理中文数词,或者正在处理量词 214 | return true; 215 | }else{ 216 | //找到一个相邻的数词 217 | if(!context.getOrgLexemes().isEmpty()){ 218 | Lexeme l = context.getOrgLexemes().peekLast(); 219 | if(Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()){ 220 | if(l.getBegin() + l.getLength() == context.getCursor()){ 221 | return true; 222 | } 223 | } 224 | } 225 | } 226 | return false; 227 | } 228 | 229 | /** 230 | * 添加数词词元到结果集 231 | * @param context 232 | */ 233 | private void outputNumLexeme(AnalyzeContext context){ 234 | if(nStart > -1 && nEnd > -1){ 235 | //输出数词 236 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM); 237 | context.addLexeme(newLexeme); 238 | 239 | } 240 | } 241 | 242 | } 243 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/core/CharacterUtil.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 字符集识别工具类 25 | */ 26 | package org.wltea.analyzer.core; 27 | 28 | /** 29 | * 30 | * 字符集识别工具类 31 | */ 32 | class CharacterUtil { 33 | 34 | public static final int CHAR_USELESS = 0; 35 | 36 | public static final int CHAR_ARABIC = 0X00000001; 37 | 38 | public static final int CHAR_ENGLISH = 0X00000002; 39 | 40 | public static final int CHAR_CHINESE = 0X00000004; 41 | 42 | public static final int CHAR_OTHER_CJK = 0X00000008; 43 | 44 | 45 | /** 46 | * 识别字符类型 47 | * @param input 48 | * @return int CharacterUtil定义的字符类型常量 49 | */ 50 | static int identifyCharType(char input){ 51 | if(input >= '0' && input <= '9'){ 52 | return CHAR_ARABIC; 53 | 54 | }else if((input >= 'a' && input <= 'z') 55 | || (input >= 'A' && input <= 'Z')){ 56 | return CHAR_ENGLISH; 57 | 58 | }else { 59 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input); 60 | 61 | if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 62 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 63 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){ 64 | //目前已知的中文字符UTF-8集合 65 | return CHAR_CHINESE; 66 | 67 | }else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符 68 | //韩文字符集 69 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES 70 | || ub == Character.UnicodeBlock.HANGUL_JAMO 71 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO 72 | //日文字符集 73 | || ub == Character.UnicodeBlock.HIRAGANA //平假名 74 | || ub == Character.UnicodeBlock.KATAKANA //片假名 75 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){ 76 | return CHAR_OTHER_CJK; 77 | 78 | } 79 | } 80 | //其他的不做处理的字符 81 | return CHAR_USELESS; 82 | } 83 | 84 | /** 85 | * 进行字符规格化(全角转半角,大写转小写处理) 86 | * @param input 87 | * @return char 88 | */ 89 | static char regularize(char input){ 90 | if (input == 12288) { 91 | input = (char) 32; 92 | 93 | }else if (input > 65280 && input < 65375) { 94 | input = (char) (input - 65248); 95 | 96 | }else if (input >= 'A' && input <= 'Z') { 97 | input += 32; 98 | } 99 | 100 | return input; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/core/IKArbitrator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.util.Stack; 28 | import java.util.TreeSet; 29 | 30 | /** 31 | * IK分词歧义裁决器 32 | */ 33 | class IKArbitrator { 34 | 35 | IKArbitrator(){ 36 | 37 | } 38 | 39 | /** 40 | * 分词歧义处理 41 | * @param orgLexemes 42 | * @param useSmart 43 | */ 44 | void process(AnalyzeContext context , boolean useSmart){ 45 | QuickSortSet orgLexemes = context.getOrgLexemes(); 46 | Lexeme orgLexeme = orgLexemes.pollFirst(); 47 | 48 | LexemePath crossPath = new LexemePath(); 49 | while(orgLexeme != null){ 50 | if(!crossPath.addCrossLexeme(orgLexeme)){ 51 | //找到与crossPath不相交的下一个crossPath 52 | if(crossPath.size() == 1 || !useSmart){ 53 | //crossPath没有歧义 或者 不做歧义处理 54 | //直接输出当前crossPath 55 | context.addLexemePath(crossPath); 56 | }else{ 57 | //对当前的crossPath进行歧义处理 58 | QuickSortSet.Cell headCell = crossPath.getHead(); 59 | LexemePath judgeResult = this.judge(context,headCell, crossPath.getPathLength()); 60 | //输出歧义处理结果judgeResult 61 | context.addLexemePath(judgeResult); 62 | } 63 | 64 | //把orgLexeme加入新的crossPath中 65 | crossPath = new LexemePath(); //再次new了对象 66 | crossPath.addCrossLexeme(orgLexeme); 67 | } 68 | orgLexeme = orgLexemes.pollFirst(); 69 | } 70 | 71 | 72 | //处理最后的path 73 | if(crossPath.size() <= 1 || !useSmart){ //输入流单字情况,"额" 74 | //crossPath没有歧义 或者 不做歧义处理 75 | //直接输出当前crossPath 76 | context.addLexemePath(crossPath); 77 | }else{ 78 | //对当前的crossPath进行歧义处理 79 | QuickSortSet.Cell headCell = crossPath.getHead(); 80 | LexemePath judgeResult = this.judge(context,headCell, crossPath.getPathLength()); 81 | //输出歧义处理结果judgeResult 82 | context.addLexemePath(judgeResult); 83 | } 84 | } 85 | 86 | /** 87 | * 歧义识别 88 | * @param lexemeCell 歧义路径链表头 89 | * @param fullTextLength 歧义路径文本长度 90 | * @param option 候选结果路径 91 | * @return 92 | */ 93 | @SuppressWarnings("unused") 94 | private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){ 95 | //候选路径集合 96 | TreeSet pathOptions = new TreeSet(); 97 | //候选结果路径 98 | LexemePath option = new LexemePath(); 99 | 100 | //对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈 101 | Stack lexemeStack = this.forwardPath(lexemeCell , option); 102 | 103 | //当前词元链并非最理想的,加入候选路径集合 104 | pathOptions.add(option.copy()); 105 | 106 | //存在歧义词,处理 107 | QuickSortSet.Cell c = null; 108 | while(!lexemeStack.isEmpty()){ 109 | c = lexemeStack.pop(); 110 | //回滚词元链 111 | this.backPath(c.getLexeme() , option); 112 | //从歧义词位置开始,递归,生成可选方案 113 | this.forwardPath(c , option); 114 | pathOptions.add(option.copy()); 115 | } 116 | 117 | //返回集合中的最优方案 118 | //return pathOptions.first(); 119 | /*Iterator it=pathOptions.iterator(); 120 | 121 | while(it.hasNext()) 122 | { 123 | System.out.println(it.next().toString()); 124 | }*/ 125 | 126 | return pathOptions.last(); 127 | 128 | } 129 | 130 | private LexemePath judge(AnalyzeContext context,QuickSortSet.Cell lexemeCell , int fullTextLength){ 131 | //候选路径集合 132 | TreeSet pathOptions = new TreeSet(); 133 | 134 | //候选结果路径 135 | 136 | LexemePath option = new LexemePath(context.getSegmentBuff(),lexemeCell.getLexeme().getBegin(),fullTextLength); 137 | 138 | //对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈 139 | Stack lexemeStack = this.forwardPath(lexemeCell , option); 140 | 141 | //当前词元链并非最理想的,加入候选路径集合 142 | pathOptions.add(option.copy()); //自定义拷贝函数 143 | 144 | //存在歧义词,处理 145 | QuickSortSet.Cell c = null; 146 | while(!lexemeStack.isEmpty()){ 147 | c = lexemeStack.pop(); 148 | //回滚词元链 149 | this.backPath(c.getLexeme() , option); 150 | //从歧义词位置开始,递归,生成可选方案 151 | this.forwardPath(c , option); 152 | pathOptions.add(option.copy()); 153 | } 154 | 155 | //路径添加完毕,比较接口没写好,导致返回0的被去重 156 | /* 157 | * ①、进行比较? 158 | * ②、单字比较? 159 | * ③、查找字典? 160 | * */ 161 | 162 | return pathOptions.last(); 163 | 164 | } 165 | 166 | /** 167 | * 向前遍历,添加词元,构造一个无歧义词元组合 168 | * @param LexemePath path 169 | * @return 170 | */ 171 | private Stack forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){ 172 | //发生冲突的Lexeme栈 173 | Stack conflictStack = new Stack(); 174 | QuickSortSet.Cell c = lexemeCell; 175 | //迭代遍历Lexeme链表 176 | while(c != null && c.getLexeme() != null){ 177 | if(!option.addNotCrossLexeme(c.getLexeme())){ 178 | //词元交叉,添加失败则加入lexemeStack栈 179 | conflictStack.push(c); 180 | } 181 | c = c.getNext(); 182 | } 183 | return conflictStack; 184 | } 185 | 186 | /** 187 | * 回滚词元链,直到它能够接受指定的词元 188 | * @param lexeme 189 | * @param l 190 | */ 191 | private void backPath(Lexeme l , LexemePath option){ 192 | while(option.checkCross(l)){ 193 | option.removeTail(); 194 | } 195 | 196 | } 197 | 198 | } 199 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/core/IKSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | */ 24 | package org.wltea.analyzer.core; 25 | 26 | import java.io.IOException; 27 | import java.io.Reader; 28 | import java.util.ArrayList; 29 | import java.util.List; 30 | 31 | import org.wltea.analyzer.cfg.Configuration; 32 | import org.wltea.analyzer.cfg.DefaultConfig; 33 | import org.wltea.analyzer.dic.Dictionary; 34 | 35 | /** 36 | * IK分词器主类 37 | * 38 | */ 39 | public final class IKSegmenter { 40 | 41 | //字符窜reader 42 | private Reader input; 43 | //分词器配置项 44 | private Configuration cfg; 45 | //分词器上下文 46 | private AnalyzeContext context; 47 | //分词处理器列表 48 | private List segmenters; 49 | //分词歧义裁决器 50 | private IKArbitrator arbitrator; 51 | 52 | 53 | /** 54 | * IK分词器构造函数 55 | * @param input 56 | * @param useSmart 为true,使用智能分词策略 57 | * 58 | * 非智能分词:细粒度输出所有可能的切分结果 59 | * 智能分词: 合并数词和量词,对分词结果进行歧义判断 60 | */ 61 | 62 | public IKSegmenter(Reader input , boolean useSmart){ 63 | this.input = input; 64 | this.cfg = DefaultConfig.getInstance(); 65 | this.cfg.setUseSmart(useSmart); 66 | this.init(); 67 | } 68 | 69 | /** 70 | * IK分词器构造函数 71 | * @param input 72 | * @param cfg 使用自定义的Configuration构造分词器 73 | * 74 | */ 75 | public IKSegmenter(Reader input , Configuration cfg){ 76 | this.input = input; 77 | this.cfg = cfg; 78 | this.init(); 79 | } 80 | 81 | /** 82 | * 初始化 83 | */ 84 | private void init(){ 85 | //初始化词典单例 86 | Dictionary.initial(this.cfg); 87 | //初始化分词上下文 88 | this.context = new AnalyzeContext(this.cfg); 89 | //加载子分词器 90 | this.segmenters = this.loadSegmenters(); 91 | //加载歧义裁决器 92 | this.arbitrator = new IKArbitrator(); 93 | } 94 | 95 | 96 | /** 97 | * 初始化词典,加载子分词器实现 98 | * @return List 99 | */ 100 | private List loadSegmenters(){ 101 | List segmenters = new ArrayList(4); 102 | //处理字母的子分词器 103 | segmenters.add(new LetterSegmenter()); 104 | //处理中文数量词的子分词器 105 | segmenters.add(new CN_QuantifierSegmenter()); 106 | //处理中文词的子分词器 107 | segmenters.add(new CJKSegmenter()); 108 | return segmenters; 109 | } 110 | 111 | /** 112 | * 分词,获取下一个词元 113 | * @return Lexeme 词元对象 114 | * @throws IOException 115 | */ 116 | public synchronized Lexeme next()throws IOException{ 117 | Lexeme l = null; 118 | while((l = context.getNextLexeme()) == null ){ 119 | /* 120 | * 从reader中读取数据,填充buffer 121 | * 如果reader是分次读入buffer的,那么buffer要 进行移位处理 122 | * 移位处理上次读入的但未处理的数据 123 | */ 124 | int available = context.fillBuffer(this.input); 125 | if(available <= 0){ 126 | //reader已经读完 127 | context.reset(); 128 | return null; 129 | 130 | }else{ 131 | //初始化指针 132 | context.initCursor(); 133 | do{ 134 | //遍历子分词器 135 | for(ISegmenter segmenter : segmenters){ 136 | segmenter.analyze(context); 137 | } 138 | //字符缓冲区接近读完,需要读入新的字符 139 | if(context.needRefillBuffer()){ 140 | break; 141 | } 142 | //向前移动指针 143 | }while(context.moveCursor()); 144 | //重置子分词器,为下轮循环进行初始化 145 | for(ISegmenter segmenter : segmenters){ 146 | segmenter.reset(); 147 | } 148 | } 149 | //对分词进行歧义处理 150 | this.arbitrator.process(context, this.cfg.useSmart()); 151 | //将分词结果输出到结果集,并处理未切分的单个CJK字符 152 | context.outputToResult(); 153 | //记录本次分词的缓冲区位移 154 | context.markBufferOffset(); 155 | } 156 | return l; 157 | } 158 | 159 | /** 160 | * 重置分词器到初始状态 161 | * @param input 162 | */ 163 | public synchronized void reset(Reader input) { 164 | this.input = input; 165 | context.reset(); 166 | for(ISegmenter segmenter : segmenters){ 167 | segmenter.reset(); 168 | } 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/core/ISegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | 28 | /** 29 | * 30 | * 子分词器接口 31 | */ 32 | interface ISegmenter { 33 | 34 | /** 35 | * 从分析器读取下一个可能分解的词元对象 36 | * @param context 分词算法上下文 37 | */ 38 | void analyze(AnalyzeContext context); 39 | 40 | 41 | /** 42 | * 重置子分析器状态 43 | */ 44 | void reset(); 45 | 46 | } 47 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.util.Arrays; 28 | 29 | /** 30 | * 31 | * 英文字符及阿拉伯数字子分词器 32 | */ 33 | class LetterSegmenter implements ISegmenter { 34 | 35 | //子分词器标签 36 | static final String SEGMENTER_NAME = "LETTER_SEGMENTER"; 37 | //链接符号 38 | private static final char[] Letter_Connector = new char[]{'#' , '&' , '+' , '-' , '.' , '@' , '_'}; 39 | 40 | //数字符号 41 | private static final char[] Num_Connector = new char[]{',' , '.'}; 42 | 43 | /* 44 | * 词元的开始位置, 45 | * 同时作为子分词器状态标识 46 | * 当start > -1 时,标识当前的分词器正在处理字符 47 | */ 48 | private int start; 49 | /* 50 | * 记录词元结束位置 51 | * end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置 52 | */ 53 | private int end; 54 | 55 | /* 56 | * 字母起始位置 57 | */ 58 | private int englishStart; 59 | 60 | /* 61 | * 字母结束位置 62 | */ 63 | private int englishEnd; 64 | 65 | /* 66 | * 阿拉伯数字起始位置 67 | */ 68 | private int arabicStart; 69 | 70 | /* 71 | * 阿拉伯数字结束位置 72 | */ 73 | private int arabicEnd; 74 | 75 | LetterSegmenter(){ 76 | Arrays.sort(Letter_Connector); 77 | Arrays.sort(Num_Connector); 78 | this.start = -1; 79 | this.end = -1; 80 | this.englishStart = -1; 81 | this.englishEnd = -1; 82 | this.arabicStart = -1; 83 | this.arabicEnd = -1; 84 | } 85 | 86 | 87 | /* (non-Javadoc) 88 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) 89 | */ 90 | public void analyze(AnalyzeContext context) { 91 | boolean bufferLockFlag = false; 92 | //处理英文字母 93 | bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag; 94 | //处理阿拉伯字母 95 | bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag; 96 | //处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复) 97 | bufferLockFlag = this.processMixLetter(context) || bufferLockFlag; 98 | 99 | //判断是否锁定缓冲区 100 | if(bufferLockFlag){ 101 | context.lockBuffer(SEGMENTER_NAME); 102 | }else{ 103 | //对缓冲区解锁 104 | context.unlockBuffer(SEGMENTER_NAME); 105 | } 106 | } 107 | 108 | /* (non-Javadoc) 109 | * @see org.wltea.analyzer.core.ISegmenter#reset() 110 | */ 111 | public void reset() { 112 | this.start = -1; 113 | this.end = -1; 114 | this.englishStart = -1; 115 | this.englishEnd = -1; 116 | this.arabicStart = -1; 117 | this.arabicEnd = -1; 118 | } 119 | 120 | /** 121 | * 处理数字字母混合输出 122 | * 如:windos2000 | linliangyi2005@gmail.com 123 | * @param input 124 | * @param context 125 | * @return 126 | */ 127 | private boolean processMixLetter(AnalyzeContext context){ 128 | boolean needLock = false; 129 | 130 | if(this.start == -1){//当前的分词器尚未开始处理字符 131 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() 132 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 133 | //记录起始指针的位置,标明分词器进入处理状态 134 | this.start = context.getCursor(); 135 | this.end = start; 136 | } 137 | 138 | }else{//当前的分词器正在处理字符 139 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() 140 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 141 | //记录下可能的结束位置 142 | this.end = context.getCursor(); 143 | 144 | }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType() 145 | && this.isLetterConnector(context.getCurrentChar())){ 146 | //记录下可能的结束位置 147 | this.end = context.getCursor(); 148 | }else{ 149 | //遇到非Letter字符,输出词元 150 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER); 151 | context.addLexeme(newLexeme); 152 | this.start = -1; 153 | this.end = -1; 154 | } 155 | } 156 | 157 | //判断缓冲区是否已经读完 158 | if(context.isBufferConsumed()){ 159 | if(this.start != -1 && this.end != -1){ 160 | //缓冲以读完,输出词元 161 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER); 162 | context.addLexeme(newLexeme); 163 | this.start = -1; 164 | this.end = -1; 165 | } 166 | } 167 | 168 | //判断是否锁定缓冲区 169 | if(this.start == -1 && this.end == -1){ 170 | //对缓冲区解锁 171 | needLock = false; 172 | }else{ 173 | needLock = true; 174 | } 175 | return needLock; 176 | } 177 | 178 | /** 179 | * 处理纯英文字母输出 180 | * @param context 181 | * @return 182 | */ 183 | private boolean processEnglishLetter(AnalyzeContext context){ 184 | boolean needLock = false; 185 | 186 | if(this.englishStart == -1){//当前的分词器尚未开始处理英文字符 187 | if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 188 | //记录起始指针的位置,标明分词器进入处理状态 189 | this.englishStart = context.getCursor(); 190 | this.englishEnd = this.englishStart; 191 | } 192 | }else {//当前的分词器正在处理英文字符 193 | if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 194 | //记录当前指针位置为结束位置 195 | this.englishEnd = context.getCursor(); 196 | }else{ 197 | //遇到非English字符,输出词元 198 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH); 199 | context.addLexeme(newLexeme); 200 | this.englishStart = -1; 201 | this.englishEnd= -1; 202 | } 203 | } 204 | 205 | //判断缓冲区是否已经读完 206 | if(context.isBufferConsumed()){ 207 | if(this.englishStart != -1 && this.englishEnd != -1){ 208 | //缓冲以读完,输出词元 209 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH); 210 | context.addLexeme(newLexeme); 211 | this.englishStart = -1; 212 | this.englishEnd= -1; 213 | } 214 | } 215 | 216 | //判断是否锁定缓冲区 217 | if(this.englishStart == -1 && this.englishEnd == -1){ 218 | //对缓冲区解锁 219 | needLock = false; 220 | }else{ 221 | needLock = true; 222 | } 223 | return needLock; 224 | } 225 | 226 | /** 227 | * 处理阿拉伯数字输出 228 | * @param context 229 | * @return 230 | */ 231 | private boolean processArabicLetter(AnalyzeContext context){ 232 | boolean needLock = false; 233 | 234 | if(this.arabicStart == -1){//当前的分词器尚未开始处理数字字符 235 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){ 236 | //记录起始指针的位置,标明分词器进入处理状态 237 | this.arabicStart = context.getCursor(); 238 | this.arabicEnd = this.arabicStart; 239 | } 240 | }else {//当前的分词器正在处理数字字符 241 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){ 242 | //记录当前指针位置为结束位置 243 | this.arabicEnd = context.getCursor(); 244 | }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType() 245 | && this.isNumConnector(context.getCurrentChar())){ 246 | //不输出数字,但不标记结束 247 | }else{ 248 | ////遇到非Arabic字符,输出词元 249 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC); 250 | context.addLexeme(newLexeme); 251 | this.arabicStart = -1; 252 | this.arabicEnd = -1; 253 | } 254 | } 255 | 256 | //判断缓冲区是否已经读完 257 | if(context.isBufferConsumed()){ 258 | if(this.arabicStart != -1 && this.arabicEnd != -1){ 259 | //生成已切分的词元 260 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC); 261 | context.addLexeme(newLexeme); 262 | this.arabicStart = -1; 263 | this.arabicEnd = -1; 264 | } 265 | } 266 | 267 | //判断是否锁定缓冲区 268 | if(this.arabicStart == -1 && this.arabicEnd == -1){ 269 | //对缓冲区解锁 270 | needLock = false; 271 | }else{ 272 | needLock = true; 273 | } 274 | return needLock; 275 | } 276 | 277 | /** 278 | * 判断是否是字母连接符号 279 | * @param input 280 | * @return 281 | */ 282 | private boolean isLetterConnector(char input){ 283 | int index = Arrays.binarySearch(Letter_Connector, input); 284 | return index >= 0; 285 | } 286 | 287 | /** 288 | * 判断是否是数字连接符号 289 | * @param input 290 | * @return 291 | */ 292 | private boolean isNumConnector(char input){ 293 | int index = Arrays.binarySearch(Num_Connector, input); 294 | return index >= 0; 295 | } 296 | } 297 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/core/Lexeme.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | /** 28 | * IK词元对象 29 | */ 30 | public class Lexeme implements Comparable{ 31 | //lexemeType常量 32 | //未知 33 | public static final int TYPE_UNKNOWN = 0; 34 | //英文 35 | public static final int TYPE_ENGLISH = 1; 36 | //数字 37 | public static final int TYPE_ARABIC = 2; 38 | //英文数字混合 39 | public static final int TYPE_LETTER = 3; 40 | //中文词元 41 | public static final int TYPE_CNWORD = 4; 42 | //中文单字 43 | public static final int TYPE_CNCHAR = 64; 44 | //日韩文字 45 | public static final int TYPE_OTHER_CJK = 8; 46 | //中文数词 47 | public static final int TYPE_CNUM = 16; 48 | //中文量词 49 | public static final int TYPE_COUNT = 32; 50 | //中文数量词 51 | public static final int TYPE_CQUAN = 48; 52 | 53 | //词元的起始位移 54 | private int offset; 55 | //词元的相对起始位置 56 | private int begin; 57 | //词元的长度 58 | private int length; 59 | //词元文本 60 | private String lexemeText; 61 | //词元类型 62 | private int lexemeType; 63 | 64 | 65 | public Lexeme(int offset , int begin , int length , int lexemeType){ 66 | this.offset = offset; 67 | this.begin = begin; 68 | if(length < 0){ 69 | throw new IllegalArgumentException("length < 0"); 70 | } 71 | this.length = length; 72 | this.lexemeType = lexemeType; 73 | } 74 | 75 | /* 76 | * 判断词元相等算法 77 | * 起始位置偏移、起始位置、终止位置相同 78 | * @see java.lang.Object#equals(Object o) 79 | */ 80 | public boolean equals(Object o){ 81 | if(o == null){ 82 | return false; 83 | } 84 | 85 | if(this == o){ 86 | return true; 87 | } 88 | 89 | if(o instanceof Lexeme){ 90 | Lexeme other = (Lexeme)o; 91 | if(this.offset == other.getOffset() 92 | && this.begin == other.getBegin() 93 | && this.length == other.getLength()){ 94 | return true; 95 | }else{ 96 | return false; 97 | } 98 | }else{ 99 | return false; 100 | } 101 | } 102 | 103 | /* 104 | * 词元哈希编码算法 105 | * @see java.lang.Object#hashCode() 106 | */ 107 | public int hashCode(){ 108 | int absBegin = getBeginPosition(); 109 | int absEnd = getEndPosition(); 110 | return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11; 111 | } 112 | 113 | /* 114 | * 词元在排序集合中的比较算法 115 | * @see java.lang.Comparable#compareTo(java.lang.Object) 116 | */ 117 | public int compareTo(Lexeme other) { 118 | //起始位置优先 119 | if(this.begin < other.getBegin()){ 120 | return -1; 121 | }else if(this.begin == other.getBegin()){ 122 | //词元长度优先 123 | if(this.length > other.getLength()){ 124 | return -1; 125 | }else if(this.length == other.getLength()){ 126 | return 0; 127 | }else {//this.length < other.getLength() 128 | return 1; 129 | } 130 | 131 | }else{//this.begin > other.getBegin() 132 | return 1; 133 | } 134 | } 135 | 136 | public int getOffset() { 137 | return offset; 138 | } 139 | 140 | public void setOffset(int offset) { 141 | this.offset = offset; 142 | } 143 | 144 | public int getBegin() { 145 | return begin; 146 | } 147 | /** 148 | * 获取词元在文本中的起始位置 149 | * @return int 150 | */ 151 | public int getBeginPosition(){ 152 | return offset + begin; 153 | } 154 | 155 | public void setBegin(int begin) { 156 | this.begin = begin; 157 | } 158 | 159 | /** 160 | * 获取词元在文本中的结束位置 161 | * @return int 162 | */ 163 | public int getEndPosition(){ 164 | return offset + begin + length; 165 | } 166 | 167 | /** 168 | * 获取词元的字符长度 169 | * @return int 170 | */ 171 | public int getLength(){ 172 | return this.length; 173 | } 174 | 175 | public void setLength(int length) { 176 | if(this.length < 0){ 177 | throw new IllegalArgumentException("length < 0"); 178 | } 179 | this.length = length; 180 | } 181 | 182 | /** 183 | * 获取词元的文本内容 184 | * @return String 185 | */ 186 | public String getLexemeText() { 187 | if(lexemeText == null){ 188 | return ""; 189 | } 190 | return lexemeText; 191 | } 192 | 193 | public void setLexemeText(String lexemeText) { 194 | if(lexemeText == null){ 195 | this.lexemeText = ""; 196 | this.length = 0; 197 | }else{ 198 | this.lexemeText = lexemeText; 199 | this.length = lexemeText.length(); 200 | } 201 | } 202 | 203 | /** 204 | * 获取词元类型 205 | * @return int 206 | */ 207 | public int getLexemeType() { 208 | return lexemeType; 209 | } 210 | 211 | /** 212 | * 获取词元类型标示字符串 213 | * @return String 214 | */ 215 | public String getLexemeTypeString(){ 216 | switch(lexemeType) { 217 | 218 | case TYPE_ENGLISH : 219 | return "ENGLISH"; 220 | 221 | case TYPE_ARABIC : 222 | return "ARABIC"; 223 | 224 | case TYPE_LETTER : 225 | return "LETTER"; 226 | 227 | case TYPE_CNWORD : 228 | return "CN_WORD"; 229 | 230 | case TYPE_CNCHAR : 231 | return "CN_CHAR"; 232 | 233 | case TYPE_OTHER_CJK : 234 | return "OTHER_CJK"; 235 | 236 | case TYPE_COUNT : 237 | return "COUNT"; 238 | 239 | case TYPE_CNUM : 240 | return "TYPE_CNUM"; 241 | 242 | case TYPE_CQUAN: 243 | return "TYPE_CQUAN"; 244 | 245 | default : 246 | return "UNKONW"; 247 | } 248 | } 249 | 250 | 251 | public void setLexemeType(int lexemeType) { 252 | this.lexemeType = lexemeType; 253 | } 254 | 255 | /** 256 | * 合并两个相邻的词元 257 | * @param l 258 | * @param lexemeType 259 | * @return boolean 词元是否成功合并 260 | */ 261 | public boolean append(Lexeme l , int lexemeType){ 262 | if(l != null && this.getEndPosition() == l.getBeginPosition()){ 263 | this.length += l.getLength(); 264 | this.lexemeType = lexemeType; 265 | return true; 266 | }else { 267 | return false; 268 | } 269 | } 270 | 271 | 272 | /** 273 | * 274 | */ 275 | public String toString(){ 276 | StringBuffer strbuf = new StringBuffer(); 277 | strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition()); 278 | strbuf.append(" : ").append(this.lexemeText).append(" : \t"); 279 | strbuf.append(this.getLexemeTypeString()); 280 | return strbuf.toString(); 281 | } 282 | 283 | 284 | } 285 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/core/LexemePath.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import org.wltea.analyzer.dic.Dictionary; 28 | 29 | 30 | /** 31 | * Lexeme链(路径) 32 | */ 33 | class LexemePath extends QuickSortSet implements Comparable{ 34 | 35 | //起始位置 36 | private int pathBegin; 37 | //结束 38 | private int pathEnd; 39 | //词元链的有效字符长度 40 | private int payloadLength; 41 | 42 | private char[] sentenceContent; //原始输入内容 43 | private int absBegin; //交集的绝对起始处----区别于词元 44 | private int absLength; //交集的绝对长度 45 | 46 | private float _result=-1.0f; //存储返回量化后的结果 47 | 48 | LexemePath(){ 49 | this.pathBegin = -1; 50 | this.pathEnd = -1; 51 | this.payloadLength = 0; 52 | } 53 | 54 | LexemePath(char[] context,int absBegin ,int fullTextLength) 55 | { 56 | this.pathBegin = -1; 57 | this.pathEnd = -1; 58 | this.payloadLength = 0; 59 | //System.arraycopy(context, 0,sentenceContent, 0, 100); 60 | this.sentenceContent = context; 61 | this.absBegin = absBegin; 62 | this.absLength = fullTextLength; 63 | } 64 | /** 65 | * 向LexemePath追加相交的Lexeme 66 | * @param lexeme 67 | * @return 68 | */ 69 | boolean addCrossLexeme(Lexeme lexeme){ 70 | if(this.isEmpty()){ 71 | this.addLexeme(lexeme); 72 | this.pathBegin = lexeme.getBegin(); 73 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 74 | this.payloadLength += lexeme.getLength(); 75 | return true; 76 | 77 | }else if(this.checkCross(lexeme)){ 78 | this.addLexeme(lexeme); 79 | if(lexeme.getBegin() + lexeme.getLength() > this.pathEnd){ 80 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 81 | } 82 | this.payloadLength = this.pathEnd - this.pathBegin; //此处payloadLength,交集处不算?end减原来的begin 83 | return true; 84 | 85 | }else{ 86 | return false; 87 | 88 | } 89 | } 90 | 91 | /** 92 | * 向LexemePath追加不相交的Lexeme 93 | * @param lexeme 94 | * @return 95 | */ 96 | boolean addNotCrossLexeme(Lexeme lexeme){ 97 | if(this.isEmpty()){ 98 | this.addLexeme(lexeme); 99 | this.pathBegin = lexeme.getBegin(); 100 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 101 | this.payloadLength += lexeme.getLength(); 102 | return true; 103 | 104 | }else if(this.checkCross(lexeme)){ 105 | return false; 106 | 107 | }else{ 108 | this.addLexeme(lexeme); 109 | this.payloadLength += lexeme.getLength(); 110 | Lexeme head = this.peekFirst(); 111 | this.pathBegin = head.getBegin(); 112 | Lexeme tail = this.peekLast(); 113 | this.pathEnd = tail.getBegin() + tail.getLength(); 114 | return true; 115 | 116 | } 117 | } 118 | 119 | /** 120 | * 移除尾部的Lexeme 121 | * @return 122 | */ 123 | Lexeme removeTail(){ 124 | Lexeme tail = this.pollLast(); 125 | if(this.isEmpty()){ 126 | this.pathBegin = -1; 127 | this.pathEnd = -1; 128 | this.payloadLength = 0; 129 | }else{ 130 | this.payloadLength -= tail.getLength(); 131 | Lexeme newTail = this.peekLast(); 132 | this.pathEnd = newTail.getBegin() + newTail.getLength(); 133 | } 134 | return tail; 135 | } 136 | 137 | /** 138 | * 检测词元位置交叉(有歧义的切分) 139 | * @param lexeme 140 | * @return 141 | */ 142 | boolean checkCross(Lexeme lexeme){ 143 | return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd) 144 | || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()+ lexeme.getLength()); 145 | } 146 | 147 | int getPathBegin() { 148 | return pathBegin; 149 | } 150 | 151 | int getPathEnd() { 152 | return pathEnd; 153 | } 154 | 155 | /** 156 | * 获取Path的有效词长 157 | * @return 158 | */ 159 | int getPayloadLength(){ 160 | return this.payloadLength; 161 | } 162 | 163 | /** 164 | * 获取LexemePath的路径长度 165 | * @return 166 | */ 167 | int getPathLength(){ 168 | return this.pathEnd - this.pathBegin; 169 | } 170 | 171 | 172 | /** 173 | * X权重(词元长度积),长度越平均,值越大 174 | * @return 175 | */ 176 | int getXWeight(){ 177 | int product = 1; 178 | Cell c = this.getHead(); 179 | while( c != null && c.getLexeme() != null){ 180 | product *= c.getLexeme().getLength(); 181 | c = c.getNext(); 182 | } 183 | return product; 184 | } 185 | 186 | /** 187 | * 词元位置权重,切分结果词元越多,值为大 188 | * @return 189 | */ 190 | int getPWeight(){ 191 | int pWeight = 0; 192 | int p = 0; 193 | Cell c = this.getHead(); 194 | while( c != null && c.getLexeme() != null){ 195 | p++; 196 | //pWeight += c.getLexeme().getBegin() * c.getLexeme().getLength(); 197 | pWeight += p * c.getLexeme().getLength(); 198 | c = c.getNext(); 199 | } 200 | return pWeight; 201 | } 202 | 203 | LexemePath copy(){ 204 | LexemePath theCopy = new LexemePath(); 205 | theCopy.pathBegin = this.pathBegin; 206 | theCopy.pathEnd = this.pathEnd; 207 | theCopy.payloadLength = this.payloadLength; 208 | 209 | theCopy.sentenceContent = this.sentenceContent; 210 | theCopy.absBegin = this.absBegin; 211 | theCopy.absLength = this.absLength; 212 | 213 | Cell c = this.getHead(); 214 | while( c != null && c.getLexeme() != null){ 215 | theCopy.addLexeme(c.getLexeme()); 216 | c = c.getNext(); 217 | } 218 | return theCopy; 219 | } 220 | 221 | public int compareTo(LexemePath o) { 222 | float nowResult,OriginResult; 223 | nowResult = this.calcResult(); 224 | OriginResult = o.calcResult(); 225 | 226 | if( nowResult > OriginResult ) 227 | { 228 | return 1; 229 | } 230 | else if(nowResult < OriginResult) 231 | { 232 | return -1; 233 | } 234 | else 235 | { 236 | if(this.pathEnd > o.pathEnd) 237 | { 238 | return 1; 239 | } 240 | else if(pathEnd < o.pathEnd) 241 | { 242 | return -1; 243 | } 244 | } 245 | return 0; 246 | } 247 | 248 | private float calcResult(){ 249 | if(_result == -1.0f) //未被计算过 250 | { 251 | _result= (this.payloadLength*10) + (this.size()*(-5)) + this.getPathLength()+this.getXWeight()+this.getPWeight(); 252 | 253 | /*存在单字 254 | *①、判断单字的个数,进行单字定位,用于获取 255 | *②、在单字字典进行查找,是否存在,取其概率值 256 | * */ 257 | if(this.payloadLength < this.absLength) //存在单字 258 | { 259 | int curPoint; 260 | Cell head = this.getHead(); 261 | curPoint = this.absBegin; //从路径绝对起始处开始扫描 262 | float sumFreq=0; 263 | char singleChar=0; 264 | while(head != null){ 265 | while(curPoint 0){//词元接入链表头部 66 | this.head.prev = newCell; 67 | newCell.next = this.head; 68 | this.head = newCell; 69 | this.size++; 70 | return true; 71 | 72 | }else{ 73 | //从尾部上逆 74 | Cell index = this.tail; 75 | while(index != null && index.compareTo(newCell) > 0){ 76 | index = index.prev; 77 | } 78 | if(index.compareTo(newCell) == 0){//词元与集合中的词元重复,不放入集合 79 | return false; 80 | 81 | }else if(index.compareTo(newCell) < 0){//词元插入链表中的某个位置 82 | newCell.prev = index; 83 | newCell.next = index.next; 84 | index.next.prev = newCell; 85 | index.next = newCell; 86 | this.size++; 87 | return true; 88 | } 89 | } 90 | } 91 | return false; 92 | } 93 | 94 | /** 95 | * 返回链表头部元素 96 | * @return 97 | */ 98 | Lexeme peekFirst(){ 99 | if(this.head != null){ 100 | return this.head.lexeme; 101 | } 102 | return null; 103 | } 104 | 105 | /** 106 | * 取出链表集合的第一个元素 107 | * @return Lexeme 108 | */ 109 | Lexeme pollFirst(){ 110 | if(this.size == 1){ 111 | Lexeme first = this.head.lexeme; 112 | this.head = null; 113 | this.tail = null; 114 | this.size--; 115 | return first; 116 | }else if(this.size > 1){ 117 | Lexeme first = this.head.lexeme; 118 | this.head = this.head.next; 119 | this.size --; 120 | return first; 121 | }else{ 122 | return null; 123 | } 124 | } 125 | 126 | /** 127 | * 返回链表尾部元素 128 | * @return 129 | */ 130 | Lexeme peekLast(){ 131 | if(this.tail != null){ 132 | return this.tail.lexeme; 133 | } 134 | return null; 135 | } 136 | 137 | /** 138 | * 取出链表集合的最后一个元素 139 | * @return Lexeme 140 | */ 141 | Lexeme pollLast(){ 142 | if(this.size == 1){ 143 | Lexeme last = this.head.lexeme; 144 | this.head = null; 145 | this.tail = null; 146 | this.size--; 147 | return last; 148 | 149 | }else if(this.size > 1){ 150 | Lexeme last = this.tail.lexeme; 151 | this.tail = this.tail.prev; 152 | this.size--; 153 | return last; 154 | 155 | }else{ 156 | return null; 157 | } 158 | } 159 | 160 | /** 161 | * 返回集合大小 162 | * @return 163 | */ 164 | int size(){ 165 | return this.size; 166 | } 167 | 168 | /** 169 | * 判断集合是否为空 170 | * @return 171 | */ 172 | boolean isEmpty(){ 173 | return this.size == 0; 174 | } 175 | 176 | /** 177 | * 返回lexeme链的头部 178 | * @return 179 | */ 180 | Cell getHead(){ 181 | return this.head; 182 | } 183 | 184 | /** 185 | * 186 | * IK 中文分词 版本 5.0 187 | * IK Analyzer release 5.0 188 | * 189 | * Licensed to the Apache Software Foundation (ASF) under one or more 190 | * contributor license agreements. See the NOTICE file distributed with 191 | * this work for additional information regarding copyright ownership. 192 | * The ASF licenses this file to You under the Apache License, Version 2.0 193 | * (the "License"); you may not use this file except in compliance with 194 | * the License. You may obtain a copy of the License at 195 | * 196 | * http://www.apache.org/licenses/LICENSE-2.0 197 | * 198 | * Unless required by applicable law or agreed to in writing, software 199 | * distributed under the License is distributed on an "AS IS" BASIS, 200 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | * See the License for the specific language governing permissions and 202 | * limitations under the License. 203 | * 204 | * 源代码由林良益(linliangyi2005@gmail.com)提供 205 | * 版权声明 2012,乌龙茶工作室 206 | * provided by Linliangyi and copyright 2012 by Oolong studio 207 | * 208 | * QuickSortSet集合单元 209 | * 210 | */ 211 | class Cell implements Comparable{ 212 | private Cell prev; 213 | private Cell next; 214 | private Lexeme lexeme; 215 | 216 | Cell(Lexeme lexeme){ 217 | if(lexeme == null){ 218 | throw new IllegalArgumentException("lexeme must not be null"); 219 | } 220 | this.lexeme = lexeme; 221 | } 222 | 223 | public int compareTo(Cell o) { 224 | return this.lexeme.compareTo(o.lexeme); 225 | } 226 | 227 | public Cell getPrev(){ 228 | return this.prev; 229 | } 230 | 231 | public Cell getNext(){ 232 | return this.next; 233 | } 234 | 235 | public Lexeme getLexeme(){ 236 | return this.lexeme; 237 | } 238 | } 239 | } 240 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/dic/DictCharNode.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.dic; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | public class DictCharNode { 7 | private static final Map charMap = new HashMap(1024,0.8f); 8 | 9 | void addChar(Character key,Float logFreq) 10 | { 11 | charMap.put(key, logFreq); 12 | //(int)(Math.log(Integer.parseInt(w[1]))*100),默认给0 13 | } 14 | 15 | float getCharFreq(Character singleChar) 16 | { 17 | float freq=-2.0f; //非单字,则表示该路径切分存在某些问题 18 | if(charMap.containsKey(singleChar)) //如果存在 19 | { 20 | freq = charMap.get(singleChar); 21 | } 22 | return freq; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/dic/DictSegment.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * IK 中文分词 版本 5.0 4 | * IK Analyzer release 5.0 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * 21 | * 源代码由林良益(linliangyi2005@gmail.com)提供 22 | * 版权声明 2012,乌龙茶工作室 23 | * provided by Linliangyi and copyright 2012 by Oolong studio 24 | * 25 | */ 26 | package org.wltea.analyzer.dic; 27 | 28 | import java.util.Arrays; 29 | import java.util.HashMap; 30 | import java.util.Map; 31 | 32 | /** 33 | * 词典树分段,表示词典树的一个分枝 34 | */ 35 | class DictSegment implements Comparable{ 36 | 37 | //公用字典表,存储汉字 38 | //private static final Map charMap = new HashMap(16 , 0.95f); 39 | //数组大小上限 40 | private static final int ARRAY_LENGTH_LIMIT = 3; 41 | 42 | 43 | //Map存储结构 44 | private Map childrenMap; 45 | //数组方式存储结构 46 | private DictSegment[] childrenArray; 47 | 48 | 49 | //当前节点上存储的字符 50 | private Character nodeChar; 51 | //当前节点存储的Segment数目 52 | //storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储 53 | private int storeSize = 0; 54 | //当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词 55 | private int nodeState = 0; 56 | 57 | 58 | DictSegment(Character nodeChar){ 59 | if(nodeChar == null){ 60 | throw new IllegalArgumentException("参数为空异常,字符不能为空"); 61 | } 62 | this.nodeChar = nodeChar; 63 | } 64 | 65 | Character getNodeChar() { 66 | return nodeChar; 67 | } 68 | 69 | /* 70 | * 判断是否有下一个节点 71 | */ 72 | boolean hasNextNode(){ 73 | return this.storeSize > 0; 74 | } 75 | 76 | /** 77 | * 匹配词段 78 | * @param charArray 79 | * @return Hit 80 | */ 81 | Hit match(char[] charArray){ 82 | return this.match(charArray , 0 , charArray.length , null); 83 | } 84 | 85 | /** 86 | * 匹配词段 87 | * @param charArray 88 | * @param begin 89 | * @param length 90 | * @return Hit 91 | */ 92 | Hit match(char[] charArray , int begin , int length){ 93 | return this.match(charArray , begin , length , null); 94 | } 95 | 96 | /** 97 | * 匹配词段 98 | * @param charArray 99 | * @param begin 100 | * @param length 101 | * @param searchHit 102 | * @return Hit 103 | */ 104 | Hit match(char[] charArray , int begin , int length , Hit searchHit){ 105 | 106 | if(searchHit == null){ 107 | //如果hit为空,新建 108 | searchHit= new Hit(); 109 | //设置hit的其实文本位置 110 | searchHit.setBegin(begin); 111 | }else{ 112 | //否则要将HIT状态重置 113 | searchHit.setUnmatch(); 114 | } 115 | //设置hit的当前处理位置 116 | searchHit.setEnd(begin); 117 | 118 | Character keyChar = new Character(charArray[begin]); 119 | DictSegment ds = null; 120 | 121 | //引用实例变量为本地变量,避免查询时遇到更新的同步问题 122 | DictSegment[] segmentArray = this.childrenArray; 123 | Map segmentMap = this.childrenMap; 124 | 125 | //STEP1 在节点中查找keyChar对应的DictSegment 126 | if(segmentArray != null){ 127 | //在数组中查找 128 | DictSegment keySegment = new DictSegment(keyChar); 129 | int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize , keySegment); 130 | if(position >= 0){ 131 | ds = segmentArray[position]; 132 | } 133 | 134 | }else if(segmentMap != null){ 135 | //在map中查找 136 | ds = (DictSegment)segmentMap.get(keyChar); 137 | } 138 | 139 | //STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果 140 | if(ds != null){ 141 | if(length > 1){ 142 | //词未匹配完,继续往下搜索 143 | return ds.match(charArray, begin + 1 , length - 1 , searchHit); 144 | }else if (length == 1){ 145 | 146 | //搜索最后一个char 147 | if(ds.nodeState == 1){ 148 | //添加HIT状态为完全匹配 149 | searchHit.setMatch(); 150 | } 151 | if(ds.hasNextNode()){ 152 | //添加HIT状态为前缀匹配 153 | searchHit.setPrefix(); 154 | //记录当前位置的DictSegment 155 | searchHit.setMatchedDictSegment(ds); 156 | } 157 | return searchHit; 158 | } 159 | 160 | } 161 | //STEP3 没有找到DictSegment, 将HIT设置为不匹配 162 | return searchHit; 163 | } 164 | 165 | /** 166 | * 加载填充词典片段 167 | * @param charArray 168 | */ 169 | void fillSegment(char[] charArray){ 170 | this.fillSegment(charArray, 0 , charArray.length , 1); 171 | } 172 | 173 | /** 174 | * 屏蔽词典中的一个词 175 | * @param charArray 176 | */ 177 | void disableSegment(char[] charArray){ 178 | this.fillSegment(charArray, 0 , charArray.length , 0); 179 | } 180 | 181 | /** 182 | * 加载填充词典片段 183 | * @param charArray 184 | * @param begin 185 | * @param length 186 | * @param enabled 187 | */ 188 | private synchronized void fillSegment(char[] charArray , int begin , int length , int enabled){ 189 | //获取字典表中的汉字对象 190 | Character beginChar = new Character(charArray[begin]); 191 | /*Character keyChar = charMap.get(beginChar); 192 | //字典中没有该字,则将其添加入字典 193 | if(keyChar == null){ 194 | charMap.put(beginChar, beginChar); 195 | keyChar = beginChar; 196 | }*/ 197 | 198 | //搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建 199 | DictSegment ds = lookforSegment(beginChar , enabled); 200 | if(ds != null){ 201 | //处理keyChar对应的segment 202 | if(length > 1){ 203 | //词元还没有完全加入词典树 204 | ds.fillSegment(charArray, begin + 1, length - 1 , enabled); 205 | }else if (length == 1){ 206 | //已经是词元的最后一个char,设置当前节点状态为enabled, 207 | //enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词 208 | ds.nodeState = enabled; 209 | } 210 | } 211 | 212 | } 213 | 214 | /** 215 | * 查找本节点下对应的keyChar的segment * 216 | * @param keyChar 217 | * @param create =1如果没有找到,则创建新的segment ; =0如果没有找到,不创建,返回null 218 | * @return 219 | */ 220 | private DictSegment lookforSegment(Character keyChar , int create){ 221 | 222 | DictSegment ds = null; 223 | 224 | if(this.storeSize <= ARRAY_LENGTH_LIMIT){ 225 | //获取数组容器,如果数组未创建则创建数组 226 | DictSegment[] segmentArray = getChildrenArray(); 227 | //搜寻数组 228 | DictSegment keySegment = new DictSegment(keyChar); 229 | int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize, keySegment); 230 | if(position >= 0){ 231 | ds = segmentArray[position]; 232 | } 233 | 234 | //遍历数组后没有找到对应的segment 235 | if(ds == null && create == 1){ 236 | ds = keySegment; 237 | if(this.storeSize < ARRAY_LENGTH_LIMIT){ 238 | //数组容量未满,使用数组存储 239 | segmentArray[this.storeSize] = ds; 240 | //segment数目+1 241 | this.storeSize++; 242 | Arrays.sort(segmentArray , 0 , this.storeSize); 243 | 244 | }else{ 245 | //数组容量已满,切换Map存储 246 | //获取Map容器,如果Map未创建,则创建Map 247 | Map segmentMap = getChildrenMap(); 248 | //将数组中的segment迁移到Map中 249 | migrate(segmentArray , segmentMap); 250 | //存储新的segment 251 | segmentMap.put(keyChar, ds); 252 | //segment数目+1 , 必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组 253 | this.storeSize++; 254 | //释放当前的数组引用 255 | this.childrenArray = null; 256 | } 257 | 258 | } 259 | 260 | }else{ 261 | //获取Map容器,如果Map未创建,则创建Map 262 | Map segmentMap = getChildrenMap(); 263 | //搜索Map 264 | ds = (DictSegment)segmentMap.get(keyChar); 265 | if(ds == null && create == 1){ 266 | //构造新的segment 267 | ds = new DictSegment(keyChar); 268 | segmentMap.put(keyChar , ds); 269 | //当前节点存储segment数目+1 270 | this.storeSize ++; 271 | } 272 | } 273 | 274 | return ds; 275 | } 276 | 277 | 278 | /** 279 | * 获取数组容器 280 | * 线程同步方法 281 | */ 282 | private DictSegment[] getChildrenArray(){ 283 | if(this.childrenArray == null){ 284 | synchronized(this){ 285 | if(this.childrenArray == null){ 286 | this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT]; 287 | } 288 | } 289 | } 290 | return this.childrenArray; 291 | } 292 | 293 | /** 294 | * 获取Map容器 295 | * 线程同步方法 296 | */ 297 | private Map getChildrenMap(){ 298 | if(this.childrenMap == null){ 299 | synchronized(this){ 300 | if(this.childrenMap == null){ 301 | this.childrenMap = new HashMap(ARRAY_LENGTH_LIMIT * 2,0.8f); 302 | } 303 | } 304 | } 305 | return this.childrenMap; 306 | } 307 | 308 | /** 309 | * 将数组中的segment迁移到Map中 310 | * @param segmentArray 311 | */ 312 | private void migrate(DictSegment[] segmentArray , Map segmentMap){ 313 | for(DictSegment segment : segmentArray){ 314 | if(segment != null){ 315 | segmentMap.put(segment.nodeChar, segment); 316 | } 317 | } 318 | } 319 | 320 | /** 321 | * 实现Comparable接口 322 | * @param o 323 | * @return int 324 | */ 325 | public int compareTo(DictSegment o) { 326 | //对当前节点存储的char进行比较 327 | return this.nodeChar.compareTo(o.nodeChar); 328 | } 329 | 330 | } 331 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/dic/Dictionary.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 25 | */ 26 | package org.wltea.analyzer.dic; 27 | 28 | import java.io.BufferedReader; 29 | import java.io.IOException; 30 | import java.io.InputStream; 31 | import java.io.InputStreamReader; 32 | import java.util.Collection; 33 | import java.util.List; 34 | import org.wltea.analyzer.cfg.Configuration; 35 | import org.wltea.analyzer.cfg.DefaultConfig; 36 | 37 | /** 38 | * 词典管理类,单子模式 39 | */ 40 | public class Dictionary { 41 | 42 | 43 | /* 44 | * 词典单子实例 45 | */ 46 | private volatile static Dictionary singleton; 47 | 48 | /* 49 | * 主词典对象 50 | */ 51 | private DictSegment _MainDict; 52 | 53 | /* 54 | * 停止词词典 55 | */ 56 | //private DictSegment _StopWordDict; 57 | /* 58 | * 量词词典 59 | */ 60 | private DictSegment _QuantifierDict; 61 | /* 62 | * 单字带词频词典 63 | */ 64 | private DictCharNode _CharFreqDict; 65 | /* 66 | * 配置对象 67 | */ 68 | private Configuration cfg; 69 | 70 | private Dictionary(Configuration cfg){ 71 | this.cfg = cfg; 72 | //建立一个主词典实例 73 | _MainDict = new DictSegment((char)0); 74 | this.loadMainDict(_MainDict); 75 | 76 | /*_StopWordDict = new DictSegment((char)0); 77 | this.loadStopWordDict(_StopWordDict);*/ 78 | 79 | this.loadQuantifierDict(); 80 | this.loadCharFreqDict(); 81 | 82 | } 83 | 84 | /** 85 | * 词典初始化 86 | * 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化 87 | * 只有当Dictionary类被实际调用时,才会开始载入词典, 88 | * 这将延长首次分词操作的时间 89 | * 该方法提供了一个在应用加载阶段就初始化字典的手段 90 | * @return Dictionary 91 | */ 92 | public static Dictionary initial(Configuration cfg){ 93 | if(singleton == null){ 94 | synchronized(Dictionary.class){ 95 | if(singleton == null){ 96 | singleton = new Dictionary(cfg); 97 | return singleton; 98 | } 99 | } 100 | } 101 | return singleton; 102 | } 103 | 104 | /** 105 | * 把solr配置的字典加入到MainDic中,进行字典切换 106 | * @param inputStreamList 多字典输入流 107 | * @return 108 | */ 109 | public static synchronized Dictionary addDic2MainDic(List inputStreamList) 110 | { 111 | if(singleton == null) 112 | { 113 | Configuration cfg = DefaultConfig.getInstance(); 114 | Dictionary.initial(cfg); 115 | } 116 | 117 | DictSegment mainDicTemp = new DictSegment((char)0); 118 | 119 | System.out.println("begin load MainDict :"); 120 | singleton.loadMainDict(mainDicTemp); 121 | 122 | System.out.println("begin loadSolrMainDict by List:"); 123 | for(InputStream is : inputStreamList) 124 | { 125 | singleton.loadWords2DictSegment(is, mainDicTemp); 126 | } 127 | 128 | singleton._MainDict = mainDicTemp; 129 | System.out.println("*********************************"); 130 | System.out.println("end switch!!"); 131 | System.out.println("*********************************"); 132 | 133 | mainDicTemp = null; 134 | 135 | return singleton; 136 | } 137 | 138 | /** 139 | * 获取词典单子实例 140 | * @return Dictionary 单例对象 141 | */ 142 | public static Dictionary getSingleton(){ 143 | if(singleton == null){ 144 | throw new IllegalStateException("词典尚未初始化,请先调用initial方法"); 145 | } 146 | return singleton; 147 | } 148 | 149 | /** 150 | * 批量加载新词条 151 | * @param words Collection词条列表 152 | */ 153 | public void addWords(Collection words){ 154 | if(words != null){ 155 | for(String word : words){ 156 | if (word != null) { 157 | //批量加载词条到主内存词典中 158 | singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray()); 159 | } 160 | } 161 | } 162 | } 163 | 164 | /** 165 | * 批量移除(屏蔽)词条 166 | * @param words 167 | */ 168 | public void disableWords(Collection words){ 169 | if(words != null){ 170 | for(String word : words){ 171 | if (word != null) { 172 | //批量屏蔽词条 173 | singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray()); 174 | } 175 | } 176 | } 177 | } 178 | 179 | /** 180 | * 检索匹配主词典 181 | * @param charArray 182 | * @return Hit 匹配结果描述 183 | */ 184 | public Hit matchInMainDict(char[] charArray){ 185 | return singleton._MainDict.match(charArray); 186 | } 187 | 188 | /** 189 | * 检索匹配主词典 190 | * @param charArray 191 | * @param begin 192 | * @param length 193 | * @return Hit 匹配结果描述 194 | */ 195 | public Hit matchInMainDict(char[] charArray , int begin, int length){ 196 | return singleton._MainDict.match(charArray, begin, length); 197 | } 198 | 199 | /** 200 | * 检索匹配量词词典 201 | * @param charArray 202 | * @param begin 203 | * @param length 204 | * @return Hit 匹配结果描述 205 | */ 206 | public Hit matchInQuantifierDict(char[] charArray , int begin, int length){ 207 | return singleton._QuantifierDict.match(charArray, begin, length); 208 | } 209 | 210 | /** 211 | * 从已匹配的Hit中直接取出DictSegment,继续向下匹配 212 | * @param charArray 213 | * @param currentIndex 214 | * @param matchedHit 215 | * @return Hit 216 | */ 217 | public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){ 218 | DictSegment ds = matchedHit.getMatchedDictSegment(); 219 | return ds.match(charArray, currentIndex, 1 , matchedHit); 220 | } 221 | 222 | 223 | /** 224 | * 判断是否是停止词 225 | * @param charArray 226 | * @param begin 227 | * @param length 228 | * @return boolean 229 | */ 230 | /*public boolean isStopWord(char[] charArray , int begin, int length){ 231 | return singleton._StopWordDict.match(charArray, begin, length).isMatch(); 232 | }*/ 233 | 234 | /** 235 | * 加载主词典及扩展词典 236 | */ 237 | private void loadMainDict(DictSegment dstDicSegment){ 238 | 239 | //读取主词典文件 240 | InputStream inputStream = this.getClass().getClassLoader().getResourceAsStream("main2012.dic"); 241 | if(inputStream == null){ 242 | throw new RuntimeException("Main Dictionary not found!!!"); 243 | } 244 | 245 | //System.out.println("test加载主字典"); 246 | this.loadWords2DictSegment(inputStream,dstDicSegment); 247 | 248 | //System.out.println("test加载扩展字典"); 249 | this.loadExtDict(dstDicSegment); 250 | 251 | } 252 | 253 | /** 254 | * 加载用户配置的扩展词典到主词库表 255 | */ 256 | private void loadExtDict(DictSegment dstDicSegment){ 257 | //加载扩展词典配置 258 | List extDictFiles = cfg.getExtDictionarys(); 259 | if(extDictFiles != null){ 260 | InputStream is = null; 261 | for(String extDictName : extDictFiles){ 262 | //读取扩展词典文件 263 | //System.out.println("加载扩展词典:" + extDictName); 264 | is = this.getClass().getClassLoader().getResourceAsStream(extDictName); 265 | //如果找不到扩展的字典,则忽略 266 | if(is == null){ 267 | continue; 268 | } 269 | loadWords2DictSegment(is,dstDicSegment); 270 | } 271 | } 272 | } 273 | 274 | /** 275 | * 276 | * @param is 字典数据输入流 277 | * @param dstDicSegment 目标字典 278 | */ 279 | 280 | private void loadWords2DictSegment(InputStream is,DictSegment dstDicSegment) { 281 | 282 | if(is != null) 283 | { 284 | try { 285 | BufferedReader br = new BufferedReader(new InputStreamReader(is, 286 | "UTF-8")); 287 | String theWord = null; 288 | do { 289 | theWord = br.readLine(); 290 | if (theWord != null ) { 291 | String line = theWord.trim(); 292 | if (!line.isEmpty() && !line.startsWith("#")){ 293 | String[] words = line.split("[\\s=,>]+"); 294 | for(String w :words) 295 | dstDicSegment.fillSegment(w.toLowerCase().toCharArray()); 296 | } 297 | } 298 | } while (theWord != null); 299 | 300 | } catch (IOException ioe) { 301 | System.err.println(" Dictionary loading exception。ClassName: " + dstDicSegment.getClass().getName()); 302 | ioe.printStackTrace(); 303 | 304 | } finally { 305 | try { 306 | if (is != null) { 307 | is.close(); 308 | is = null; 309 | } 310 | } catch (IOException e) { 311 | e.printStackTrace(); 312 | } 313 | } 314 | } 315 | } 316 | 317 | /** 318 | * 加载量词词典 319 | */ 320 | private void loadQuantifierDict(){ 321 | //建立一个量词典实例 322 | _QuantifierDict = new DictSegment((char)0); 323 | //读取量词词典文件 324 | InputStream is = this.getClass().getClassLoader().getResourceAsStream("quantifier.dic"); 325 | if(is == null){ 326 | throw new RuntimeException("Quantifier Dictionary not found!!!"); 327 | } 328 | loadWords2DictSegment(is, _QuantifierDict); 329 | } 330 | 331 | private void loadCharFreqDict(){ 332 | _CharFreqDict = new DictCharNode(); 333 | //读取量词词典文件 334 | InputStream is = this.getClass().getClassLoader().getResourceAsStream("chars.dic"); 335 | if(is == null){ 336 | throw new RuntimeException("Chars Dictionary not found!!!"); 337 | } 338 | try { //此处可以抽象出一个接口,或公用函数 339 | BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); 340 | String theWord = null; 341 | do { 342 | theWord = br.readLine(); 343 | if (theWord != null && !"".equals(theWord.trim())) { 344 | String[] w = theWord.split(" "); 345 | if(w.length == 2) 346 | { 347 | _CharFreqDict.addChar(w[0].charAt(0), (float)(Math.log10(Integer.parseInt(w[1])+5))); 348 | } 349 | /*else 350 | { 351 | _CharFreqDict.addChar(w[0].charAt(0), 0); //默认无词性该给多少权重?是否该存在(对于没词频数据)? 352 | }*/ 353 | } 354 | } while (theWord != null); 355 | 356 | } catch (IOException ioe) { 357 | System.err.println("Chars Dictionary loading exception."); 358 | ioe.printStackTrace(); 359 | }finally{ 360 | try { 361 | if(is != null){ 362 | is.close(); 363 | is = null; 364 | } 365 | } catch (IOException e) { 366 | e.printStackTrace(); 367 | } 368 | } 369 | } 370 | 371 | public float getCharFreq(Character key) 372 | { 373 | return _CharFreqDict.getCharFreq(key); 374 | } 375 | 376 | } 377 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/dic/Hit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * IK 中文分词 版本 5.0 4 | * IK Analyzer release 5.0 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * 21 | * 源代码由林良益(linliangyi2005@gmail.com)提供 22 | * 版权声明 2012,乌龙茶工作室 23 | * provided by Linliangyi and copyright 2012 by Oolong studio 24 | * 25 | */ 26 | package org.wltea.analyzer.dic; 27 | 28 | /** 29 | * 表示一次词典匹配的命中 30 | */ 31 | public class Hit { 32 | //Hit不匹配 33 | private static final int UNMATCH = 0x00000000; 34 | //Hit完全匹配 35 | private static final int MATCH = 0x00000001; 36 | //Hit前缀匹配 37 | private static final int PREFIX = 0x00000010; 38 | 39 | 40 | //该HIT当前状态,默认未匹配 41 | private int hitState = UNMATCH; 42 | 43 | //记录词典匹配过程中,当前匹配到的词典分支节点 44 | private DictSegment matchedDictSegment; 45 | /* 46 | * 词段开始位置 47 | */ 48 | private int begin; 49 | /* 50 | * 词段的结束位置 51 | */ 52 | private int end; 53 | 54 | 55 | /** 56 | * 判断是否完全匹配 57 | */ 58 | public boolean isMatch() { 59 | return (this.hitState & MATCH) > 0; 60 | } 61 | /** 62 | * 63 | */ 64 | public void setMatch() { 65 | this.hitState = this.hitState | MATCH; 66 | } 67 | 68 | /** 69 | * 判断是否是词的前缀 70 | */ 71 | public boolean isPrefix() { 72 | return (this.hitState & PREFIX) > 0; 73 | } 74 | /** 75 | * 76 | */ 77 | public void setPrefix() { 78 | this.hitState = this.hitState | PREFIX; 79 | } 80 | /** 81 | * 判断是否是不匹配 82 | */ 83 | public boolean isUnmatch() { 84 | return this.hitState == UNMATCH ; 85 | } 86 | /** 87 | * 88 | */ 89 | public void setUnmatch() { 90 | this.hitState = UNMATCH; 91 | } 92 | 93 | public DictSegment getMatchedDictSegment() { 94 | return matchedDictSegment; 95 | } 96 | 97 | public void setMatchedDictSegment(DictSegment matchedDictSegment) { 98 | this.matchedDictSegment = matchedDictSegment; 99 | } 100 | 101 | public int getBegin() { 102 | return begin; 103 | } 104 | 105 | public void setBegin(int begin) { 106 | this.begin = begin; 107 | } 108 | 109 | public int getEnd() { 110 | return end; 111 | } 112 | 113 | public void setEnd(int end) { 114 | this.end = end; 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0.1 3 | * IK Analyzer release 5.0.1 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | 25 | * 26 | */ 27 | package org.wltea.analyzer.lucene; 28 | 29 | import java.io.IOException; 30 | import java.io.Reader; 31 | 32 | import org.apache.lucene.analysis.Tokenizer; 33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 35 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 36 | 37 | import org.wltea.analyzer.core.IKSegmenter; 38 | import org.wltea.analyzer.core.Lexeme; 39 | 40 | /** 41 | * IK分词器 Lucene Tokenizer适配器类 兼容Lucene 4.0版本 42 | */ 43 | public final class IKTokenizer extends Tokenizer { 44 | 45 | // IK分词器实现 46 | private IKSegmenter _IKImplement; 47 | 48 | // 词元文本属性 49 | private final CharTermAttribute termAtt; 50 | // 词元位移属性 51 | private final OffsetAttribute offsetAtt; 52 | // 词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量) 53 | private final TypeAttribute typeAtt; 54 | // 记录最后一个词元的结束位置 55 | private int endPosition; 56 | 57 | /** 58 | * Lucene 4.0 Tokenizer适配器类构造函数 59 | * 60 | * @param in 61 | * @param useSmart 62 | */ 63 | 64 | public IKTokenizer(Reader in, boolean useSmart) { 65 | super(in); 66 | offsetAtt = addAttribute(OffsetAttribute.class); 67 | termAtt = addAttribute(CharTermAttribute.class); 68 | typeAtt = addAttribute(TypeAttribute.class); 69 | _IKImplement = new IKSegmenter(input, useSmart); 70 | } 71 | 72 | /* 73 | * (non-Javadoc) 74 | * 75 | * @see org.apache.lucene.analysis.TokenStream#incrementToken() 76 | */ 77 | @Override 78 | public boolean incrementToken() throws IOException { 79 | // 清除所有的词元属性 80 | clearAttributes(); 81 | Lexeme nextLexeme = _IKImplement.next(); 82 | if (nextLexeme != null) { 83 | // 将Lexeme转成Attributes 84 | // 设置词元文本 85 | termAtt.append(nextLexeme.getLexemeText()); 86 | // 设置词元长度 87 | termAtt.setLength(nextLexeme.getLength()); 88 | // 设置词元位移 89 | offsetAtt.setOffset(nextLexeme.getBeginPosition(), 90 | nextLexeme.getEndPosition()); 91 | // 记录分词的最后位置 92 | endPosition = nextLexeme.getEndPosition(); 93 | // 记录词元分类 94 | typeAtt.setType(nextLexeme.getLexemeTypeString()); 95 | // 返会true告知还有下个词元 96 | return true; 97 | } 98 | // 返会false告知词元输出完毕 99 | return false; 100 | } 101 | 102 | /* 103 | * (non-Javadoc) 104 | * 105 | * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) 106 | */ 107 | @Override 108 | public void reset() throws IOException { 109 | super.reset(); 110 | _IKImplement.reset(input); 111 | } 112 | 113 | @Override 114 | public final void end() { 115 | // set final offset 116 | int finalOffset = correctOffset(this.endPosition); 117 | offsetAtt.setOffset(finalOffset, finalOffset); 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/java/org/wltea/analyzer/lucene/IKTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.lucene; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.Reader; 6 | import java.util.List; 7 | import java.util.Map; 8 | import org.apache.lucene.analysis.Tokenizer; 9 | import org.apache.lucene.analysis.util.ResourceLoader; 10 | import org.apache.lucene.util.AttributeSource.AttributeFactory; 11 | import org.wltea.analyzer.dic.Dictionary; 12 | 13 | import com.mlcs.search.mlcsseg.lucene.ReloadableTokenizerFactory; 14 | import com.mlcs.search.mlcsseg.lucene.ReloaderRegister; 15 | 16 | public class IKTokenizerFactory extends ReloadableTokenizerFactory { 17 | 18 | 19 | public IKTokenizerFactory(Map args) { 20 | super(args); 21 | 22 | useSmart = getBoolean(args, "useSmart", false); 23 | System.out.println(":::ik:construction::::::::::::::::::::::::::" + conf); 24 | } 25 | private boolean useSmart = false; 26 | 27 | private boolean useSmart() { 28 | return useSmart; 29 | } 30 | 31 | 32 | // 通过这个实现,调用自身分词器 33 | public Tokenizer create(AttributeFactory attributeFactory, Reader in) { // 会多次被调用 34 | return new IKTokenizer(in, this.useSmart()); // 初始化词典,分词器,消歧器 35 | } 36 | 37 | public void inform(ResourceLoader loader) throws IOException { // 在启动时初始化一次 38 | System.out.println(":::ik:::inform::::::::::::::::::::::::" + conf); 39 | ReloaderRegister.register(this, loader, conf); 40 | } 41 | 42 | 43 | 44 | @Override 45 | public void update(List inputStreams) { 46 | Dictionary.addDic2MainDic(inputStreams); 47 | } 48 | 49 | 50 | } 51 | -------------------------------------------------------------------------------- /mlcsseg-ik/src/main/resources/quantifier.dic: -------------------------------------------------------------------------------- 1 | 丈 2 | 下 3 | 世 4 | 世纪 5 | 两 6 | 个 7 | 中 8 | 串 9 | 亩 10 | 人 11 | 介 12 | 付 13 | 代 14 | 件 15 | 任 16 | 份 17 | 伏 18 | 伙 19 | 位 20 | 位数 21 | 例 22 | 倍 23 | 像素 24 | 元 25 | 克 26 | 克拉 27 | 公亩 28 | 公克 29 | 公分 30 | 公升 31 | 公尺 32 | 公担 33 | 公斤 34 | 公里 35 | 公顷 36 | 具 37 | 册 38 | 出 39 | 刀 40 | 分 41 | 分钟 42 | 分米 43 | 划 44 | 列 45 | 则 46 | 刻 47 | 剂 48 | 剑 49 | 副 50 | 加仑 51 | 勺 52 | 包 53 | 匙 54 | 匹 55 | 区 56 | 千克 57 | 千米 58 | 升 59 | 卷 60 | 厅 61 | 厘 62 | 厘米 63 | 双 64 | 发 65 | 口 66 | 句 67 | 只 68 | 台 69 | 叶 70 | 号 71 | 名 72 | 吨 73 | 听 74 | 员 75 | 周 76 | 周年 77 | 品 78 | 回 79 | 团 80 | 圆 81 | 圈 82 | 地 83 | 场 84 | 块 85 | 坪 86 | 堆 87 | 声 88 | 壶 89 | 处 90 | 夜 91 | 大 92 | 天 93 | 头 94 | 套 95 | 女 96 | 孔 97 | 字 98 | 宗 99 | 室 100 | 家 101 | 寸 102 | 对 103 | 封 104 | 尊 105 | 小时 106 | 尺 107 | 尾 108 | 局 109 | 层 110 | 届 111 | 岁 112 | 师 113 | 帧 114 | 幅 115 | 幕 116 | 幢 117 | 平方 118 | 平方公尺 119 | 平方公里 120 | 平方分米 121 | 平方厘米 122 | 平方码 123 | 平方米 124 | 平方英寸 125 | 平方英尺 126 | 平方英里 127 | 平米 128 | 年 129 | 年代 130 | 年级 131 | 度 132 | 座 133 | 式 134 | 引 135 | 张 136 | 成 137 | 战 138 | 截 139 | 户 140 | 房 141 | 所 142 | 扇 143 | 手 144 | 打 145 | 批 146 | 把 147 | 折 148 | 担 149 | 拍 150 | 招 151 | 拨 152 | 拳 153 | 指 154 | 掌 155 | 排 156 | 撮 157 | 支 158 | 文 159 | 斗 160 | 斤 161 | 方 162 | 族 163 | 日 164 | 时 165 | 曲 166 | 月 167 | 月份 168 | 期 169 | 本 170 | 朵 171 | 村 172 | 束 173 | 条 174 | 来 175 | 杯 176 | 枚 177 | 枝 178 | 枪 179 | 架 180 | 柄 181 | 柜 182 | 栋 183 | 栏 184 | 株 185 | 样 186 | 根 187 | 格 188 | 案 189 | 桌 190 | 档 191 | 桩 192 | 桶 193 | 梯 194 | 棵 195 | 楼 196 | 次 197 | 款 198 | 步 199 | 段 200 | 毛 201 | 毫 202 | 毫升 203 | 毫米 204 | 毫克 205 | 池 206 | 洲 207 | 派 208 | 海里 209 | 滴 210 | 炮 211 | 点 212 | 点钟 213 | 片 214 | 版 215 | 环 216 | 班 217 | 瓣 218 | 瓶 219 | 生 220 | 男 221 | 画 222 | 界 223 | 盆 224 | 盎司 225 | 盏 226 | 盒 227 | 盘 228 | 相 229 | 眼 230 | 石 231 | 码 232 | 碗 233 | 碟 234 | 磅 235 | 种 236 | 科 237 | 秒 238 | 秒钟 239 | 窝 240 | 立方公尺 241 | 立方分米 242 | 立方厘米 243 | 立方码 244 | 立方米 245 | 立方英寸 246 | 立方英尺 247 | 站 248 | 章 249 | 笔 250 | 等 251 | 筐 252 | 筒 253 | 箱 254 | 篇 255 | 篓 256 | 篮 257 | 簇 258 | 米 259 | 类 260 | 粒 261 | 级 262 | 组 263 | 维 264 | 缕 265 | 缸 266 | 罐 267 | 网 268 | 群 269 | 股 270 | 脚 271 | 船 272 | 艇 273 | 艘 274 | 色 275 | 节 276 | 英亩 277 | 英寸 278 | 英尺 279 | 英里 280 | 行 281 | 袋 282 | 角 283 | 言 284 | 课 285 | 起 286 | 趟 287 | 路 288 | 车 289 | 转 290 | 轮 291 | 辆 292 | 辈 293 | 连 294 | 通 295 | 遍 296 | 部 297 | 里 298 | 重 299 | 针 300 | 钟 301 | 钱 302 | 锅 303 | 门 304 | 间 305 | 队 306 | 阶段 307 | 隅 308 | 集 309 | 页 310 | 顶 311 | 顷 312 | 项 313 | 顿 314 | 颗 315 | 餐 316 | 首 -------------------------------------------------------------------------------- /mlcsseg-ik/src/test/java/org/wltea/analyzer/test/TestIk.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.test; 2 | 3 | import java.io.IOException; 4 | import java.io.StringReader; 5 | 6 | import org.wltea.analyzer.core.IKSegmenter; 7 | 8 | public class TestIk { 9 | public static void main(String[] args) throws IOException { 10 | IKSegmenter ik = new IKSegmenter(new StringReader(""), true); 11 | ik.next(); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.mlcs.search 6 | mlcsseg 7 | 4.6.0-SNAPSHOT 8 | pom 9 | 10 | mlcsseg 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | mlcsseg-common 19 | mlcsseg-ik 20 | mlcsseg-filter 21 | mlcsseg-ansj 22 | 23 | 24 | 25 | 26 | org.apache.lucene 27 | lucene-analyzers-common 28 | 4.6.1 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /test1/conf/admin-extra.html: -------------------------------------------------------------------------------- 1 | 17 | 18 | 32 | -------------------------------------------------------------------------------- /test1/conf/admin-extra.menu-bottom.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test1/conf/admin-extra.menu-top.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test1/conf/extDic.txt: -------------------------------------------------------------------------------- 1 |  2 | 七匹狼 3 | 秋装 4 | 伊莲娜 5 | 格男仕 6 | 李东垣 7 | 卡扎菲 8 | 大舒服 9 | 惠国吉 10 | 楠 11 | 木 12 | 金 13 | 丝 -------------------------------------------------------------------------------- /test1/conf/extDic1.txt: -------------------------------------------------------------------------------- 1 |  2 | 古妃奇 3 | 简直笨 4 | 并发编程 5 | 穆定喜 -------------------------------------------------------------------------------- /test1/conf/ik.conf: -------------------------------------------------------------------------------- 1 | lastupdate=11223 2 | files=extDic.txt,extDic1.txt,synonyms.txt,isynonyms.txt -------------------------------------------------------------------------------- /test1/conf/isynonyms.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | #some test synonym mappings unlikely to appear in real input text 15 | aaafoo => aaabar 16 | bbbfoo => bbbfoo bbbbar 17 | cccfoo => cccbar cccbaz 18 | fooaaa,baraaa,bazaaa 19 | 20 | # Some synonym groups specific to this example 21 | GB,gib,gigabyte,gigabytes 22 | MB,mib,megabyte,megabytes 23 | Television, Televisions, TV, TVs 24 | #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming 25 | #after us won't split it into two words. 26 | 27 | # Synonym mappings can be used for spelling correction too 28 | 男式=>男 29 | -------------------------------------------------------------------------------- /test1/conf/schema.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 47 | 48 | 49 | 65 | 66 | 67 | 92 | 93 | 99 | 100 | 101 | 102 | 103 | 104 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 121 | id 122 | 123 | 124 | 125 | 126 | 133 | 134 | 140 | 141 | 144 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 184 | 185 | 188 | 189 | 190 | 191 | 192 | 193 | 203 | 204 | 205 | 206 | 207 | 208 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 265 | 266 | 267 | 278 | 279 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 305 | 306 | 307 | 308 | 309 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 345 | 354 | 355 | 356 | 357 | 362 | 367 | 368 | 369 | -------------------------------------------------------------------------------- /test1/conf/stop.conf: -------------------------------------------------------------------------------- 1 | lastupdate=111221 2 | files=stopwords.txt -------------------------------------------------------------------------------- /test1/conf/stopwords.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 和 16 | -------------------------------------------------------------------------------- /test1/conf/synonym.conf: -------------------------------------------------------------------------------- 1 | lastupdate=12 2 | files=synonyms.txt -------------------------------------------------------------------------------- /test1/conf/synonym2.conf: -------------------------------------------------------------------------------- 1 | lastupdate=12 2 | files=isynonyms.txt -------------------------------------------------------------------------------- /test1/conf/synonyms.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | #some test synonym mappings unlikely to appear in real input text 15 | aaafoo => aaabar 16 | bbbfoo => bbbfoo bbbbar 17 | cccfoo => cccbar cccbaz 18 | fooaaa,baraaa,bazaaa 19 | 20 | # Some synonym groups specific to this example 21 | GB,gib,gigabyte,gigabytes 22 | MB,mib,megabyte,megabytes 23 | Television, Televisions, TV, TVs 24 | #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming 25 | #after us won't split it into two words. 26 | 27 | # Synonym mappings can be used for spelling correction too 28 | pixima => pixma 29 | 李东垣 => 李东阳 30 | 卡扎菲,卡扎渣,卡炸飞 31 | 穆定喜 => 木丁西 -------------------------------------------------------------------------------- /test1/conf/update-script.js: -------------------------------------------------------------------------------- 1 | /* 2 | This is a basic skeleton JavaScript update processor. 3 | 4 | In order for this to be executed, it must be properly wired into solrconfig.xml; by default it is commented out in 5 | the example solrconfig.xml and must be uncommented to be enabled. 6 | 7 | See http://wiki.apache.org/solr/ScriptUpdateProcessor for more details. 8 | */ 9 | 10 | function processAdd(cmd) { 11 | 12 | doc = cmd.solrDoc; // org.apache.solr.common.SolrInputDocument 13 | id = doc.getFieldValue("id"); 14 | logger.info("update-script#processAdd: id=" + id); 15 | 16 | // Set a field value: 17 | // doc.setField("foo_s", "whatever"); 18 | 19 | // Get a configuration parameter: 20 | // config_param = params.get('config_param'); // "params" only exists if processor configured with 21 | 22 | // Get a request parameter: 23 | // some_param = req.getParams().get("some_param") 24 | 25 | // Add a field of field names that match a pattern: 26 | // - Potentially useful to determine the fields/attributes represented in a result set, via faceting on field_name_ss 27 | // field_names = doc.getFieldNames().toArray(); 28 | // for(i=0; i < field_names.length; i++) { 29 | // field_name = field_names[i]; 30 | // if (/attr_.*/.test(field_name)) { doc.addField("attribute_ss", field_names[i]); } 31 | // } 32 | 33 | } 34 | 35 | function processDelete(cmd) { 36 | // no-op 37 | } 38 | 39 | function processMergeIndexes(cmd) { 40 | // no-op 41 | } 42 | 43 | function processCommit(cmd) { 44 | // no-op 45 | } 46 | 47 | function processRollback(cmd) { 48 | // no-op 49 | } 50 | 51 | function finish() { 52 | // no-op 53 | } 54 | -------------------------------------------------------------------------------- /test2/conf/admin-extra.html: -------------------------------------------------------------------------------- 1 | 17 | 18 | 32 | -------------------------------------------------------------------------------- /test2/conf/admin-extra.menu-bottom.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test2/conf/admin-extra.menu-top.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test2/conf/ansj.conf: -------------------------------------------------------------------------------- 1 | lastupdate=1226 2 | files=extDic.txt,extDic1.txt -------------------------------------------------------------------------------- /test2/conf/extDic.txt: -------------------------------------------------------------------------------- 1 |  2 | 七匹狼 3 | 秋装 4 | 伊莲娜 5 | 格男仕 6 | 李东垣 7 | 卡扎菲 8 | 大舒服 9 | -------------------------------------------------------------------------------- /test2/conf/extDic1.txt: -------------------------------------------------------------------------------- 1 |  2 | 古妃奇 3 | 简直笨 4 | -------------------------------------------------------------------------------- /test2/conf/isynonyms.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | #some test synonym mappings unlikely to appear in real input text 15 | aaafoo => aaabar 16 | bbbfoo => bbbfoo bbbbar 17 | cccfoo => cccbar cccbaz 18 | fooaaa,baraaa,bazaaa 19 | 20 | # Some synonym groups specific to this example 21 | GB,gib,gigabyte,gigabytes 22 | MB,mib,megabyte,megabytes 23 | Television, Televisions, TV, TVs 24 | #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming 25 | #after us won't split it into two words. 26 | 27 | # Synonym mappings can be used for spelling correction too 28 | 男式=>男 29 | -------------------------------------------------------------------------------- /test2/conf/schema.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 47 | 48 | 49 | 65 | 66 | 67 | 92 | 93 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 122 | id 123 | 124 | 125 | 126 | 127 | 134 | 135 | 141 | 142 | 145 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 185 | 186 | 189 | 190 | 191 | 192 | 193 | 194 | 204 | 205 | 206 | 207 | 208 | 209 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 266 | 267 | 268 | 279 | 280 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 306 | 307 | 308 | 309 | 310 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 339 | 348 | 349 | 350 | 351 | 356 | 361 | 362 | 363 | -------------------------------------------------------------------------------- /test2/conf/stop.conf: -------------------------------------------------------------------------------- 1 | lastupdate=11122 2 | files=stopwords.txt -------------------------------------------------------------------------------- /test2/conf/stopwords.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 和 16 | 你 17 | -------------------------------------------------------------------------------- /test2/conf/synonym.conf: -------------------------------------------------------------------------------- 1 | lastupdate=1 2 | files=synonyms.txt -------------------------------------------------------------------------------- /test2/conf/synonym2.conf: -------------------------------------------------------------------------------- 1 | lastupdate=12 2 | files=isynonyms.txt -------------------------------------------------------------------------------- /test2/conf/synonyms.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | #some test synonym mappings unlikely to appear in real input text 15 | aaafoo => aaabar 16 | bbbfoo => bbbfoo bbbbar 17 | cccfoo => cccbar cccbaz 18 | fooaaa,baraaa,bazaaa 19 | 20 | # Some synonym groups specific to this example 21 | GB,gib,gigabyte,gigabytes 22 | MB,mib,megabyte,megabytes 23 | Television, Televisions, TV, TVs 24 | #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming 25 | #after us won't split it into two words. 26 | 27 | # Synonym mappings can be used for spelling correction too 28 | pixima => pixma 29 | 李东垣 => 李东阳 30 | 卡扎菲,卡扎渣,卡炸飞 31 | -------------------------------------------------------------------------------- /test2/conf/update-script.js: -------------------------------------------------------------------------------- 1 | /* 2 | This is a basic skeleton JavaScript update processor. 3 | 4 | In order for this to be executed, it must be properly wired into solrconfig.xml; by default it is commented out in 5 | the example solrconfig.xml and must be uncommented to be enabled. 6 | 7 | See http://wiki.apache.org/solr/ScriptUpdateProcessor for more details. 8 | */ 9 | 10 | function processAdd(cmd) { 11 | 12 | doc = cmd.solrDoc; // org.apache.solr.common.SolrInputDocument 13 | id = doc.getFieldValue("id"); 14 | logger.info("update-script#processAdd: id=" + id); 15 | 16 | // Set a field value: 17 | // doc.setField("foo_s", "whatever"); 18 | 19 | // Get a configuration parameter: 20 | // config_param = params.get('config_param'); // "params" only exists if processor configured with 21 | 22 | // Get a request parameter: 23 | // some_param = req.getParams().get("some_param") 24 | 25 | // Add a field of field names that match a pattern: 26 | // - Potentially useful to determine the fields/attributes represented in a result set, via faceting on field_name_ss 27 | // field_names = doc.getFieldNames().toArray(); 28 | // for(i=0; i < field_names.length; i++) { 29 | // field_name = field_names[i]; 30 | // if (/attr_.*/.test(field_name)) { doc.addField("attribute_ss", field_names[i]); } 31 | // } 32 | 33 | } 34 | 35 | function processDelete(cmd) { 36 | // no-op 37 | } 38 | 39 | function processMergeIndexes(cmd) { 40 | // no-op 41 | } 42 | 43 | function processCommit(cmd) { 44 | // no-op 45 | } 46 | 47 | function processRollback(cmd) { 48 | // no-op 49 | } 50 | 51 | function finish() { 52 | // no-op 53 | } 54 | --------------------------------------------------------------------------------