├── .gitignore
├── README.md
├── mlcsseg-ansj
    ├── lib
    │   ├── ansj_seg-1.4-min.jar
    │   └── tree_split-1.3.jar
    ├── pom.xml
    └── src
    │   └── main
    │       ├── assembly
    │           └── zip.xml
    │       └── java
    │           └── org
    │               └── ansj
    │                   └── solr
    │                       ├── AnsjTokenizer.java
    │                       ├── AnsjTokenizerFactory.java
    │                       └── TestAnsj.java
├── mlcsseg-common
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── mlcs
    │                   └── search
    │                       └── mlcsseg
    │                           ├── common
    │                               └── ScheduledExecutor.java
    │                           └── lucene
    │                               ├── CnTokenizer.java
    │                               ├── ReloadableTokenizerFactory.java
    │                               └── ReloaderRegister.java
├── mlcsseg-filter
    ├── pom.xml
    └── src
    │   ├── main
    │       └── java
    │       │   └── org
    │       │       └── apache
    │       │           └── solr
    │       │               └── analysis
    │       │                   ├── DStopFilter.java
    │       │                   ├── DStopFilterFactory.java
    │       │                   └── DSynonymFilterFactory.java
    │   └── test
    │       └── java
    │           └── org
    │               └── mlcsseg
    │                   └── filter
    │                       └── AppTest.java
├── mlcsseg-ik
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── org
    │       │   │   └── wltea
    │       │   │       └── analyzer
    │       │   │           ├── cfg
    │       │   │               ├── Configuration.java
    │       │   │               └── DefaultConfig.java
    │       │   │           ├── core
    │       │   │               ├── AnalyzeContext.java
    │       │   │               ├── CJKSegmenter.java
    │       │   │               ├── CN_QuantifierSegmenter.java
    │       │   │               ├── CharacterUtil.java
    │       │   │               ├── IKArbitrator.java
    │       │   │               ├── IKSegmenter.java
    │       │   │               ├── ISegmenter.java
    │       │   │               ├── LetterSegmenter.java
    │       │   │               ├── Lexeme.java
    │       │   │               ├── LexemePath.java
    │       │   │               └── QuickSortSet.java
    │       │   │           ├── dic
    │       │   │               ├── DictCharNode.java
    │       │   │               ├── DictSegment.java
    │       │   │               ├── Dictionary.java
    │       │   │               └── Hit.java
    │       │   │           └── lucene
    │       │   │               ├── IKTokenizer.java
    │       │   │               └── IKTokenizerFactory.java
    │       └── resources
    │       │   ├── chars.dic
    │       │   ├── main2012.dic
    │       │   └── quantifier.dic
    │   └── test
    │       └── java
    │           └── org
    │               └── wltea
    │                   └── analyzer
    │                       └── test
    │                           └── TestIk.java
├── pom.xml
├── test1
    └── conf
    │   ├── admin-extra.html
    │   ├── admin-extra.menu-bottom.html
    │   ├── admin-extra.menu-top.html
    │   ├── extDic.txt
    │   ├── extDic1.txt
    │   ├── ik.conf
    │   ├── isynonyms.txt
    │   ├── schema.xml
    │   ├── solrconfig.xml
    │   ├── stop.conf
    │   ├── stopwords.txt
    │   ├── synonym.conf
    │   ├── synonym2.conf
    │   ├── synonyms.txt
    │   └── update-script.js
└── test2
    └── conf
        ├── admin-extra.html
        ├── admin-extra.menu-bottom.html
        ├── admin-extra.menu-top.html
        ├── ansj.conf
        ├── extDic.txt
        ├── extDic1.txt
        ├── isynonyms.txt
        ├── schema.xml
        ├── solrconfig.xml
        ├── stop.conf
        ├── stopwords.txt
        ├── synonym.conf
        ├── synonym2.conf
        ├── synonyms.txt
        └── update-script.js


/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | 
3 | # Package Files #
4 | *.war
5 | *.ear
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | mlcsseg  ：solr分词器大补贴
 2 | =======
 3 | 
 4 | 包括IK, ANSJ，过滤器。支持动态加载solr配置路径下的自定义词库。
 5 | 
 6 | 支持最新的`4.6`版本。master分支是4.6的，其他分支支持对应的solr版本
 7 | 
 8 | 配置和说明都在：http://mlcsdev.iteye.com/blog/2037109
 9 | 
10 | 欢迎使用，并以任何方式提供意见和建议。
11 | 


--------------------------------------------------------------------------------
/mlcsseg-ansj/lib/ansj_seg-1.4-min.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlcsdev/mlcsseg/942bc12ee27e5297bf6a042952299a82c490ca19/mlcsseg-ansj/lib/ansj_seg-1.4-min.jar


--------------------------------------------------------------------------------
/mlcsseg-ansj/lib/tree_split-1.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlcsdev/mlcsseg/942bc12ee27e5297bf6a042952299a82c490ca19/mlcsseg-ansj/lib/tree_split-1.3.jar


--------------------------------------------------------------------------------
/mlcsseg-ansj/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 | 	<modelVersion>4.0.0</modelVersion>
 4 | 	<parent>
 5 | 		<groupId>com.mlcs.search</groupId>
 6 | 		<artifactId>mlcsseg</artifactId>
 7 | 		<version>4.6.0-SNAPSHOT</version>
 8 | 	</parent>
 9 | 	<artifactId>mlcsseg-ansj</artifactId>
10 | 
11 | 	<dependencies>
12 | 		<dependency>
13 | 			<groupId>com.mlcs.search</groupId>
14 | 			<artifactId>mlcsseg-common</artifactId>
15 | 			<version>4.6.0-SNAPSHOT</version>
16 | 		</dependency>
17 | 		<dependency>
18 | 			<groupId>org.ansj</groupId>
19 | 			<artifactId>ansj_seg</artifactId>
20 | 			<version>1.4</version>
21 | 			<scope>system</scope>
22 | 			<systemPath>${project.basedir}/lib/ansj_seg-1.4-min.jar</systemPath>
23 | 		</dependency>
24 | 		<dependency>
25 | 			<groupId>org.ansj</groupId>
26 | 			<artifactId>tree_split</artifactId>
27 | 			<version>1.3</version>
28 | 			<scope>system</scope>
29 | 			<systemPath>${project.basedir}/lib/tree_split-1.3.jar</systemPath>
30 | 		</dependency>
31 | 	</dependencies>
32 | 
33 | 	<build>
34 | 		<plugins>
35 | 			<plugin>
36 | 				<groupId>org.apache.maven.plugins</groupId>
37 | 				<artifactId>maven-compiler-plugin</artifactId>
38 | 				<configuration>
39 | 					<source>1.6</source>
40 | 					<target>1.6</target>
41 | 					<encoding>utf8</encoding>
42 | 				</configuration>
43 | 			</plugin>
44 | 			<plugin>
45 | 				<groupId>org.apache.maven.plugins</groupId>
46 | 				<artifactId>maven-resources-plugin</artifactId>
47 | 				<version>2.5</version>
48 | 				<configuration>
49 | 					<encoding>UTF-8</encoding>
50 | 				</configuration>
51 | 			</plugin>
52 | 			<plugin>
53 | 				<artifactId>maven-assembly-plugin</artifactId>
54 | 				<version>2.2.1</version>
55 | 				<configuration>
56 | 					<descriptors>
57 | 						<descriptor>src/main/assembly/zip.xml</descriptor>
58 | 					</descriptors>
59 | 				</configuration>
60 | 			</plugin>
61 | 			<plugin>
62 | 				<groupId>org.apache.maven.plugins</groupId>
63 | 				<artifactId>maven-jar-plugin</artifactId>
64 | 				<version>2.4</version>
65 | 			</plugin>
66 | 		</plugins>
67 | 	</build>
68 | </project>


--------------------------------------------------------------------------------
/mlcsseg-ansj/src/main/assembly/zip.xml:
--------------------------------------------------------------------------------
 1 | <assembly
 2 | 	xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
 3 | 	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 | 	xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2 http://maven.apache.org/xsd/assembly-1.1.2.xsd">
 5 | 	<id>bin</id>
 6 | 	<formats>
 7 | 		<format>zip</format>
 8 | 	</formats>
 9 | 	<dependencySets>
10 | 		<dependencySet>
11 | 			<useProjectArtifact>true</useProjectArtifact>
12 | 			<outputDirectory>jar</outputDirectory>
13 | 			<excludes>
14 | 
15 | 				<exclude>org.apache.solr:solr*</exclude>
16 | 				<exclude>com.spatial4j:spatial4j</exclude>
17 | 				<exclude>org.apache.lucene:lucene*</exclude>
18 | 				<exclude>com.google.guava:guava*</exclude>
19 | 				<exclude>commons*:commons*</exclude>
20 | 				<exclude>org.restlet.jee:org.restlet*</exclude>
21 | 				<exclude>org.apache.zookeeper:zookeeper*</exclude>
22 | 				<exclude>org.noggit:noggit*</exclude>
23 | 				<exclude>org.slf4j*:slf4j*</exclude>
24 | 				<exclude>org.codehaus.woodstox:wstx-asl*</exclude>
25 | 				<exclude>org.apache.httpcomponents:http*</exclude>
26 | 			</excludes>
27 | 		</dependencySet>
28 | 	</dependencySets>
29 | 	<fileSets>
30 | 		<fileSet>
31 | 			<directory>lib</directory>
32 | 			<outputDirectory>jar</outputDirectory>
33 | 		</fileSet>
34 | 	</fileSets>
35 | </assembly>
36 | 


--------------------------------------------------------------------------------
/mlcsseg-ansj/src/main/java/org/ansj/solr/AnsjTokenizer.java:
--------------------------------------------------------------------------------
  1 | package org.ansj.solr;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.Reader;
  5 | import java.util.ArrayList;
  6 | import java.util.Iterator;
  7 | import java.util.List;
  8 | import org.ansj.domain.Term;
  9 | import org.ansj.splitWord.analysis.IndexAnalysis;
 10 | import org.ansj.splitWord.analysis.ToAnalysis;
 11 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 12 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 13 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 14 | 
 15 | import com.mlcs.search.mlcsseg.lucene.CnTokenizer;
 16 | 
 17 | 
 18 | public class AnsjTokenizer extends CnTokenizer{
 19 | 	private int analysisType ; 
 20 | 	private boolean removePunc;
 21 | 
 22 | 	private CharTermAttribute termAtt;
 23 | 	private OffsetAttribute offsetAtt;
 24 | 	private TypeAttribute typeAtt;
 25 | 	int lastOffset = 0;
 26 | 	int endPosition =0; 
 27 | 	private Iterator<Term> tokenIter;
 28 | 	private List<Term> tokenBuffer;
 29 | 	static
 30 | 	{
 31 | 		ToAnalysis.parse("");
 32 | 	}
 33 | 	
 34 | 	public AnsjTokenizer(Reader input, int analysisType, boolean removePunc) {
 35 | 		super(input);
 36 | 		offsetAtt = addAttribute(OffsetAttribute.class);
 37 | 		termAtt = addAttribute(CharTermAttribute.class);
 38 | 		typeAtt = addAttribute(TypeAttribute.class);
 39 | 		this.analysisType = analysisType;
 40 | 		this.removePunc = removePunc;
 41 | 	}
 42 | 
 43 | 	@Override
 44 | 	public boolean incrementToken() throws IOException {
 45 | 		if (tokenIter == null || !tokenIter.hasNext()){
 46 | 			String currentSentence = checkSentences();
 47 | 			if (currentSentence!= null){
 48 | 				tokenBuffer = new ArrayList<Term>();
 49 | 				if (analysisType == 1){
 50 | 					for(Term term :  ToAnalysis.parse(currentSentence)){
 51 | 						if (removePunc && stopwords.contains(term.getName()))
 52 | 							continue;
 53 | 						tokenBuffer.add(term);
 54 | 					}
 55 | 					
 56 | 				}else {
 57 | 					for(Term term :  IndexAnalysis.parse(currentSentence)){
 58 | 						if (removePunc && stopwords.contains(term.getName()))
 59 | 							continue;
 60 | 						tokenBuffer.add(term);
 61 | 					}
 62 | 				}
 63 | 				tokenIter = tokenBuffer.iterator();
 64 | 				if (!tokenIter.hasNext()){
 65 | 					return false;
 66 | 				}
 67 | 			} else {
 68 | 				return false; // no more sentences, end of stream!
 69 | 			}
 70 | 		}
 71 | 		clearAttributes();
 72 | 		
 73 | 		Term term = tokenIter.next();
 74 | 		if (removePunc){
 75 | 			while(stopwords.contains(term.getName())){
 76 | 				if (!tokenIter.hasNext()){
 77 | 				}else{
 78 | 					term = tokenIter.next();
 79 | 				}
 80 | 			}
 81 | 		}
 82 | 		termAtt.append(term.getName());
 83 | 		termAtt.setLength(term.getName().length());
 84 | 		
 85 | 		int currentStart = tokenStart + term.getOffe();
 86 | 		int currentEnd = tokenStart + term.getToValue();
 87 | 		offsetAtt.setOffset(currentStart,currentEnd);
 88 | 		typeAtt.setType("word");
 89 | 
 90 | //		int pi = currentStart - lastOffset;
 91 | //		if(term.getOffe()  <= 0) {
 92 | //			pi = 1;
 93 | //		}
 94 | //		positionIncrementAtt.setPositionIncrement( pi );
 95 | 		lastOffset = currentStart;
 96 | 		endPosition = currentEnd;
 97 | 		return true;
 98 | 	}
 99 | 
100 | 
101 | 
102 | 	@Override
103 | 	public void reset() throws IOException {
104 | 		super.reset();
105 | 	}
106 | 	
107 | 	public final void end() {
108 | 		// set final offset
109 | 		int finalOffset = correctOffset(this.endPosition);
110 | 		offsetAtt.setOffset(finalOffset, finalOffset);
111 | 	}
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/mlcsseg-ansj/src/main/java/org/ansj/solr/AnsjTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.ansj.solr;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.io.Reader;
 6 | import java.util.List;
 7 | import java.util.Map;
 8 | import org.ansj.library.UserDefineLibrary;
 9 | import org.apache.lucene.analysis.Tokenizer;
10 | import org.apache.lucene.analysis.util.ResourceLoader;
11 | import org.apache.lucene.util.AttributeSource.AttributeFactory;
12 | 
13 | import com.mlcs.search.mlcsseg.lucene.ReloadableTokenizerFactory;
14 | import com.mlcs.search.mlcsseg.lucene.ReloaderRegister;
15 | 
16 | 
17 | public class AnsjTokenizerFactory extends ReloadableTokenizerFactory {
18 | 
19 | 	private int analysisType = 0;
20 | 	private boolean rmPunc = true;
21 | 		
22 | 	public AnsjTokenizerFactory(Map<String, String> args) {
23 | 		super(args);
24 | 		analysisType = getInt(args, "analysisType", 0);
25 | 		rmPunc = getBoolean(args, "rmPunc", true);
26 | 		System.out.println(":::ansj:construction::::::::::::::::::::::::::" + conf);
27 | 	}
28 | 
29 | 
30 | 
31 | 	public void inform(ResourceLoader loader) throws IOException {
32 | 		System.out.println(":::ansj:::inform::::::::::::::::::::::::" + conf);
33 | 		ReloaderRegister.register(this, loader, conf);		
34 | 	}
35 | 
36 | 	@Override
37 | 	public Tokenizer create(AttributeFactory factory, Reader input) {
38 | 		return new AnsjTokenizer(input, analysisType, rmPunc);
39 | 	}
40 | 
41 | 
42 | 
43 | 	@Override
44 | 	public void update(List<InputStream> inputStreams) {
45 | 		if (inputStreams!= null){
46 | 			UserDefineLibrary.reloadMainAndAdd(inputStreams);
47 | 		}
48 | 	}
49 | 	
50 | 
51 | 
52 | 	
53 | }
54 | 


--------------------------------------------------------------------------------
/mlcsseg-ansj/src/main/java/org/ansj/solr/TestAnsj.java:
--------------------------------------------------------------------------------
 1 | package org.ansj.solr;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.StringReader;
 5 | import java.util.List;
 6 | 
 7 | import org.ansj.domain.Term;
 8 | import org.ansj.splitWord.analysis.IndexAnalysis;
 9 | import org.ansj.splitWord.analysis.ToAnalysis;
10 | import org.apache.lucene.analysis.Tokenizer;
11 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
12 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
13 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
14 | 
15 | 
16 | 
17 | public class TestAnsj {
18 | 	
19 | 	public static void main(String[] args) throws IOException {
20 | 		List<Term> parse = ToAnalysis.parse("天天向上，媒体打打。《回家真好》");
21 | 		System.out.println(parse);
22 | 		Tokenizer tokenizer = new AnsjTokenizer(new StringReader("天天向上，媒体打打。《回家真好》"), 0, true);
23 | 		CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
24 | 		OffsetAttribute offsetAtt = 
25 | 				tokenizer.addAttribute(OffsetAttribute.class);
26 | 			PositionIncrementAttribute positionIncrementAtt = 
27 | 				tokenizer.addAttribute(PositionIncrementAttribute.class);
28 | 
29 | 		
30 | 		while (tokenizer.incrementToken()){
31 | 
32 | 			System.out.print(new String(termAtt.toString()) );
33 | 			System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
34 | 			System.out.print( positionIncrementAtt.getPositionIncrement() +"/");
35 | 
36 | 		}
37 | 		tokenizer.close();
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/mlcsseg-common/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
 3 |     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
 4 |   <modelVersion>4.0.0</modelVersion>
 5 |   <parent>
 6 |     <groupId>com.mlcs.search</groupId>
 7 |     <artifactId>mlcsseg</artifactId>
 8 |     <version>4.6.0-SNAPSHOT</version>
 9 |   </parent>
10 |   <artifactId>mlcsseg-common</artifactId>
11 |   <name>mlcsseg-common</name>
12 |   <url>http://maven.apache.org</url>
13 |   <properties>
14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |   </properties>
16 | </project>
17 | 


--------------------------------------------------------------------------------
/mlcsseg-common/src/main/java/com/mlcs/search/mlcsseg/common/ScheduledExecutor.java:
--------------------------------------------------------------------------------
 1 | package com.mlcs.search.mlcsseg.common;
 2 | import java.util.concurrent.Executors;
 3 | import java.util.concurrent.ScheduledExecutorService;
 4 | import java.util.concurrent.ThreadFactory;
 5 | import java.util.concurrent.TimeUnit;
 6 | 
 7 | 
 8 | 
 9 | public class ScheduledExecutor {
10 | 	
11 | 	static class SegTF implements ThreadFactory{
12 | 
13 | 		public Thread newThread(Runnable r) {
14 | 			Thread t = new Thread(r, "SegmentScheduledExecutorThread");
15 | 			t.setDaemon(true);
16 | 			return t;
17 | 		}
18 | 		
19 | 	}
20 | 	
21 | 	final public static ScheduledExecutorService ScheduledService = Executors.newSingleThreadScheduledExecutor(new SegTF());
22 | 	
23 | 	
24 | 	public static void submit(Runnable cmd, long periodMilliSenconds){
25 | 		ScheduledService.scheduleAtFixedRate(cmd, 10l, periodMilliSenconds, TimeUnit.MILLISECONDS);
26 | 	}
27 | 	
28 | }
29 | 


--------------------------------------------------------------------------------
/mlcsseg-common/src/main/java/com/mlcs/search/mlcsseg/lucene/CnTokenizer.java:
--------------------------------------------------------------------------------
 1 | package com.mlcs.search.mlcsseg.lucene;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.Reader;
 5 | import java.io.StringReader;
 6 | import java.util.HashSet;
 7 | import java.util.Set;
 8 | 
 9 | import org.apache.lucene.analysis.Tokenizer;
10 | 
11 | /**
12 |  * 增加基础的停用词过滤，切长句的能力。分词细节没做.
13 |  *  @Description TODO
14 |  *	@author shanbo.liang
15 |  */
16 | public abstract class CnTokenizer extends Tokenizer{
17 | 	public final static String SPACES = " 　\t\r\n";
18 | 	public final static String PUNCTUATION = "。，！？；,!?;";
19 | 	public final static String stop = "',.`-_=?\'|\"(){}[]<>*#&^$@!~:;+/《》—－，。、：；！·？“”）（【】［］●'";
20 | 	public static Set<String> stopwords = new HashSet<String>();
21 | 	
22 | 	protected final StringBuilder buffer = new StringBuilder();
23 | 	protected int tokenStart = 0, tokenEnd = 0;
24 | 	
25 | 	
26 | 	static
27 | 	{
28 | 		for(String c : stop.split("")){
29 | 			stopwords.add(c);
30 | 		}
31 | 	}
32 | 	
33 | 	protected CnTokenizer(Reader input) {
34 | 		super(input);
35 | 	}
36 | 
37 | 	protected String checkSentences() throws IOException{
38 | 		buffer.setLength(0);
39 | 		int ci;
40 | 		char ch, pch;
41 | 		boolean atBegin = true;
42 | 		tokenStart = tokenEnd;
43 | 		ci = input.read();
44 | 		ch = (char) ci;
45 | 
46 | 		while (true) {
47 | 			if (ci == -1) {
48 | 				break;
49 | 			} else if (PUNCTUATION.indexOf(ch) != -1) {
50 | 				// End of a sentence
51 | 				buffer.append(ch);
52 | 				tokenEnd++;
53 | 				break;
54 | 			} else if (atBegin && SPACES.indexOf(ch) != -1) {
55 | 				tokenStart++;
56 | 				tokenEnd++;
57 | 				ci = input.read();
58 | 				ch = (char) ci;
59 | 			} else {
60 | 				buffer.append(ch);
61 | 				atBegin = false;
62 | 				tokenEnd++;
63 | 				pch = ch;
64 | 				ci = input.read();
65 | 				ch = (char) ci;
66 | 				// Two spaces, such as CR, LF
67 | 				if (SPACES.indexOf(ch) != -1
68 | 						&& SPACES.indexOf(pch) != -1) {
69 | 					// buffer.append(ch);
70 | 					tokenEnd++;
71 | 					break;
72 | 				}
73 | 			}
74 | 		}
75 | 		if (buffer.length() == 0){
76 | 			//sentences finished~	
77 | 			return null; 
78 | 		}else {
79 | 			return buffer.toString();
80 | 		}
81 | 
82 | 	}
83 | 
84 | 	public void reset() throws IOException {
85 | 		super.reset();
86 | 		tokenStart = tokenEnd = 0;
87 | 	}
88 | }
89 | 


--------------------------------------------------------------------------------
/mlcsseg-common/src/main/java/com/mlcs/search/mlcsseg/lucene/ReloadableTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package com.mlcs.search.mlcsseg.lucene;
 2 | 
 3 | import java.io.InputStream;
 4 | import java.util.List;
 5 | import java.util.Map;
 6 | 
 7 | import org.apache.lucene.analysis.util.ResourceLoaderAware;
 8 | import org.apache.lucene.analysis.util.TokenizerFactory;
 9 | 
10 | 
11 | public abstract class ReloadableTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware{
12 | 	
13 | 	protected String conf;
14 | 	
15 | 	protected ReloadableTokenizerFactory(Map<String, String> args) {
16 | 		super(args);
17 | 		assureMatchVersion();
18 | 		conf = get(args, "conf");
19 | 	}
20 | 
21 | 	public abstract void update(List<InputStream> inputStreams);
22 | 	
23 | 	public String getBeanName(){
24 | 		return this.getClass().toString();
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/mlcsseg-common/src/main/java/com/mlcs/search/mlcsseg/lucene/ReloaderRegister.java:
--------------------------------------------------------------------------------
  1 | package com.mlcs.search.mlcsseg.lucene;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.InputStream;
  5 | import java.util.ArrayList;
  6 | import java.util.Collections;
  7 | import java.util.HashMap;
  8 | import java.util.List;
  9 | import java.util.Map;
 10 | import java.util.Properties;
 11 | 
 12 | import org.apache.lucene.analysis.util.ResourceLoader;
 13 | 
 14 | import com.mlcs.search.mlcsseg.common.ScheduledExecutor;
 15 | 
 16 | /**
 17 |  * register it in 'inform(ResourceLoader loader)'
 18 |  *  @Description TODO
 19 |  *	@author shanbo.liang
 20 |  */
 21 | public class ReloaderRegister{
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 	private static Map<String, ConfigChecker> reloadAwares = new HashMap<String, ConfigChecker>();
 27 | 
 28 | 
 29 | 	public static class ConfigChecker {
 30 | 
 31 | 		private long lastUpdateTime = Long.MIN_VALUE;
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 		public static List<String> SplitFileNames(String fileNames) {
 37 | 			if (fileNames == null || fileNames.isEmpty())
 38 | 				return Collections.emptyList();
 39 | 
 40 | 			List<String> result = new ArrayList<String>();
 41 | 			for (String file : fileNames.split("[,\\s]+")) {
 42 | 				result.add(file);
 43 | 			}
 44 | 
 45 | 			return result;
 46 | 		}
 47 | 
 48 | 		public List<String> currentToReload(InputStream confStream){
 49 | 			try{
 50 | 				Properties p = new Properties();
 51 | 				p.load(confStream);
 52 | 				confStream.close();
 53 | 				String lastupdate = p.getProperty("lastupdate", "0");
 54 | 				Long t = new Long(lastupdate);
 55 | //				System.out.println(" => " + toString() + "=========loading conf========= : " + p.toString() );
 56 | 				if (t > this.lastUpdateTime){
 57 | 					System.out.println("lastUpdateTime is new, files will be loaded!" );
 58 | 					this.lastUpdateTime = t.longValue();
 59 | 					String paths = p.getProperty("files");
 60 | 					if (paths==null || paths.trim().isEmpty()) // 必须有地址
 61 | 						return Collections.emptyList();
 62 | 
 63 | 					List<String> dicPaths = SplitFileNames(p.getProperty("files"));
 64 | 					return dicPaths;
 65 | 				}else{
 66 | 					this.lastUpdateTime = t.longValue();
 67 | 					return Collections.emptyList();
 68 | 				}
 69 | 			}catch(IOException e){
 70 | 				return Collections.emptyList();
 71 | 			}
 72 | 		}
 73 | 
 74 | 		public String toString(){
 75 | 			return "configchecker@" + lastUpdateTime;
 76 | 		}
 77 | 
 78 | 	}
 79 | 
 80 | 
 81 | 	/**
 82 | 	 * 向注册机注册一个可定时更新的tokenfactory；register it in 'inform(ResourceLoader loader)'
 83 | 	 * @param reloadFactory
 84 | 	 * @param loader
 85 | 	 * @param confName
 86 | 	 * @return
 87 | 	 */
 88 | 	public static synchronized String register(final ReloadableTokenizerFactory reloadFactory, final ResourceLoader loader, final String confName){
 89 | 		if ( reloadAwares.containsKey(reloadFactory.getBeanName())){
 90 | 			return "already";
 91 | 		}else{
 92 | 			if(confName != null && !confName.trim().isEmpty()){ //存在conf才注册进来
 93 | 				final ConfigChecker cc = new ConfigChecker();
 94 | 				reloadAwares.put(reloadFactory.getBeanName(), cc);
 95 | 				loadAndUpdate(cc, reloadFactory, loader, confName);
 96 | 				ScheduledExecutor.submit(new Runnable() {
 97 | 					public void run() {
 98 | 						loadAndUpdate(cc, reloadFactory, loader, confName);
 99 | 					}
100 | 				}, 30 * 1000);
101 | 				return "ok";
102 | 			}
103 | 			return "conf is empty";
104 | 		}
105 | 	}
106 | 
107 | 	private static void loadAndUpdate(final ConfigChecker cc, final ReloadableTokenizerFactory reloadFactory, final ResourceLoader loader, final String confName){
108 | 
109 | 		try {
110 | 			List<String> dicts = cc.currentToReload(loader.openResource(confName));
111 | 			if (!dicts.isEmpty()){
112 | 				List<InputStream> insFromLoader = new ArrayList<InputStream>(dicts.size());
113 | 				for(String dictName : dicts){
114 | 					try{
115 | 						insFromLoader.add(loader.openResource(dictName));
116 | 					}catch(IOException e){
117 | 						System.out.println("missing dict source : " + dictName);
118 | 					}
119 | 				}
120 | 				reloadFactory.update(insFromLoader);
121 | 				System.out.println("reload finish! " + dicts);
122 | 			}
123 | 		} catch (IOException e) {
124 | 			e.printStackTrace();
125 | 		}
126 | 	}
127 | 
128 | 
129 | }
130 | 


--------------------------------------------------------------------------------
/mlcsseg-filter/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <project
 3 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
 4 | 	xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
 5 | 	<modelVersion>4.0.0</modelVersion>
 6 | 	<parent>
 7 | 		<groupId>com.mlcs.search</groupId>
 8 | 		<artifactId>mlcsseg</artifactId>
 9 | 		<version>4.6.0-SNAPSHOT</version>
10 | 	</parent>
11 | 	<artifactId>mlcsseg-filter</artifactId>
12 | 	<name>mlcsseg-filter</name>
13 | 	<url>http://maven.apache.org</url>
14 | 	<properties>
15 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
16 | 	</properties>
17 | 
18 | 	<dependencies>
19 | 		<dependency>
20 | 			<groupId>junit</groupId>
21 | 			<artifactId>junit</artifactId>
22 | 			<version>3.8.1</version>
23 | 			<scope>test</scope>
24 | 		</dependency>
25 | 		<dependency>
26 | 			<groupId>com.mlcs.search</groupId>
27 | 			<artifactId>mlcsseg-common</artifactId>
28 | 			<version>4.6.0-SNAPSHOT</version>
29 | 		</dependency>
30 | 	</dependencies>
31 | </project>
32 | 


--------------------------------------------------------------------------------
/mlcsseg-filter/src/main/java/org/apache/solr/analysis/DStopFilter.java:
--------------------------------------------------------------------------------
 1 | package org.apache.solr.analysis;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.lucene.analysis.TokenStream;
 6 | 
 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 8 | import org.apache.lucene.analysis.util.CharArraySet;
 9 | import org.apache.lucene.analysis.util.FilteringTokenFilter;
10 | import org.apache.lucene.util.Version;
11 | 
12 | public class DStopFilter extends FilteringTokenFilter {
13 | 
14 | 	private final CharArraySet stopWords;
15 | 	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
16 | 
17 | 	public DStopFilter(TokenStream input,	CharArraySet stopWords) {
18 | 		super(Version.LUCENE_46, input);
19 | 
20 | 		this.stopWords = stopWords;
21 | 	}
22 | 
23 | 	@Override
24 | 	protected boolean accept() throws IOException {
25 | 
26 | 		// System.out.println("<IKStopFilter>accept()"+termAtt.toString());
27 | 		return !stopWords.contains(termAtt.buffer(), 0, termAtt.length()); // 未被赋值过？隐藏操作在哪里实现？
28 | 	}
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/mlcsseg-filter/src/main/java/org/apache/solr/analysis/DStopFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.apache.solr.analysis;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.util.Map;
 6 | import java.util.Properties;
 7 | 
 8 | import org.apache.lucene.analysis.TokenStream;
 9 | import org.apache.lucene.analysis.util.CharArraySet;
10 | import org.apache.lucene.analysis.util.ResourceLoader;
11 | import org.apache.lucene.analysis.util.ResourceLoaderAware;
12 | import org.apache.lucene.analysis.util.TokenFilterFactory;
13 | 
14 | import com.mlcs.search.mlcsseg.common.ScheduledExecutor;
15 | 
16 | 
17 | public class DStopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
18 | 
19 | 	public DStopFilterFactory(Map<String, String> args) {
20 | 		super(args);
21 | 		ignoreCase = getBoolean(args, "ignoreCase", false);
22 | //		enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
23 | 		conf = get(args, "conf"); //paths & lastupdate
24 | 		System.out.println("construct:::::stop::::::::::::::::::::::" + conf);
25 | 	}
26 | 	
27 | 	private CharArraySet stopWords;
28 | 	private boolean ignoreCase;
29 | //	private boolean enablePositionIncrements;
30 | 
31 | 	private ResourceLoader loader;
32 | 
33 | 	private String conf;
34 | 	private long lastUpdateTime = -1;
35 | 
36 | 	public void inform(final ResourceLoader loader) throws IOException {
37 | 		System.out.println("inform:::::stop::::::::::::::::::::::" + conf);
38 | 		this.loader = loader;
39 | 		this.update();
40 | 		if(conf != null && !conf.trim().isEmpty()){
41 | 			ScheduledExecutor.submit(new Runnable() {
42 | 				
43 | 				public void run() {
44 | 					try {
45 | 						update();
46 | 					} catch (IOException e) {
47 | 						e.printStackTrace();
48 | 					}
49 | 				}
50 | 			}, 1000 * 60 );
51 | 		}
52 | 	}
53 | 
54 | 	@Override
55 | 	public TokenStream create(TokenStream arg0) {
56 | 		DStopFilter stopFilter = new DStopFilter( arg0, stopWords);
57 | 		return stopFilter;
58 | 	}
59 | 
60 | 	public void update() throws IOException {
61 | 		Properties p = canUpdate();
62 | 		if (p != null){
63 | 			System.out.println("<IKStopFilterFactory> updating~~~!! ");
64 | 			stopWords = getWordSet(loader, p.getProperty("files"), ignoreCase);
65 | 			System.out.println("<IKStopFilterFactory> finish!! ");
66 | 		}
67 | 
68 | 	}
69 | 		
70 | 
71 | 	private Properties canUpdate() {
72 | 
73 | 		try{
74 | 			Properties p = new Properties();
75 | 			InputStream confStream = loader.openResource(conf);
76 | 			p.load(confStream);
77 | 			confStream.close();
78 | 			String lastupdate = p.getProperty("lastupdate", "0");
79 | 			Long t = new Long(lastupdate);
80 | 
81 | 			if (t > this.lastUpdateTime){
82 | 				this.lastUpdateTime = t.longValue();
83 | 				String paths = p.getProperty("files");
84 | 				if (paths==null || paths.trim().isEmpty()) // 必须有地址
85 | 					return null;
86 | 				System.out.println("loading conf");
87 | 				return p;
88 | 			}else{
89 | 				this.lastUpdateTime = t.longValue();
90 | 				return null;
91 | 			}
92 | 		}catch(Exception e){
93 | 			System.err.println("stop parsing conf NullPointerException~~~~~" + e.getMessage());
94 | 			return null;
95 | 		} 
96 | 	}
97 | 
98 | }
99 | 


--------------------------------------------------------------------------------
/mlcsseg-filter/src/main/java/org/apache/solr/analysis/DSynonymFilterFactory.java:
--------------------------------------------------------------------------------
  1 | package org.apache.solr.analysis;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.io.InputStream;
  6 | import java.io.InputStreamReader;
  7 | import java.io.Reader;
  8 | import java.nio.charset.Charset;
  9 | import java.nio.charset.CharsetDecoder;
 10 | import java.nio.charset.CodingErrorAction;
 11 | import java.text.ParseException;
 12 | import java.util.List;
 13 | import java.util.Map;
 14 | import java.util.Properties;
 15 | 
 16 | import org.apache.lucene.analysis.TokenStream;
 17 | import org.apache.lucene.analysis.Analyzer;
 18 | import org.apache.lucene.analysis.core.LowerCaseFilter;
 19 | import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 20 | import org.apache.lucene.analysis.synonym.SolrSynonymParser;
 21 | import org.apache.lucene.analysis.synonym.SynonymFilter;
 22 | import org.apache.lucene.analysis.synonym.SynonymMap;
 23 | import org.apache.lucene.analysis.util.ResourceLoader;
 24 | import org.apache.lucene.analysis.util.ResourceLoaderAware;
 25 | import org.apache.lucene.analysis.util.TokenFilterFactory;
 26 | import org.apache.lucene.util.Version;
 27 | 
 28 | import com.mlcs.search.mlcsseg.common.ScheduledExecutor;
 29 | 
 30 | 
 31 | public class DSynonymFilterFactory extends TokenFilterFactory implements
 32 | ResourceLoaderAware {
 33 | 
 34 | 	public DSynonymFilterFactory(Map<String, String> args) throws IOException {
 35 | 		super(args);
 36 | 		expand = getBoolean(args, "expand", true);
 37 | 		ignoreCase = getBoolean(args, "ignoreCase", false);
 38 | 		conf = get(args, "conf"); //paths & lastupdate
 39 | 		System.out.println(conf);
 40 | 	}
 41 | 
 42 | 	private SynonymMap map; // 词库，可以通过引用改变
 43 | 	private boolean ignoreCase; //属性
 44 | 	private boolean expand; 
 45 | 	private ResourceLoader loader = null;
 46 | 
 47 | 	private String conf;    // properties格式， 存lastupdatetime和词库路径files：逗号间隔
 48 | 	private long lastUpdateTime = -1;
 49 | 
 50 | 	public void inform(ResourceLoader loader) throws IOException {
 51 | 		System.out.println(":::::synonym::::::::::::::::::::::" + conf);
 52 | 		this.loader = loader;
 53 | 		this.update();
 54 | 		if(conf != null && !conf.trim().isEmpty()){
 55 | 			ScheduledExecutor.submit(new Runnable() {
 56 | 				
 57 | 				public void run() {
 58 | 					update();
 59 | 					
 60 | 				}
 61 | 			}, 1000 * 60);
 62 | 		}
 63 | 	}
 64 | 
 65 | 	private SynonymMap loadSolrSynonyms(ResourceLoader loader, Properties p) throws IOException, ParseException {
 66 | 		final Analyzer analyzer = new Analyzer() {
 67 | 			@Override
 68 | 			protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
 69 | 				WhitespaceTokenizer tokenizer =  new WhitespaceTokenizer(Version.LUCENE_46, reader);
 70 | 				TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_46, tokenizer) : tokenizer;
 71 | 				return new TokenStreamComponents(tokenizer, stream);
 72 | 			}
 73 | 		};
 74 | 		String synonyms = p.getProperty("files");
 75 | 
 76 | 		CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
 77 | 				.onMalformedInput(CodingErrorAction.REPORT)
 78 | 				.onUnmappableCharacter(CodingErrorAction.REPORT);
 79 | 
 80 | 		SolrSynonymParser parser = new SolrSynonymParser(true, expand,	analyzer);
 81 | 		File synonymFile = new File(synonyms);
 82 | 		if (loader != null){ //first call in constructor
 83 | 			if (synonymFile.exists()) {
 84 | 				decoder.reset();
 85 | 				
 86 | 				parser.parse(new InputStreamReader(loader.openResource(synonyms),
 87 | 						decoder));
 88 | 			} else {
 89 | 				List<String> files = splitFileNames(synonyms);
 90 | 				for (String file : files) {
 91 | 					decoder.reset();
 92 | 					parser.parse(new InputStreamReader(loader.openResource(file),
 93 | 							decoder));
 94 | 				}
 95 | 			}
 96 | 		}
 97 | 
 98 | 		return parser.build();
 99 | 	}
100 | 
101 | 	@Override
102 | 	public TokenStream create(TokenStream input) {
103 | 		return map.fst == null ? input : new SynonymFilter(input, map,ignoreCase);
104 | 	}
105 | 
106 | 	public void update() {
107 | 
108 | 		Properties p = canUpdate();
109 | 		if (p != null){
110 | 			try {
111 | 				System.out.println("<IKSynonymFilterFactory> updating !");
112 | 				map = loadSolrSynonyms(loader, p); // 内部已实现切换
113 | 				System.out.println("<IKSynonymFilterFactory> finish~!");
114 | 			} catch (IOException e) {
115 | 				System.err.println("<IKSynonymFilterFactory> IOException!!");
116 | 				e.printStackTrace();
117 | 			} catch (ParseException e) {
118 | 				System.err.println("<IKSynonymFilterFactory> ParseException!!");
119 | 				e.printStackTrace();
120 | 			}
121 | 		}
122 | 	}
123 | 
124 | 	private Properties canUpdate() {
125 | 
126 | 		try{
127 | 			Properties p = new Properties();
128 | 			InputStream confStream = loader.openResource(conf);
129 | 			p.load(confStream);
130 | 			confStream.close();
131 | 			String lastupdate = p.getProperty("lastupdate", "0");
132 | 			Long t = new Long(lastupdate);
133 | 			
134 | 			if (t > this.lastUpdateTime){
135 | 				this.lastUpdateTime = t.longValue();
136 | 				String paths = p.getProperty("files");
137 | 				if (paths==null || paths.trim().isEmpty()) // 必须有地址
138 | 					return null;
139 | 				System.out.println("loading conf");
140 | 				return p;
141 | 			}else{
142 | 				this.lastUpdateTime = t.longValue();
143 | 				return null;
144 | 			}
145 | 		}catch(Exception e){
146 | 			System.err.println("synonym parsing conf NullPointerException~~~~~" + e.getMessage());
147 | 			return null;
148 | 		}
149 | 	}
150 | 
151 | }
152 | 


--------------------------------------------------------------------------------
/mlcsseg-filter/src/test/java/org/mlcsseg/filter/AppTest.java:
--------------------------------------------------------------------------------
 1 | package org.mlcsseg.filter;
 2 | 
 3 | import junit.framework.Test;
 4 | import junit.framework.TestCase;
 5 | import junit.framework.TestSuite;
 6 | 
 7 | /**
 8 |  * Unit test for simple App.
 9 |  */
10 | public class AppTest 
11 |     extends TestCase
12 | {
13 |     /**
14 |      * Create the test case
15 |      *
16 |      * @param testName name of the test case
17 |      */
18 |     public AppTest( String testName )
19 |     {
20 |         super( testName );
21 |     }
22 | 
23 |     /**
24 |      * @return the suite of tests being tested
25 |      */
26 |     public static Test suite()
27 |     {
28 |         return new TestSuite( AppTest.class );
29 |     }
30 | 
31 |     /**
32 |      * Rigourous Test :-)
33 |      */
34 |     public void testApp()
35 |     {
36 |         assertTrue( true );
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 | 	<modelVersion>4.0.0</modelVersion>
 4 | 	<parent>
 5 | 		<groupId>com.mlcs.search</groupId>
 6 | 		<artifactId>mlcsseg</artifactId>
 7 | 		<version>4.6.0-SNAPSHOT</version>
 8 | 	</parent>
 9 | 	<artifactId>mlcsseg-ik</artifactId>
10 | 	<name>mlcsseg-ik</name>
11 | 	<url>http://maven.apache.org</url>
12 | 
13 | 	<properties>
14 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 | 	</properties>
16 | 
17 | 	<dependencies>
18 | 		<dependency>
19 | 			<groupId>junit</groupId>
20 | 			<artifactId>junit</artifactId>
21 | 			<version>3.8.1</version>
22 | 			<scope>test</scope>
23 | 		</dependency>
24 | 		<dependency>
25 | 			<groupId>com.mlcs.search</groupId>
26 | 			<artifactId>mlcsseg-common</artifactId>
27 | 			<version>4.6.0-SNAPSHOT</version>
28 | 		</dependency>
29 | 	</dependencies>
30 | 	<build>
31 | 		<resources>
32 | 			<resource>
33 | 				<directory>src/main/resources</directory>
34 | 				<includes>
35 | 					<include>**/*.dic</include>
36 | 					<include>**/*.xml</include>
37 | 				</includes>
38 | 			</resource>
39 | 		</resources>
40 | 	</build>
41 | </project>


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/cfg/Configuration.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * IK 中文分词  版本 5.0
 3 |  * IK Analyzer release 5.0
 4 |  * 
 5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 6 |  * contributor license agreements.  See the NOTICE file distributed with
 7 |  * this work for additional information regarding copyright ownership.
 8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 9 |  * (the "License"); you may not use this file except in compliance with
10 |  * the License.  You may obtain a copy of the License at
11 |  *
12 |  *     http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  * Unless required by applicable law or agreed to in writing, software
15 |  * distributed under the License is distributed on an "AS IS" BASIS,
16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  * See the License for the specific language governing permissions and
18 |  * limitations under the License.
19 |  *
20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
21 |  * 版权声明 2012，乌龙茶工作室
22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
23 |  * 
24 |  */
25 | package org.wltea.analyzer.cfg;
26 | 
27 | import java.util.List;
28 | 
29 | /**
30 |  * 
31 |  * 配置管理类接口
32 |  * 
33 |  */
34 | public interface Configuration {
35 | 	
36 | 	/**
37 | 	 * 返回useSmart标志位
38 | 	 * useSmart =true ，分词器使用智能切分策略， =false则使用细粒度切分
39 | 	 * @return useSmart
40 | 	 */
41 | 	public boolean useSmart();
42 | 	
43 | 	/**
44 | 	 * 设置useSmart标志位
45 | 	 * useSmart =true ，分词器使用智能切分策略， =false则使用细粒度切分
46 | 	 * @param useSmart
47 | 	 */
48 | 	public void setUseSmart(boolean useSmart);
49 | 	
50 | 	
51 | 	/**
52 | 	 * 获取主词典路径
53 | 	 * 
54 | 	 * @return String 主词典路径
55 | 	 */
56 | 	public String getMainDictionary();
57 | 
58 | 	/**
59 | 	 * 获取量词词典路径
60 | 	 * @return String 量词词典路径
61 | 	 */
62 | 	public String getQuantifierDicionary();
63 | 
64 | 	/**
65 | 	 * 获取扩展字典配置路径
66 | 	 * @return List<String> 相对类加载器的路径
67 | 	 */
68 | 	public List<String> getExtDictionarys();
69 | 
70 | 
71 | 	/**
72 | 	 * 获取扩展停止词典配置路径
73 | 	 * @return List<String> 相对类加载器的路径
74 | 	 */
75 | 	public List<String> getExtStopWordDictionarys();
76 | }
77 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/cfg/DefaultConfig.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0
  3 |  * IK Analyzer release 5.0
  4 |  * 
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  * 
 24 |  * 
 25 |  */
 26 | package org.wltea.analyzer.cfg;
 27 | 
 28 | import java.io.IOException;
 29 | import java.io.InputStream;
 30 | import java.util.ArrayList;
 31 | import java.util.InvalidPropertiesFormatException;
 32 | import java.util.List;
 33 | import java.util.Properties;
 34 | 
 35 | /**
 36 |  * Configuration 默认实现
 37 |  * 2012-5-8
 38 |  *
 39 |  */
 40 | public class DefaultConfig implements Configuration{
 41 | 
 42 | 	/*
 43 | 	 * 分词器默认字典路径 
 44 | 	 */
 45 | 	private static final String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic";
 46 | 	private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic";
 47 | 
 48 | 	/*
 49 | 	 * 分词器配置文件路径
 50 | 	 */	
 51 | 	private static final String FILE_NAME = "IKAnalyzer.cfg.xml";
 52 | 	//配置属性——扩展字典
 53 | 	private static final String EXT_DICT = "ext_dict";
 54 | 	//配置属性——扩展停止词典
 55 | 	private static final String EXT_STOP = "ext_stopwords";
 56 | 	
 57 | 	private Properties props;
 58 | 	/*
 59 | 	 * 是否使用smart方式分词
 60 | 	 */
 61 | 	private boolean useSmart;
 62 | 		
 63 | 	/**
 64 | 	 * 返回单例
 65 | 	 * @return Configuration单例
 66 | 	 */
 67 | 	public static Configuration getInstance(){
 68 | 		return new DefaultConfig();
 69 | 	}
 70 | 		
 71 | 	/*
 72 | 	 * 初始化配置文件
 73 | 	 */
 74 | 	private DefaultConfig(){		
 75 | 		props = new Properties();
 76 | 		
 77 | 		InputStream input = this.getClass().getClassLoader().getResourceAsStream(FILE_NAME);
 78 | 		if(input != null){
 79 | 			try {
 80 | 				props.loadFromXML(input);
 81 | 			} catch (InvalidPropertiesFormatException e) {
 82 | 				e.printStackTrace();
 83 | 			} catch (IOException e) {
 84 | 				e.printStackTrace();
 85 | 			}
 86 | 		}
 87 | 	}
 88 | 
 89 | 	
 90 | 	/**
 91 | 	 * 返回useSmart标志位
 92 | 	 * useSmart =true ，分词器使用智能切分策略， =false则使用细粒度切分
 93 | 	 * @return useSmart
 94 | 	 */
 95 | 	public boolean useSmart() {
 96 | 		return useSmart;
 97 | 	}
 98 | 
 99 | 	/**
100 | 	 * 设置useSmart标志位
101 | 	 * useSmart =true ，分词器使用智能切分策略， =false则使用细粒度切分
102 | 	 * @param useSmart
103 | 	 */
104 | 	public void setUseSmart(boolean useSmart) {
105 | 		this.useSmart = useSmart;
106 | 	}	
107 | 	
108 | 	/**
109 | 	 * 获取主词典路径
110 | 	 * 
111 | 	 * @return String 主词典路径
112 | 	 */
113 | 	public String getMainDictionary(){
114 | 		return PATH_DIC_MAIN;
115 | 	}
116 | 
117 | 	/**
118 | 	 * 获取量词词典路径
119 | 	 * @return String 量词词典路径
120 | 	 */
121 | 	public String getQuantifierDicionary(){
122 | 		return PATH_DIC_QUANTIFIER;
123 | 	}
124 | 
125 | 	/**
126 | 	 * 获取扩展字典配置路径
127 | 	 * @return List<String> 相对类加载器的路径
128 | 	 */
129 | 	public List<String> getExtDictionarys(){
130 | 		List<String> extDictFiles = new ArrayList<String>(2);
131 | 		String extDictCfg = props.getProperty(EXT_DICT);
132 | 		if(extDictCfg != null){
133 | 			//使用;分割多个扩展字典配置
134 | 			String[] filePaths = extDictCfg.split(";");
135 | 			if(filePaths != null){
136 | 				for(String filePath : filePaths){
137 | 					if(filePath != null && !"".equals(filePath.trim())){
138 | 						extDictFiles.add(filePath.trim());
139 | 					}
140 | 				}
141 | 			}
142 | 		}		
143 | 		return extDictFiles;		
144 | 	}
145 | 
146 | 
147 | 	/**
148 | 	 * 获取扩展停止词典配置路径
149 | 	 * @return List<String> 相对类加载器的路径
150 | 	 */
151 | 	public List<String> getExtStopWordDictionarys(){
152 | 		List<String> extStopWordDictFiles = new ArrayList<String>(2);
153 | 		String extStopWordDictCfg = props.getProperty(EXT_STOP);
154 | 		if(extStopWordDictCfg != null){
155 | 			//使用;分割多个扩展字典配置
156 | 			String[] filePaths = extStopWordDictCfg.split(";");
157 | 			if(filePaths != null){
158 | 				for(String filePath : filePaths){
159 | 					if(filePath != null && !"".equals(filePath.trim())){
160 | 						extStopWordDictFiles.add(filePath.trim());
161 | 					}
162 | 				}
163 | 			}
164 | 		}		
165 | 		return extStopWordDictFiles;		
166 | 	}
167 | }
168 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0
  3 |  * IK Analyzer release 5.0
  4 |  * 
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  * 
 24 |  */
 25 | package org.wltea.analyzer.core;
 26 | 
 27 | import java.io.IOException;
 28 | import java.io.Reader;
 29 | import java.util.HashMap;
 30 | import java.util.HashSet;
 31 | import java.util.LinkedList;
 32 | import java.util.Map;
 33 | import java.util.Set;
 34 | 
 35 | import org.wltea.analyzer.cfg.Configuration;
 36 | /**
 37 |  * 
 38 |  * 分词器上下文状态
 39 |  * 
 40 |  */
 41 | class AnalyzeContext {
 42 | 	
 43 | 	//默认缓冲区大小
 44 | 	private static final int BUFF_SIZE = 4096;
 45 | 	//缓冲区耗尽的临界值
 46 | 	private static final int BUFF_EXHAUST_CRITICAL = 100;	
 47 | 	
 48 |  
 49 | 	//字符窜读取缓冲
 50 |     private char[] segmentBuff;
 51 |     //字符类型数组
 52 |     private int[] charTypes;
 53 |     
 54 |     
 55 |     //记录Reader内已分析的字串总长度
 56 |     //在分多段分析词元时，该变量累计当前的segmentBuff相对于reader起始位置的位移
 57 | 	private int buffOffset;	
 58 |     //当前缓冲区位置指针
 59 |     private int cursor;
 60 |     //最近一次读入的,可处理的字串长度
 61 | 	private int available;
 62 | 
 63 | 	
 64 | 	//子分词器锁
 65 |     //该集合非空，说明有子分词器在占用segmentBuff
 66 |     private Set<String> buffLocker;
 67 |     
 68 |     //原始分词结果集合，未经歧义处理
 69 |     private QuickSortSet orgLexemes;    
 70 |     //LexemePath位置索引表
 71 |     private Map<Integer , LexemePath> pathMap;    
 72 |     //最终分词结果集
 73 |     private LinkedList<Lexeme> results;
 74 |     
 75 | 	//分词器配置项
 76 | 	private Configuration cfg;
 77 |     
 78 |     public AnalyzeContext(Configuration cfg){
 79 |     	this.cfg = cfg;
 80 |     	this.segmentBuff = new char[BUFF_SIZE];
 81 |     	this.charTypes = new int[BUFF_SIZE];
 82 |     	this.buffLocker = new HashSet<String>();
 83 |     	this.orgLexemes = new QuickSortSet();
 84 |     	this.pathMap = new HashMap<Integer , LexemePath>();    	
 85 |     	this.results = new LinkedList<Lexeme>();
 86 |     }
 87 |     
 88 |     int getCursor(){
 89 |     	return this.cursor;
 90 |     }
 91 | //    
 92 | //    void setCursor(int cursor){
 93 | //    	this.cursor = cursor;
 94 | //    }
 95 |      
 96 |     char[] getSegmentBuff(){
 97 |     	return this.segmentBuff;
 98 |     }
 99 |     
100 |     char getCurrentChar(){
101 |     	return this.segmentBuff[this.cursor];
102 |     }
103 |     
104 |     int getCurrentCharType(){
105 |     	return this.charTypes[this.cursor];
106 |     }
107 |     
108 |     int getBufferOffset(){
109 |     	return this.buffOffset;
110 |     }
111 | 	
112 |     /**
113 |      * 根据context的上下文情况，填充segmentBuff 
114 |      * @param reader
115 |      * @return 返回待分析的（有效的）字串长度
116 |      * @throws IOException 
117 |      */
118 |     int fillBuffer(Reader reader) throws IOException{
119 |     	int readCount = 0;
120 |     	if(this.buffOffset == 0){
121 |     		//首次读取reader
122 |     		readCount = reader.read(segmentBuff);
123 |     	}else{
124 |     		int offset = this.available - this.cursor;
125 |     		if(offset > 0){
126 |     			//最近一次读取的>最近一次处理的，将未处理的字串拷贝到segmentBuff头部
127 |     			System.arraycopy(this.segmentBuff , this.cursor , this.segmentBuff , 0 , offset);
128 |     			readCount = offset;
129 |     		}
130 |     		//继续读取reader ，以onceReadIn - onceAnalyzed为起始位置，继续填充segmentBuff剩余的部分
131 |     		readCount += reader.read(this.segmentBuff , offset , BUFF_SIZE - offset);
132 |     	}            	
133 |     	//记录最后一次从Reader中读入的可用字符长度
134 |     	this.available = readCount;
135 |     	//重置当前指针
136 |     	this.cursor = 0;
137 |     	return readCount;
138 |     }
139 | 
140 |     /**
141 |      * 初始化buff指针，处理第一个字符
142 |      */
143 |     void initCursor(){
144 |     	this.cursor = 0;
145 |     	this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
146 |     	this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
147 |     }
148 |     
149 |     /**
150 |      * 指针+1
151 |      * 成功返回 true； 指针已经到了buff尾部，不能前进，返回false
152 |      * 并处理当前字符
153 |      */
154 |     boolean moveCursor(){
155 |     	if(this.cursor < this.available - 1){
156 |     		this.cursor++;
157 |         	this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
158 |         	this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
159 |     		return true;
160 |     	}else{
161 |     		return false;
162 |     	}
163 |     }
164 | 	
165 |     /**
166 |      * 设置当前segmentBuff为锁定状态
167 |      * 加入占用segmentBuff的子分词器名称，表示占用segmentBuff
168 |      * @param segmenterName
169 |      */
170 | 	void lockBuffer(String segmenterName){
171 | 		this.buffLocker.add(segmenterName);
172 | 	}
173 | 	
174 | 	/**
175 | 	 * 移除指定的子分词器名，释放对segmentBuff的占用
176 | 	 * @param segmenterName
177 | 	 */
178 | 	void unlockBuffer(String segmenterName){
179 | 		this.buffLocker.remove(segmenterName);
180 | 	}
181 | 	
182 | 	/**
183 | 	 * 只要buffLocker中存在segmenterName
184 | 	 * 则buffer被锁定
185 | 	 * @return boolean 缓冲去是否被锁定
186 | 	 */
187 | 	boolean isBufferLocked(){
188 | 		return this.buffLocker.size() > 0;
189 | 	}
190 | 
191 | 	/**
192 | 	 * 判断当前segmentBuff是否已经用完
193 | 	 * 当前执针cursor移至segmentBuff末端this.available - 1
194 | 	 * @return
195 | 	 */
196 | 	boolean isBufferConsumed(){
197 | 		return this.cursor == this.available - 1;
198 | 	}
199 | 	
200 | 	/**
201 | 	 * 判断segmentBuff是否需要读取新数据
202 | 	 * 
203 | 	 * 满足一下条件时，
204 | 	 * 1.available == BUFF_SIZE 表示buffer满载
205 | 	 * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内
206 | 	 * 3.!context.isBufferLocked()表示没有segmenter在占用buffer
207 | 	 * 要中断当前循环（buffer要进行移位，并再读取数据的操作）
208 | 	 * @return
209 | 	 */
210 | 	boolean needRefillBuffer(){
211 | 		return this.available == BUFF_SIZE 
212 | 			&& this.cursor < this.available - 1   
213 | 			&& this.cursor  > this.available - BUFF_EXHAUST_CRITICAL
214 | 			&& !this.isBufferLocked();
215 | 	}
216 | 	
217 | 	/**
218 | 	 * 累计当前的segmentBuff相对于reader起始位置的位移
219 | 	 */
220 | 	void markBufferOffset(){
221 | 		this.buffOffset += this.cursor;
222 | 	}
223 | 	
224 | 	/**
225 | 	 * 向分词结果集添加词元
226 | 	 * @param lexeme
227 | 	 */
228 | 	void addLexeme(Lexeme lexeme){
229 | 		this.orgLexemes.addLexeme(lexeme);
230 | 	}
231 | 	
232 | 	/**
233 | 	 * 添加分词结果路径
234 | 	 * 路径起始位置 ---> 路径 映射表
235 | 	 * @param path
236 | 	 */
237 | 	void addLexemePath(LexemePath path){
238 | 		if(path != null){
239 | 			this.pathMap.put(path.getPathBegin(), path);
240 | 		}
241 | 	}
242 | 	
243 | 	
244 | 	/**
245 | 	 * 返回原始分词结果
246 | 	 * @return
247 | 	 */
248 | 	QuickSortSet getOrgLexemes(){
249 | 		return this.orgLexemes;
250 | 	}
251 | 	
252 | 	/**
253 | 	 * 推送分词结果到结果集合
254 | 	 * 1.从buff头部遍历到this.cursor已处理位置
255 | 	 * 2.将map中存在的分词结果推入results
256 | 	 * 3.将map中不存在的CJDK字符以单字方式推入results
257 | 	 */
258 | 	void outputToResult(){
259 | 		int index = 0;
260 | 		for( ; index <= this.cursor ;){
261 | 			//跳过非CJK字符
262 | 			if(CharacterUtil.CHAR_USELESS == this.charTypes[index]){
263 | 				index++;
264 | 				continue;
265 | 			}
266 | 			//从pathMap找出对应index位置的LexemePath
267 | 			LexemePath path = this.pathMap.get(index);
268 | 			if(path != null){
269 | 				//输出LexemePath中的lexeme到results集合
270 | 				Lexeme l = path.pollFirst();
271 | 				while(l != null){
272 | 					this.results.add(l);
273 | 					//将index移至lexeme后
274 | 					index = l.getBegin() + l.getLength();					
275 | 					l = path.pollFirst();
276 | 					if(l != null){
277 | 						//输出path内部，词元间遗漏的单字
278 | 						for(;index < l.getBegin();index++){
279 | 							this.outputSingleCJK(index);
280 | 						}
281 | 					}
282 | 				}
283 | 			}else{//pathMap中找不到index对应的LexemePath
284 | 				//单字输出
285 | 				this.outputSingleCJK(index);
286 | 				index++;
287 | 			}
288 | 		}
289 | 		//清空当前的Map
290 | 		this.pathMap.clear();
291 | 	}
292 | 	
293 | 	/**
294 | 	 * 对CJK字符进行单字输出
295 | 	 * @param index
296 | 	 */
297 | 	private void outputSingleCJK(int index){
298 | 		if(CharacterUtil.CHAR_CHINESE == this.charTypes[index]){			
299 | 			Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_CNCHAR);
300 | 			this.results.add(singleCharLexeme);
301 | 		}else if(CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]){
302 | 			Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_OTHER_CJK);
303 | 			this.results.add(singleCharLexeme);
304 | 		}
305 | 	}
306 | 		
307 | 	/**
308 | 	 * 返回lexeme 
309 | 	 * 
310 | 	 * 同时处理合并
311 | 	 * @return
312 | 	 */
313 | 	Lexeme getNextLexeme(){
314 | 		//从结果集取出，并移除第一个Lexme
315 | 		Lexeme result = this.results.pollFirst();
316 | 		/*while(result != null){
317 |     		//数量词合并
318 |     		this.compound(result);
319 |     		if(Dictionary.getSingleton().isStopWord(this.segmentBuff ,  result.getBegin() , result.getLength())){
320 |        			//是停止词继续取列表的下一个
321 |     			result = this.results.pollFirst(); 				
322 |     		}else{
323 | 	 			//不是停止词, 生成lexeme的词元文本,输出
324 | 	    		result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength()));
325 | 	    		break;
326 |     		}
327 | 		}*/
328 | 		if(result != null)
329 | 		{
330 | 			this.compound(result);
331 | 			result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength()));
332 | 		}
333 | 		return result;
334 | 	}
335 | 	
336 | 	/**
337 | 	 * 重置分词上下文状态
338 | 	 */
339 | 	void reset(){		
340 | 		this.buffLocker.clear();
341 |         this.orgLexemes = new QuickSortSet();
342 |         this.available =0;
343 |         this.buffOffset = 0;
344 |     	this.charTypes = new int[BUFF_SIZE];
345 |     	this.cursor = 0;
346 |     	this.results.clear();
347 |     	this.segmentBuff = new char[BUFF_SIZE];
348 |     	this.pathMap.clear();
349 | 	}
350 | 	
351 | 	/**
352 | 	 * 组合词元
353 | 	 */
354 | 	private void compound(Lexeme result){
355 | 		if(!this.cfg.useSmart()){
356 | 			return ;
357 | 		}
358 |    		//数量词合并处理
359 | 		if(!this.results.isEmpty()){
360 | 
361 | 			if(Lexeme.TYPE_ARABIC == result.getLexemeType()){
362 | 				Lexeme nextLexeme = this.results.peekFirst();
363 | 				boolean appendOk = false;
364 | 				if(Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()){
365 | 					//合并英文数词+中文数词
366 | 					appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
367 | 				}else if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){
368 | 					//合并英文数词+中文量词
369 | 					appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
370 | 				}
371 | 				if(appendOk){
372 | 					//弹出
373 | 					this.results.pollFirst(); 
374 | 				}
375 | 			}
376 | 			
377 | 			//可能存在第二轮合并
378 | 			if(Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()){
379 | 				Lexeme nextLexeme = this.results.peekFirst();
380 | 				boolean appendOk = false;
381 | 				 if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){
382 | 					 //合并中文数词+中文量词
383 |  					appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
384 |  				}  
385 | 				if(appendOk){
386 | 					//弹出
387 | 					this.results.pollFirst();   				
388 | 				}
389 | 			}
390 | 
391 | 		}
392 | 	}
393 | 	
394 | }
395 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java:
--------------------------------------------------------------------------------
  1 | 
  2 | /**
  3 |  * IK 中文分词  版本 5.0
  4 |  * IK Analyzer release 5.0
  5 |  * 
  6 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  7 |  * contributor license agreements.  See the NOTICE file distributed with
  8 |  * this work for additional information regarding copyright ownership.
  9 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 10 |  * (the "License"); you may not use this file except in compliance with
 11 |  * the License.  You may obtain a copy of the License at
 12 |  *
 13 |  *     http://www.apache.org/licenses/LICENSE-2.0
 14 |  *
 15 |  * Unless required by applicable law or agreed to in writing, software
 16 |  * distributed under the License is distributed on an "AS IS" BASIS,
 17 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 |  * See the License for the specific language governing permissions and
 19 |  * limitations under the License.
 20 |  *
 21 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 22 |  * 版权声明 2012，乌龙茶工作室
 23 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 24 |  * 
 25 |  */
 26 | package org.wltea.analyzer.core;
 27 | 
 28 | import java.util.LinkedList;
 29 | import java.util.List;
 30 | 
 31 | import org.wltea.analyzer.dic.Dictionary;
 32 | import org.wltea.analyzer.dic.Hit;
 33 | 
 34 | 
 35 | /**
 36 |  *  中文-日韩文子分词器
 37 |  */
 38 | class CJKSegmenter implements ISegmenter {
 39 | 	
 40 | 	//子分词器标签
 41 | 	static final String SEGMENTER_NAME = "CJK_SEGMENTER";
 42 | 	//待处理的分词hit队列
 43 | 	private List<Hit> tmpHits;
 44 | 	
 45 | 	
 46 | 	CJKSegmenter(){
 47 | 		this.tmpHits = new LinkedList<Hit>();
 48 | 	}
 49 | 
 50 | 	/* (non-Javadoc)
 51 | 	 * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
 52 | 	 */
 53 | 	public void analyze(AnalyzeContext context) {
 54 | 		if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){
 55 | 			
 56 | 			//优先处理tmpHits中的hit
 57 | 			if(!this.tmpHits.isEmpty()){
 58 | 				//处理词段队列
 59 | 				Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
 60 | 				for(Hit hit : tmpArray){
 61 | 					hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
 62 | 					if(hit.isMatch()){
 63 | 						//输出当前的词
 64 | 						Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
 65 | 						context.addLexeme(newLexeme);
 66 | 						
 67 | 						if(!hit.isPrefix()){//不是词前缀，hit不需要继续匹配，移除
 68 | 							this.tmpHits.remove(hit);
 69 | 						}
 70 | 						
 71 | 					}else if(hit.isUnmatch()){
 72 | 						//hit不是词，移除
 73 | 						this.tmpHits.remove(hit);
 74 | 					}					
 75 | 				}
 76 | 			}			
 77 | 			
 78 | 			//*********************************
 79 | 			//再对当前指针位置的字符进行单字匹配
 80 | 			Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
 81 | 			if(singleCharHit.isMatch()){//首字成词
 82 | 				//输出当前的词
 83 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
 84 | 				context.addLexeme(newLexeme);
 85 | 
 86 | 				//同时也是词前缀
 87 | 				if(singleCharHit.isPrefix()){
 88 | 					//前缀匹配则放入hit列表
 89 | 					this.tmpHits.add(singleCharHit);
 90 | 				}
 91 | 			}else if(singleCharHit.isPrefix()){//首字为词前缀
 92 | 				//前缀匹配则放入hit列表
 93 | 				this.tmpHits.add(singleCharHit);
 94 | 			}
 95 | 			
 96 | 
 97 | 		}else{
 98 | 			//遇到CHAR_USELESS字符
 99 | 			//清空队列
100 | 			this.tmpHits.clear();
101 | 		}
102 | 		
103 | 		//判断缓冲区是否已经读完
104 | 		if(context.isBufferConsumed()){
105 | 			//清空队列
106 | 			this.tmpHits.clear();
107 | 		}
108 | 		
109 | 		//判断是否锁定缓冲区
110 | 		if(this.tmpHits.size() == 0){
111 | 			context.unlockBuffer(SEGMENTER_NAME);
112 | 			
113 | 		}else{
114 | 			context.lockBuffer(SEGMENTER_NAME);
115 | 		}
116 | 	}
117 | 
118 | 	/* (non-Javadoc)
119 | 	 * @see org.wltea.analyzer.core.ISegmenter#reset()
120 | 	 */
121 | 	public void reset() {
122 | 		//清空队列
123 | 		this.tmpHits.clear();
124 | 	}
125 | 
126 | }
127 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0
  3 |  * IK Analyzer release 5.0
  4 |  * 
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  * 
 24 |  */
 25 | package org.wltea.analyzer.core;
 26 | 
 27 | import java.util.HashSet;
 28 | import java.util.LinkedList;
 29 | import java.util.List;
 30 | import java.util.Set;
 31 | 
 32 | import org.wltea.analyzer.dic.Dictionary;
 33 | import org.wltea.analyzer.dic.Hit;
 34 | 
 35 | /**
 36 |  * 
 37 |  * 中文数量词子分词器
 38 |  */
 39 | class CN_QuantifierSegmenter implements ISegmenter{
 40 | 	
 41 | 	//子分词器标签
 42 | 	static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
 43 | 	
 44 | 	//中文数词
 45 | 	private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";//Cnum
 46 | 	private static Set<Character> ChnNumberChars = new HashSet<Character>();
 47 | 	static{
 48 | 		char[] ca = Chn_Num.toCharArray();
 49 | 		for(char nChar : ca){
 50 | 			ChnNumberChars.add(nChar);
 51 | 		}
 52 | 	}
 53 | 	
 54 | 	/*
 55 | 	 * 词元的开始位置，
 56 | 	 * 同时作为子分词器状态标识
 57 | 	 * 当start > -1 时，标识当前的分词器正在处理字符
 58 | 	 */
 59 | 	private int nStart;
 60 | 	/*
 61 | 	 * 记录词元结束位置
 62 | 	 * end记录的是在词元中最后一个出现的合理的数词结束
 63 | 	 */
 64 | 	private int nEnd;
 65 | 
 66 | 	//待处理的量词hit队列
 67 | 	private List<Hit> countHits;
 68 | 	
 69 | 	
 70 | 	CN_QuantifierSegmenter(){
 71 | 		nStart = -1;
 72 | 		nEnd = -1;
 73 | 		this.countHits  = new LinkedList<Hit>();
 74 | 	}
 75 | 	
 76 | 	/**
 77 | 	 * 分词
 78 | 	 */
 79 | 	public void analyze(AnalyzeContext context) {
 80 | 		//处理中文数词
 81 | 		this.processCNumber(context);
 82 | 		//处理中文量词
 83 | 		this.processCount(context);
 84 | 		
 85 | 		//判断是否锁定缓冲区
 86 | 		if(this.nStart == -1 && this.nEnd == -1	&& countHits.isEmpty()){
 87 | 			//对缓冲区解锁
 88 | 			context.unlockBuffer(SEGMENTER_NAME);
 89 | 		}else{
 90 | 			context.lockBuffer(SEGMENTER_NAME);
 91 | 		}
 92 | 	}
 93 | 	
 94 | 
 95 | 	/**
 96 | 	 * 重置子分词器状态
 97 | 	 */
 98 | 	public void reset() {
 99 | 		nStart = -1;
100 | 		nEnd = -1;
101 | 		countHits.clear();
102 | 	}
103 | 	
104 | 	/**
105 | 	 * 处理数词
106 | 	 */
107 | 	private void processCNumber(AnalyzeContext context){
108 | 		if(nStart == -1 && nEnd == -1){//初始状态
109 | 			if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 
110 | 					&& ChnNumberChars.contains(context.getCurrentChar())){
111 | 				//记录数词的起始、结束位置
112 | 				nStart = context.getCursor();
113 | 				nEnd = context.getCursor();
114 | 			}
115 | 		}else{//正在处理状态
116 | 			if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 
117 | 					&& ChnNumberChars.contains(context.getCurrentChar())){
118 | 				//记录数词的结束位置
119 | 				nEnd = context.getCursor();
120 | 			}else{
121 | 				//输出数词
122 | 				this.outputNumLexeme(context);
123 | 				//重置头尾指针
124 | 				nStart = -1;
125 | 				nEnd = -1;
126 | 			}
127 | 		}
128 | 		
129 | 		//缓冲区已经用完，还有尚未输出的数词
130 | 		if(context.isBufferConsumed()){
131 | 			if(nStart != -1 && nEnd != -1){
132 | 				//输出数词
133 | 				outputNumLexeme(context);
134 | 				//重置头尾指针
135 | 				nStart = -1;
136 | 				nEnd = -1;
137 | 			}
138 | 		}	
139 | 	}
140 | 	
141 | 	/**
142 | 	 * 处理中文量词
143 | 	 * @param context
144 | 	 */
145 | 	private void processCount(AnalyzeContext context){
146 | 		// 判断是否需要启动量词扫描
147 | 		if(!this.needCountScan(context)){
148 | 			return;
149 | 		}
150 | 		
151 | 		if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){
152 | 			
153 | 			//优先处理countHits中的hit
154 | 			if(!this.countHits.isEmpty()){
155 | 				//处理词段队列
156 | 				Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
157 | 				for(Hit hit : tmpArray){
158 | 					hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
159 | 					if(hit.isMatch()){
160 | 						//输出当前的词
161 | 						Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
162 | 						context.addLexeme(newLexeme);
163 | 						
164 | 						if(!hit.isPrefix()){//不是词前缀，hit不需要继续匹配，移除
165 | 							this.countHits.remove(hit);
166 | 						}
167 | 						
168 | 					}else if(hit.isUnmatch()){
169 | 						//hit不是词，移除
170 | 						this.countHits.remove(hit);
171 | 					}					
172 | 				}
173 | 			}				
174 | 
175 | 			//*********************************
176 | 			//对当前指针位置的字符进行单字匹配
177 | 			Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
178 | 			if(singleCharHit.isMatch()){//首字成量词词
179 | 				//输出当前的词
180 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
181 | 				context.addLexeme(newLexeme);
182 | 
183 | 				//同时也是词前缀
184 | 				if(singleCharHit.isPrefix()){
185 | 					//前缀匹配则放入hit列表
186 | 					this.countHits.add(singleCharHit);
187 | 				}
188 | 			}else if(singleCharHit.isPrefix()){//首字为量词前缀
189 | 				//前缀匹配则放入hit列表
190 | 				this.countHits.add(singleCharHit);
191 | 			}
192 | 			
193 | 			
194 | 		}else{
195 | 			//输入的不是中文字符
196 | 			//清空未成形的量词
197 | 			this.countHits.clear();
198 | 		}
199 | 		
200 | 		//缓冲区数据已经读完，还有尚未输出的量词
201 | 		if(context.isBufferConsumed()){
202 | 			//清空未成形的量词
203 | 			this.countHits.clear();
204 | 		}
205 | 	}
206 | 	
207 | 	/**
208 | 	 * 判断是否需要扫描量词
209 | 	 * @return
210 | 	 */
211 | 	private boolean needCountScan(AnalyzeContext context){
212 | 		if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){
213 | 			//正在处理中文数词,或者正在处理量词
214 | 			return true;
215 | 		}else{
216 | 			//找到一个相邻的数词
217 | 			if(!context.getOrgLexemes().isEmpty()){
218 | 				Lexeme l = context.getOrgLexemes().peekLast();
219 | 				if(Lexeme.TYPE_CNUM == l.getLexemeType() ||  Lexeme.TYPE_ARABIC == l.getLexemeType()){
220 | 					if(l.getBegin() + l.getLength() == context.getCursor()){
221 | 						return true;
222 | 					}
223 | 				}
224 | 			}
225 | 		}
226 | 		return false;
227 | 	}
228 | 	
229 | 	/**
230 | 	 * 添加数词词元到结果集
231 | 	 * @param context
232 | 	 */
233 | 	private void outputNumLexeme(AnalyzeContext context){
234 | 		if(nStart > -1 && nEnd > -1){
235 | 			//输出数词
236 | 			Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM);
237 | 			context.addLexeme(newLexeme);
238 | 			
239 | 		}
240 | 	}
241 | 
242 | }
243 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/CharacterUtil.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0
  3 |  * IK Analyzer release 5.0
  4 |  * 
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  * 
 24 |  * 字符集识别工具类
 25 |  */
 26 | package org.wltea.analyzer.core;
 27 | 
 28 | /**
 29 |  *
 30 |  * 字符集识别工具类
 31 |  */
 32 | class CharacterUtil {
 33 | 	
 34 | 	public static final int CHAR_USELESS = 0;
 35 | 	
 36 | 	public static final int CHAR_ARABIC = 0X00000001;
 37 | 	
 38 | 	public static final int CHAR_ENGLISH = 0X00000002;
 39 | 	
 40 | 	public static final int CHAR_CHINESE = 0X00000004;
 41 | 	
 42 | 	public static final int CHAR_OTHER_CJK = 0X00000008;
 43 | 	
 44 | 	
 45 | 	/**
 46 | 	 * 识别字符类型
 47 | 	 * @param input
 48 | 	 * @return int CharacterUtil定义的字符类型常量
 49 | 	 */
 50 | 	static int identifyCharType(char input){
 51 | 		if(input >= '0' && input <= '9'){
 52 | 			return CHAR_ARABIC;
 53 | 			
 54 | 		}else if((input >= 'a' && input <= 'z')
 55 | 				|| (input >= 'A' && input <= 'Z')){
 56 | 			return CHAR_ENGLISH;
 57 | 			
 58 | 		}else {
 59 | 			Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
 60 | 			
 61 | 			if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS  
 62 | 					|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS  
 63 | 					|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
 64 | 				//目前已知的中文字符UTF-8集合
 65 | 				return CHAR_CHINESE;
 66 | 				
 67 | 			}else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
 68 | 					//韩文字符集
 69 | 					|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES 
 70 | 					|| ub == Character.UnicodeBlock.HANGUL_JAMO
 71 | 					|| ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
 72 | 					//日文字符集
 73 | 					|| ub == Character.UnicodeBlock.HIRAGANA //平假名
 74 | 					|| ub == Character.UnicodeBlock.KATAKANA //片假名
 75 | 					|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){
 76 | 				return CHAR_OTHER_CJK;
 77 | 				
 78 | 			}
 79 | 		}
 80 | 		//其他的不做处理的字符
 81 | 		return CHAR_USELESS;
 82 | 	}
 83 | 	
 84 | 	/**
 85 | 	 * 进行字符规格化（全角转半角，大写转小写处理）
 86 | 	 * @param input
 87 | 	 * @return char
 88 | 	 */
 89 | 	static char regularize(char input){
 90 |         if (input == 12288) {
 91 |             input = (char) 32;
 92 |             
 93 |         }else if (input > 65280 && input < 65375) {
 94 |             input = (char) (input - 65248);
 95 |             
 96 |         }else if (input >= 'A' && input <= 'Z') {
 97 |         	input += 32;
 98 | 		}
 99 |         
100 |         return input;
101 | 	}
102 | }
103 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/IKArbitrator.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0
  3 |  * IK Analyzer release 5.0
  4 |  * 
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  * 
 24 |  */
 25 | package org.wltea.analyzer.core;
 26 | 
 27 | import java.util.Stack;
 28 | import java.util.TreeSet;
 29 | 
 30 | /**
 31 |  * IK分词歧义裁决器
 32 |  */
 33 | class IKArbitrator {
 34 | 
 35 | 	IKArbitrator(){
 36 | 		
 37 | 	}
 38 | 	
 39 | 	/**
 40 | 	 * 分词歧义处理
 41 | 	 * @param orgLexemes
 42 | 	 * @param useSmart
 43 | 	 */
 44 | 	void process(AnalyzeContext context , boolean useSmart){
 45 | 		QuickSortSet orgLexemes = context.getOrgLexemes();
 46 | 		Lexeme orgLexeme = orgLexemes.pollFirst();
 47 | 		
 48 | 		LexemePath crossPath = new LexemePath();
 49 | 		while(orgLexeme != null){
 50 | 			if(!crossPath.addCrossLexeme(orgLexeme)){
 51 | 				//找到与crossPath不相交的下一个crossPath	
 52 | 				if(crossPath.size() == 1 || !useSmart){
 53 | 					//crossPath没有歧义 或者 不做歧义处理
 54 | 					//直接输出当前crossPath
 55 | 					context.addLexemePath(crossPath);
 56 | 				}else{
 57 | 					//对当前的crossPath进行歧义处理
 58 | 					QuickSortSet.Cell headCell = crossPath.getHead();
 59 | 					LexemePath judgeResult = this.judge(context,headCell, crossPath.getPathLength());
 60 | 					//输出歧义处理结果judgeResult
 61 | 					context.addLexemePath(judgeResult);
 62 | 				}
 63 | 				
 64 | 				//把orgLexeme加入新的crossPath中
 65 | 				crossPath = new LexemePath();	//再次new了对象
 66 | 				crossPath.addCrossLexeme(orgLexeme);
 67 | 			}
 68 | 			orgLexeme = orgLexemes.pollFirst();
 69 | 		}
 70 | 		
 71 | 		
 72 | 		//处理最后的path
 73 | 		if(crossPath.size() <= 1 || !useSmart){		//输入流单字情况,"额"
 74 | 			//crossPath没有歧义 或者 不做歧义处理
 75 | 			//直接输出当前crossPath
 76 | 			context.addLexemePath(crossPath);
 77 | 		}else{
 78 | 			//对当前的crossPath进行歧义处理
 79 | 			QuickSortSet.Cell headCell = crossPath.getHead();
 80 | 			LexemePath judgeResult = this.judge(context,headCell, crossPath.getPathLength());
 81 | 			//输出歧义处理结果judgeResult
 82 | 			context.addLexemePath(judgeResult);
 83 | 		}
 84 | 	}
 85 | 	
 86 | 	/**
 87 | 	 * 歧义识别
 88 | 	 * @param lexemeCell 歧义路径链表头
 89 | 	 * @param fullTextLength 歧义路径文本长度
 90 | 	 * @param option 候选结果路径
 91 | 	 * @return
 92 | 	 */
 93 | 	@SuppressWarnings("unused")
 94 | 	private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
 95 | 		//候选路径集合
 96 | 		TreeSet<LexemePath> pathOptions = new TreeSet<LexemePath>();
 97 | 		//候选结果路径
 98 | 		LexemePath option = new LexemePath();
 99 | 		
100 | 		//对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
101 | 		Stack<QuickSortSet.Cell> lexemeStack = this.forwardPath(lexemeCell , option);
102 | 		
103 | 		//当前词元链并非最理想的，加入候选路径集合
104 | 		pathOptions.add(option.copy());
105 | 		
106 | 		//存在歧义词，处理
107 | 		QuickSortSet.Cell c = null;
108 | 		while(!lexemeStack.isEmpty()){
109 | 			c = lexemeStack.pop();
110 | 			//回滚词元链
111 | 			this.backPath(c.getLexeme() , option);
112 | 			//从歧义词位置开始，递归，生成可选方案
113 | 			this.forwardPath(c , option);
114 | 			pathOptions.add(option.copy());
115 | 		}
116 | 		
117 | 		//返回集合中的最优方案
118 | 		//return pathOptions.first();
119 | 		/*Iterator<LexemePath> it=pathOptions.iterator();
120 | 
121 | 		while(it.hasNext())
122 | 		{
123 | 			System.out.println(it.next().toString());
124 | 		}*/ 
125 | 		
126 | 		return pathOptions.last();
127 | 
128 | 	}
129 | 	
130 | 	private LexemePath judge(AnalyzeContext context,QuickSortSet.Cell lexemeCell , int fullTextLength){
131 | 		//候选路径集合
132 | 		TreeSet<LexemePath> pathOptions = new TreeSet<LexemePath>();
133 | 		
134 | 		//候选结果路径
135 | 		
136 | 		LexemePath option = new LexemePath(context.getSegmentBuff(),lexemeCell.getLexeme().getBegin(),fullTextLength);
137 | 		
138 | 		//对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
139 | 		Stack<QuickSortSet.Cell> lexemeStack = this.forwardPath(lexemeCell , option);
140 | 		
141 | 		//当前词元链并非最理想的，加入候选路径集合
142 | 		pathOptions.add(option.copy());		//自定义拷贝函数
143 | 		
144 | 		//存在歧义词，处理
145 | 		QuickSortSet.Cell c = null;
146 | 		while(!lexemeStack.isEmpty()){
147 | 			c = lexemeStack.pop();
148 | 			//回滚词元链
149 | 			this.backPath(c.getLexeme() , option);
150 | 			//从歧义词位置开始，递归，生成可选方案
151 | 			this.forwardPath(c , option);
152 | 			pathOptions.add(option.copy());
153 | 		}
154 | 		
155 | 		//路径添加完毕，比较接口没写好，导致返回0的被去重
156 | 		/*
157 | 		 * ①、进行比较？
158 | 		 * ②、单字比较？
159 | 		 * ③、查找字典？
160 | 		 * */
161 | 		
162 | 		return pathOptions.last();
163 | 
164 | 	}
165 | 	
166 | 	/**
167 | 	 * 向前遍历，添加词元，构造一个无歧义词元组合
168 | 	 * @param LexemePath path
169 | 	 * @return
170 | 	 */
171 | 	private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){
172 | 		//发生冲突的Lexeme栈
173 | 		Stack<QuickSortSet.Cell> conflictStack = new Stack<QuickSortSet.Cell>();
174 | 		QuickSortSet.Cell c = lexemeCell;
175 | 		//迭代遍历Lexeme链表
176 | 		while(c != null && c.getLexeme() != null){
177 | 			if(!option.addNotCrossLexeme(c.getLexeme())){
178 | 				//词元交叉，添加失败则加入lexemeStack栈
179 | 				conflictStack.push(c);
180 | 			}
181 | 			c = c.getNext();
182 | 		}
183 | 		return conflictStack;
184 | 	}
185 | 	
186 | 	/**
187 | 	 * 回滚词元链，直到它能够接受指定的词元
188 | 	 * @param lexeme 
189 | 	 * @param l
190 | 	 */
191 | 	private void backPath(Lexeme l  , LexemePath option){
192 | 		while(option.checkCross(l)){
193 | 			option.removeTail();
194 | 		}
195 | 		
196 | 	}
197 | 	
198 | }
199 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/IKSegmenter.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0
  3 |  * IK Analyzer release 5.0
  4 |  * 
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  */
 24 | package org.wltea.analyzer.core;
 25 | 
 26 | import java.io.IOException;
 27 | import java.io.Reader;
 28 | import java.util.ArrayList;
 29 | import java.util.List;
 30 | 
 31 | import org.wltea.analyzer.cfg.Configuration;
 32 | import org.wltea.analyzer.cfg.DefaultConfig;
 33 | import org.wltea.analyzer.dic.Dictionary;
 34 | 
 35 | /**
 36 |  * IK分词器主类
 37 |  *
 38 |  */
 39 | public final class IKSegmenter {
 40 | 	
 41 | 	//字符窜reader
 42 | 	private Reader input;
 43 | 	//分词器配置项
 44 | 	private Configuration cfg;
 45 | 	//分词器上下文
 46 | 	private AnalyzeContext context;
 47 | 	//分词处理器列表
 48 | 	private List<ISegmenter> segmenters;
 49 | 	//分词歧义裁决器
 50 | 	private IKArbitrator arbitrator;
 51 | 	
 52 | 
 53 | 	/**
 54 | 	 * IK分词器构造函数
 55 | 	 * @param input 
 56 | 	 * @param useSmart 为true，使用智能分词策略
 57 | 	 * 
 58 | 	 * 非智能分词：细粒度输出所有可能的切分结果
 59 | 	 * 智能分词： 合并数词和量词，对分词结果进行歧义判断
 60 | 	 */
 61 | 	
 62 | 	public IKSegmenter(Reader input , boolean useSmart){
 63 | 		this.input = input;
 64 | 		this.cfg = DefaultConfig.getInstance();
 65 | 		this.cfg.setUseSmart(useSmart);
 66 | 		this.init();
 67 | 	}
 68 | 	
 69 | 	/**
 70 | 	 * IK分词器构造函数
 71 | 	 * @param input
 72 | 	 * @param cfg 使用自定义的Configuration构造分词器
 73 | 	 * 
 74 | 	 */
 75 | 	public IKSegmenter(Reader input , Configuration cfg){
 76 | 		this.input = input;
 77 | 		this.cfg = cfg;
 78 | 		this.init();
 79 | 	}
 80 | 	
 81 | 	/**
 82 | 	 * 初始化
 83 | 	 */
 84 | 	private void init(){
 85 | 		//初始化词典单例
 86 | 		Dictionary.initial(this.cfg);
 87 | 		//初始化分词上下文
 88 | 		this.context = new AnalyzeContext(this.cfg);
 89 | 		//加载子分词器
 90 | 		this.segmenters = this.loadSegmenters();
 91 | 		//加载歧义裁决器
 92 | 		this.arbitrator = new IKArbitrator();
 93 | 	}
 94 | 	
 95 | 	
 96 | 	/**
 97 | 	 * 初始化词典，加载子分词器实现
 98 | 	 * @return List<ISegmenter>
 99 | 	 */
100 | 	private List<ISegmenter> loadSegmenters(){
101 | 		List<ISegmenter> segmenters = new ArrayList<ISegmenter>(4);
102 | 		//处理字母的子分词器
103 | 		segmenters.add(new LetterSegmenter()); 
104 | 		//处理中文数量词的子分词器
105 | 		segmenters.add(new CN_QuantifierSegmenter());
106 | 		//处理中文词的子分词器
107 | 		segmenters.add(new CJKSegmenter());
108 | 		return segmenters;
109 | 	}
110 | 	
111 | 	/**
112 | 	 * 分词，获取下一个词元
113 | 	 * @return Lexeme 词元对象
114 | 	 * @throws IOException
115 | 	 */
116 | 	public synchronized Lexeme next()throws IOException{
117 | 		Lexeme l = null;
118 | 		while((l = context.getNextLexeme()) == null ){
119 | 			/*
120 | 			 * 从reader中读取数据，填充buffer
121 | 			 * 如果reader是分次读入buffer的，那么buffer要  进行移位处理
122 | 			 * 移位处理上次读入的但未处理的数据
123 | 			 */
124 | 			int available = context.fillBuffer(this.input);
125 | 			if(available <= 0){
126 | 				//reader已经读完
127 | 				context.reset();
128 | 				return null;
129 | 				
130 | 			}else{
131 | 				//初始化指针
132 | 				context.initCursor();
133 | 				do{
134 |         			//遍历子分词器
135 |         			for(ISegmenter segmenter : segmenters){
136 |         				segmenter.analyze(context);
137 |         			}
138 |         			//字符缓冲区接近读完，需要读入新的字符
139 |         			if(context.needRefillBuffer()){
140 |         				break;
141 |         			}
142 |    				//向前移动指针
143 | 				}while(context.moveCursor());
144 | 				//重置子分词器，为下轮循环进行初始化
145 | 				for(ISegmenter segmenter : segmenters){
146 | 					segmenter.reset();
147 | 				}
148 | 			}
149 | 			//对分词进行歧义处理
150 | 			this.arbitrator.process(context, this.cfg.useSmart());			
151 | 			//将分词结果输出到结果集，并处理未切分的单个CJK字符
152 | 			context.outputToResult();
153 | 			//记录本次分词的缓冲区位移
154 | 			context.markBufferOffset();			
155 | 		}
156 | 		return l;
157 | 	}
158 | 
159 | 	/**
160 |      * 重置分词器到初始状态
161 |      * @param input
162 |      */
163 | 	public synchronized void reset(Reader input) {
164 | 		this.input = input;
165 | 		context.reset();
166 | 		for(ISegmenter segmenter : segmenters){
167 | 			segmenter.reset();
168 | 		}
169 | 	}
170 | }
171 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/ISegmenter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * IK 中文分词  版本 5.0
 3 |  * IK Analyzer release 5.0
 4 |  * 
 5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 6 |  * contributor license agreements.  See the NOTICE file distributed with
 7 |  * this work for additional information regarding copyright ownership.
 8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 9 |  * (the "License"); you may not use this file except in compliance with
10 |  * the License.  You may obtain a copy of the License at
11 |  *
12 |  *     http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  * Unless required by applicable law or agreed to in writing, software
15 |  * distributed under the License is distributed on an "AS IS" BASIS,
16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  * See the License for the specific language governing permissions and
18 |  * limitations under the License.
19 |  *
20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
21 |  * 版权声明 2012，乌龙茶工作室
22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
23 |  * 
24 |  */
25 | package org.wltea.analyzer.core;
26 | 
27 | 
28 | /**
29 |  * 
30 |  * 子分词器接口
31 |  */
32 | interface ISegmenter {
33 | 	
34 | 	/**
35 | 	 * 从分析器读取下一个可能分解的词元对象
36 | 	 * @param context 分词算法上下文
37 | 	 */
38 | 	void analyze(AnalyzeContext context);
39 | 	
40 | 	
41 | 	/**
42 | 	 * 重置子分析器状态
43 | 	 */
44 | 	void reset();
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0
  3 |  * IK Analyzer release 5.0
  4 |  * 
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  * 
 24 |  */
 25 | package org.wltea.analyzer.core;
 26 | 
 27 | import java.util.Arrays;
 28 | 
 29 | /**
 30 |  * 
 31 |  * 英文字符及阿拉伯数字子分词器
 32 |  */
 33 | class LetterSegmenter implements ISegmenter {
 34 | 	
 35 | 	//子分词器标签
 36 | 	static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
 37 | 	//链接符号
 38 | 	private static final char[] Letter_Connector = new char[]{'#' , '&' , '+' , '-' , '.' , '@' , '_'};
 39 | 	
 40 | 	//数字符号
 41 | 	private static final char[] Num_Connector = new char[]{',' , '.'};
 42 | 	
 43 | 	/*
 44 | 	 * 词元的开始位置，
 45 | 	 * 同时作为子分词器状态标识
 46 | 	 * 当start > -1 时，标识当前的分词器正在处理字符
 47 | 	 */
 48 | 	private int start;
 49 | 	/*
 50 | 	 * 记录词元结束位置
 51 | 	 * end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
 52 | 	 */
 53 | 	private int end;
 54 | 	
 55 | 	/*
 56 | 	 * 字母起始位置
 57 | 	 */
 58 | 	private int englishStart;
 59 | 
 60 | 	/*
 61 | 	 * 字母结束位置
 62 | 	 */
 63 | 	private int englishEnd;
 64 | 	
 65 | 	/*
 66 | 	 * 阿拉伯数字起始位置
 67 | 	 */
 68 | 	private int arabicStart;
 69 | 	
 70 | 	/*
 71 | 	 * 阿拉伯数字结束位置
 72 | 	 */
 73 | 	private int arabicEnd;
 74 | 	
 75 | 	LetterSegmenter(){
 76 | 		Arrays.sort(Letter_Connector);
 77 | 		Arrays.sort(Num_Connector);
 78 | 		this.start = -1;
 79 | 		this.end = -1;
 80 | 		this.englishStart = -1;
 81 | 		this.englishEnd = -1;
 82 | 		this.arabicStart = -1;
 83 | 		this.arabicEnd = -1;
 84 | 	}
 85 | 
 86 | 
 87 | 	/* (non-Javadoc)
 88 | 	 * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
 89 | 	 */
 90 | 	public void analyze(AnalyzeContext context) {
 91 | 		boolean bufferLockFlag = false;
 92 | 		//处理英文字母
 93 | 		bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag;
 94 | 		//处理阿拉伯字母
 95 | 		bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
 96 | 		//处理混合字母(这个要放最后处理，可以通过QuickSortSet排除重复)
 97 | 		bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
 98 | 		
 99 | 		//判断是否锁定缓冲区
100 | 		if(bufferLockFlag){
101 | 			context.lockBuffer(SEGMENTER_NAME);
102 | 		}else{
103 | 			//对缓冲区解锁
104 | 			context.unlockBuffer(SEGMENTER_NAME);
105 | 		}
106 | 	}
107 | 	
108 | 	/* (non-Javadoc)
109 | 	 * @see org.wltea.analyzer.core.ISegmenter#reset()
110 | 	 */
111 | 	public void reset() {
112 | 		this.start = -1;
113 | 		this.end = -1;
114 | 		this.englishStart = -1;
115 | 		this.englishEnd = -1;
116 | 		this.arabicStart = -1;
117 | 		this.arabicEnd = -1;
118 | 	}	
119 | 	
120 | 	/**
121 | 	 * 处理数字字母混合输出
122 | 	 * 如：windos2000 | linliangyi2005@gmail.com
123 | 	 * @param input
124 | 	 * @param context
125 | 	 * @return
126 | 	 */
127 | 	private boolean processMixLetter(AnalyzeContext context){
128 | 		boolean needLock = false;
129 | 		
130 | 		if(this.start == -1){//当前的分词器尚未开始处理字符
131 | 			if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
132 | 					|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
133 | 				//记录起始指针的位置,标明分词器进入处理状态
134 | 				this.start = context.getCursor();
135 | 				this.end = start;
136 | 			}
137 | 			
138 | 		}else{//当前的分词器正在处理字符			
139 | 			if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
140 | 					|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
141 | 				//记录下可能的结束位置
142 | 				this.end = context.getCursor();
143 | 				
144 | 			}else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
145 | 						&& this.isLetterConnector(context.getCurrentChar())){
146 | 				//记录下可能的结束位置
147 | 				this.end = context.getCursor();
148 | 			}else{
149 | 				//遇到非Letter字符，输出词元
150 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER);
151 | 				context.addLexeme(newLexeme);
152 | 				this.start = -1;
153 | 				this.end = -1;
154 | 			}			
155 | 		}
156 | 		
157 | 		//判断缓冲区是否已经读完
158 | 		if(context.isBufferConsumed()){
159 | 			if(this.start != -1 && this.end != -1){
160 | 				//缓冲以读完，输出词元
161 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER);
162 | 				context.addLexeme(newLexeme);
163 | 				this.start = -1;
164 | 				this.end = -1;
165 | 			}
166 | 		}
167 | 		
168 | 		//判断是否锁定缓冲区
169 | 		if(this.start == -1 && this.end == -1){
170 | 			//对缓冲区解锁
171 | 			needLock = false;
172 | 		}else{
173 | 			needLock = true;
174 | 		}
175 | 		return needLock;
176 | 	}
177 | 	
178 | 	/**
179 | 	 * 处理纯英文字母输出
180 | 	 * @param context
181 | 	 * @return
182 | 	 */
183 | 	private boolean processEnglishLetter(AnalyzeContext context){
184 | 		boolean needLock = false;
185 | 		
186 | 		if(this.englishStart == -1){//当前的分词器尚未开始处理英文字符	
187 | 			if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
188 | 				//记录起始指针的位置,标明分词器进入处理状态
189 | 				this.englishStart = context.getCursor();
190 | 				this.englishEnd = this.englishStart;
191 | 			}
192 | 		}else {//当前的分词器正在处理英文字符	
193 | 			if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
194 | 				//记录当前指针位置为结束位置
195 | 				this.englishEnd =  context.getCursor();
196 | 			}else{
197 | 				//遇到非English字符,输出词元
198 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH);
199 | 				context.addLexeme(newLexeme);
200 | 				this.englishStart = -1;
201 | 				this.englishEnd= -1;
202 | 			}
203 | 		}
204 | 		
205 | 		//判断缓冲区是否已经读完
206 | 		if(context.isBufferConsumed()){
207 | 			if(this.englishStart != -1 && this.englishEnd != -1){
208 | 				//缓冲以读完，输出词元
209 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH);
210 | 				context.addLexeme(newLexeme);
211 | 				this.englishStart = -1;
212 | 				this.englishEnd= -1;
213 | 			}
214 | 		}	
215 | 		
216 | 		//判断是否锁定缓冲区
217 | 		if(this.englishStart == -1 && this.englishEnd == -1){
218 | 			//对缓冲区解锁
219 | 			needLock = false;
220 | 		}else{
221 | 			needLock = true;
222 | 		}
223 | 		return needLock;			
224 | 	}
225 | 	
226 | 	/**
227 | 	 * 处理阿拉伯数字输出
228 | 	 * @param context
229 | 	 * @return
230 | 	 */
231 | 	private boolean processArabicLetter(AnalyzeContext context){
232 | 		boolean needLock = false;
233 | 		
234 | 		if(this.arabicStart == -1){//当前的分词器尚未开始处理数字字符	
235 | 			if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){
236 | 				//记录起始指针的位置,标明分词器进入处理状态
237 | 				this.arabicStart = context.getCursor();
238 | 				this.arabicEnd = this.arabicStart;
239 | 			}
240 | 		}else {//当前的分词器正在处理数字字符	
241 | 			if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){
242 | 				//记录当前指针位置为结束位置
243 | 				this.arabicEnd = context.getCursor();
244 | 			}else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
245 | 					&& this.isNumConnector(context.getCurrentChar())){
246 | 				//不输出数字，但不标记结束
247 | 			}else{
248 | 				////遇到非Arabic字符,输出词元
249 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC);
250 | 				context.addLexeme(newLexeme);
251 | 				this.arabicStart = -1;
252 | 				this.arabicEnd = -1;
253 | 			}
254 | 		}
255 | 		
256 | 		//判断缓冲区是否已经读完
257 | 		if(context.isBufferConsumed()){
258 | 			if(this.arabicStart != -1 && this.arabicEnd != -1){
259 | 				//生成已切分的词元
260 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() ,  this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC);
261 | 				context.addLexeme(newLexeme);
262 | 				this.arabicStart = -1;
263 | 				this.arabicEnd = -1;
264 | 			}
265 | 		}
266 | 		
267 | 		//判断是否锁定缓冲区
268 | 		if(this.arabicStart == -1 && this.arabicEnd == -1){
269 | 			//对缓冲区解锁
270 | 			needLock = false;
271 | 		}else{
272 | 			needLock = true;
273 | 		}
274 | 		return needLock;		
275 | 	}	
276 | 
277 | 	/**
278 | 	 * 判断是否是字母连接符号
279 | 	 * @param input
280 | 	 * @return
281 | 	 */
282 | 	private boolean isLetterConnector(char input){
283 | 		int index = Arrays.binarySearch(Letter_Connector, input);
284 | 		return index >= 0;
285 | 	}
286 | 	
287 | 	/**
288 | 	 * 判断是否是数字连接符号
289 | 	 * @param input
290 | 	 * @return
291 | 	 */
292 | 	private boolean isNumConnector(char input){
293 | 		int index = Arrays.binarySearch(Num_Connector, input);
294 | 		return index >= 0;
295 | 	}
296 | }
297 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/Lexeme.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0
  3 |  * IK Analyzer release 5.0
  4 |  * 
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  * 
 24 |  */
 25 | package org.wltea.analyzer.core;
 26 | 
 27 | /**
 28 |  * IK词元对象 
 29 |  */
 30 | public class Lexeme implements Comparable<Lexeme>{
 31 | 	//lexemeType常量
 32 | 	//未知
 33 | 	public static final int TYPE_UNKNOWN = 0;
 34 | 	//英文
 35 | 	public static final int TYPE_ENGLISH = 1;
 36 | 	//数字
 37 | 	public static final int TYPE_ARABIC = 2;
 38 | 	//英文数字混合
 39 | 	public static final int TYPE_LETTER = 3;
 40 | 	//中文词元
 41 | 	public static final int TYPE_CNWORD = 4;
 42 | 	//中文单字
 43 | 	public static final int TYPE_CNCHAR = 64;
 44 | 	//日韩文字
 45 | 	public static final int TYPE_OTHER_CJK = 8;
 46 | 	//中文数词
 47 | 	public static final int TYPE_CNUM = 16;
 48 | 	//中文量词
 49 | 	public static final int TYPE_COUNT = 32;
 50 | 	//中文数量词
 51 | 	public static final int TYPE_CQUAN = 48;
 52 | 	
 53 | 	//词元的起始位移
 54 | 	private int offset;
 55 |     //词元的相对起始位置
 56 |     private int begin;
 57 |     //词元的长度
 58 |     private int length;
 59 |     //词元文本
 60 |     private String lexemeText;
 61 |     //词元类型
 62 |     private int lexemeType;
 63 |     
 64 |     
 65 | 	public Lexeme(int offset , int begin , int length , int lexemeType){
 66 | 		this.offset = offset;
 67 | 		this.begin = begin;
 68 | 		if(length < 0){
 69 | 			throw new IllegalArgumentException("length < 0");
 70 | 		}
 71 | 		this.length = length;
 72 | 		this.lexemeType = lexemeType;
 73 | 	}
 74 | 	
 75 |     /*
 76 |      * 判断词元相等算法
 77 |      * 起始位置偏移、起始位置、终止位置相同
 78 |      * @see java.lang.Object#equals(Object o)
 79 |      */
 80 | 	public boolean equals(Object o){
 81 | 		if(o == null){
 82 | 			return false;
 83 | 		}
 84 | 		
 85 | 		if(this == o){
 86 | 			return true;
 87 | 		}
 88 | 		
 89 | 		if(o instanceof Lexeme){
 90 | 			Lexeme other = (Lexeme)o;
 91 | 			if(this.offset == other.getOffset()
 92 | 					&& this.begin == other.getBegin()
 93 | 					&& this.length == other.getLength()){
 94 | 				return true;			
 95 | 			}else{
 96 | 				return false;
 97 | 			}
 98 | 		}else{		
 99 | 			return false;
100 | 		}
101 | 	}
102 | 	
103 |     /*
104 |      * 词元哈希编码算法
105 |      * @see java.lang.Object#hashCode()
106 |      */
107 |     public int hashCode(){
108 |     	int absBegin = getBeginPosition();
109 |     	int absEnd = getEndPosition();
110 |     	return  (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
111 |     }
112 |     
113 |     /*
114 |      * 词元在排序集合中的比较算法
115 |      * @see java.lang.Comparable#compareTo(java.lang.Object)
116 |      */
117 | 	public int compareTo(Lexeme other) {
118 | 		//起始位置优先
119 |         if(this.begin < other.getBegin()){
120 |             return -1;
121 |         }else if(this.begin == other.getBegin()){
122 |         	//词元长度优先
123 |         	if(this.length > other.getLength()){
124 |         		return -1;
125 |         	}else if(this.length == other.getLength()){
126 |         		return 0;
127 |         	}else {//this.length < other.getLength()
128 |         		return 1;
129 |         	}
130 |         	
131 |         }else{//this.begin > other.getBegin()
132 |         	return 1;
133 |         }
134 | 	}
135 | 	
136 | 	public int getOffset() {
137 | 		return offset;
138 | 	}
139 | 
140 | 	public void setOffset(int offset) {
141 | 		this.offset = offset;
142 | 	}
143 | 
144 | 	public int getBegin() {
145 | 		return begin;
146 | 	}
147 | 	/**
148 | 	 * 获取词元在文本中的起始位置
149 | 	 * @return int
150 | 	 */
151 | 	public int getBeginPosition(){
152 | 		return offset + begin;
153 | 	}
154 | 
155 | 	public void setBegin(int begin) {
156 | 		this.begin = begin;
157 | 	}
158 | 
159 | 	/**
160 | 	 * 获取词元在文本中的结束位置
161 | 	 * @return int
162 | 	 */
163 | 	public int getEndPosition(){
164 | 		return offset + begin + length;
165 | 	}
166 | 	
167 | 	/**
168 | 	 * 获取词元的字符长度
169 | 	 * @return int
170 | 	 */
171 | 	public int getLength(){
172 | 		return this.length;
173 | 	}	
174 | 	
175 | 	public void setLength(int length) {
176 | 		if(this.length < 0){
177 | 			throw new IllegalArgumentException("length < 0");
178 | 		}
179 | 		this.length = length;
180 | 	}
181 | 	
182 | 	/**
183 | 	 * 获取词元的文本内容
184 | 	 * @return String
185 | 	 */
186 | 	public String getLexemeText() {
187 | 		if(lexemeText == null){
188 | 			return "";
189 | 		}
190 | 		return lexemeText;
191 | 	}
192 | 
193 | 	public void setLexemeText(String lexemeText) {
194 | 		if(lexemeText == null){
195 | 			this.lexemeText = "";
196 | 			this.length = 0;
197 | 		}else{
198 | 			this.lexemeText = lexemeText;
199 | 			this.length = lexemeText.length();
200 | 		}
201 | 	}
202 | 
203 | 	/**
204 | 	 * 获取词元类型
205 | 	 * @return int
206 | 	 */
207 | 	public int getLexemeType() {
208 | 		return lexemeType;
209 | 	}
210 | 	
211 | 	/**
212 | 	 * 获取词元类型标示字符串
213 | 	 * @return String
214 | 	 */
215 | 	public String getLexemeTypeString(){
216 | 		switch(lexemeType) {
217 | 
218 | 		case TYPE_ENGLISH :
219 | 			return "ENGLISH";
220 | 			
221 | 		case TYPE_ARABIC :
222 | 			return "ARABIC";
223 | 			
224 | 		case TYPE_LETTER :
225 | 			return "LETTER";
226 | 			
227 | 		case TYPE_CNWORD : 
228 | 			return "CN_WORD";
229 | 			
230 | 		case TYPE_CNCHAR : 
231 | 			return "CN_CHAR";
232 | 			
233 | 		case TYPE_OTHER_CJK :
234 | 			return "OTHER_CJK";
235 | 			
236 | 		case TYPE_COUNT :
237 | 			return "COUNT";
238 | 			
239 | 		case TYPE_CNUM :
240 | 			return "TYPE_CNUM";
241 | 			
242 | 		case TYPE_CQUAN:	
243 | 			return "TYPE_CQUAN";
244 | 			
245 | 		default :
246 | 			return "UNKONW";
247 | 		}
248 | 	}
249 | 		
250 | 
251 | 	public void setLexemeType(int lexemeType) {
252 | 		this.lexemeType = lexemeType;
253 | 	}
254 | 	
255 | 	/**
256 | 	 * 合并两个相邻的词元
257 | 	 * @param l
258 | 	 * @param lexemeType
259 | 	 * @return boolean 词元是否成功合并
260 | 	 */
261 | 	public boolean append(Lexeme l , int lexemeType){
262 | 		if(l != null && this.getEndPosition() == l.getBeginPosition()){
263 | 			this.length += l.getLength();
264 | 			this.lexemeType = lexemeType;
265 | 			return true;
266 | 		}else {
267 | 			return false;
268 | 		}
269 | 	}
270 | 	
271 | 
272 | 	/**
273 | 	 * 
274 | 	 */
275 | 	public String toString(){
276 | 		StringBuffer strbuf = new StringBuffer();
277 | 		strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition());
278 | 		strbuf.append(" : ").append(this.lexemeText).append(" : \t");
279 | 		strbuf.append(this.getLexemeTypeString());
280 | 		return strbuf.toString();
281 | 	}
282 | 	
283 | 
284 | }
285 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/LexemePath.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0
  3 |  * IK Analyzer release 5.0
  4 |  * 
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  * 
 24 |  */
 25 | package org.wltea.analyzer.core;
 26 | 
 27 | import org.wltea.analyzer.dic.Dictionary;
 28 | 
 29 | 
 30 | /**
 31 |  * Lexeme链（路径）
 32 |  */
 33 | class LexemePath extends QuickSortSet implements Comparable<LexemePath>{
 34 | 	
 35 | 	//起始位置
 36 | 	private int pathBegin;
 37 | 	//结束
 38 | 	private int pathEnd;
 39 | 	//词元链的有效字符长度
 40 | 	private int payloadLength;
 41 | 	
 42 | 	private char[] sentenceContent;	//原始输入内容
 43 | 	private int absBegin;		//交集的绝对起始处----区别于词元
 44 | 	private int absLength;		//交集的绝对长度
 45 | 	
 46 | 	private float _result=-1.0f;		//存储返回量化后的结果
 47 | 	
 48 | 	LexemePath(){
 49 | 		this.pathBegin = -1;
 50 | 		this.pathEnd = -1;
 51 | 		this.payloadLength = 0;
 52 | 	}
 53 | 
 54 | 	LexemePath(char[] context,int absBegin ,int fullTextLength)
 55 | 	{
 56 | 		this.pathBegin = -1;
 57 | 		this.pathEnd = -1;
 58 | 		this.payloadLength = 0;
 59 | 		//System.arraycopy(context, 0,sentenceContent, 0, 100);
 60 | 		this.sentenceContent = context;
 61 | 		this.absBegin = absBegin;
 62 | 		this.absLength = fullTextLength;
 63 | 	}
 64 | 	/**
 65 | 	 * 向LexemePath追加相交的Lexeme
 66 | 	 * @param lexeme
 67 | 	 * @return 
 68 | 	 */
 69 | 	boolean addCrossLexeme(Lexeme lexeme){
 70 | 		if(this.isEmpty()){
 71 | 			this.addLexeme(lexeme);
 72 | 			this.pathBegin = lexeme.getBegin();
 73 | 			this.pathEnd = lexeme.getBegin() + lexeme.getLength();
 74 | 			this.payloadLength += lexeme.getLength();
 75 | 			return true;
 76 | 			
 77 | 		}else if(this.checkCross(lexeme)){
 78 | 			this.addLexeme(lexeme);
 79 | 			if(lexeme.getBegin() + lexeme.getLength() > this.pathEnd){
 80 | 				this.pathEnd = lexeme.getBegin() + lexeme.getLength();
 81 | 			}
 82 | 			this.payloadLength = this.pathEnd - this.pathBegin;	//此处payloadLength，交集处不算？end减原来的begin
 83 | 			return true;
 84 | 			
 85 | 		}else{
 86 | 			return  false;
 87 | 			
 88 | 		}
 89 | 	}
 90 | 	
 91 | 	/**
 92 | 	 * 向LexemePath追加不相交的Lexeme
 93 | 	 * @param lexeme
 94 | 	 * @return 
 95 | 	 */
 96 | 	boolean addNotCrossLexeme(Lexeme lexeme){
 97 | 		if(this.isEmpty()){
 98 | 			this.addLexeme(lexeme);
 99 | 			this.pathBegin = lexeme.getBegin();
100 | 			this.pathEnd = lexeme.getBegin() + lexeme.getLength();
101 | 			this.payloadLength += lexeme.getLength();
102 | 			return true;
103 | 			
104 | 		}else if(this.checkCross(lexeme)){
105 | 			return  false;
106 | 			
107 | 		}else{
108 | 			this.addLexeme(lexeme);
109 | 			this.payloadLength += lexeme.getLength();
110 | 			Lexeme head = this.peekFirst();
111 | 			this.pathBegin = head.getBegin();
112 | 			Lexeme tail = this.peekLast();
113 | 			this.pathEnd = tail.getBegin() + tail.getLength();
114 | 			return true;
115 | 			
116 | 		}
117 | 	}
118 | 	
119 | 	/**
120 | 	 * 移除尾部的Lexeme
121 | 	 * @return
122 | 	 */
123 | 	Lexeme removeTail(){
124 | 		Lexeme tail = this.pollLast();
125 | 		if(this.isEmpty()){
126 | 			this.pathBegin = -1;
127 | 			this.pathEnd = -1;
128 | 			this.payloadLength = 0;			
129 | 		}else{		
130 | 			this.payloadLength -= tail.getLength();
131 | 			Lexeme newTail = this.peekLast();
132 | 			this.pathEnd = newTail.getBegin() + newTail.getLength();
133 | 		}
134 | 		return tail;
135 | 	}
136 | 	
137 | 	/**
138 | 	 * 检测词元位置交叉（有歧义的切分）
139 | 	 * @param lexeme
140 | 	 * @return
141 | 	 */
142 | 	boolean checkCross(Lexeme lexeme){
143 | 		return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
144 | 				|| (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()+ lexeme.getLength());
145 | 	}
146 | 	
147 | 	int getPathBegin() {
148 | 		return pathBegin;
149 | 	}
150 | 
151 | 	int getPathEnd() {
152 | 		return pathEnd;
153 | 	}
154 | 
155 | 	/**
156 | 	 * 获取Path的有效词长
157 | 	 * @return
158 | 	 */
159 | 	int getPayloadLength(){
160 | 		return this.payloadLength;
161 | 	}
162 | 	
163 | 	/**
164 | 	 * 获取LexemePath的路径长度
165 | 	 * @return
166 | 	 */
167 | 	int getPathLength(){
168 | 		return this.pathEnd - this.pathBegin;
169 | 	}
170 | 	
171 | 
172 | 	/**
173 | 	 * X权重（词元长度积），长度越平均，值越大
174 | 	 * @return
175 | 	 */
176 | 	int getXWeight(){
177 | 		int product = 1;
178 | 		Cell c = this.getHead();
179 | 		while( c != null && c.getLexeme() != null){
180 | 			product *= c.getLexeme().getLength();
181 | 			c = c.getNext();
182 | 		}
183 | 		return product;
184 | 	}
185 | 	
186 | 	/**
187 | 	 * 词元位置权重,切分结果词元越多，值为大
188 | 	 * @return
189 | 	 */
190 | 	int getPWeight(){
191 | 		int pWeight = 0;
192 | 		int p = 0;
193 | 		Cell c = this.getHead();
194 | 		while( c != null && c.getLexeme() != null){
195 | 			p++;
196 | 			//pWeight += c.getLexeme().getBegin() * c.getLexeme().getLength();
197 | 			pWeight += p * c.getLexeme().getLength();
198 | 			c = c.getNext();
199 | 		}
200 | 		return pWeight;		
201 | 	}
202 | 	
203 | 	LexemePath copy(){
204 | 		LexemePath theCopy = new LexemePath();
205 | 		theCopy.pathBegin = this.pathBegin;
206 | 		theCopy.pathEnd = this.pathEnd;
207 | 		theCopy.payloadLength = this.payloadLength;
208 | 		
209 | 		theCopy.sentenceContent = this.sentenceContent;
210 | 		theCopy.absBegin = this.absBegin;
211 | 		theCopy.absLength = this.absLength;
212 | 		
213 | 		Cell c = this.getHead();
214 | 		while( c != null && c.getLexeme() != null){
215 | 			theCopy.addLexeme(c.getLexeme());
216 | 			c = c.getNext();
217 | 		}
218 | 		return theCopy;
219 | 	}
220 | 
221 | 	public int compareTo(LexemePath o) {
222 | 		float nowResult,OriginResult;
223 | 		nowResult = this.calcResult();
224 | 		OriginResult = o.calcResult();
225 | 		
226 | 		if( nowResult > OriginResult )
227 | 		{
228 | 			return 1;
229 | 		}
230 | 		else if(nowResult < OriginResult)
231 | 		{
232 | 			return -1;
233 | 		}
234 | 		else 
235 | 		{
236 | 			if(this.pathEnd > o.pathEnd)
237 | 			{
238 | 				return 1;
239 | 			}
240 | 			else if(pathEnd < o.pathEnd)
241 | 			{
242 | 				return -1;
243 | 			}
244 | 		}
245 | 		return 0;
246 | 	}
247 | 	
248 | 	private float calcResult(){
249 | 		if(_result == -1.0f)		//未被计算过
250 | 		{
251 | 			_result= (this.payloadLength*10) + (this.size()*(-5)) + this.getPathLength()+this.getXWeight()+this.getPWeight();
252 | 			
253 | 			/*存在单字
254 | 			 *①、判断单字的个数，进行单字定位，用于获取
255 | 			 *②、在单字字典进行查找，是否存在，取其概率值 
256 | 			 * */
257 | 			if(this.payloadLength < this.absLength)		//存在单字
258 | 			{
259 | 				int curPoint;
260 | 				Cell head = this.getHead();
261 | 				curPoint = this.absBegin;		//从路径绝对起始处开始扫描
262 | 				float sumFreq=0;
263 | 				char singleChar=0;
264 | 				while(head != null){				
265 | 					while(curPoint<head.getLexeme().getBegin())
266 | 					{
267 | 						singleChar=sentenceContent[curPoint];		//会空指针？自定义copy函数原因
268 | 						sumFreq += Dictionary.getSingleton().getCharFreq(singleChar);
269 | 						//result -=;
270 | 						curPoint++;
271 | 					}
272 | 								
273 | 					curPoint += head.getLexeme().getLength();	//平移一个词元的长度
274 | 					head = head.getNext();
275 | 				}
276 | 				
277 | 				//词元扫描完，结尾可能漏词
278 | 				while(curPoint < this.absBegin + this.absLength)
279 | 				{
280 | 					singleChar=sentenceContent[curPoint];		//会空指针？自定义copy函数原因
281 | 					sumFreq += Dictionary.getSingleton().getCharFreq(singleChar);
282 | 					curPoint++;
283 | 				}
284 | 				
285 | 				_result += sumFreq*2.0;	//存在单字出现频率越高，则路径越优异
286 | 			}
287 | 		}
288 | 		return _result;
289 | 	}
290 | 	
291 | 	public String toString(){
292 | 		StringBuffer sb = new StringBuffer();
293 | 		sb.append("pathBegin  : ").append(pathBegin).append("\r\n");
294 | 		sb.append("pathEnd  : ").append(pathEnd).append("\r\n");
295 | 		sb.append("payloadLength  : ").append(payloadLength).append("\r\n");
296 | 		Cell head = this.getHead();
297 | 		while(head != null){
298 | 			sb.append("lexeme : ").append(head.getLexeme()).append("\r\n");
299 | 			head = head.getNext();
300 | 		}
301 | 		return sb.toString();
302 | 	}
303 | 
304 | }
305 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/QuickSortSet.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0
  3 |  * IK Analyzer release 5.0
  4 |  * 
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  * 
 24 |  */
 25 | package org.wltea.analyzer.core;
 26 | 
 27 | /**
 28 |  * IK分词器专用的Lexem快速排序集合
 29 |  */
 30 | class QuickSortSet {
 31 | 	//链表头
 32 | 	private Cell head;
 33 | 	//链表尾
 34 | 	private Cell tail;
 35 | 	//链表的实际大小
 36 | 	private int size;
 37 | 	
 38 | 	QuickSortSet(){
 39 | 		this.size = 0;
 40 | 	}
 41 | 	
 42 | 	/**
 43 | 	 * 向链表集合添加词元
 44 | 	 * @param lexeme
 45 | 	 */
 46 | 	boolean addLexeme(Lexeme lexeme){
 47 | 		Cell newCell = new Cell(lexeme); 
 48 | 		if(this.size == 0){
 49 | 			this.head = newCell;
 50 | 			this.tail = newCell;
 51 | 			this.size++;
 52 | 			return true;
 53 | 			
 54 | 		}else{
 55 | 			if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同，不放入集合
 56 | 				return false;
 57 | 				
 58 | 			}else if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部
 59 | 				this.tail.next = newCell;
 60 | 				newCell.prev = this.tail;
 61 | 				this.tail = newCell;
 62 | 				this.size++;
 63 | 				return true;
 64 | 				
 65 | 			}else if(this.head.compareTo(newCell) > 0){//词元接入链表头部
 66 | 				this.head.prev = newCell;
 67 | 				newCell.next = this.head;
 68 | 				this.head = newCell;
 69 | 				this.size++;
 70 | 				return true;
 71 | 				
 72 | 			}else{					
 73 | 				//从尾部上逆
 74 | 				Cell index = this.tail;
 75 | 				while(index != null && index.compareTo(newCell) > 0){
 76 | 					index = index.prev;
 77 | 				}
 78 | 				if(index.compareTo(newCell) == 0){//词元与集合中的词元重复，不放入集合
 79 | 					return false;
 80 | 					
 81 | 				}else if(index.compareTo(newCell) < 0){//词元插入链表中的某个位置
 82 | 					newCell.prev = index;
 83 | 					newCell.next = index.next;
 84 | 					index.next.prev = newCell;
 85 | 					index.next = newCell;
 86 | 					this.size++;
 87 | 					return true;					
 88 | 				}
 89 | 			}
 90 | 		}
 91 | 		return false;
 92 | 	}
 93 | 	
 94 | 	/**
 95 | 	 * 返回链表头部元素
 96 | 	 * @return
 97 | 	 */
 98 | 	Lexeme peekFirst(){
 99 | 		if(this.head != null){
100 | 			return this.head.lexeme;
101 | 		}
102 | 		return null;
103 | 	}
104 | 	
105 | 	/**
106 | 	 * 取出链表集合的第一个元素
107 | 	 * @return Lexeme
108 | 	 */
109 | 	Lexeme pollFirst(){
110 | 		if(this.size == 1){
111 | 			Lexeme first = this.head.lexeme;
112 | 			this.head = null;
113 | 			this.tail = null;
114 | 			this.size--;
115 | 			return first;
116 | 		}else if(this.size > 1){
117 | 			Lexeme first = this.head.lexeme;
118 | 			this.head = this.head.next;
119 | 			this.size --;
120 | 			return first;
121 | 		}else{
122 | 			return null;
123 | 		}
124 | 	}
125 | 	
126 | 	/**
127 | 	 * 返回链表尾部元素
128 | 	 * @return
129 | 	 */
130 | 	Lexeme peekLast(){
131 | 		if(this.tail != null){
132 | 			return this.tail.lexeme;
133 | 		}
134 | 		return null;
135 | 	}
136 | 	
137 | 	/**
138 | 	 * 取出链表集合的最后一个元素
139 | 	 * @return Lexeme
140 | 	 */
141 | 	Lexeme pollLast(){
142 | 		if(this.size == 1){
143 | 			Lexeme last = this.head.lexeme;
144 | 			this.head = null;
145 | 			this.tail = null;
146 | 			this.size--;
147 | 			return last;
148 | 			
149 | 		}else if(this.size > 1){
150 | 			Lexeme last = this.tail.lexeme;
151 | 			this.tail = this.tail.prev;
152 | 			this.size--;
153 | 			return last;
154 | 			
155 | 		}else{
156 | 			return null;
157 | 		}
158 | 	}
159 | 	
160 | 	/**
161 | 	 * 返回集合大小
162 | 	 * @return
163 | 	 */
164 | 	int size(){
165 | 		return this.size;
166 | 	}
167 | 	
168 | 	/**
169 | 	 * 判断集合是否为空
170 | 	 * @return
171 | 	 */
172 | 	boolean isEmpty(){
173 | 		return this.size == 0;
174 | 	}
175 | 	
176 | 	/**
177 | 	 * 返回lexeme链的头部
178 | 	 * @return
179 | 	 */
180 | 	Cell getHead(){
181 | 		return this.head;
182 | 	}
183 | 	
184 | 	/**
185 | 	 * 
186 | 	 * IK 中文分词  版本 5.0
187 | 	 * IK Analyzer release 5.0
188 | 	 * 
189 | 	 * Licensed to the Apache Software Foundation (ASF) under one or more
190 | 	 * contributor license agreements.  See the NOTICE file distributed with
191 | 	 * this work for additional information regarding copyright ownership.
192 | 	 * The ASF licenses this file to You under the Apache License, Version 2.0
193 | 	 * (the "License"); you may not use this file except in compliance with
194 | 	 * the License.  You may obtain a copy of the License at
195 | 	 *
196 | 	 *     http://www.apache.org/licenses/LICENSE-2.0
197 | 	 *
198 | 	 * Unless required by applicable law or agreed to in writing, software
199 | 	 * distributed under the License is distributed on an "AS IS" BASIS,
200 | 	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | 	 * See the License for the specific language governing permissions and
202 | 	 * limitations under the License.
203 | 	 *
204 | 	 * 源代码由林良益(linliangyi2005@gmail.com)提供
205 | 	 * 版权声明 2012，乌龙茶工作室
206 | 	 * provided by Linliangyi and copyright 2012 by Oolong studio
207 | 	 * 
208 | 	 * QuickSortSet集合单元
209 | 	 * 
210 | 	 */
211 | 	class Cell implements Comparable<Cell>{
212 | 		private Cell prev;
213 | 		private Cell next;
214 | 		private Lexeme lexeme;
215 | 		
216 | 		Cell(Lexeme lexeme){
217 | 			if(lexeme == null){
218 | 				throw new IllegalArgumentException("lexeme must not be null");
219 | 			}
220 | 			this.lexeme = lexeme;
221 | 		}
222 | 
223 | 		public int compareTo(Cell o) {
224 | 			return this.lexeme.compareTo(o.lexeme);
225 | 		}
226 | 
227 | 		public Cell getPrev(){
228 | 			return this.prev;
229 | 		}
230 | 		
231 | 		public Cell getNext(){
232 | 			return this.next;
233 | 		}
234 | 		
235 | 		public Lexeme getLexeme(){
236 | 			return this.lexeme;
237 | 		}
238 | 	}
239 | }
240 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/dic/DictCharNode.java:
--------------------------------------------------------------------------------
 1 | package org.wltea.analyzer.dic;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | 
 6 | public class DictCharNode {
 7 | 	private static final Map<Character , Float> charMap = new HashMap<Character , Float>(1024,0.8f);
 8 | 	
 9 | 	void addChar(Character key,Float logFreq)
10 | 	{
11 | 		charMap.put(key, logFreq);
12 | 		//(int)(Math.log(Integer.parseInt(w[1]))*100)，默认给0
13 | 	}
14 | 	
15 | 	float getCharFreq(Character singleChar)
16 | 	{
17 | 		float freq=-2.0f;	//非单字，则表示该路径切分存在某些问题
18 | 		if(charMap.containsKey(singleChar))		//如果存在
19 | 		{
20 | 			freq = charMap.get(singleChar);
21 | 		}
22 | 		return freq;
23 | 	}
24 | }
25 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/dic/DictSegment.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * 
  3 |  * IK 中文分词  版本 5.0
  4 |  * IK Analyzer release 5.0
  5 |  * 
  6 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  7 |  * contributor license agreements.  See the NOTICE file distributed with
  8 |  * this work for additional information regarding copyright ownership.
  9 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 10 |  * (the "License"); you may not use this file except in compliance with
 11 |  * the License.  You may obtain a copy of the License at
 12 |  *
 13 |  *     http://www.apache.org/licenses/LICENSE-2.0
 14 |  *
 15 |  * Unless required by applicable law or agreed to in writing, software
 16 |  * distributed under the License is distributed on an "AS IS" BASIS,
 17 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 |  * See the License for the specific language governing permissions and
 19 |  * limitations under the License.
 20 |  *
 21 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 22 |  * 版权声明 2012，乌龙茶工作室
 23 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 24 |  * 
 25 |  */
 26 | package org.wltea.analyzer.dic;
 27 | 
 28 | import java.util.Arrays;
 29 | import java.util.HashMap;
 30 | import java.util.Map;
 31 | 
 32 | /**
 33 |  * 词典树分段，表示词典树的一个分枝
 34 |  */
 35 | class DictSegment implements Comparable<DictSegment>{
 36 | 	
 37 | 	//公用字典表，存储汉字
 38 | 	//private static final Map<Character , Character> charMap = new HashMap<Character , Character>(16 , 0.95f);
 39 | 	//数组大小上限
 40 | 	private static final int ARRAY_LENGTH_LIMIT = 3;
 41 | 
 42 | 	
 43 | 	//Map存储结构
 44 | 	private Map<Character , DictSegment> childrenMap;
 45 | 	//数组方式存储结构
 46 | 	private DictSegment[] childrenArray;
 47 | 	
 48 | 	
 49 | 	//当前节点上存储的字符
 50 | 	private Character nodeChar;
 51 | 	//当前节点存储的Segment数目
 52 | 	//storeSize <=ARRAY_LENGTH_LIMIT ，使用数组存储， storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
 53 | 	private int storeSize = 0;
 54 | 	//当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
 55 | 	private int nodeState = 0;	
 56 | 	
 57 | 	
 58 | 	DictSegment(Character nodeChar){
 59 | 		if(nodeChar == null){
 60 | 			throw new IllegalArgumentException("参数为空异常，字符不能为空");
 61 | 		}
 62 | 		this.nodeChar = nodeChar;
 63 | 	}
 64 | 
 65 | 	Character getNodeChar() {
 66 | 		return nodeChar;
 67 | 	}
 68 | 	
 69 | 	/*
 70 | 	 * 判断是否有下一个节点
 71 | 	 */
 72 | 	boolean hasNextNode(){
 73 | 		return  this.storeSize > 0;
 74 | 	}
 75 | 	
 76 | 	/**
 77 | 	 * 匹配词段
 78 | 	 * @param charArray
 79 | 	 * @return Hit
 80 | 	 */
 81 | 	Hit match(char[] charArray){
 82 | 		return this.match(charArray , 0 , charArray.length , null);
 83 | 	}
 84 | 	
 85 | 	/**
 86 | 	 * 匹配词段
 87 | 	 * @param charArray
 88 | 	 * @param begin
 89 | 	 * @param length
 90 | 	 * @return Hit 
 91 | 	 */
 92 | 	Hit match(char[] charArray , int begin , int length){
 93 | 		return this.match(charArray , begin , length , null);
 94 | 	}
 95 | 	
 96 | 	/**
 97 | 	 * 匹配词段
 98 | 	 * @param charArray
 99 | 	 * @param begin
100 | 	 * @param length
101 | 	 * @param searchHit
102 | 	 * @return Hit 
103 | 	 */
104 | 	Hit match(char[] charArray , int begin , int length , Hit searchHit){
105 | 		
106 | 		if(searchHit == null){
107 | 			//如果hit为空，新建
108 | 			searchHit= new Hit();
109 | 			//设置hit的其实文本位置
110 | 			searchHit.setBegin(begin);
111 | 		}else{
112 | 			//否则要将HIT状态重置
113 | 			searchHit.setUnmatch();
114 | 		}
115 | 		//设置hit的当前处理位置
116 | 		searchHit.setEnd(begin);
117 | 		
118 | 		Character keyChar = new Character(charArray[begin]);
119 | 		DictSegment ds = null;
120 | 		
121 | 		//引用实例变量为本地变量，避免查询时遇到更新的同步问题
122 | 		DictSegment[] segmentArray = this.childrenArray;
123 | 		Map<Character , DictSegment> segmentMap = this.childrenMap;		
124 | 		
125 | 		//STEP1 在节点中查找keyChar对应的DictSegment
126 | 		if(segmentArray != null){
127 | 			//在数组中查找
128 | 			DictSegment keySegment = new DictSegment(keyChar);
129 | 			int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize , keySegment);
130 | 			if(position >= 0){
131 | 				ds = segmentArray[position];
132 | 			}
133 | 
134 | 		}else if(segmentMap != null){
135 | 			//在map中查找
136 | 			ds = (DictSegment)segmentMap.get(keyChar);
137 | 		}
138 | 		
139 | 		//STEP2 找到DictSegment，判断词的匹配状态，是否继续递归，还是返回结果
140 | 		if(ds != null){			
141 | 			if(length > 1){
142 | 				//词未匹配完，继续往下搜索
143 | 				return ds.match(charArray, begin + 1 , length - 1 , searchHit);
144 | 			}else if (length == 1){
145 | 				
146 | 				//搜索最后一个char
147 | 				if(ds.nodeState == 1){
148 | 					//添加HIT状态为完全匹配
149 | 					searchHit.setMatch();
150 | 				}
151 | 				if(ds.hasNextNode()){
152 | 					//添加HIT状态为前缀匹配
153 | 					searchHit.setPrefix();
154 | 					//记录当前位置的DictSegment
155 | 					searchHit.setMatchedDictSegment(ds);
156 | 				}
157 | 				return searchHit;
158 | 			}
159 | 			
160 | 		}
161 | 		//STEP3 没有找到DictSegment， 将HIT设置为不匹配
162 | 		return searchHit;		
163 | 	}
164 | 
165 | 	/**
166 | 	 * 加载填充词典片段
167 | 	 * @param charArray
168 | 	 */
169 | 	void fillSegment(char[] charArray){
170 | 		this.fillSegment(charArray, 0 , charArray.length , 1); 
171 | 	}
172 | 	
173 | 	/**
174 | 	 * 屏蔽词典中的一个词
175 | 	 * @param charArray
176 | 	 */
177 | 	void disableSegment(char[] charArray){
178 | 		this.fillSegment(charArray, 0 , charArray.length , 0); 
179 | 	}
180 | 	
181 | 	/**
182 | 	 * 加载填充词典片段
183 | 	 * @param charArray
184 | 	 * @param begin
185 | 	 * @param length
186 | 	 * @param enabled
187 | 	 */
188 | 	private synchronized void fillSegment(char[] charArray , int begin , int length , int enabled){
189 | 		//获取字典表中的汉字对象
190 | 		Character beginChar = new Character(charArray[begin]);
191 | 		/*Character keyChar = charMap.get(beginChar);
192 | 		//字典中没有该字，则将其添加入字典
193 | 		if(keyChar == null){
194 | 			charMap.put(beginChar, beginChar);
195 | 			keyChar = beginChar;
196 | 		}*/
197 | 		
198 | 		//搜索当前节点的存储，查询对应keyChar的keyChar，如果没有则创建
199 | 		DictSegment ds = lookforSegment(beginChar , enabled);
200 | 		if(ds != null){
201 | 			//处理keyChar对应的segment
202 | 			if(length > 1){
203 | 				//词元还没有完全加入词典树
204 | 				ds.fillSegment(charArray, begin + 1, length - 1 , enabled);
205 | 			}else if (length == 1){
206 | 				//已经是词元的最后一个char,设置当前节点状态为enabled，
207 | 				//enabled=1表明一个完整的词，enabled=0表示从词典中屏蔽当前词
208 | 				ds.nodeState = enabled;
209 | 			}
210 | 		}
211 | 
212 | 	}
213 | 	
214 | 	/**
215 | 	 * 查找本节点下对应的keyChar的segment	 * 
216 | 	 * @param keyChar
217 | 	 * @param create  =1如果没有找到，则创建新的segment ; =0如果没有找到，不创建，返回null
218 | 	 * @return
219 | 	 */
220 | 	private DictSegment lookforSegment(Character keyChar ,  int create){
221 | 		
222 | 		DictSegment ds = null;
223 | 
224 | 		if(this.storeSize <= ARRAY_LENGTH_LIMIT){
225 | 			//获取数组容器，如果数组未创建则创建数组
226 | 			DictSegment[] segmentArray = getChildrenArray();			
227 | 			//搜寻数组
228 | 			DictSegment keySegment = new DictSegment(keyChar);
229 | 			int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize, keySegment);
230 | 			if(position >= 0){
231 | 				ds = segmentArray[position];
232 | 			}
233 | 		
234 | 			//遍历数组后没有找到对应的segment
235 | 			if(ds == null && create == 1){
236 | 				ds = keySegment;
237 | 				if(this.storeSize < ARRAY_LENGTH_LIMIT){
238 | 					//数组容量未满，使用数组存储
239 | 					segmentArray[this.storeSize] = ds;
240 | 					//segment数目+1
241 | 					this.storeSize++;
242 | 					Arrays.sort(segmentArray , 0 , this.storeSize);
243 | 					
244 | 				}else{
245 | 					//数组容量已满，切换Map存储
246 | 					//获取Map容器，如果Map未创建,则创建Map
247 | 					Map<Character , DictSegment> segmentMap = getChildrenMap();
248 | 					//将数组中的segment迁移到Map中
249 | 					migrate(segmentArray ,  segmentMap);
250 | 					//存储新的segment
251 | 					segmentMap.put(keyChar, ds);
252 | 					//segment数目+1 ，  必须在释放数组前执行storeSize++ ， 确保极端情况下，不会取到空的数组
253 | 					this.storeSize++;
254 | 					//释放当前的数组引用
255 | 					this.childrenArray = null;
256 | 				}
257 | 
258 | 			}			
259 | 			
260 | 		}else{
261 | 			//获取Map容器，如果Map未创建,则创建Map
262 | 			Map<Character , DictSegment> segmentMap = getChildrenMap();
263 | 			//搜索Map
264 | 			ds = (DictSegment)segmentMap.get(keyChar);
265 | 			if(ds == null && create == 1){
266 | 				//构造新的segment
267 | 				ds = new DictSegment(keyChar);
268 | 				segmentMap.put(keyChar , ds);
269 | 				//当前节点存储segment数目+1
270 | 				this.storeSize ++;
271 | 			}
272 | 		}
273 | 
274 | 		return ds;
275 | 	}
276 | 	
277 | 	
278 | 	/**
279 | 	 * 获取数组容器
280 | 	 * 线程同步方法
281 | 	 */
282 | 	private DictSegment[] getChildrenArray(){
283 | 		if(this.childrenArray == null){
284 | 			synchronized(this){
285 | 				if(this.childrenArray == null){
286 | 					this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
287 | 				}
288 | 			}
289 | 		}
290 | 		return this.childrenArray;
291 | 	}
292 | 	
293 | 	/**
294 | 	 * 获取Map容器
295 | 	 * 线程同步方法
296 | 	 */	
297 | 	private Map<Character , DictSegment> getChildrenMap(){
298 | 		if(this.childrenMap == null){
299 | 			synchronized(this){
300 | 				if(this.childrenMap == null){
301 | 					this.childrenMap = new HashMap<Character , DictSegment>(ARRAY_LENGTH_LIMIT * 2,0.8f);
302 | 				}
303 | 			}
304 | 		}
305 | 		return this.childrenMap;
306 | 	}
307 | 	
308 | 	/**
309 | 	 * 将数组中的segment迁移到Map中
310 | 	 * @param segmentArray
311 | 	 */
312 | 	private void migrate(DictSegment[] segmentArray , Map<Character , DictSegment> segmentMap){
313 | 		for(DictSegment segment : segmentArray){
314 | 			if(segment != null){
315 | 				segmentMap.put(segment.nodeChar, segment);
316 | 			}
317 | 		}
318 | 	}
319 | 
320 | 	/**
321 | 	 * 实现Comparable接口
322 | 	 * @param o
323 | 	 * @return int
324 | 	 */
325 | 	public int compareTo(DictSegment o) {
326 | 		//对当前节点存储的char进行比较
327 | 		return this.nodeChar.compareTo(o.nodeChar);
328 | 	}
329 | 	
330 | }
331 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/dic/Dictionary.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0
  3 |  * IK Analyzer release 5.0
  4 |  * 
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  * 
 24 |  * 
 25 |  */
 26 | package org.wltea.analyzer.dic;
 27 | 
 28 | import java.io.BufferedReader;
 29 | import java.io.IOException;
 30 | import java.io.InputStream;
 31 | import java.io.InputStreamReader;
 32 | import java.util.Collection;
 33 | import java.util.List;
 34 | import org.wltea.analyzer.cfg.Configuration;
 35 | import org.wltea.analyzer.cfg.DefaultConfig;
 36 | 
 37 | /**
 38 |  * 词典管理类,单子模式
 39 |  */
 40 | public class Dictionary {
 41 | 
 42 | 
 43 | 	/*
 44 | 	 * 词典单子实例
 45 | 	 */
 46 | 	private volatile static Dictionary singleton;
 47 | 	
 48 | 	/*
 49 | 	 * 主词典对象
 50 | 	 */
 51 | 	private DictSegment _MainDict;
 52 | 	
 53 | 	/*
 54 | 	 * 停止词词典 
 55 | 	 */
 56 | 	//private DictSegment _StopWordDict;
 57 | 	/*
 58 | 	 * 量词词典
 59 | 	 */
 60 | 	private DictSegment _QuantifierDict;
 61 | 	/*
 62 | 	 * 单字带词频词典
 63 | 	 */
 64 | 	private DictCharNode _CharFreqDict;
 65 | 	/*
 66 | 	 * 配置对象
 67 | 	 */
 68 | 	private Configuration cfg;
 69 | 	
 70 | 	private Dictionary(Configuration cfg){
 71 | 		this.cfg = cfg;
 72 | 		//建立一个主词典实例
 73 | 		_MainDict = new DictSegment((char)0);
 74 | 		this.loadMainDict(_MainDict);
 75 | 		
 76 | 		/*_StopWordDict = new DictSegment((char)0);
 77 | 		this.loadStopWordDict(_StopWordDict);*/
 78 | 		
 79 | 		this.loadQuantifierDict();
 80 | 		this.loadCharFreqDict();
 81 | 
 82 | 	}
 83 | 	
 84 | 	/**
 85 | 	 * 词典初始化
 86 | 	 * 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
 87 | 	 * 只有当Dictionary类被实际调用时，才会开始载入词典，
 88 | 	 * 这将延长首次分词操作的时间
 89 | 	 * 该方法提供了一个在应用加载阶段就初始化字典的手段
 90 | 	 * @return Dictionary
 91 | 	 */
 92 | 	public static Dictionary initial(Configuration cfg){
 93 | 		if(singleton == null){
 94 | 			synchronized(Dictionary.class){
 95 | 				if(singleton == null){
 96 | 					singleton = new Dictionary(cfg);
 97 | 					return singleton;
 98 | 				}
 99 | 			}
100 | 		}
101 | 		return singleton;
102 | 	}
103 | 	
104 | 	/**
105 | 	 * 把solr配置的字典加入到MainDic中，进行字典切换
106 | 	 * @param inputStreamList 多字典输入流
107 | 	 * @return
108 | 	 */
109 | 	public static  synchronized Dictionary addDic2MainDic(List<InputStream> inputStreamList)
110 | 	{
111 | 		if(singleton == null)
112 | 		{
113 | 			Configuration cfg = DefaultConfig.getInstance();
114 | 			Dictionary.initial(cfg);
115 | 		}
116 | 		
117 | 		DictSegment mainDicTemp = new DictSegment((char)0);
118 | 
119 | 		System.out.println("begin load MainDict :");
120 | 		singleton.loadMainDict(mainDicTemp);
121 | 		
122 | 		System.out.println("begin loadSolrMainDict by  List<InputStream>:");
123 | 		for(InputStream is : inputStreamList)
124 | 		{
125 | 			singleton.loadWords2DictSegment(is, mainDicTemp);
126 | 		}
127 | 		
128 | 		singleton._MainDict = mainDicTemp;
129 | 		System.out.println("*********************************");
130 | 		System.out.println("<mainWordsDic>end switch!!");
131 | 		System.out.println("*********************************");
132 | 		
133 | 		mainDicTemp = null;
134 | 	
135 | 		return singleton;
136 | 	}
137 | 	
138 | 	/**
139 | 	 * 获取词典单子实例
140 | 	 * @return Dictionary 单例对象
141 | 	 */
142 | 	public static Dictionary getSingleton(){
143 | 		if(singleton == null){
144 | 			throw new IllegalStateException("词典尚未初始化，请先调用initial方法");
145 | 		}
146 | 		return singleton;
147 | 	}
148 | 	
149 | 	/**
150 | 	 * 批量加载新词条
151 | 	 * @param words Collection<String>词条列表
152 | 	 */
153 | 	public void addWords(Collection<String> words){
154 | 		if(words != null){
155 | 			for(String word : words){
156 | 				if (word != null) {
157 | 					//批量加载词条到主内存词典中
158 | 					singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
159 | 				}
160 | 			}
161 | 		}
162 | 	}
163 | 	
164 | 	/**
165 | 	 * 批量移除（屏蔽）词条
166 | 	 * @param words
167 | 	 */
168 | 	public void disableWords(Collection<String> words){
169 | 		if(words != null){
170 | 			for(String word : words){
171 | 				if (word != null) {
172 | 					//批量屏蔽词条
173 | 					singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
174 | 				}
175 | 			}
176 | 		}
177 | 	}
178 | 	
179 | 	/**
180 | 	 * 检索匹配主词典
181 | 	 * @param charArray
182 | 	 * @return Hit 匹配结果描述
183 | 	 */
184 | 	public Hit matchInMainDict(char[] charArray){
185 | 		return singleton._MainDict.match(charArray);
186 | 	}
187 | 	
188 | 	/**
189 | 	 * 检索匹配主词典
190 | 	 * @param charArray
191 | 	 * @param begin
192 | 	 * @param length
193 | 	 * @return Hit 匹配结果描述
194 | 	 */
195 | 	public Hit matchInMainDict(char[] charArray , int begin, int length){
196 | 		return singleton._MainDict.match(charArray, begin, length);
197 | 	}
198 | 	
199 | 	/**
200 | 	 * 检索匹配量词词典
201 | 	 * @param charArray
202 | 	 * @param begin
203 | 	 * @param length
204 | 	 * @return Hit 匹配结果描述
205 | 	 */
206 | 	public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
207 | 		return singleton._QuantifierDict.match(charArray, begin, length);
208 | 	}
209 | 	
210 | 	/**
211 | 	 * 从已匹配的Hit中直接取出DictSegment，继续向下匹配
212 | 	 * @param charArray
213 | 	 * @param currentIndex
214 | 	 * @param matchedHit
215 | 	 * @return Hit
216 | 	 */
217 | 	public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){
218 | 		DictSegment ds = matchedHit.getMatchedDictSegment();
219 | 		return ds.match(charArray, currentIndex, 1 , matchedHit);
220 | 	}
221 | 	
222 | 	
223 | 	/**
224 | 	 * 判断是否是停止词
225 | 	 * @param charArray
226 | 	 * @param begin
227 | 	 * @param length
228 | 	 * @return boolean
229 | 	 */
230 | 	/*public boolean isStopWord(char[] charArray , int begin, int length){			
231 | 		return singleton._StopWordDict.match(charArray, begin, length).isMatch();
232 | 	}*/	
233 | 	
234 | 	/**
235 | 	 * 加载主词典及扩展词典
236 | 	 */
237 | 	private void loadMainDict(DictSegment dstDicSegment){
238 | 		
239 | 		//读取主词典文件
240 |         InputStream inputStream = this.getClass().getClassLoader().getResourceAsStream("main2012.dic");
241 |         if(inputStream == null){
242 |         	throw new RuntimeException("Main Dictionary not found!!!");
243 |         }
244 |         
245 |         //System.out.println("test加载主字典");
246 |         this.loadWords2DictSegment(inputStream,dstDicSegment);
247 |         
248 |         //System.out.println("test加载扩展字典");
249 |     	this.loadExtDict(dstDicSegment);
250 |        
251 | 	}	
252 | 	
253 | 	/**
254 | 	 * 加载用户配置的扩展词典到主词库表
255 | 	 */
256 | 	private void loadExtDict(DictSegment dstDicSegment){
257 | 		//加载扩展词典配置
258 | 		List<String> extDictFiles  = cfg.getExtDictionarys();
259 | 		if(extDictFiles != null){
260 | 			InputStream is = null;
261 | 			for(String extDictName : extDictFiles){
262 | 				//读取扩展词典文件
263 | 				//System.out.println("加载扩展词典：" + extDictName);
264 | 				is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
265 | 				//如果找不到扩展的字典，则忽略
266 | 				if(is == null){
267 | 					continue;
268 | 				}
269 | 				loadWords2DictSegment(is,dstDicSegment);
270 | 			}
271 | 		}		
272 | 	}
273 | 	
274 | 	/**
275 | 	 * 
276 | 	 * @param is 字典数据输入流
277 | 	 * @param dstDicSegment 目标字典
278 | 	 */
279 | 
280 | 	private void loadWords2DictSegment(InputStream is,DictSegment dstDicSegment) {
281 | 		
282 | 		if(is != null)
283 | 		{
284 | 			try {
285 | 				BufferedReader br = new BufferedReader(new InputStreamReader(is,
286 | 						"UTF-8"));
287 | 				String theWord = null;
288 | 				do {
289 | 					theWord = br.readLine();
290 | 					if (theWord != null ) {
291 | 						String line = theWord.trim();
292 | 						if (!line.isEmpty() && !line.startsWith("#")){
293 | 							String[] words = line.split("[\\s=,>]+");
294 | 							for(String w :words)
295 | 								dstDicSegment.fillSegment(w.toLowerCase().toCharArray());
296 | 						}
297 | 					}
298 | 				} while (theWord != null);
299 | 
300 | 			} catch (IOException ioe) {
301 | 				System.err.println(" Dictionary loading exception。ClassName: " + dstDicSegment.getClass().getName());
302 | 				ioe.printStackTrace();
303 | 				
304 | 			} finally {
305 | 				try {
306 | 					if (is != null) {
307 | 						is.close();
308 | 						is = null;
309 | 					}
310 | 				} catch (IOException e) {
311 | 					e.printStackTrace();
312 | 				}
313 | 			}
314 | 		}
315 | 	}
316 | 		
317 | 	/**
318 | 	 * 加载量词词典
319 | 	 */
320 | 	private void loadQuantifierDict(){
321 | 		//建立一个量词典实例
322 | 		_QuantifierDict = new DictSegment((char)0);
323 | 		//读取量词词典文件
324 |         InputStream is = this.getClass().getClassLoader().getResourceAsStream("quantifier.dic");
325 |         if(is == null){
326 |         	throw new RuntimeException("Quantifier Dictionary not found!!!");
327 |         }
328 | 		loadWords2DictSegment(is, _QuantifierDict);
329 | 	}
330 | 	
331 | 	private void loadCharFreqDict(){
332 | 		_CharFreqDict = new DictCharNode();
333 | 		//读取量词词典文件
334 |         InputStream is = this.getClass().getClassLoader().getResourceAsStream("chars.dic");
335 |         if(is == null){
336 |         	throw new RuntimeException("Chars Dictionary not found!!!");
337 |         }
338 | 		try {		//此处可以抽象出一个接口，或公用函数
339 | 			BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
340 | 			String theWord = null;
341 | 			do {
342 | 				theWord = br.readLine();
343 | 				if (theWord != null && !"".equals(theWord.trim())) {
344 | 					String[] w = theWord.split(" ");
345 | 					if(w.length == 2)
346 | 					{
347 | 						_CharFreqDict.addChar(w[0].charAt(0), (float)(Math.log10(Integer.parseInt(w[1])+5)));
348 | 					}
349 | 					/*else
350 | 					{
351 | 						_CharFreqDict.addChar(w[0].charAt(0), 0);		//默认无词性该给多少权重？是否该存在(对于没词频数据)？
352 | 					}*/
353 | 				}
354 | 			} while (theWord != null);
355 | 			
356 | 		} catch (IOException ioe) {
357 | 			System.err.println("Chars Dictionary loading exception.");
358 | 			ioe.printStackTrace();	
359 | 			}finally{
360 | 			try {
361 | 				if(is != null){
362 | 					is.close();
363 | 					is = null;
364 | 				}
365 | 			} catch (IOException e) {
366 | 				e.printStackTrace();
367 | 			}
368 | 		}	
369 | 	}
370 | 	
371 | 	public float getCharFreq(Character key)
372 | 	{
373 | 		return _CharFreqDict.getCharFreq(key);
374 | 	}
375 | 	
376 | }
377 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/dic/Hit.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * 
  3 |  * IK 中文分词  版本 5.0
  4 |  * IK Analyzer release 5.0
  5 |  * 
  6 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  7 |  * contributor license agreements.  See the NOTICE file distributed with
  8 |  * this work for additional information regarding copyright ownership.
  9 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 10 |  * (the "License"); you may not use this file except in compliance with
 11 |  * the License.  You may obtain a copy of the License at
 12 |  *
 13 |  *     http://www.apache.org/licenses/LICENSE-2.0
 14 |  *
 15 |  * Unless required by applicable law or agreed to in writing, software
 16 |  * distributed under the License is distributed on an "AS IS" BASIS,
 17 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 |  * See the License for the specific language governing permissions and
 19 |  * limitations under the License.
 20 |  *
 21 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 22 |  * 版权声明 2012，乌龙茶工作室
 23 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 24 |  * 
 25 |  */
 26 | package org.wltea.analyzer.dic;
 27 | 
 28 | /**
 29 |  * 表示一次词典匹配的命中
 30 |  */
 31 | public class Hit {
 32 | 	//Hit不匹配
 33 | 	private static final int UNMATCH = 0x00000000;
 34 | 	//Hit完全匹配
 35 | 	private static final int MATCH = 0x00000001;
 36 | 	//Hit前缀匹配
 37 | 	private static final int PREFIX = 0x00000010;
 38 | 	
 39 | 	
 40 | 	//该HIT当前状态，默认未匹配
 41 | 	private int hitState = UNMATCH;
 42 | 	
 43 | 	//记录词典匹配过程中，当前匹配到的词典分支节点
 44 | 	private DictSegment matchedDictSegment; 
 45 | 	/*
 46 | 	 * 词段开始位置
 47 | 	 */
 48 | 	private int begin;
 49 | 	/*
 50 | 	 * 词段的结束位置
 51 | 	 */
 52 | 	private int end;
 53 | 	
 54 | 	
 55 | 	/**
 56 | 	 * 判断是否完全匹配
 57 | 	 */
 58 | 	public boolean isMatch() {
 59 | 		return (this.hitState & MATCH) > 0;
 60 | 	}
 61 | 	/**
 62 | 	 * 
 63 | 	 */
 64 | 	public void setMatch() {
 65 | 		this.hitState = this.hitState | MATCH;
 66 | 	}
 67 | 
 68 | 	/**
 69 | 	 * 判断是否是词的前缀
 70 | 	 */
 71 | 	public boolean isPrefix() {
 72 | 		return (this.hitState & PREFIX) > 0;
 73 | 	}
 74 | 	/**
 75 | 	 * 
 76 | 	 */
 77 | 	public void setPrefix() {
 78 | 		this.hitState = this.hitState | PREFIX;
 79 | 	}
 80 | 	/**
 81 | 	 * 判断是否是不匹配
 82 | 	 */
 83 | 	public boolean isUnmatch() {
 84 | 		return this.hitState == UNMATCH ;
 85 | 	}
 86 | 	/**
 87 | 	 * 
 88 | 	 */
 89 | 	public void setUnmatch() {
 90 | 		this.hitState = UNMATCH;
 91 | 	}
 92 | 	
 93 | 	public DictSegment getMatchedDictSegment() {
 94 | 		return matchedDictSegment;
 95 | 	}
 96 | 	
 97 | 	public void setMatchedDictSegment(DictSegment matchedDictSegment) {
 98 | 		this.matchedDictSegment = matchedDictSegment;
 99 | 	}
100 | 	
101 | 	public int getBegin() {
102 | 		return begin;
103 | 	}
104 | 	
105 | 	public void setBegin(int begin) {
106 | 		this.begin = begin;
107 | 	}
108 | 	
109 | 	public int getEnd() {
110 | 		return end;
111 | 	}
112 | 	
113 | 	public void setEnd(int end) {
114 | 		this.end = end;
115 | 	}	
116 | 	
117 | }
118 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0.1
  3 |  * IK Analyzer release 5.0.1
  4 |  * 
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  * 
 24 | 
 25 |  * 
 26 |  */
 27 | package org.wltea.analyzer.lucene;
 28 | 
 29 | import java.io.IOException;
 30 | import java.io.Reader;
 31 | 
 32 | import org.apache.lucene.analysis.Tokenizer;
 33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 35 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 36 | 
 37 | import org.wltea.analyzer.core.IKSegmenter;
 38 | import org.wltea.analyzer.core.Lexeme;
 39 | 
 40 | /**
 41 |  * IK分词器 Lucene Tokenizer适配器类 兼容Lucene 4.0版本
 42 |  */
 43 | public final class IKTokenizer extends Tokenizer {
 44 | 
 45 | 	// IK分词器实现
 46 | 	private IKSegmenter _IKImplement;
 47 | 
 48 | 	// 词元文本属性
 49 | 	private final CharTermAttribute termAtt;
 50 | 	// 词元位移属性
 51 | 	private final OffsetAttribute offsetAtt;
 52 | 	// 词元分类属性（该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量）
 53 | 	private final TypeAttribute typeAtt;
 54 | 	// 记录最后一个词元的结束位置
 55 | 	private int endPosition;
 56 | 
 57 | 	/**
 58 | 	 * Lucene 4.0 Tokenizer适配器类构造函数
 59 | 	 * 
 60 | 	 * @param in
 61 | 	 * @param useSmart
 62 | 	 */
 63 | 
 64 | 	public IKTokenizer(Reader in, boolean useSmart) {
 65 | 		super(in);
 66 | 		offsetAtt = addAttribute(OffsetAttribute.class);
 67 | 		termAtt = addAttribute(CharTermAttribute.class);
 68 | 		typeAtt = addAttribute(TypeAttribute.class);
 69 | 		_IKImplement = new IKSegmenter(input, useSmart);
 70 | 	}
 71 | 
 72 | 	/*
 73 | 	 * (non-Javadoc)
 74 | 	 * 
 75 | 	 * @see org.apache.lucene.analysis.TokenStream#incrementToken()
 76 | 	 */
 77 | 	@Override
 78 | 	public boolean incrementToken() throws IOException {
 79 | 		// 清除所有的词元属性
 80 | 		clearAttributes();
 81 | 		Lexeme nextLexeme = _IKImplement.next();
 82 | 		if (nextLexeme != null) {
 83 | 			// 将Lexeme转成Attributes
 84 | 			// 设置词元文本
 85 | 			termAtt.append(nextLexeme.getLexemeText());
 86 | 			// 设置词元长度
 87 | 			termAtt.setLength(nextLexeme.getLength());
 88 | 			// 设置词元位移
 89 | 			offsetAtt.setOffset(nextLexeme.getBeginPosition(),
 90 | 					nextLexeme.getEndPosition());
 91 | 			// 记录分词的最后位置
 92 | 			endPosition = nextLexeme.getEndPosition();
 93 | 			// 记录词元分类
 94 | 			typeAtt.setType(nextLexeme.getLexemeTypeString());
 95 | 			// 返会true告知还有下个词元
 96 | 			return true;
 97 | 		}
 98 | 		// 返会false告知词元输出完毕
 99 | 		return false;
100 | 	}
101 | 
102 | 	/*
103 | 	 * (non-Javadoc)
104 | 	 * 
105 | 	 * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
106 | 	 */
107 | 	@Override
108 | 	public void reset() throws IOException {
109 | 		super.reset();
110 | 		_IKImplement.reset(input);
111 | 	}
112 | 
113 | 	@Override
114 | 	public final void end() {
115 | 		// set final offset
116 | 		int finalOffset = correctOffset(this.endPosition);
117 | 		offsetAtt.setOffset(finalOffset, finalOffset);
118 | 	}
119 | }
120 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/lucene/IKTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.wltea.analyzer.lucene;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.io.Reader;
 6 | import java.util.List;
 7 | import java.util.Map;
 8 | import org.apache.lucene.analysis.Tokenizer;
 9 | import org.apache.lucene.analysis.util.ResourceLoader;
10 | import org.apache.lucene.util.AttributeSource.AttributeFactory;
11 | import org.wltea.analyzer.dic.Dictionary;
12 | 
13 | import com.mlcs.search.mlcsseg.lucene.ReloadableTokenizerFactory;
14 | import com.mlcs.search.mlcsseg.lucene.ReloaderRegister;
15 | 
16 | public class IKTokenizerFactory extends ReloadableTokenizerFactory {
17 | 
18 | 
19 | 	public IKTokenizerFactory(Map<String, String> args) {
20 | 		super(args);
21 | 		
22 | 		useSmart = getBoolean(args, "useSmart", false);
23 | 		System.out.println(":::ik:construction::::::::::::::::::::::::::" + conf);
24 | 	}
25 | 	private boolean useSmart = false;
26 | 
27 | 	private boolean useSmart() {
28 | 		return useSmart;
29 | 	}
30 | 
31 | 
32 | 	// 通过这个实现，调用自身分词器
33 | 	public Tokenizer create(AttributeFactory attributeFactory, Reader in) { // 会多次被调用
34 | 		return new IKTokenizer(in, this.useSmart()); // 初始化词典，分词器，消歧器
35 | 	}
36 | 
37 | 	public void inform(ResourceLoader loader) throws IOException { // 在启动时初始化一次
38 | 		System.out.println(":::ik:::inform::::::::::::::::::::::::" + conf);
39 | 		ReloaderRegister.register(this, loader, conf);
40 | 	}
41 | 
42 | 
43 | 
44 | 	@Override
45 | 	public void update(List<InputStream> inputStreams) {
46 | 		Dictionary.addDic2MainDic(inputStreams);
47 | 	}
48 | 
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/resources/quantifier.dic:
--------------------------------------------------------------------------------
  1 | 丈
  2 | 下
  3 | 世
  4 | 世纪
  5 | 两
  6 | 个
  7 | 中
  8 | 串
  9 | 亩
 10 | 人
 11 | 介
 12 | 付
 13 | 代
 14 | 件
 15 | 任
 16 | 份
 17 | 伏
 18 | 伙
 19 | 位
 20 | 位数
 21 | 例
 22 | 倍
 23 | 像素
 24 | 元
 25 | 克
 26 | 克拉
 27 | 公亩
 28 | 公克
 29 | 公分
 30 | 公升
 31 | 公尺
 32 | 公担
 33 | 公斤
 34 | 公里
 35 | 公顷
 36 | 具
 37 | 册
 38 | 出
 39 | 刀
 40 | 分
 41 | 分钟
 42 | 分米
 43 | 划
 44 | 列
 45 | 则
 46 | 刻
 47 | 剂
 48 | 剑
 49 | 副
 50 | 加仑
 51 | 勺
 52 | 包
 53 | 匙
 54 | 匹
 55 | 区
 56 | 千克
 57 | 千米
 58 | 升
 59 | 卷
 60 | 厅
 61 | 厘
 62 | 厘米
 63 | 双
 64 | 发
 65 | 口
 66 | 句
 67 | 只
 68 | 台
 69 | 叶
 70 | 号
 71 | 名
 72 | 吨
 73 | 听
 74 | 员
 75 | 周
 76 | 周年
 77 | 品
 78 | 回
 79 | 团
 80 | 圆
 81 | 圈
 82 | 地
 83 | 场
 84 | 块
 85 | 坪
 86 | 堆
 87 | 声
 88 | 壶
 89 | 处
 90 | 夜
 91 | 大
 92 | 天
 93 | 头
 94 | 套
 95 | 女
 96 | 孔
 97 | 字
 98 | 宗
 99 | 室
100 | 家
101 | 寸
102 | 对
103 | 封
104 | 尊
105 | 小时
106 | 尺
107 | 尾
108 | 局
109 | 层
110 | 届
111 | 岁
112 | 师
113 | 帧
114 | 幅
115 | 幕
116 | 幢
117 | 平方
118 | 平方公尺
119 | 平方公里
120 | 平方分米
121 | 平方厘米
122 | 平方码
123 | 平方米
124 | 平方英寸
125 | 平方英尺
126 | 平方英里
127 | 平米
128 | 年
129 | 年代
130 | 年级
131 | 度
132 | 座
133 | 式
134 | 引
135 | 张
136 | 成
137 | 战
138 | 截
139 | 户
140 | 房
141 | 所
142 | 扇
143 | 手
144 | 打
145 | 批
146 | 把
147 | 折
148 | 担
149 | 拍
150 | 招
151 | 拨
152 | 拳
153 | 指
154 | 掌
155 | 排
156 | 撮
157 | 支
158 | 文
159 | 斗
160 | 斤
161 | 方
162 | 族
163 | 日
164 | 时
165 | 曲
166 | 月
167 | 月份
168 | 期
169 | 本
170 | 朵
171 | 村
172 | 束
173 | 条
174 | 来
175 | 杯
176 | 枚
177 | 枝
178 | 枪
179 | 架
180 | 柄
181 | 柜
182 | 栋
183 | 栏
184 | 株
185 | 样
186 | 根
187 | 格
188 | 案
189 | 桌
190 | 档
191 | 桩
192 | 桶
193 | 梯
194 | 棵
195 | 楼
196 | 次
197 | 款
198 | 步
199 | 段
200 | 毛
201 | 毫
202 | 毫升
203 | 毫米
204 | 毫克
205 | 池
206 | 洲
207 | 派
208 | 海里
209 | 滴
210 | 炮
211 | 点
212 | 点钟
213 | 片
214 | 版
215 | 环
216 | 班
217 | 瓣
218 | 瓶
219 | 生
220 | 男
221 | 画
222 | 界
223 | 盆
224 | 盎司
225 | 盏
226 | 盒
227 | 盘
228 | 相
229 | 眼
230 | 石
231 | 码
232 | 碗
233 | 碟
234 | 磅
235 | 种
236 | 科
237 | 秒
238 | 秒钟
239 | 窝
240 | 立方公尺
241 | 立方分米
242 | 立方厘米
243 | 立方码
244 | 立方米
245 | 立方英寸
246 | 立方英尺
247 | 站
248 | 章
249 | 笔
250 | 等
251 | 筐
252 | 筒
253 | 箱
254 | 篇
255 | 篓
256 | 篮
257 | 簇
258 | 米
259 | 类
260 | 粒
261 | 级
262 | 组
263 | 维
264 | 缕
265 | 缸
266 | 罐
267 | 网
268 | 群
269 | 股
270 | 脚
271 | 船
272 | 艇
273 | 艘
274 | 色
275 | 节
276 | 英亩
277 | 英寸
278 | 英尺
279 | 英里
280 | 行
281 | 袋
282 | 角
283 | 言
284 | 课
285 | 起
286 | 趟
287 | 路
288 | 车
289 | 转
290 | 轮
291 | 辆
292 | 辈
293 | 连
294 | 通
295 | 遍
296 | 部
297 | 里
298 | 重
299 | 针
300 | 钟
301 | 钱
302 | 锅
303 | 门
304 | 间
305 | 队
306 | 阶段
307 | 隅
308 | 集
309 | 页
310 | 顶
311 | 顷
312 | 项
313 | 顿
314 | 颗
315 | 餐
316 | 首


--------------------------------------------------------------------------------
/mlcsseg-ik/src/test/java/org/wltea/analyzer/test/TestIk.java:
--------------------------------------------------------------------------------
 1 | package org.wltea.analyzer.test;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.StringReader;
 5 | 
 6 | import org.wltea.analyzer.core.IKSegmenter;
 7 | 
 8 | public class TestIk {
 9 | 	public static void main(String[] args) throws IOException {
10 | 		IKSegmenter ik = new IKSegmenter(new StringReader(""), true);
11 | 		ik.next();
12 | 	}
13 | }	
14 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 | 	<modelVersion>4.0.0</modelVersion>
 4 | 
 5 | 	<groupId>com.mlcs.search</groupId>
 6 | 	<artifactId>mlcsseg</artifactId>
 7 | 	<version>4.6.0-SNAPSHOT</version>
 8 | 	<packaging>pom</packaging>
 9 | 
10 | 	<name>mlcsseg</name>
11 | 	<url>http://maven.apache.org</url>
12 | 
13 | 	<properties>
14 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 | 	</properties>
16 | 
17 | 	<modules>
18 | 		<module>mlcsseg-common</module>
19 | 		<module>mlcsseg-ik</module>
20 | 		<module>mlcsseg-filter</module>
21 | 		<module>mlcsseg-ansj</module>
22 | 	</modules>
23 | 	<dependencies>
24 | 
25 | 		<dependency>
26 | 			<groupId>org.apache.lucene</groupId>
27 | 			<artifactId>lucene-analyzers-common</artifactId>
28 | 			<version>4.6.1</version>
29 | 		</dependency>
30 | 	</dependencies>
31 | </project>
32 | 


--------------------------------------------------------------------------------
/test1/conf/admin-extra.html:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  contributor license agreements.  See the NOTICE file distributed with
 4 |  this work for additional information regarding copyright ownership.
 5 |  The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  (the "License"); you may not use this file except in compliance with
 7 |  the License.  You may obtain a copy of the License at
 8 | 
 9 |      http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 |  Unless required by applicable law or agreed to in writing, software
12 |  distributed under the License is distributed on an "AS IS" BASIS,
13 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  See the License for the specific language governing permissions and
15 |  limitations under the License.
16 | -->
17 | 
18 | <!-- The content of this page will be statically included into the top
19 | of the admin page.  Uncomment this as an example to see there the content
20 | will show up.
21 | 
22 | <hr>
23 | <i>This line will appear before the first table</i>
24 | <tr>
25 | <td colspan="2">
26 | This row will be appended to the end of the first table
27 | </td>
28 | </tr>
29 | <hr>
30 | 
31 | -->
32 | 


--------------------------------------------------------------------------------
/test1/conf/admin-extra.menu-bottom.html:
--------------------------------------------------------------------------------
1 | <!-- admin-extra.menu-bottom.html -->
2 | 


--------------------------------------------------------------------------------
/test1/conf/admin-extra.menu-top.html:
--------------------------------------------------------------------------------
1 | <!-- admin-extra.menu-top.html -->
2 | 


--------------------------------------------------------------------------------
/test1/conf/extDic.txt:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | 七匹狼
 3 | 秋装
 4 | 伊莲娜
 5 | 格男仕
 6 | 李东垣
 7 | 卡扎菲
 8 | 大舒服
 9 | 惠国吉
10 | 楠
11 | 木
12 | 金
13 | 丝


--------------------------------------------------------------------------------
/test1/conf/extDic1.txt:
--------------------------------------------------------------------------------
1 | ﻿
2 | 古妃奇
3 | 简直笨
4 | 并发编程
5 | 穆定喜


--------------------------------------------------------------------------------
/test1/conf/ik.conf:
--------------------------------------------------------------------------------
1 | lastupdate=11223
2 | files=extDic.txt,extDic1.txt,synonyms.txt,isynonyms.txt


--------------------------------------------------------------------------------
/test1/conf/isynonyms.txt:
--------------------------------------------------------------------------------
 1 | # The ASF licenses this file to You under the Apache License, Version 2.0
 2 | # (the "License"); you may not use this file except in compliance with
 3 | # the License.  You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | #-----------------------------------------------------------------------
14 | #some test synonym mappings unlikely to appear in real input text
15 | aaafoo => aaabar
16 | bbbfoo => bbbfoo bbbbar
17 | cccfoo => cccbar cccbaz
18 | fooaaa,baraaa,bazaaa
19 | 
20 | # Some synonym groups specific to this example
21 | GB,gib,gigabyte,gigabytes
22 | MB,mib,megabyte,megabytes
23 | Television, Televisions, TV, TVs
24 | #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
25 | #after us won't split it into two words.
26 | 
27 | # Synonym mappings can be used for spelling correction too
28 | 男式=>男
29 | 


--------------------------------------------------------------------------------
/test1/conf/schema.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" ?>
  2 | <!--
  3 |  Licensed to the Apache Software Foundation (ASF) under one or more
  4 |  contributor license agreements.  See the NOTICE file distributed with
  5 |  this work for additional information regarding copyright ownership.
  6 |  The ASF licenses this file to You under the Apache License, Version 2.0
  7 |  (the "License"); you may not use this file except in compliance with
  8 |  the License.  You may obtain a copy of the License at
  9 | 
 10 |      http://www.apache.org/licenses/LICENSE-2.0
 11 | 
 12 |  Unless required by applicable law or agreed to in writing, software
 13 |  distributed under the License is distributed on an "AS IS" BASIS,
 14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  See the License for the specific language governing permissions and
 16 |  limitations under the License.
 17 | -->
 18 | 
 19 | <!--  
 20 |  This is the Solr schema file. This file should be named "schema.xml" and
 21 |  should be in the conf directory under the solr home
 22 |  (i.e. ./solr/conf/schema.xml by default) 
 23 |  or located where the classloader for the Solr webapp can find it.
 24 | 
 25 |  This example schema is the recommended starting point for users.
 26 |  It should be kept correct and concise, usable out-of-the-box.
 27 | 
 28 |  For more information, on how to customize this file, please see
 29 |  http://wiki.apache.org/solr/SchemaXml
 30 | 
 31 |  PERFORMANCE NOTE: this schema includes many optional features and should not
 32 |  be used for benchmarking.  To improve performance one could
 33 |   - set stored="false" for all fields possible (esp large fields) when you
 34 |     only need to search on the field but don't need to return the original
 35 |     value.
 36 |   - set indexed="false" if you don't need to search on the field, but only
 37 |     return the field as a result of searching on other indexed fields.
 38 |   - remove all unneeded copyField statements
 39 |   - for best index size and searching performance, set "index" to false
 40 |     for all general text fields, use copyField to copy them to the
 41 |     catchall "text" field, and use that for searching.
 42 |   - For maximum indexing performance, use the StreamingUpdateSolrServer
 43 |     java client.
 44 |   - Remember to run the JVM in server mode, and use a higher logging level
 45 |     that avoids logging every request
 46 | -->
 47 | 
 48 | <schema name="test1" version="1.5">
 49 |   <!-- attribute "name" is the name of this schema and is only used for display purposes.
 50 |        version="x.y" is Solr's version number for the schema syntax and 
 51 |        semantics.  It should not normally be changed by applications.
 52 | 
 53 |        1.0: multiValued attribute did not exist, all fields are multiValued 
 54 |             by nature
 55 |        1.1: multiValued attribute introduced, false by default 
 56 |        1.2: omitTermFreqAndPositions attribute introduced, true by default 
 57 |             except for text fields.
 58 |        1.3: removed optional field compress feature
 59 |        1.4: autoGeneratePhraseQueries attribute introduced to drive QueryParser
 60 |             behavior when a single string produces multiple tokens.  Defaults 
 61 |             to off for version >= 1.4
 62 |        1.5: omitNorms defaults to true for primitive field types 
 63 |             (int, float, boolean, string...)
 64 |      -->
 65 | 
 66 |  <fields>
 67 |    <!-- Valid attributes for fields:
 68 |      name: mandatory - the name for the field
 69 |      type: mandatory - the name of a field type from the 
 70 |        <types> fieldType section
 71 |      indexed: true if this field should be indexed (searchable or sortable)
 72 |      stored: true if this field should be retrievable
 73 |      multiValued: true if this field may contain multiple values per document
 74 |      omitNorms: (expert) set to true to omit the norms associated with
 75 |        this field (this disables length normalization and index-time
 76 |        boosting for the field, and saves some memory).  Only full-text
 77 |        fields or fields that need an index-time boost need norms.
 78 |        Norms are omitted for primitive (non-analyzed) types by default.
 79 |      termVectors: [false] set to true to store the term vector for a
 80 |        given field.
 81 |        When using MoreLikeThis, fields used for similarity should be
 82 |        stored for best performance.
 83 |      termPositions: Store position information with the term vector.  
 84 |        This will increase storage costs.
 85 |      termOffsets: Store offset information with the term vector. This 
 86 |        will increase storage costs.
 87 |      required: The field is required.  It will throw an error if the
 88 |        value does not exist
 89 |      default: a value that should be used if no value is specified
 90 |        when adding a document.
 91 |    -->
 92 | 
 93 |    <!-- field names should consist of alphanumeric or underscore characters only and
 94 |       not start with a digit.  This is not currently strictly enforced,
 95 |       but other field names will not have first class support from all components
 96 |       and back compatibility is not guaranteed.  Names with both leading and
 97 |       trailing underscores (e.g. _version_) are reserved.
 98 |    -->
 99 |         
100 |    <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" /> 
101 | 
102 |    <field name="state"  type="int"    indexed="true"  stored="true"/>
103 | 	
104 |    <!-- Common metadata fields, named specifically to match up with
105 |      SolrCell metadata when parsing rich documents such as Word, PDF.
106 |      Some fields are multiValued only because Tika currently may return
107 |      multiple values for them. Some metadata is parsed from the documents,
108 |      but there are some which come from the client context:
109 |        "content_type": From the HTTP headers of incoming stream
110 |        "resourcename": From SolrCell request param resource.name
111 |    -->
112 |    <field name="text" type="string" stored="false" indexed="false"/>
113 |    <field name="_version_" type="long" indexed="true" stored="true"  multiValued="false" />
114 | 
115 |  </fields>
116 | 
117 | 
118 |  <!-- Field to use to determine and enforce document uniqueness. 
119 |       Unless this field is marked with required="false", it will be a required field
120 |    -->
121 |  <uniqueKey>id</uniqueKey>
122 |  <solrQueryParser defaultOperator="AND"/>
123 |  <!-- copy field -->
124 | 
125 |  
126 |  <!-- DEPRECATED: The defaultSearchField is consulted by various query parsers when
127 |   parsing a query string that isn't explicit about the field.  Machine (non-user)
128 |   generated queries are best made explicit, or they can use the "df" request parameter
129 |   which takes precedence over this.
130 |   Note: Un-commenting defaultSearchField will be insufficient if your request handler
131 |   in solrconfig.xml defines "df", which takes precedence. That would need to be removed.
132 |  <defaultSearchField>text</defaultSearchField> -->
133 | 
134 |  <!-- DEPRECATED: The defaultOperator (AND|OR) is consulted by various query parsers
135 |   when parsing a query string to determine if a clause of the query should be marked as
136 |   required or optional, assuming the clause isn't already marked by some operator.
137 |   The default is OR, which is generally assumed so it is not a good idea to change it
138 |   globally here.  The "q.op" request parameter takes precedence over this.
139 |  <solrQueryParser defaultOperator="OR"/> -->
140 | 
141 |   <!-- copyField commands copy one field to another at the time a document
142 |         is added to the index.  It's used either to index the same field differently,
143 |         or to add multiple fields to the same field for easier/faster searching.  -->
144 |    <!-- Above, multiple source fields are copied to the [text] field. 
145 | 	  Another way to map multiple source fields to the same 
146 | 	  destination field is to use the dynamic field syntax. 
147 | 	  copyField also supports a maxChars to copy setting.  -->
148 | 	   
149 |    <!-- <copyField source="*_t" dest="text" maxChars="3000"/> -->
150 | 
151 |    <!-- copy name to alphaNameSort, a field designed for sorting by name -->
152 |    <!-- <copyField source="name" dest="alphaNameSort"/> -->
153 |  
154 |   <types>
155 |     <!-- field type definitions. The "name" attribute is
156 |        just a label to be used by field definitions.  The "class"
157 |        attribute and any other attributes determine the real
158 |        behavior of the fieldType.
159 |          Class names starting with "solr" refer to java classes in a
160 |        standard package such as org.apache.solr.analysis
161 |     -->
162 | 
163 |     <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
164 |     <fieldType name="string" class="solr.StrField" sortMissingLast="true" />
165 | 
166 |     <!-- boolean type: "true" or "false" -->
167 |     <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
168 | 
169 |     <!-- sortMissingLast and sortMissingFirst attributes are optional attributes are
170 |          currently supported on types that are sorted internally as strings
171 |          and on numeric types.
172 | 	     This includes "string","boolean", and, as of 3.5 (and 4.x),
173 | 	     int, float, long, date, double, including the "Trie" variants.
174 |        - If sortMissingLast="true", then a sort on this field will cause documents
175 |          without the field to come after documents with the field,
176 |          regardless of the requested sort order (asc or desc).
177 |        - If sortMissingFirst="true", then a sort on this field will cause documents
178 |          without the field to come before documents with the field,
179 |          regardless of the requested sort order.
180 |        - If sortMissingLast="false" and sortMissingFirst="false" (the default),
181 |          then default lucene sorting will be used which places docs without the
182 |          field first in an ascending sort and last in a descending sort.
183 |     -->    
184 | 
185 |     <!--
186 |       Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
187 |     -->
188 |     <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
189 |     <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
190 |     <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
191 |     <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
192 | 
193 |     <!--
194 |      Numeric field types that index each value at various levels of precision
195 |      to accelerate range queries when the number of values between the range
196 |      endpoints is large. See the javadoc for NumericRangeQuery for internal
197 |      implementation details.
198 | 
199 |      Smaller precisionStep values (specified in bits) will lead to more tokens
200 |      indexed per value, slightly larger index size, and faster range queries.
201 |      A precisionStep of 0 disables indexing at different precision levels.
202 |     -->
203 |     <fieldType name="tint" class="solr.TrieIntField" precisionStep="4" positionIncrementGap="0"/>
204 |     <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="4" positionIncrementGap="0"/>
205 |     <fieldType name="tlong" class="solr.TrieLongField" precisionStep="4" positionIncrementGap="0"/>
206 |     <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="4" positionIncrementGap="0"/>
207 | 
208 |     <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
209 |          is a more restricted form of the canonical representation of dateTime
210 |          http://www.w3.org/TR/xmlschema-2/#dateTime    
211 |          The trailing "Z" designates UTC time and is mandatory.
212 |          Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
213 |          All other components are mandatory.
214 | 
215 |          Expressions can also be used to denote calculations that should be
216 |          performed relative to "NOW" to determine the value, ie...
217 | 
218 |                NOW/HOUR
219 |                   ... Round to the start of the current hour
220 |                NOW-1DAY
221 |                   ... Exactly 1 day prior to now
222 |                NOW/DAY+6MONTHS+3DAYS
223 |                   ... 6 months and 3 days in the future from the start of
224 |                       the current day
225 |                       
226 |          Consult the DateField javadocs for more information.
227 | 
228 |          Note: For faster range queries, consider the tdate type
229 |       -->
230 |     <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>
231 | 
232 |     <!-- A Trie based date field for faster date range queries and date faceting. -->
233 |     <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/>
234 | 
235 | 
236 |     <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
237 |     <fieldtype name="binary" class="solr.BinaryField"/>
238 | 
239 |     <!--
240 |       Note:
241 |       These should only be used for compatibility with existing indexes (created with lucene or older Solr versions).
242 |       Use Trie based fields instead. As of Solr 3.5 and 4.x, Trie based fields support sortMissingFirst/Last
243 |       
244 |       Plain numeric field types that store and index the text
245 |       value verbatim (and hence don't correctly support range queries, since the
246 |       lexicographic ordering isn't equal to the numeric ordering)
247 |     -->
248 |     <fieldType name="pint" class="solr.IntField"/>
249 |     <fieldType name="plong" class="solr.LongField"/>
250 |     <fieldType name="pfloat" class="solr.FloatField"/>
251 |     <fieldType name="pdouble" class="solr.DoubleField"/>
252 |     <fieldType name="pdate" class="solr.DateField" sortMissingLast="true"/>
253 | 
254 |     <!-- The "RandomSortField" is not used to store or search any
255 |          data.  You can declare fields of this type it in your schema
256 |          to generate pseudo-random orderings of your docs for sorting 
257 |          or function purposes.  The ordering is generated based on the field
258 |          name and the version of the index. As long as the index version
259 |          remains unchanged, and the same field name is reused,
260 |          the ordering of the docs will be consistent.  
261 |          If you want different psuedo-random orderings of documents,
262 |          for the same version of the index, use a dynamicField and
263 |          change the field name in the request.
264 |      -->
265 |     <fieldType name="random" class="solr.RandomSortField" indexed="true" />
266 | 
267 |     <!-- solr.TextField allows the specification of custom text analyzers
268 |          specified as a tokenizer and a list of token filters. Different
269 |          analyzers may be specified for indexing and querying.
270 | 
271 |          The optional positionIncrementGap puts space between multiple fields of
272 |          this type on the same document, with the purpose of preventing false phrase
273 |          matching across fields.
274 | 
275 |          For more info on customizing your analyzer chain, please see
276 |          http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
277 |      -->
278 | 
279 |     <!-- One can also specify an existing Analyzer class that has a
280 |          default constructor via the class attribute on the analyzer element.
281 |          Example:
282 |     <fieldType name="text_greek" class="solr.TextField">
283 |       <analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/>
284 |     </fieldType>
285 |     -->
286 | 
287 |     <!-- A text field that only splits on whitespace for exact matching of words -->
288 |     <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
289 |       <analyzer>
290 |         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
291 |       </analyzer>
292 |     </fieldType>
293 | 
294 | 	<fieldType name="text_cs" class="solr.TextField" positionIncrementGap="100">
295 |       <analyzer>
296 |         <tokenizer class="solr.PatternTokenizerFactory" pattern="[\s,]+"/>
297 |       </analyzer>
298 |     </fieldType>
299 | 	
300 |     <!-- A general text field that has reasonable, generic
301 |          cross-language defaults: it tokenizes with StandardTokenizer,
302 | 	 removes stop words from case-insensitive "stopwords.txt"
303 | 	 (empty by default), and down cases.  At query time only, it
304 | 	 also applies synonyms. -->
305 |     <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
306 |       <analyzer type="index">
307 |         <tokenizer class="solr.StandardTokenizerFactory"/>
308 |         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
309 |         <!-- in this example, we will only use synonyms at query time
310 |         <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
311 |         -->
312 |         <filter class="solr.LowerCaseFilterFactory"/>
313 |       </analyzer>
314 |       <analyzer type="query">
315 |         <tokenizer class="solr.StandardTokenizerFactory"/>
316 |         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
317 |         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
318 |         <filter class="solr.LowerCaseFilterFactory"/>
319 |       </analyzer>
320 |     </fieldType>
321 | 
322 |    <fieldType name="text_ik" class="solr.TextField" positionIncrementGap="100">
323 |       <analyzer type="index">
324 |         <tokenizer class="org.wltea.analyzer.lucene.IKTokenizerFactory" useSmart="false" conf="ik.conf"/>
325 |         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
326 |         <!-- in this example, we will only use synonyms at query time
327 |         <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
328 |         -->
329 |         <filter class="solr.LowerCaseFilterFactory"/>
330 |       </analyzer>
331 |       <analyzer type="query">
332 |         <tokenizer class="org.wltea.analyzer.lucene.IKTokenizerFactory" useSmart="true" conf="ik.conf"/>
333 | 		<filter class="solr.LowerCaseFilterFactory"/>
334 |         <filter class="org.apache.solr.analysis.DStopFilterFactory" ignoreCase="true" enablePositionIncrements="true" conf="stop.conf"/>
335 | 		<filter class="org.apache.solr.analysis.DSynonymFilterFactory" ignoreCase="true" expand="true" conf="synonym.conf"/>
336 |       </analyzer>
337 |    </fieldType>
338 |   
339 | 
340 |     <!-- A text field with defaults appropriate for English: it
341 |          tokenizes with StandardTokenizer, removes English stop words
342 |          (lang/stopwords_en.txt), down cases, protects words from protwords.txt, and
343 |          finally applies Porter's stemming.  The query time analyzer
344 |          also applies synonyms from synonyms.txt. -->
345 |     <!-- A text field with defaults appropriate for English, plus
346 | 	 aggressive word-splitting and autophrase features enabled.
347 | 	 This field is just like text_en, except it adds
348 | 	 WordDelimiterFilter to enable splitting and matching of
349 | 	 words on case-change, alpha numeric boundaries, and
350 | 	 non-alphanumeric chars.  This means certain compound word
351 | 	 cases will work, for example query "wi fi" will match
352 | 	 document "WiFi" or "wi-fi".
353 |         -->
354 | 
355 |  </types>
356 |   
357 |   <!-- Similarity is the scoring routine for each document vs. a query.
358 |        A custom Similarity or SimilarityFactory may be specified here, but 
359 |        the default is fine for most applications.  
360 |        For more info: http://wiki.apache.org/solr/SchemaXml#Similarity
361 |     -->
362 |   <!--
363 |      <similarity class="com.example.solr.CustomSimilarityFactory">
364 |        <str name="paramkey">param value</str>
365 |      </similarity>
366 |     -->
367 | 
368 | </schema>
369 | 


--------------------------------------------------------------------------------
/test1/conf/stop.conf:
--------------------------------------------------------------------------------
1 | lastupdate=111221
2 | files=stopwords.txt


--------------------------------------------------------------------------------
/test1/conf/stopwords.txt:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 和
16 | 


--------------------------------------------------------------------------------
/test1/conf/synonym.conf:
--------------------------------------------------------------------------------
1 | lastupdate=12
2 | files=synonyms.txt


--------------------------------------------------------------------------------
/test1/conf/synonym2.conf:
--------------------------------------------------------------------------------
1 | lastupdate=12
2 | files=isynonyms.txt


--------------------------------------------------------------------------------
/test1/conf/synonyms.txt:
--------------------------------------------------------------------------------
 1 | # The ASF licenses this file to You under the Apache License, Version 2.0
 2 | # (the "License"); you may not use this file except in compliance with
 3 | # the License.  You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | #-----------------------------------------------------------------------
14 | #some test synonym mappings unlikely to appear in real input text
15 | aaafoo => aaabar
16 | bbbfoo => bbbfoo bbbbar
17 | cccfoo => cccbar cccbaz
18 | fooaaa,baraaa,bazaaa
19 | 
20 | # Some synonym groups specific to this example
21 | GB,gib,gigabyte,gigabytes
22 | MB,mib,megabyte,megabytes
23 | Television, Televisions, TV, TVs
24 | #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
25 | #after us won't split it into two words.
26 | 
27 | # Synonym mappings can be used for spelling correction too
28 | pixima => pixma
29 | 李东垣 => 李东阳
30 | 卡扎菲,卡扎渣,卡炸飞
31 | 穆定喜 => 木丁西


--------------------------------------------------------------------------------
/test1/conf/update-script.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |   This is a basic skeleton JavaScript update processor.
 3 | 
 4 |   In order for this to be executed, it must be properly wired into solrconfig.xml; by default it is commented out in
 5 |   the example solrconfig.xml and must be uncommented to be enabled.
 6 | 
 7 |   See http://wiki.apache.org/solr/ScriptUpdateProcessor for more details.
 8 | */
 9 | 
10 | function processAdd(cmd) {
11 | 
12 |   doc = cmd.solrDoc;  // org.apache.solr.common.SolrInputDocument
13 |   id = doc.getFieldValue("id");
14 |   logger.info("update-script#processAdd: id=" + id);
15 | 
16 | // Set a field value:
17 | //  doc.setField("foo_s", "whatever");
18 | 
19 | // Get a configuration parameter:
20 | //  config_param = params.get('config_param');  // "params" only exists if processor configured with <lst name="params">
21 | 
22 | // Get a request parameter:
23 | // some_param = req.getParams().get("some_param")
24 | 
25 | // Add a field of field names that match a pattern:
26 | //   - Potentially useful to determine the fields/attributes represented in a result set, via faceting on field_name_ss
27 | //  field_names = doc.getFieldNames().toArray();
28 | //  for(i=0; i < field_names.length; i++) {
29 | //    field_name = field_names[i];
30 | //    if (/attr_.*/.test(field_name)) { doc.addField("attribute_ss", field_names[i]); }
31 | //  }
32 | 
33 | }
34 | 
35 | function processDelete(cmd) {
36 |   // no-op
37 | }
38 | 
39 | function processMergeIndexes(cmd) {
40 |   // no-op
41 | }
42 | 
43 | function processCommit(cmd) {
44 |   // no-op
45 | }
46 | 
47 | function processRollback(cmd) {
48 |   // no-op
49 | }
50 | 
51 | function finish() {
52 |   // no-op
53 | }
54 | 


--------------------------------------------------------------------------------
/test2/conf/admin-extra.html:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  contributor license agreements.  See the NOTICE file distributed with
 4 |  this work for additional information regarding copyright ownership.
 5 |  The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  (the "License"); you may not use this file except in compliance with
 7 |  the License.  You may obtain a copy of the License at
 8 | 
 9 |      http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 |  Unless required by applicable law or agreed to in writing, software
12 |  distributed under the License is distributed on an "AS IS" BASIS,
13 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  See the License for the specific language governing permissions and
15 |  limitations under the License.
16 | -->
17 | 
18 | <!-- The content of this page will be statically included into the top
19 | of the admin page.  Uncomment this as an example to see there the content
20 | will show up.
21 | 
22 | <hr>
23 | <i>This line will appear before the first table</i>
24 | <tr>
25 | <td colspan="2">
26 | This row will be appended to the end of the first table
27 | </td>
28 | </tr>
29 | <hr>
30 | 
31 | -->
32 | 


--------------------------------------------------------------------------------
/test2/conf/admin-extra.menu-bottom.html:
--------------------------------------------------------------------------------
1 | <!-- admin-extra.menu-bottom.html -->
2 | 


--------------------------------------------------------------------------------
/test2/conf/admin-extra.menu-top.html:
--------------------------------------------------------------------------------
1 | <!-- admin-extra.menu-top.html -->
2 | 


--------------------------------------------------------------------------------
/test2/conf/ansj.conf:
--------------------------------------------------------------------------------
1 | lastupdate=1226
2 | files=extDic.txt,extDic1.txt


--------------------------------------------------------------------------------
/test2/conf/extDic.txt:
--------------------------------------------------------------------------------
1 | ﻿
2 | 七匹狼
3 | 秋装
4 | 伊莲娜
5 | 格男仕
6 | 李东垣
7 | 卡扎菲
8 | 大舒服
9 | 


--------------------------------------------------------------------------------
/test2/conf/extDic1.txt:
--------------------------------------------------------------------------------
1 | ﻿
2 | 古妃奇
3 | 简直笨
4 | 


--------------------------------------------------------------------------------
/test2/conf/isynonyms.txt:
--------------------------------------------------------------------------------
 1 | # The ASF licenses this file to You under the Apache License, Version 2.0
 2 | # (the "License"); you may not use this file except in compliance with
 3 | # the License.  You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | #-----------------------------------------------------------------------
14 | #some test synonym mappings unlikely to appear in real input text
15 | aaafoo => aaabar
16 | bbbfoo => bbbfoo bbbbar
17 | cccfoo => cccbar cccbaz
18 | fooaaa,baraaa,bazaaa
19 | 
20 | # Some synonym groups specific to this example
21 | GB,gib,gigabyte,gigabytes
22 | MB,mib,megabyte,megabytes
23 | Television, Televisions, TV, TVs
24 | #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
25 | #after us won't split it into two words.
26 | 
27 | # Synonym mappings can be used for spelling correction too
28 | 男式=>男
29 | 


--------------------------------------------------------------------------------
/test2/conf/schema.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" ?>
  2 | <!--
  3 |  Licensed to the Apache Software Foundation (ASF) under one or more
  4 |  contributor license agreements.  See the NOTICE file distributed with
  5 |  this work for additional information regarding copyright ownership.
  6 |  The ASF licenses this file to You under the Apache License, Version 2.0
  7 |  (the "License"); you may not use this file except in compliance with
  8 |  the License.  You may obtain a copy of the License at
  9 | 
 10 |      http://www.apache.org/licenses/LICENSE-2.0
 11 | 
 12 |  Unless required by applicable law or agreed to in writing, software
 13 |  distributed under the License is distributed on an "AS IS" BASIS,
 14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  See the License for the specific language governing permissions and
 16 |  limitations under the License.
 17 | -->
 18 | 
 19 | <!--  
 20 |  This is the Solr schema file. This file should be named "schema.xml" and
 21 |  should be in the conf directory under the solr home
 22 |  (i.e. ./solr/conf/schema.xml by default) 
 23 |  or located where the classloader for the Solr webapp can find it.
 24 | 
 25 |  This example schema is the recommended starting point for users.
 26 |  It should be kept correct and concise, usable out-of-the-box.
 27 | 
 28 |  For more information, on how to customize this file, please see
 29 |  http://wiki.apache.org/solr/SchemaXml
 30 | 
 31 |  PERFORMANCE NOTE: this schema includes many optional features and should not
 32 |  be used for benchmarking.  To improve performance one could
 33 |   - set stored="false" for all fields possible (esp large fields) when you
 34 |     only need to search on the field but don't need to return the original
 35 |     value.
 36 |   - set indexed="false" if you don't need to search on the field, but only
 37 |     return the field as a result of searching on other indexed fields.
 38 |   - remove all unneeded copyField statements
 39 |   - for best index size and searching performance, set "index" to false
 40 |     for all general text fields, use copyField to copy them to the
 41 |     catchall "text" field, and use that for searching.
 42 |   - For maximum indexing performance, use the StreamingUpdateSolrServer
 43 |     java client.
 44 |   - Remember to run the JVM in server mode, and use a higher logging level
 45 |     that avoids logging every request
 46 | -->
 47 | 
 48 | <schema name="test2" version="1.5">
 49 |   <!-- attribute "name" is the name of this schema and is only used for display purposes.
 50 |        version="x.y" is Solr's version number for the schema syntax and 
 51 |        semantics.  It should not normally be changed by applications.
 52 | 
 53 |        1.0: multiValued attribute did not exist, all fields are multiValued 
 54 |             by nature
 55 |        1.1: multiValued attribute introduced, false by default 
 56 |        1.2: omitTermFreqAndPositions attribute introduced, true by default 
 57 |             except for text fields.
 58 |        1.3: removed optional field compress feature
 59 |        1.4: autoGeneratePhraseQueries attribute introduced to drive QueryParser
 60 |             behavior when a single string produces multiple tokens.  Defaults 
 61 |             to off for version >= 1.4
 62 |        1.5: omitNorms defaults to true for primitive field types 
 63 |             (int, float, boolean, string...)
 64 |      -->
 65 | 
 66 |  <fields>
 67 |    <!-- Valid attributes for fields:
 68 |      name: mandatory - the name for the field
 69 |      type: mandatory - the name of a field type from the 
 70 |        <types> fieldType section
 71 |      indexed: true if this field should be indexed (searchable or sortable)
 72 |      stored: true if this field should be retrievable
 73 |      multiValued: true if this field may contain multiple values per document
 74 |      omitNorms: (expert) set to true to omit the norms associated with
 75 |        this field (this disables length normalization and index-time
 76 |        boosting for the field, and saves some memory).  Only full-text
 77 |        fields or fields that need an index-time boost need norms.
 78 |        Norms are omitted for primitive (non-analyzed) types by default.
 79 |      termVectors: [false] set to true to store the term vector for a
 80 |        given field.
 81 |        When using MoreLikeThis, fields used for similarity should be
 82 |        stored for best performance.
 83 |      termPositions: Store position information with the term vector.  
 84 |        This will increase storage costs.
 85 |      termOffsets: Store offset information with the term vector. This 
 86 |        will increase storage costs.
 87 |      required: The field is required.  It will throw an error if the
 88 |        value does not exist
 89 |      default: a value that should be used if no value is specified
 90 |        when adding a document.
 91 |    -->
 92 | 
 93 |    <!-- field names should consist of alphanumeric or underscore characters only and
 94 |       not start with a digit.  This is not currently strictly enforced,
 95 |       but other field names will not have first class support from all components
 96 |       and back compatibility is not guaranteed.  Names with both leading and
 97 |       trailing underscores (e.g. _version_) are reserved.
 98 |    -->
 99 |         
100 |    <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" /> 
101 | 
102 |    <field name="state"  type="int"    indexed="true"  stored="true"/>
103 | 
104 | 	
105 |    <!-- Common metadata fields, named specifically to match up with
106 |      SolrCell metadata when parsing rich documents such as Word, PDF.
107 |      Some fields are multiValued only because Tika currently may return
108 |      multiple values for them. Some metadata is parsed from the documents,
109 |      but there are some which come from the client context:
110 |        "content_type": From the HTTP headers of incoming stream
111 |        "resourcename": From SolrCell request param resource.name
112 |    -->
113 |    <field name="text" type="string" stored="false" indexed="false"/>
114 |    <field name="_version_" type="long" indexed="true" stored="true"  multiValued="false" />
115 | 
116 |  </fields>
117 | 
118 | 
119 |  <!-- Field to use to determine and enforce document uniqueness. 
120 |       Unless this field is marked with required="false", it will be a required field
121 |    -->
122 |  <uniqueKey>id</uniqueKey>
123 |  <solrQueryParser defaultOperator="AND"/>
124 |  <!-- copy field -->
125 | 
126 |  
127 |  <!-- DEPRECATED: The defaultSearchField is consulted by various query parsers when
128 |   parsing a query string that isn't explicit about the field.  Machine (non-user)
129 |   generated queries are best made explicit, or they can use the "df" request parameter
130 |   which takes precedence over this.
131 |   Note: Un-commenting defaultSearchField will be insufficient if your request handler
132 |   in solrconfig.xml defines "df", which takes precedence. That would need to be removed.
133 |  <defaultSearchField>text</defaultSearchField> -->
134 | 
135 |  <!-- DEPRECATED: The defaultOperator (AND|OR) is consulted by various query parsers
136 |   when parsing a query string to determine if a clause of the query should be marked as
137 |   required or optional, assuming the clause isn't already marked by some operator.
138 |   The default is OR, which is generally assumed so it is not a good idea to change it
139 |   globally here.  The "q.op" request parameter takes precedence over this.
140 |  <solrQueryParser defaultOperator="OR"/> -->
141 | 
142 |   <!-- copyField commands copy one field to another at the time a document
143 |         is added to the index.  It's used either to index the same field differently,
144 |         or to add multiple fields to the same field for easier/faster searching.  -->
145 |    <!-- Above, multiple source fields are copied to the [text] field. 
146 | 	  Another way to map multiple source fields to the same 
147 | 	  destination field is to use the dynamic field syntax. 
148 | 	  copyField also supports a maxChars to copy setting.  -->
149 | 	   
150 |    <!-- <copyField source="*_t" dest="text" maxChars="3000"/> -->
151 | 
152 |    <!-- copy name to alphaNameSort, a field designed for sorting by name -->
153 |    <!-- <copyField source="name" dest="alphaNameSort"/> -->
154 |  
155 |   <types>
156 |     <!-- field type definitions. The "name" attribute is
157 |        just a label to be used by field definitions.  The "class"
158 |        attribute and any other attributes determine the real
159 |        behavior of the fieldType.
160 |          Class names starting with "solr" refer to java classes in a
161 |        standard package such as org.apache.solr.analysis
162 |     -->
163 | 
164 |     <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
165 |     <fieldType name="string" class="solr.StrField" sortMissingLast="true" />
166 | 
167 |     <!-- boolean type: "true" or "false" -->
168 |     <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
169 | 
170 |     <!-- sortMissingLast and sortMissingFirst attributes are optional attributes are
171 |          currently supported on types that are sorted internally as strings
172 |          and on numeric types.
173 | 	     This includes "string","boolean", and, as of 3.5 (and 4.x),
174 | 	     int, float, long, date, double, including the "Trie" variants.
175 |        - If sortMissingLast="true", then a sort on this field will cause documents
176 |          without the field to come after documents with the field,
177 |          regardless of the requested sort order (asc or desc).
178 |        - If sortMissingFirst="true", then a sort on this field will cause documents
179 |          without the field to come before documents with the field,
180 |          regardless of the requested sort order.
181 |        - If sortMissingLast="false" and sortMissingFirst="false" (the default),
182 |          then default lucene sorting will be used which places docs without the
183 |          field first in an ascending sort and last in a descending sort.
184 |     -->    
185 | 
186 |     <!--
187 |       Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
188 |     -->
189 |     <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
190 |     <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
191 |     <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
192 |     <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
193 | 
194 |     <!--
195 |      Numeric field types that index each value at various levels of precision
196 |      to accelerate range queries when the number of values between the range
197 |      endpoints is large. See the javadoc for NumericRangeQuery for internal
198 |      implementation details.
199 | 
200 |      Smaller precisionStep values (specified in bits) will lead to more tokens
201 |      indexed per value, slightly larger index size, and faster range queries.
202 |      A precisionStep of 0 disables indexing at different precision levels.
203 |     -->
204 |     <fieldType name="tint" class="solr.TrieIntField" precisionStep="4" positionIncrementGap="0"/>
205 |     <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="4" positionIncrementGap="0"/>
206 |     <fieldType name="tlong" class="solr.TrieLongField" precisionStep="4" positionIncrementGap="0"/>
207 |     <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="4" positionIncrementGap="0"/>
208 | 
209 |     <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
210 |          is a more restricted form of the canonical representation of dateTime
211 |          http://www.w3.org/TR/xmlschema-2/#dateTime    
212 |          The trailing "Z" designates UTC time and is mandatory.
213 |          Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
214 |          All other components are mandatory.
215 | 
216 |          Expressions can also be used to denote calculations that should be
217 |          performed relative to "NOW" to determine the value, ie...
218 | 
219 |                NOW/HOUR
220 |                   ... Round to the start of the current hour
221 |                NOW-1DAY
222 |                   ... Exactly 1 day prior to now
223 |                NOW/DAY+6MONTHS+3DAYS
224 |                   ... 6 months and 3 days in the future from the start of
225 |                       the current day
226 |                       
227 |          Consult the DateField javadocs for more information.
228 | 
229 |          Note: For faster range queries, consider the tdate type
230 |       -->
231 |     <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>
232 | 
233 |     <!-- A Trie based date field for faster date range queries and date faceting. -->
234 |     <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/>
235 | 
236 | 
237 |     <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
238 |     <fieldtype name="binary" class="solr.BinaryField"/>
239 | 
240 |     <!--
241 |       Note:
242 |       These should only be used for compatibility with existing indexes (created with lucene or older Solr versions).
243 |       Use Trie based fields instead. As of Solr 3.5 and 4.x, Trie based fields support sortMissingFirst/Last
244 |       
245 |       Plain numeric field types that store and index the text
246 |       value verbatim (and hence don't correctly support range queries, since the
247 |       lexicographic ordering isn't equal to the numeric ordering)
248 |     -->
249 |     <fieldType name="pint" class="solr.IntField"/>
250 |     <fieldType name="plong" class="solr.LongField"/>
251 |     <fieldType name="pfloat" class="solr.FloatField"/>
252 |     <fieldType name="pdouble" class="solr.DoubleField"/>
253 |     <fieldType name="pdate" class="solr.DateField" sortMissingLast="true"/>
254 | 
255 |     <!-- The "RandomSortField" is not used to store or search any
256 |          data.  You can declare fields of this type it in your schema
257 |          to generate pseudo-random orderings of your docs for sorting 
258 |          or function purposes.  The ordering is generated based on the field
259 |          name and the version of the index. As long as the index version
260 |          remains unchanged, and the same field name is reused,
261 |          the ordering of the docs will be consistent.  
262 |          If you want different psuedo-random orderings of documents,
263 |          for the same version of the index, use a dynamicField and
264 |          change the field name in the request.
265 |      -->
266 |     <fieldType name="random" class="solr.RandomSortField" indexed="true" />
267 | 
268 |     <!-- solr.TextField allows the specification of custom text analyzers
269 |          specified as a tokenizer and a list of token filters. Different
270 |          analyzers may be specified for indexing and querying.
271 | 
272 |          The optional positionIncrementGap puts space between multiple fields of
273 |          this type on the same document, with the purpose of preventing false phrase
274 |          matching across fields.
275 | 
276 |          For more info on customizing your analyzer chain, please see
277 |          http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
278 |      -->
279 | 
280 |     <!-- One can also specify an existing Analyzer class that has a
281 |          default constructor via the class attribute on the analyzer element.
282 |          Example:
283 |     <fieldType name="text_greek" class="solr.TextField">
284 |       <analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/>
285 |     </fieldType>
286 |     -->
287 | 
288 |     <!-- A text field that only splits on whitespace for exact matching of words -->
289 |     <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
290 |       <analyzer>
291 |         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
292 |       </analyzer>
293 |     </fieldType>
294 | 
295 | 	<fieldType name="text_cs" class="solr.TextField" positionIncrementGap="100">
296 |       <analyzer>
297 |         <tokenizer class="solr.PatternTokenizerFactory" pattern="[\s,]+"/>
298 |       </analyzer>
299 |     </fieldType>
300 | 	
301 |     <!-- A general text field that has reasonable, generic
302 |          cross-language defaults: it tokenizes with StandardTokenizer,
303 | 	 removes stop words from case-insensitive "stopwords.txt"
304 | 	 (empty by default), and down cases.  At query time only, it
305 | 	 also applies synonyms. -->
306 |     <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
307 |       <analyzer type="index">
308 |         <tokenizer class="solr.StandardTokenizerFactory"/>
309 |         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
310 |         <!-- in this example, we will only use synonyms at query time
311 |         <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
312 |         -->
313 |         <filter class="solr.LowerCaseFilterFactory"/>
314 |       </analyzer>
315 |       <analyzer type="query">
316 |         <tokenizer class="solr.StandardTokenizerFactory"/>
317 |         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
318 |         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
319 |         <filter class="solr.LowerCaseFilterFactory"/>
320 |       </analyzer>
321 |     </fieldType>
322 | 	
323 |      <fieldType name="text_ansj" class="solr.TextField" positionIncrementGap="100">
324 |      <analyzer type="index">
325 |        <tokenizer class="org.ansj.solr.AnsjTokenizerFactory" conf="ansj.conf"/>
326 |      </analyzer>
327 | 	 <analyzer type="query">
328 |        <tokenizer class="org.ansj.solr.AnsjTokenizerFactory" analysisType="1"/>
329 |      </analyzer>
330 |    </fieldType>
331 |   
332 | 
333 | 
334 |     <!-- A text field with defaults appropriate for English: it
335 |          tokenizes with StandardTokenizer, removes English stop words
336 |          (lang/stopwords_en.txt), down cases, protects words from protwords.txt, and
337 |          finally applies Porter's stemming.  The query time analyzer
338 |          also applies synonyms from synonyms.txt. -->
339 |     <!-- A text field with defaults appropriate for English, plus
340 | 	 aggressive word-splitting and autophrase features enabled.
341 | 	 This field is just like text_en, except it adds
342 | 	 WordDelimiterFilter to enable splitting and matching of
343 | 	 words on case-change, alpha numeric boundaries, and
344 | 	 non-alphanumeric chars.  This means certain compound word
345 | 	 cases will work, for example query "wi fi" will match
346 | 	 document "WiFi" or "wi-fi".
347 |         -->
348 | 
349 |  </types>
350 |   
351 |   <!-- Similarity is the scoring routine for each document vs. a query.
352 |        A custom Similarity or SimilarityFactory may be specified here, but 
353 |        the default is fine for most applications.  
354 |        For more info: http://wiki.apache.org/solr/SchemaXml#Similarity
355 |     -->
356 |   <!--
357 |      <similarity class="com.example.solr.CustomSimilarityFactory">
358 |        <str name="paramkey">param value</str>
359 |      </similarity>
360 |     -->
361 | 
362 | </schema>
363 | 


--------------------------------------------------------------------------------
/test2/conf/stop.conf:
--------------------------------------------------------------------------------
1 | lastupdate=11122
2 | files=stopwords.txt


--------------------------------------------------------------------------------
/test2/conf/stopwords.txt:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 和
16 | 你
17 | 


--------------------------------------------------------------------------------
/test2/conf/synonym.conf:
--------------------------------------------------------------------------------
1 | lastupdate=1
2 | files=synonyms.txt


--------------------------------------------------------------------------------
/test2/conf/synonym2.conf:
--------------------------------------------------------------------------------
1 | lastupdate=12
2 | files=isynonyms.txt


--------------------------------------------------------------------------------
/test2/conf/synonyms.txt:
--------------------------------------------------------------------------------
 1 | # The ASF licenses this file to You under the Apache License, Version 2.0
 2 | # (the "License"); you may not use this file except in compliance with
 3 | # the License.  You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | #-----------------------------------------------------------------------
14 | #some test synonym mappings unlikely to appear in real input text
15 | aaafoo => aaabar
16 | bbbfoo => bbbfoo bbbbar
17 | cccfoo => cccbar cccbaz
18 | fooaaa,baraaa,bazaaa
19 | 
20 | # Some synonym groups specific to this example
21 | GB,gib,gigabyte,gigabytes
22 | MB,mib,megabyte,megabytes
23 | Television, Televisions, TV, TVs
24 | #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
25 | #after us won't split it into two words.
26 | 
27 | # Synonym mappings can be used for spelling correction too
28 | pixima => pixma
29 | 李东垣 => 李东阳
30 | 卡扎菲,卡扎渣,卡炸飞
31 | 


--------------------------------------------------------------------------------
/test2/conf/update-script.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |   This is a basic skeleton JavaScript update processor.
 3 | 
 4 |   In order for this to be executed, it must be properly wired into solrconfig.xml; by default it is commented out in
 5 |   the example solrconfig.xml and must be uncommented to be enabled.
 6 | 
 7 |   See http://wiki.apache.org/solr/ScriptUpdateProcessor for more details.
 8 | */
 9 | 
10 | function processAdd(cmd) {
11 | 
12 |   doc = cmd.solrDoc;  // org.apache.solr.common.SolrInputDocument
13 |   id = doc.getFieldValue("id");
14 |   logger.info("update-script#processAdd: id=" + id);
15 | 
16 | // Set a field value:
17 | //  doc.setField("foo_s", "whatever");
18 | 
19 | // Get a configuration parameter:
20 | //  config_param = params.get('config_param');  // "params" only exists if processor configured with <lst name="params">
21 | 
22 | // Get a request parameter:
23 | // some_param = req.getParams().get("some_param")
24 | 
25 | // Add a field of field names that match a pattern:
26 | //   - Potentially useful to determine the fields/attributes represented in a result set, via faceting on field_name_ss
27 | //  field_names = doc.getFieldNames().toArray();
28 | //  for(i=0; i < field_names.length; i++) {
29 | //    field_name = field_names[i];
30 | //    if (/attr_.*/.test(field_name)) { doc.addField("attribute_ss", field_names[i]); }
31 | //  }
32 | 
33 | }
34 | 
35 | function processDelete(cmd) {
36 |   // no-op
37 | }
38 | 
39 | function processMergeIndexes(cmd) {
40 |   // no-op
41 | }
42 | 
43 | function processCommit(cmd) {
44 |   // no-op
45 | }
46 | 
47 | function processRollback(cmd) {
48 |   // no-op
49 | }
50 | 
51 | function finish() {
52 |   // no-op
53 | }
54 | 


--------------------------------------------------------------------------------