├── .gitignore
├── README.md
├── mlcsseg-ansj
├── lib
│ ├── ansj_seg-1.4-min.jar
│ └── tree_split-1.3.jar
├── pom.xml
└── src
│ └── main
│ ├── assembly
│ └── zip.xml
│ └── java
│ └── org
│ └── ansj
│ └── solr
│ ├── AnsjTokenizer.java
│ ├── AnsjTokenizerFactory.java
│ └── TestAnsj.java
├── mlcsseg-common
├── pom.xml
└── src
│ └── main
│ └── java
│ └── com
│ └── mlcs
│ └── search
│ └── mlcsseg
│ ├── common
│ └── ScheduledExecutor.java
│ └── lucene
│ ├── CnTokenizer.java
│ ├── ReloadableTokenizerFactory.java
│ └── ReloaderRegister.java
├── mlcsseg-filter
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── org
│ │ └── apache
│ │ └── solr
│ │ └── analysis
│ │ ├── DStopFilter.java
│ │ ├── DStopFilterFactory.java
│ │ └── DSynonymFilterFactory.java
│ └── test
│ └── java
│ └── org
│ └── mlcsseg
│ └── filter
│ └── AppTest.java
├── mlcsseg-ik
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── org
│ │ │ └── wltea
│ │ │ └── analyzer
│ │ │ ├── cfg
│ │ │ ├── Configuration.java
│ │ │ └── DefaultConfig.java
│ │ │ ├── core
│ │ │ ├── AnalyzeContext.java
│ │ │ ├── CJKSegmenter.java
│ │ │ ├── CN_QuantifierSegmenter.java
│ │ │ ├── CharacterUtil.java
│ │ │ ├── IKArbitrator.java
│ │ │ ├── IKSegmenter.java
│ │ │ ├── ISegmenter.java
│ │ │ ├── LetterSegmenter.java
│ │ │ ├── Lexeme.java
│ │ │ ├── LexemePath.java
│ │ │ └── QuickSortSet.java
│ │ │ ├── dic
│ │ │ ├── DictCharNode.java
│ │ │ ├── DictSegment.java
│ │ │ ├── Dictionary.java
│ │ │ └── Hit.java
│ │ │ └── lucene
│ │ │ ├── IKTokenizer.java
│ │ │ └── IKTokenizerFactory.java
│ └── resources
│ │ ├── chars.dic
│ │ ├── main2012.dic
│ │ └── quantifier.dic
│ └── test
│ └── java
│ └── org
│ └── wltea
│ └── analyzer
│ └── test
│ └── TestIk.java
├── pom.xml
├── test1
└── conf
│ ├── admin-extra.html
│ ├── admin-extra.menu-bottom.html
│ ├── admin-extra.menu-top.html
│ ├── extDic.txt
│ ├── extDic1.txt
│ ├── ik.conf
│ ├── isynonyms.txt
│ ├── schema.xml
│ ├── solrconfig.xml
│ ├── stop.conf
│ ├── stopwords.txt
│ ├── synonym.conf
│ ├── synonym2.conf
│ ├── synonyms.txt
│ └── update-script.js
└── test2
└── conf
├── admin-extra.html
├── admin-extra.menu-bottom.html
├── admin-extra.menu-top.html
├── ansj.conf
├── extDic.txt
├── extDic1.txt
├── isynonyms.txt
├── schema.xml
├── solrconfig.xml
├── stop.conf
├── stopwords.txt
├── synonym.conf
├── synonym2.conf
├── synonyms.txt
└── update-script.js
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 |
3 | # Package Files #
4 | *.war
5 | *.ear
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | mlcsseg :solr分词器大补贴
2 | =======
3 |
4 | 包括IK, ANSJ,过滤器。支持动态加载solr配置路径下的自定义词库。
5 |
6 | 支持最新的`4.6`版本。master分支是4.6的,其他分支支持对应的solr版本
7 |
8 | 配置和说明都在:http://mlcsdev.iteye.com/blog/2037109
9 |
10 | 欢迎使用,并以任何方式提供意见和建议。
11 |
--------------------------------------------------------------------------------
/mlcsseg-ansj/lib/ansj_seg-1.4-min.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlcsdev/mlcsseg/942bc12ee27e5297bf6a042952299a82c490ca19/mlcsseg-ansj/lib/ansj_seg-1.4-min.jar
--------------------------------------------------------------------------------
/mlcsseg-ansj/lib/tree_split-1.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlcsdev/mlcsseg/942bc12ee27e5297bf6a042952299a82c490ca19/mlcsseg-ansj/lib/tree_split-1.3.jar
--------------------------------------------------------------------------------
/mlcsseg-ansj/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.mlcs.search
6 | mlcsseg
7 | 4.6.0-SNAPSHOT
8 |
9 | mlcsseg-ansj
10 |
11 |
12 |
13 | com.mlcs.search
14 | mlcsseg-common
15 | 4.6.0-SNAPSHOT
16 |
17 |
18 | org.ansj
19 | ansj_seg
20 | 1.4
21 | system
22 | ${project.basedir}/lib/ansj_seg-1.4-min.jar
23 |
24 |
25 | org.ansj
26 | tree_split
27 | 1.3
28 | system
29 | ${project.basedir}/lib/tree_split-1.3.jar
30 |
31 |
32 |
33 |
34 |
35 |
36 | org.apache.maven.plugins
37 | maven-compiler-plugin
38 |
39 | 1.6
40 | 1.6
41 | utf8
42 |
43 |
44 |
45 | org.apache.maven.plugins
46 | maven-resources-plugin
47 | 2.5
48 |
49 | UTF-8
50 |
51 |
52 |
53 | maven-assembly-plugin
54 | 2.2.1
55 |
56 |
57 | src/main/assembly/zip.xml
58 |
59 |
60 |
61 |
62 | org.apache.maven.plugins
63 | maven-jar-plugin
64 | 2.4
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/mlcsseg-ansj/src/main/assembly/zip.xml:
--------------------------------------------------------------------------------
1 |
5 | bin
6 |
7 | zip
8 |
9 |
10 |
11 | true
12 | jar
13 |
14 |
15 | org.apache.solr:solr*
16 | com.spatial4j:spatial4j
17 | org.apache.lucene:lucene*
18 | com.google.guava:guava*
19 | commons*:commons*
20 | org.restlet.jee:org.restlet*
21 | org.apache.zookeeper:zookeeper*
22 | org.noggit:noggit*
23 | org.slf4j*:slf4j*
24 | org.codehaus.woodstox:wstx-asl*
25 | org.apache.httpcomponents:http*
26 |
27 |
28 |
29 |
30 |
31 | lib
32 | jar
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/mlcsseg-ansj/src/main/java/org/ansj/solr/AnsjTokenizer.java:
--------------------------------------------------------------------------------
1 | package org.ansj.solr;
2 |
3 | import java.io.IOException;
4 | import java.io.Reader;
5 | import java.util.ArrayList;
6 | import java.util.Iterator;
7 | import java.util.List;
8 | import org.ansj.domain.Term;
9 | import org.ansj.splitWord.analysis.IndexAnalysis;
10 | import org.ansj.splitWord.analysis.ToAnalysis;
11 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
12 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
13 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
14 |
15 | import com.mlcs.search.mlcsseg.lucene.CnTokenizer;
16 |
17 |
18 | public class AnsjTokenizer extends CnTokenizer{
19 | private int analysisType ;
20 | private boolean removePunc;
21 |
22 | private CharTermAttribute termAtt;
23 | private OffsetAttribute offsetAtt;
24 | private TypeAttribute typeAtt;
25 | int lastOffset = 0;
26 | int endPosition =0;
27 | private Iterator tokenIter;
28 | private List tokenBuffer;
29 | static
30 | {
31 | ToAnalysis.parse("");
32 | }
33 |
34 | public AnsjTokenizer(Reader input, int analysisType, boolean removePunc) {
35 | super(input);
36 | offsetAtt = addAttribute(OffsetAttribute.class);
37 | termAtt = addAttribute(CharTermAttribute.class);
38 | typeAtt = addAttribute(TypeAttribute.class);
39 | this.analysisType = analysisType;
40 | this.removePunc = removePunc;
41 | }
42 |
43 | @Override
44 | public boolean incrementToken() throws IOException {
45 | if (tokenIter == null || !tokenIter.hasNext()){
46 | String currentSentence = checkSentences();
47 | if (currentSentence!= null){
48 | tokenBuffer = new ArrayList();
49 | if (analysisType == 1){
50 | for(Term term : ToAnalysis.parse(currentSentence)){
51 | if (removePunc && stopwords.contains(term.getName()))
52 | continue;
53 | tokenBuffer.add(term);
54 | }
55 |
56 | }else {
57 | for(Term term : IndexAnalysis.parse(currentSentence)){
58 | if (removePunc && stopwords.contains(term.getName()))
59 | continue;
60 | tokenBuffer.add(term);
61 | }
62 | }
63 | tokenIter = tokenBuffer.iterator();
64 | if (!tokenIter.hasNext()){
65 | return false;
66 | }
67 | } else {
68 | return false; // no more sentences, end of stream!
69 | }
70 | }
71 | clearAttributes();
72 |
73 | Term term = tokenIter.next();
74 | if (removePunc){
75 | while(stopwords.contains(term.getName())){
76 | if (!tokenIter.hasNext()){
77 | }else{
78 | term = tokenIter.next();
79 | }
80 | }
81 | }
82 | termAtt.append(term.getName());
83 | termAtt.setLength(term.getName().length());
84 |
85 | int currentStart = tokenStart + term.getOffe();
86 | int currentEnd = tokenStart + term.getToValue();
87 | offsetAtt.setOffset(currentStart,currentEnd);
88 | typeAtt.setType("word");
89 |
90 | // int pi = currentStart - lastOffset;
91 | // if(term.getOffe() <= 0) {
92 | // pi = 1;
93 | // }
94 | // positionIncrementAtt.setPositionIncrement( pi );
95 | lastOffset = currentStart;
96 | endPosition = currentEnd;
97 | return true;
98 | }
99 |
100 |
101 |
102 | @Override
103 | public void reset() throws IOException {
104 | super.reset();
105 | }
106 |
107 | public final void end() {
108 | // set final offset
109 | int finalOffset = correctOffset(this.endPosition);
110 | offsetAtt.setOffset(finalOffset, finalOffset);
111 | }
112 |
113 | }
114 |
--------------------------------------------------------------------------------
/mlcsseg-ansj/src/main/java/org/ansj/solr/AnsjTokenizerFactory.java:
--------------------------------------------------------------------------------
1 | package org.ansj.solr;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.io.Reader;
6 | import java.util.List;
7 | import java.util.Map;
8 | import org.ansj.library.UserDefineLibrary;
9 | import org.apache.lucene.analysis.Tokenizer;
10 | import org.apache.lucene.analysis.util.ResourceLoader;
11 | import org.apache.lucene.util.AttributeSource.AttributeFactory;
12 |
13 | import com.mlcs.search.mlcsseg.lucene.ReloadableTokenizerFactory;
14 | import com.mlcs.search.mlcsseg.lucene.ReloaderRegister;
15 |
16 |
17 | public class AnsjTokenizerFactory extends ReloadableTokenizerFactory {
18 |
19 | private int analysisType = 0;
20 | private boolean rmPunc = true;
21 |
22 | public AnsjTokenizerFactory(Map args) {
23 | super(args);
24 | analysisType = getInt(args, "analysisType", 0);
25 | rmPunc = getBoolean(args, "rmPunc", true);
26 | System.out.println(":::ansj:construction::::::::::::::::::::::::::" + conf);
27 | }
28 |
29 |
30 |
31 | public void inform(ResourceLoader loader) throws IOException {
32 | System.out.println(":::ansj:::inform::::::::::::::::::::::::" + conf);
33 | ReloaderRegister.register(this, loader, conf);
34 | }
35 |
36 | @Override
37 | public Tokenizer create(AttributeFactory factory, Reader input) {
38 | return new AnsjTokenizer(input, analysisType, rmPunc);
39 | }
40 |
41 |
42 |
43 | @Override
44 | public void update(List inputStreams) {
45 | if (inputStreams!= null){
46 | UserDefineLibrary.reloadMainAndAdd(inputStreams);
47 | }
48 | }
49 |
50 |
51 |
52 |
53 | }
54 |
--------------------------------------------------------------------------------
/mlcsseg-ansj/src/main/java/org/ansj/solr/TestAnsj.java:
--------------------------------------------------------------------------------
1 | package org.ansj.solr;
2 |
3 | import java.io.IOException;
4 | import java.io.StringReader;
5 | import java.util.List;
6 |
7 | import org.ansj.domain.Term;
8 | import org.ansj.splitWord.analysis.IndexAnalysis;
9 | import org.ansj.splitWord.analysis.ToAnalysis;
10 | import org.apache.lucene.analysis.Tokenizer;
11 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
12 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
13 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
14 |
15 |
16 |
17 | public class TestAnsj {
18 |
19 | public static void main(String[] args) throws IOException {
20 | List parse = ToAnalysis.parse("天天向上,媒体打打。《回家真好》");
21 | System.out.println(parse);
22 | Tokenizer tokenizer = new AnsjTokenizer(new StringReader("天天向上,媒体打打。《回家真好》"), 0, true);
23 | CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
24 | OffsetAttribute offsetAtt =
25 | tokenizer.addAttribute(OffsetAttribute.class);
26 | PositionIncrementAttribute positionIncrementAtt =
27 | tokenizer.addAttribute(PositionIncrementAttribute.class);
28 |
29 |
30 | while (tokenizer.incrementToken()){
31 |
32 | System.out.print(new String(termAtt.toString()) );
33 | System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
34 | System.out.print( positionIncrementAtt.getPositionIncrement() +"/");
35 |
36 | }
37 | tokenizer.close();
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/mlcsseg-common/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 |
6 | com.mlcs.search
7 | mlcsseg
8 | 4.6.0-SNAPSHOT
9 |
10 | mlcsseg-common
11 | mlcsseg-common
12 | http://maven.apache.org
13 |
14 | UTF-8
15 |
16 |
17 |
--------------------------------------------------------------------------------
/mlcsseg-common/src/main/java/com/mlcs/search/mlcsseg/common/ScheduledExecutor.java:
--------------------------------------------------------------------------------
1 | package com.mlcs.search.mlcsseg.common;
2 | import java.util.concurrent.Executors;
3 | import java.util.concurrent.ScheduledExecutorService;
4 | import java.util.concurrent.ThreadFactory;
5 | import java.util.concurrent.TimeUnit;
6 |
7 |
8 |
9 | public class ScheduledExecutor {
10 |
11 | static class SegTF implements ThreadFactory{
12 |
13 | public Thread newThread(Runnable r) {
14 | Thread t = new Thread(r, "SegmentScheduledExecutorThread");
15 | t.setDaemon(true);
16 | return t;
17 | }
18 |
19 | }
20 |
21 | final public static ScheduledExecutorService ScheduledService = Executors.newSingleThreadScheduledExecutor(new SegTF());
22 |
23 |
24 | public static void submit(Runnable cmd, long periodMilliSenconds){
25 | ScheduledService.scheduleAtFixedRate(cmd, 10l, periodMilliSenconds, TimeUnit.MILLISECONDS);
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/mlcsseg-common/src/main/java/com/mlcs/search/mlcsseg/lucene/CnTokenizer.java:
--------------------------------------------------------------------------------
1 | package com.mlcs.search.mlcsseg.lucene;
2 |
3 | import java.io.IOException;
4 | import java.io.Reader;
5 | import java.io.StringReader;
6 | import java.util.HashSet;
7 | import java.util.Set;
8 |
9 | import org.apache.lucene.analysis.Tokenizer;
10 |
11 | /**
12 | * 增加基础的停用词过滤,切长句的能力。分词细节没做.
13 | * @Description TODO
14 | * @author shanbo.liang
15 | */
16 | public abstract class CnTokenizer extends Tokenizer{
17 | public final static String SPACES = " \t\r\n";
18 | public final static String PUNCTUATION = "。,!?;,!?;";
19 | public final static String stop = "',.`-_=?\'|\"(){}[]<>*#&^$@!~:;+/《》—-,。、:;!·?“”)(【】[]●'";
20 | public static Set stopwords = new HashSet();
21 |
22 | protected final StringBuilder buffer = new StringBuilder();
23 | protected int tokenStart = 0, tokenEnd = 0;
24 |
25 |
26 | static
27 | {
28 | for(String c : stop.split("")){
29 | stopwords.add(c);
30 | }
31 | }
32 |
33 | protected CnTokenizer(Reader input) {
34 | super(input);
35 | }
36 |
37 | protected String checkSentences() throws IOException{
38 | buffer.setLength(0);
39 | int ci;
40 | char ch, pch;
41 | boolean atBegin = true;
42 | tokenStart = tokenEnd;
43 | ci = input.read();
44 | ch = (char) ci;
45 |
46 | while (true) {
47 | if (ci == -1) {
48 | break;
49 | } else if (PUNCTUATION.indexOf(ch) != -1) {
50 | // End of a sentence
51 | buffer.append(ch);
52 | tokenEnd++;
53 | break;
54 | } else if (atBegin && SPACES.indexOf(ch) != -1) {
55 | tokenStart++;
56 | tokenEnd++;
57 | ci = input.read();
58 | ch = (char) ci;
59 | } else {
60 | buffer.append(ch);
61 | atBegin = false;
62 | tokenEnd++;
63 | pch = ch;
64 | ci = input.read();
65 | ch = (char) ci;
66 | // Two spaces, such as CR, LF
67 | if (SPACES.indexOf(ch) != -1
68 | && SPACES.indexOf(pch) != -1) {
69 | // buffer.append(ch);
70 | tokenEnd++;
71 | break;
72 | }
73 | }
74 | }
75 | if (buffer.length() == 0){
76 | //sentences finished~
77 | return null;
78 | }else {
79 | return buffer.toString();
80 | }
81 |
82 | }
83 |
84 | public void reset() throws IOException {
85 | super.reset();
86 | tokenStart = tokenEnd = 0;
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/mlcsseg-common/src/main/java/com/mlcs/search/mlcsseg/lucene/ReloadableTokenizerFactory.java:
--------------------------------------------------------------------------------
1 | package com.mlcs.search.mlcsseg.lucene;
2 |
3 | import java.io.InputStream;
4 | import java.util.List;
5 | import java.util.Map;
6 |
7 | import org.apache.lucene.analysis.util.ResourceLoaderAware;
8 | import org.apache.lucene.analysis.util.TokenizerFactory;
9 |
10 |
11 | public abstract class ReloadableTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware{
12 |
13 | protected String conf;
14 |
15 | protected ReloadableTokenizerFactory(Map args) {
16 | super(args);
17 | assureMatchVersion();
18 | conf = get(args, "conf");
19 | }
20 |
21 | public abstract void update(List inputStreams);
22 |
23 | public String getBeanName(){
24 | return this.getClass().toString();
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/mlcsseg-common/src/main/java/com/mlcs/search/mlcsseg/lucene/ReloaderRegister.java:
--------------------------------------------------------------------------------
1 | package com.mlcs.search.mlcsseg.lucene;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.util.ArrayList;
6 | import java.util.Collections;
7 | import java.util.HashMap;
8 | import java.util.List;
9 | import java.util.Map;
10 | import java.util.Properties;
11 |
12 | import org.apache.lucene.analysis.util.ResourceLoader;
13 |
14 | import com.mlcs.search.mlcsseg.common.ScheduledExecutor;
15 |
16 | /**
17 | * register it in 'inform(ResourceLoader loader)'
18 | * @Description TODO
19 | * @author shanbo.liang
20 | */
21 | public class ReloaderRegister{
22 |
23 |
24 |
25 |
26 | private static Map reloadAwares = new HashMap();
27 |
28 |
29 | public static class ConfigChecker {
30 |
31 | private long lastUpdateTime = Long.MIN_VALUE;
32 |
33 |
34 |
35 |
36 | public static List SplitFileNames(String fileNames) {
37 | if (fileNames == null || fileNames.isEmpty())
38 | return Collections.emptyList();
39 |
40 | List result = new ArrayList();
41 | for (String file : fileNames.split("[,\\s]+")) {
42 | result.add(file);
43 | }
44 |
45 | return result;
46 | }
47 |
48 | public List currentToReload(InputStream confStream){
49 | try{
50 | Properties p = new Properties();
51 | p.load(confStream);
52 | confStream.close();
53 | String lastupdate = p.getProperty("lastupdate", "0");
54 | Long t = new Long(lastupdate);
55 | // System.out.println(" => " + toString() + "=========loading conf========= : " + p.toString() );
56 | if (t > this.lastUpdateTime){
57 | System.out.println("lastUpdateTime is new, files will be loaded!" );
58 | this.lastUpdateTime = t.longValue();
59 | String paths = p.getProperty("files");
60 | if (paths==null || paths.trim().isEmpty()) // 必须有地址
61 | return Collections.emptyList();
62 |
63 | List dicPaths = SplitFileNames(p.getProperty("files"));
64 | return dicPaths;
65 | }else{
66 | this.lastUpdateTime = t.longValue();
67 | return Collections.emptyList();
68 | }
69 | }catch(IOException e){
70 | return Collections.emptyList();
71 | }
72 | }
73 |
74 | public String toString(){
75 | return "configchecker@" + lastUpdateTime;
76 | }
77 |
78 | }
79 |
80 |
81 | /**
82 | * 向注册机注册一个可定时更新的tokenfactory;register it in 'inform(ResourceLoader loader)'
83 | * @param reloadFactory
84 | * @param loader
85 | * @param confName
86 | * @return
87 | */
88 | public static synchronized String register(final ReloadableTokenizerFactory reloadFactory, final ResourceLoader loader, final String confName){
89 | if ( reloadAwares.containsKey(reloadFactory.getBeanName())){
90 | return "already";
91 | }else{
92 | if(confName != null && !confName.trim().isEmpty()){ //存在conf才注册进来
93 | final ConfigChecker cc = new ConfigChecker();
94 | reloadAwares.put(reloadFactory.getBeanName(), cc);
95 | loadAndUpdate(cc, reloadFactory, loader, confName);
96 | ScheduledExecutor.submit(new Runnable() {
97 | public void run() {
98 | loadAndUpdate(cc, reloadFactory, loader, confName);
99 | }
100 | }, 30 * 1000);
101 | return "ok";
102 | }
103 | return "conf is empty";
104 | }
105 | }
106 |
107 | private static void loadAndUpdate(final ConfigChecker cc, final ReloadableTokenizerFactory reloadFactory, final ResourceLoader loader, final String confName){
108 |
109 | try {
110 | List dicts = cc.currentToReload(loader.openResource(confName));
111 | if (!dicts.isEmpty()){
112 | List insFromLoader = new ArrayList(dicts.size());
113 | for(String dictName : dicts){
114 | try{
115 | insFromLoader.add(loader.openResource(dictName));
116 | }catch(IOException e){
117 | System.out.println("missing dict source : " + dictName);
118 | }
119 | }
120 | reloadFactory.update(insFromLoader);
121 | System.out.println("reload finish! " + dicts);
122 | }
123 | } catch (IOException e) {
124 | e.printStackTrace();
125 | }
126 | }
127 |
128 |
129 | }
130 |
--------------------------------------------------------------------------------
/mlcsseg-filter/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.mlcs.search
8 | mlcsseg
9 | 4.6.0-SNAPSHOT
10 |
11 | mlcsseg-filter
12 | mlcsseg-filter
13 | http://maven.apache.org
14 |
15 | UTF-8
16 |
17 |
18 |
19 |
20 | junit
21 | junit
22 | 3.8.1
23 | test
24 |
25 |
26 | com.mlcs.search
27 | mlcsseg-common
28 | 4.6.0-SNAPSHOT
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/mlcsseg-filter/src/main/java/org/apache/solr/analysis/DStopFilter.java:
--------------------------------------------------------------------------------
1 | package org.apache.solr.analysis;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.lucene.analysis.TokenStream;
6 |
7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
8 | import org.apache.lucene.analysis.util.CharArraySet;
9 | import org.apache.lucene.analysis.util.FilteringTokenFilter;
10 | import org.apache.lucene.util.Version;
11 |
12 | public class DStopFilter extends FilteringTokenFilter {
13 |
14 | private final CharArraySet stopWords;
15 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
16 |
17 | public DStopFilter(TokenStream input, CharArraySet stopWords) {
18 | super(Version.LUCENE_46, input);
19 |
20 | this.stopWords = stopWords;
21 | }
22 |
23 | @Override
24 | protected boolean accept() throws IOException {
25 |
26 | // System.out.println("accept()"+termAtt.toString());
27 | return !stopWords.contains(termAtt.buffer(), 0, termAtt.length()); // 未被赋值过?隐藏操作在哪里实现?
28 | }
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/mlcsseg-filter/src/main/java/org/apache/solr/analysis/DStopFilterFactory.java:
--------------------------------------------------------------------------------
1 | package org.apache.solr.analysis;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.util.Map;
6 | import java.util.Properties;
7 |
8 | import org.apache.lucene.analysis.TokenStream;
9 | import org.apache.lucene.analysis.util.CharArraySet;
10 | import org.apache.lucene.analysis.util.ResourceLoader;
11 | import org.apache.lucene.analysis.util.ResourceLoaderAware;
12 | import org.apache.lucene.analysis.util.TokenFilterFactory;
13 |
14 | import com.mlcs.search.mlcsseg.common.ScheduledExecutor;
15 |
16 |
17 | public class DStopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
18 |
19 | public DStopFilterFactory(Map args) {
20 | super(args);
21 | ignoreCase = getBoolean(args, "ignoreCase", false);
22 | // enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
23 | conf = get(args, "conf"); //paths & lastupdate
24 | System.out.println("construct:::::stop::::::::::::::::::::::" + conf);
25 | }
26 |
27 | private CharArraySet stopWords;
28 | private boolean ignoreCase;
29 | // private boolean enablePositionIncrements;
30 |
31 | private ResourceLoader loader;
32 |
33 | private String conf;
34 | private long lastUpdateTime = -1;
35 |
36 | public void inform(final ResourceLoader loader) throws IOException {
37 | System.out.println("inform:::::stop::::::::::::::::::::::" + conf);
38 | this.loader = loader;
39 | this.update();
40 | if(conf != null && !conf.trim().isEmpty()){
41 | ScheduledExecutor.submit(new Runnable() {
42 |
43 | public void run() {
44 | try {
45 | update();
46 | } catch (IOException e) {
47 | e.printStackTrace();
48 | }
49 | }
50 | }, 1000 * 60 );
51 | }
52 | }
53 |
54 | @Override
55 | public TokenStream create(TokenStream arg0) {
56 | DStopFilter stopFilter = new DStopFilter( arg0, stopWords);
57 | return stopFilter;
58 | }
59 |
60 | public void update() throws IOException {
61 | Properties p = canUpdate();
62 | if (p != null){
63 | System.out.println(" updating~~~!! ");
64 | stopWords = getWordSet(loader, p.getProperty("files"), ignoreCase);
65 | System.out.println(" finish!! ");
66 | }
67 |
68 | }
69 |
70 |
71 | private Properties canUpdate() {
72 |
73 | try{
74 | Properties p = new Properties();
75 | InputStream confStream = loader.openResource(conf);
76 | p.load(confStream);
77 | confStream.close();
78 | String lastupdate = p.getProperty("lastupdate", "0");
79 | Long t = new Long(lastupdate);
80 |
81 | if (t > this.lastUpdateTime){
82 | this.lastUpdateTime = t.longValue();
83 | String paths = p.getProperty("files");
84 | if (paths==null || paths.trim().isEmpty()) // 必须有地址
85 | return null;
86 | System.out.println("loading conf");
87 | return p;
88 | }else{
89 | this.lastUpdateTime = t.longValue();
90 | return null;
91 | }
92 | }catch(Exception e){
93 | System.err.println("stop parsing conf NullPointerException~~~~~" + e.getMessage());
94 | return null;
95 | }
96 | }
97 |
98 | }
99 |
--------------------------------------------------------------------------------
/mlcsseg-filter/src/main/java/org/apache/solr/analysis/DSynonymFilterFactory.java:
--------------------------------------------------------------------------------
1 | package org.apache.solr.analysis;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.io.InputStream;
6 | import java.io.InputStreamReader;
7 | import java.io.Reader;
8 | import java.nio.charset.Charset;
9 | import java.nio.charset.CharsetDecoder;
10 | import java.nio.charset.CodingErrorAction;
11 | import java.text.ParseException;
12 | import java.util.List;
13 | import java.util.Map;
14 | import java.util.Properties;
15 |
16 | import org.apache.lucene.analysis.TokenStream;
17 | import org.apache.lucene.analysis.Analyzer;
18 | import org.apache.lucene.analysis.core.LowerCaseFilter;
19 | import org.apache.lucene.analysis.core.WhitespaceTokenizer;
20 | import org.apache.lucene.analysis.synonym.SolrSynonymParser;
21 | import org.apache.lucene.analysis.synonym.SynonymFilter;
22 | import org.apache.lucene.analysis.synonym.SynonymMap;
23 | import org.apache.lucene.analysis.util.ResourceLoader;
24 | import org.apache.lucene.analysis.util.ResourceLoaderAware;
25 | import org.apache.lucene.analysis.util.TokenFilterFactory;
26 | import org.apache.lucene.util.Version;
27 |
28 | import com.mlcs.search.mlcsseg.common.ScheduledExecutor;
29 |
30 |
31 | public class DSynonymFilterFactory extends TokenFilterFactory implements
32 | ResourceLoaderAware {
33 |
34 | public DSynonymFilterFactory(Map args) throws IOException {
35 | super(args);
36 | expand = getBoolean(args, "expand", true);
37 | ignoreCase = getBoolean(args, "ignoreCase", false);
38 | conf = get(args, "conf"); //paths & lastupdate
39 | System.out.println(conf);
40 | }
41 |
42 | private SynonymMap map; // 词库,可以通过引用改变
43 | private boolean ignoreCase; //属性
44 | private boolean expand;
45 | private ResourceLoader loader = null;
46 |
47 | private String conf; // properties格式, 存lastupdatetime和词库路径files:逗号间隔
48 | private long lastUpdateTime = -1;
49 |
50 | public void inform(ResourceLoader loader) throws IOException {
51 | System.out.println(":::::synonym::::::::::::::::::::::" + conf);
52 | this.loader = loader;
53 | this.update();
54 | if(conf != null && !conf.trim().isEmpty()){
55 | ScheduledExecutor.submit(new Runnable() {
56 |
57 | public void run() {
58 | update();
59 |
60 | }
61 | }, 1000 * 60);
62 | }
63 | }
64 |
65 | private SynonymMap loadSolrSynonyms(ResourceLoader loader, Properties p) throws IOException, ParseException {
66 | final Analyzer analyzer = new Analyzer() {
67 | @Override
68 | protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
69 | WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_46, reader);
70 | TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_46, tokenizer) : tokenizer;
71 | return new TokenStreamComponents(tokenizer, stream);
72 | }
73 | };
74 | String synonyms = p.getProperty("files");
75 |
76 | CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
77 | .onMalformedInput(CodingErrorAction.REPORT)
78 | .onUnmappableCharacter(CodingErrorAction.REPORT);
79 |
80 | SolrSynonymParser parser = new SolrSynonymParser(true, expand, analyzer);
81 | File synonymFile = new File(synonyms);
82 | if (loader != null){ //first call in constructor
83 | if (synonymFile.exists()) {
84 | decoder.reset();
85 |
86 | parser.parse(new InputStreamReader(loader.openResource(synonyms),
87 | decoder));
88 | } else {
89 | List files = splitFileNames(synonyms);
90 | for (String file : files) {
91 | decoder.reset();
92 | parser.parse(new InputStreamReader(loader.openResource(file),
93 | decoder));
94 | }
95 | }
96 | }
97 |
98 | return parser.build();
99 | }
100 |
101 | @Override
102 | public TokenStream create(TokenStream input) {
103 | return map.fst == null ? input : new SynonymFilter(input, map,ignoreCase);
104 | }
105 |
106 | public void update() {
107 |
108 | Properties p = canUpdate();
109 | if (p != null){
110 | try {
111 | System.out.println(" updating !");
112 | map = loadSolrSynonyms(loader, p); // 内部已实现切换
113 | System.out.println(" finish~!");
114 | } catch (IOException e) {
115 | System.err.println(" IOException!!");
116 | e.printStackTrace();
117 | } catch (ParseException e) {
118 | System.err.println(" ParseException!!");
119 | e.printStackTrace();
120 | }
121 | }
122 | }
123 |
124 | private Properties canUpdate() {
125 |
126 | try{
127 | Properties p = new Properties();
128 | InputStream confStream = loader.openResource(conf);
129 | p.load(confStream);
130 | confStream.close();
131 | String lastupdate = p.getProperty("lastupdate", "0");
132 | Long t = new Long(lastupdate);
133 |
134 | if (t > this.lastUpdateTime){
135 | this.lastUpdateTime = t.longValue();
136 | String paths = p.getProperty("files");
137 | if (paths==null || paths.trim().isEmpty()) // 必须有地址
138 | return null;
139 | System.out.println("loading conf");
140 | return p;
141 | }else{
142 | this.lastUpdateTime = t.longValue();
143 | return null;
144 | }
145 | }catch(Exception e){
146 | System.err.println("synonym parsing conf NullPointerException~~~~~" + e.getMessage());
147 | return null;
148 | }
149 | }
150 |
151 | }
152 |
--------------------------------------------------------------------------------
/mlcsseg-filter/src/test/java/org/mlcsseg/filter/AppTest.java:
--------------------------------------------------------------------------------
1 | package org.mlcsseg.filter;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/mlcsseg-ik/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.mlcs.search
6 | mlcsseg
7 | 4.6.0-SNAPSHOT
8 |
9 | mlcsseg-ik
10 | mlcsseg-ik
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 |
16 |
17 |
18 |
19 | junit
20 | junit
21 | 3.8.1
22 | test
23 |
24 |
25 | com.mlcs.search
26 | mlcsseg-common
27 | 4.6.0-SNAPSHOT
28 |
29 |
30 |
31 |
32 |
33 | src/main/resources
34 |
35 | **/*.dic
36 | **/*.xml
37 |
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/cfg/Configuration.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.cfg;
26 |
27 | import java.util.List;
28 |
29 | /**
30 | *
31 | * 配置管理类接口
32 | *
33 | */
34 | public interface Configuration {
35 |
36 | /**
37 | * 返回useSmart标志位
38 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
39 | * @return useSmart
40 | */
41 | public boolean useSmart();
42 |
43 | /**
44 | * 设置useSmart标志位
45 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
46 | * @param useSmart
47 | */
48 | public void setUseSmart(boolean useSmart);
49 |
50 |
51 | /**
52 | * 获取主词典路径
53 | *
54 | * @return String 主词典路径
55 | */
56 | public String getMainDictionary();
57 |
58 | /**
59 | * 获取量词词典路径
60 | * @return String 量词词典路径
61 | */
62 | public String getQuantifierDicionary();
63 |
64 | /**
65 | * 获取扩展字典配置路径
66 | * @return List 相对类加载器的路径
67 | */
68 | public List getExtDictionarys();
69 |
70 |
71 | /**
72 | * 获取扩展停止词典配置路径
73 | * @return List 相对类加载器的路径
74 | */
75 | public List getExtStopWordDictionarys();
76 | }
77 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/cfg/DefaultConfig.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | *
25 | */
26 | package org.wltea.analyzer.cfg;
27 |
28 | import java.io.IOException;
29 | import java.io.InputStream;
30 | import java.util.ArrayList;
31 | import java.util.InvalidPropertiesFormatException;
32 | import java.util.List;
33 | import java.util.Properties;
34 |
35 | /**
36 | * Configuration 默认实现
37 | * 2012-5-8
38 | *
39 | */
40 | public class DefaultConfig implements Configuration{
41 |
42 | /*
43 | * 分词器默认字典路径
44 | */
45 | private static final String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic";
46 | private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic";
47 |
48 | /*
49 | * 分词器配置文件路径
50 | */
51 | private static final String FILE_NAME = "IKAnalyzer.cfg.xml";
52 | //配置属性——扩展字典
53 | private static final String EXT_DICT = "ext_dict";
54 | //配置属性——扩展停止词典
55 | private static final String EXT_STOP = "ext_stopwords";
56 |
57 | private Properties props;
58 | /*
59 | * 是否使用smart方式分词
60 | */
61 | private boolean useSmart;
62 |
63 | /**
64 | * 返回单例
65 | * @return Configuration单例
66 | */
67 | public static Configuration getInstance(){
68 | return new DefaultConfig();
69 | }
70 |
71 | /*
72 | * 初始化配置文件
73 | */
74 | private DefaultConfig(){
75 | props = new Properties();
76 |
77 | InputStream input = this.getClass().getClassLoader().getResourceAsStream(FILE_NAME);
78 | if(input != null){
79 | try {
80 | props.loadFromXML(input);
81 | } catch (InvalidPropertiesFormatException e) {
82 | e.printStackTrace();
83 | } catch (IOException e) {
84 | e.printStackTrace();
85 | }
86 | }
87 | }
88 |
89 |
90 | /**
91 | * 返回useSmart标志位
92 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
93 | * @return useSmart
94 | */
95 | public boolean useSmart() {
96 | return useSmart;
97 | }
98 |
99 | /**
100 | * 设置useSmart标志位
101 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
102 | * @param useSmart
103 | */
104 | public void setUseSmart(boolean useSmart) {
105 | this.useSmart = useSmart;
106 | }
107 |
108 | /**
109 | * 获取主词典路径
110 | *
111 | * @return String 主词典路径
112 | */
113 | public String getMainDictionary(){
114 | return PATH_DIC_MAIN;
115 | }
116 |
117 | /**
118 | * 获取量词词典路径
119 | * @return String 量词词典路径
120 | */
121 | public String getQuantifierDicionary(){
122 | return PATH_DIC_QUANTIFIER;
123 | }
124 |
125 | /**
126 | * 获取扩展字典配置路径
127 | * @return List 相对类加载器的路径
128 | */
129 | public List getExtDictionarys(){
130 | List extDictFiles = new ArrayList(2);
131 | String extDictCfg = props.getProperty(EXT_DICT);
132 | if(extDictCfg != null){
133 | //使用;分割多个扩展字典配置
134 | String[] filePaths = extDictCfg.split(";");
135 | if(filePaths != null){
136 | for(String filePath : filePaths){
137 | if(filePath != null && !"".equals(filePath.trim())){
138 | extDictFiles.add(filePath.trim());
139 | }
140 | }
141 | }
142 | }
143 | return extDictFiles;
144 | }
145 |
146 |
147 | /**
148 | * 获取扩展停止词典配置路径
149 | * @return List 相对类加载器的路径
150 | */
151 | public List getExtStopWordDictionarys(){
152 | List extStopWordDictFiles = new ArrayList(2);
153 | String extStopWordDictCfg = props.getProperty(EXT_STOP);
154 | if(extStopWordDictCfg != null){
155 | //使用;分割多个扩展字典配置
156 | String[] filePaths = extStopWordDictCfg.split(";");
157 | if(filePaths != null){
158 | for(String filePath : filePaths){
159 | if(filePath != null && !"".equals(filePath.trim())){
160 | extStopWordDictFiles.add(filePath.trim());
161 | }
162 | }
163 | }
164 | }
165 | return extStopWordDictFiles;
166 | }
167 | }
168 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.io.IOException;
28 | import java.io.Reader;
29 | import java.util.HashMap;
30 | import java.util.HashSet;
31 | import java.util.LinkedList;
32 | import java.util.Map;
33 | import java.util.Set;
34 |
35 | import org.wltea.analyzer.cfg.Configuration;
36 | /**
37 | *
38 | * 分词器上下文状态
39 | *
40 | */
41 | class AnalyzeContext {
42 |
43 | //默认缓冲区大小
44 | private static final int BUFF_SIZE = 4096;
45 | //缓冲区耗尽的临界值
46 | private static final int BUFF_EXHAUST_CRITICAL = 100;
47 |
48 |
49 | //字符窜读取缓冲
50 | private char[] segmentBuff;
51 | //字符类型数组
52 | private int[] charTypes;
53 |
54 |
55 | //记录Reader内已分析的字串总长度
56 | //在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移
57 | private int buffOffset;
58 | //当前缓冲区位置指针
59 | private int cursor;
60 | //最近一次读入的,可处理的字串长度
61 | private int available;
62 |
63 |
64 | //子分词器锁
65 | //该集合非空,说明有子分词器在占用segmentBuff
66 | private Set buffLocker;
67 |
68 | //原始分词结果集合,未经歧义处理
69 | private QuickSortSet orgLexemes;
70 | //LexemePath位置索引表
71 | private Map pathMap;
72 | //最终分词结果集
73 | private LinkedList results;
74 |
75 | //分词器配置项
76 | private Configuration cfg;
77 |
78 | public AnalyzeContext(Configuration cfg){
79 | this.cfg = cfg;
80 | this.segmentBuff = new char[BUFF_SIZE];
81 | this.charTypes = new int[BUFF_SIZE];
82 | this.buffLocker = new HashSet();
83 | this.orgLexemes = new QuickSortSet();
84 | this.pathMap = new HashMap();
85 | this.results = new LinkedList();
86 | }
87 |
88 | int getCursor(){
89 | return this.cursor;
90 | }
91 | //
92 | // void setCursor(int cursor){
93 | // this.cursor = cursor;
94 | // }
95 |
96 | char[] getSegmentBuff(){
97 | return this.segmentBuff;
98 | }
99 |
100 | char getCurrentChar(){
101 | return this.segmentBuff[this.cursor];
102 | }
103 |
104 | int getCurrentCharType(){
105 | return this.charTypes[this.cursor];
106 | }
107 |
108 | int getBufferOffset(){
109 | return this.buffOffset;
110 | }
111 |
112 | /**
113 | * 根据context的上下文情况,填充segmentBuff
114 | * @param reader
115 | * @return 返回待分析的(有效的)字串长度
116 | * @throws IOException
117 | */
118 | int fillBuffer(Reader reader) throws IOException{
119 | int readCount = 0;
120 | if(this.buffOffset == 0){
121 | //首次读取reader
122 | readCount = reader.read(segmentBuff);
123 | }else{
124 | int offset = this.available - this.cursor;
125 | if(offset > 0){
126 | //最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部
127 | System.arraycopy(this.segmentBuff , this.cursor , this.segmentBuff , 0 , offset);
128 | readCount = offset;
129 | }
130 | //继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分
131 | readCount += reader.read(this.segmentBuff , offset , BUFF_SIZE - offset);
132 | }
133 | //记录最后一次从Reader中读入的可用字符长度
134 | this.available = readCount;
135 | //重置当前指针
136 | this.cursor = 0;
137 | return readCount;
138 | }
139 |
140 | /**
141 | * 初始化buff指针,处理第一个字符
142 | */
143 | void initCursor(){
144 | this.cursor = 0;
145 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
146 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
147 | }
148 |
149 | /**
150 | * 指针+1
151 | * 成功返回 true; 指针已经到了buff尾部,不能前进,返回false
152 | * 并处理当前字符
153 | */
154 | boolean moveCursor(){
155 | if(this.cursor < this.available - 1){
156 | this.cursor++;
157 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
158 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
159 | return true;
160 | }else{
161 | return false;
162 | }
163 | }
164 |
165 | /**
166 | * 设置当前segmentBuff为锁定状态
167 | * 加入占用segmentBuff的子分词器名称,表示占用segmentBuff
168 | * @param segmenterName
169 | */
170 | void lockBuffer(String segmenterName){
171 | this.buffLocker.add(segmenterName);
172 | }
173 |
174 | /**
175 | * 移除指定的子分词器名,释放对segmentBuff的占用
176 | * @param segmenterName
177 | */
178 | void unlockBuffer(String segmenterName){
179 | this.buffLocker.remove(segmenterName);
180 | }
181 |
182 | /**
183 | * 只要buffLocker中存在segmenterName
184 | * 则buffer被锁定
185 | * @return boolean 缓冲去是否被锁定
186 | */
187 | boolean isBufferLocked(){
188 | return this.buffLocker.size() > 0;
189 | }
190 |
191 | /**
192 | * 判断当前segmentBuff是否已经用完
193 | * 当前执针cursor移至segmentBuff末端this.available - 1
194 | * @return
195 | */
196 | boolean isBufferConsumed(){
197 | return this.cursor == this.available - 1;
198 | }
199 |
200 | /**
201 | * 判断segmentBuff是否需要读取新数据
202 | *
203 | * 满足一下条件时,
204 | * 1.available == BUFF_SIZE 表示buffer满载
205 | * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内
206 | * 3.!context.isBufferLocked()表示没有segmenter在占用buffer
207 | * 要中断当前循环(buffer要进行移位,并再读取数据的操作)
208 | * @return
209 | */
210 | boolean needRefillBuffer(){
211 | return this.available == BUFF_SIZE
212 | && this.cursor < this.available - 1
213 | && this.cursor > this.available - BUFF_EXHAUST_CRITICAL
214 | && !this.isBufferLocked();
215 | }
216 |
217 | /**
218 | * 累计当前的segmentBuff相对于reader起始位置的位移
219 | */
220 | void markBufferOffset(){
221 | this.buffOffset += this.cursor;
222 | }
223 |
224 | /**
225 | * 向分词结果集添加词元
226 | * @param lexeme
227 | */
228 | void addLexeme(Lexeme lexeme){
229 | this.orgLexemes.addLexeme(lexeme);
230 | }
231 |
232 | /**
233 | * 添加分词结果路径
234 | * 路径起始位置 ---> 路径 映射表
235 | * @param path
236 | */
237 | void addLexemePath(LexemePath path){
238 | if(path != null){
239 | this.pathMap.put(path.getPathBegin(), path);
240 | }
241 | }
242 |
243 |
244 | /**
245 | * 返回原始分词结果
246 | * @return
247 | */
248 | QuickSortSet getOrgLexemes(){
249 | return this.orgLexemes;
250 | }
251 |
252 | /**
253 | * 推送分词结果到结果集合
254 | * 1.从buff头部遍历到this.cursor已处理位置
255 | * 2.将map中存在的分词结果推入results
256 | * 3.将map中不存在的CJDK字符以单字方式推入results
257 | */
258 | void outputToResult(){
259 | int index = 0;
260 | for( ; index <= this.cursor ;){
261 | //跳过非CJK字符
262 | if(CharacterUtil.CHAR_USELESS == this.charTypes[index]){
263 | index++;
264 | continue;
265 | }
266 | //从pathMap找出对应index位置的LexemePath
267 | LexemePath path = this.pathMap.get(index);
268 | if(path != null){
269 | //输出LexemePath中的lexeme到results集合
270 | Lexeme l = path.pollFirst();
271 | while(l != null){
272 | this.results.add(l);
273 | //将index移至lexeme后
274 | index = l.getBegin() + l.getLength();
275 | l = path.pollFirst();
276 | if(l != null){
277 | //输出path内部,词元间遗漏的单字
278 | for(;index < l.getBegin();index++){
279 | this.outputSingleCJK(index);
280 | }
281 | }
282 | }
283 | }else{//pathMap中找不到index对应的LexemePath
284 | //单字输出
285 | this.outputSingleCJK(index);
286 | index++;
287 | }
288 | }
289 | //清空当前的Map
290 | this.pathMap.clear();
291 | }
292 |
293 | /**
294 | * 对CJK字符进行单字输出
295 | * @param index
296 | */
297 | private void outputSingleCJK(int index){
298 | if(CharacterUtil.CHAR_CHINESE == this.charTypes[index]){
299 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_CNCHAR);
300 | this.results.add(singleCharLexeme);
301 | }else if(CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]){
302 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_OTHER_CJK);
303 | this.results.add(singleCharLexeme);
304 | }
305 | }
306 |
307 | /**
308 | * 返回lexeme
309 | *
310 | * 同时处理合并
311 | * @return
312 | */
313 | Lexeme getNextLexeme(){
314 | //从结果集取出,并移除第一个Lexme
315 | Lexeme result = this.results.pollFirst();
316 | /*while(result != null){
317 | //数量词合并
318 | this.compound(result);
319 | if(Dictionary.getSingleton().isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
320 | //是停止词继续取列表的下一个
321 | result = this.results.pollFirst();
322 | }else{
323 | //不是停止词, 生成lexeme的词元文本,输出
324 | result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength()));
325 | break;
326 | }
327 | }*/
328 | if(result != null)
329 | {
330 | this.compound(result);
331 | result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength()));
332 | }
333 | return result;
334 | }
335 |
336 | /**
337 | * 重置分词上下文状态
338 | */
339 | void reset(){
340 | this.buffLocker.clear();
341 | this.orgLexemes = new QuickSortSet();
342 | this.available =0;
343 | this.buffOffset = 0;
344 | this.charTypes = new int[BUFF_SIZE];
345 | this.cursor = 0;
346 | this.results.clear();
347 | this.segmentBuff = new char[BUFF_SIZE];
348 | this.pathMap.clear();
349 | }
350 |
351 | /**
352 | * 组合词元
353 | */
354 | private void compound(Lexeme result){
355 | if(!this.cfg.useSmart()){
356 | return ;
357 | }
358 | //数量词合并处理
359 | if(!this.results.isEmpty()){
360 |
361 | if(Lexeme.TYPE_ARABIC == result.getLexemeType()){
362 | Lexeme nextLexeme = this.results.peekFirst();
363 | boolean appendOk = false;
364 | if(Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()){
365 | //合并英文数词+中文数词
366 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
367 | }else if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){
368 | //合并英文数词+中文量词
369 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
370 | }
371 | if(appendOk){
372 | //弹出
373 | this.results.pollFirst();
374 | }
375 | }
376 |
377 | //可能存在第二轮合并
378 | if(Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()){
379 | Lexeme nextLexeme = this.results.peekFirst();
380 | boolean appendOk = false;
381 | if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){
382 | //合并中文数词+中文量词
383 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
384 | }
385 | if(appendOk){
386 | //弹出
387 | this.results.pollFirst();
388 | }
389 | }
390 |
391 | }
392 | }
393 |
394 | }
395 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java:
--------------------------------------------------------------------------------
1 |
2 | /**
3 | * IK 中文分词 版本 5.0
4 | * IK Analyzer release 5.0
5 | *
6 | * Licensed to the Apache Software Foundation (ASF) under one or more
7 | * contributor license agreements. See the NOTICE file distributed with
8 | * this work for additional information regarding copyright ownership.
9 | * The ASF licenses this file to You under the Apache License, Version 2.0
10 | * (the "License"); you may not use this file except in compliance with
11 | * the License. You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | *
21 | * 源代码由林良益(linliangyi2005@gmail.com)提供
22 | * 版权声明 2012,乌龙茶工作室
23 | * provided by Linliangyi and copyright 2012 by Oolong studio
24 | *
25 | */
26 | package org.wltea.analyzer.core;
27 |
28 | import java.util.LinkedList;
29 | import java.util.List;
30 |
31 | import org.wltea.analyzer.dic.Dictionary;
32 | import org.wltea.analyzer.dic.Hit;
33 |
34 |
35 | /**
36 | * 中文-日韩文子分词器
37 | */
38 | class CJKSegmenter implements ISegmenter {
39 |
40 | //子分词器标签
41 | static final String SEGMENTER_NAME = "CJK_SEGMENTER";
42 | //待处理的分词hit队列
43 | private List tmpHits;
44 |
45 |
46 | CJKSegmenter(){
47 | this.tmpHits = new LinkedList();
48 | }
49 |
50 | /* (non-Javadoc)
51 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
52 | */
53 | public void analyze(AnalyzeContext context) {
54 | if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){
55 |
56 | //优先处理tmpHits中的hit
57 | if(!this.tmpHits.isEmpty()){
58 | //处理词段队列
59 | Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
60 | for(Hit hit : tmpArray){
61 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
62 | if(hit.isMatch()){
63 | //输出当前的词
64 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
65 | context.addLexeme(newLexeme);
66 |
67 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除
68 | this.tmpHits.remove(hit);
69 | }
70 |
71 | }else if(hit.isUnmatch()){
72 | //hit不是词,移除
73 | this.tmpHits.remove(hit);
74 | }
75 | }
76 | }
77 |
78 | //*********************************
79 | //再对当前指针位置的字符进行单字匹配
80 | Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
81 | if(singleCharHit.isMatch()){//首字成词
82 | //输出当前的词
83 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
84 | context.addLexeme(newLexeme);
85 |
86 | //同时也是词前缀
87 | if(singleCharHit.isPrefix()){
88 | //前缀匹配则放入hit列表
89 | this.tmpHits.add(singleCharHit);
90 | }
91 | }else if(singleCharHit.isPrefix()){//首字为词前缀
92 | //前缀匹配则放入hit列表
93 | this.tmpHits.add(singleCharHit);
94 | }
95 |
96 |
97 | }else{
98 | //遇到CHAR_USELESS字符
99 | //清空队列
100 | this.tmpHits.clear();
101 | }
102 |
103 | //判断缓冲区是否已经读完
104 | if(context.isBufferConsumed()){
105 | //清空队列
106 | this.tmpHits.clear();
107 | }
108 |
109 | //判断是否锁定缓冲区
110 | if(this.tmpHits.size() == 0){
111 | context.unlockBuffer(SEGMENTER_NAME);
112 |
113 | }else{
114 | context.lockBuffer(SEGMENTER_NAME);
115 | }
116 | }
117 |
118 | /* (non-Javadoc)
119 | * @see org.wltea.analyzer.core.ISegmenter#reset()
120 | */
121 | public void reset() {
122 | //清空队列
123 | this.tmpHits.clear();
124 | }
125 |
126 | }
127 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.util.HashSet;
28 | import java.util.LinkedList;
29 | import java.util.List;
30 | import java.util.Set;
31 |
32 | import org.wltea.analyzer.dic.Dictionary;
33 | import org.wltea.analyzer.dic.Hit;
34 |
35 | /**
36 | *
37 | * 中文数量词子分词器
38 | */
39 | class CN_QuantifierSegmenter implements ISegmenter{
40 |
41 | //子分词器标签
42 | static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
43 |
44 | //中文数词
45 | private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";//Cnum
46 | private static Set ChnNumberChars = new HashSet();
47 | static{
48 | char[] ca = Chn_Num.toCharArray();
49 | for(char nChar : ca){
50 | ChnNumberChars.add(nChar);
51 | }
52 | }
53 |
54 | /*
55 | * 词元的开始位置,
56 | * 同时作为子分词器状态标识
57 | * 当start > -1 时,标识当前的分词器正在处理字符
58 | */
59 | private int nStart;
60 | /*
61 | * 记录词元结束位置
62 | * end记录的是在词元中最后一个出现的合理的数词结束
63 | */
64 | private int nEnd;
65 |
66 | //待处理的量词hit队列
67 | private List countHits;
68 |
69 |
70 | CN_QuantifierSegmenter(){
71 | nStart = -1;
72 | nEnd = -1;
73 | this.countHits = new LinkedList();
74 | }
75 |
76 | /**
77 | * 分词
78 | */
79 | public void analyze(AnalyzeContext context) {
80 | //处理中文数词
81 | this.processCNumber(context);
82 | //处理中文量词
83 | this.processCount(context);
84 |
85 | //判断是否锁定缓冲区
86 | if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){
87 | //对缓冲区解锁
88 | context.unlockBuffer(SEGMENTER_NAME);
89 | }else{
90 | context.lockBuffer(SEGMENTER_NAME);
91 | }
92 | }
93 |
94 |
95 | /**
96 | * 重置子分词器状态
97 | */
98 | public void reset() {
99 | nStart = -1;
100 | nEnd = -1;
101 | countHits.clear();
102 | }
103 |
104 | /**
105 | * 处理数词
106 | */
107 | private void processCNumber(AnalyzeContext context){
108 | if(nStart == -1 && nEnd == -1){//初始状态
109 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
110 | && ChnNumberChars.contains(context.getCurrentChar())){
111 | //记录数词的起始、结束位置
112 | nStart = context.getCursor();
113 | nEnd = context.getCursor();
114 | }
115 | }else{//正在处理状态
116 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
117 | && ChnNumberChars.contains(context.getCurrentChar())){
118 | //记录数词的结束位置
119 | nEnd = context.getCursor();
120 | }else{
121 | //输出数词
122 | this.outputNumLexeme(context);
123 | //重置头尾指针
124 | nStart = -1;
125 | nEnd = -1;
126 | }
127 | }
128 |
129 | //缓冲区已经用完,还有尚未输出的数词
130 | if(context.isBufferConsumed()){
131 | if(nStart != -1 && nEnd != -1){
132 | //输出数词
133 | outputNumLexeme(context);
134 | //重置头尾指针
135 | nStart = -1;
136 | nEnd = -1;
137 | }
138 | }
139 | }
140 |
141 | /**
142 | * 处理中文量词
143 | * @param context
144 | */
145 | private void processCount(AnalyzeContext context){
146 | // 判断是否需要启动量词扫描
147 | if(!this.needCountScan(context)){
148 | return;
149 | }
150 |
151 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){
152 |
153 | //优先处理countHits中的hit
154 | if(!this.countHits.isEmpty()){
155 | //处理词段队列
156 | Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
157 | for(Hit hit : tmpArray){
158 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
159 | if(hit.isMatch()){
160 | //输出当前的词
161 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
162 | context.addLexeme(newLexeme);
163 |
164 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除
165 | this.countHits.remove(hit);
166 | }
167 |
168 | }else if(hit.isUnmatch()){
169 | //hit不是词,移除
170 | this.countHits.remove(hit);
171 | }
172 | }
173 | }
174 |
175 | //*********************************
176 | //对当前指针位置的字符进行单字匹配
177 | Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
178 | if(singleCharHit.isMatch()){//首字成量词词
179 | //输出当前的词
180 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
181 | context.addLexeme(newLexeme);
182 |
183 | //同时也是词前缀
184 | if(singleCharHit.isPrefix()){
185 | //前缀匹配则放入hit列表
186 | this.countHits.add(singleCharHit);
187 | }
188 | }else if(singleCharHit.isPrefix()){//首字为量词前缀
189 | //前缀匹配则放入hit列表
190 | this.countHits.add(singleCharHit);
191 | }
192 |
193 |
194 | }else{
195 | //输入的不是中文字符
196 | //清空未成形的量词
197 | this.countHits.clear();
198 | }
199 |
200 | //缓冲区数据已经读完,还有尚未输出的量词
201 | if(context.isBufferConsumed()){
202 | //清空未成形的量词
203 | this.countHits.clear();
204 | }
205 | }
206 |
207 | /**
208 | * 判断是否需要扫描量词
209 | * @return
210 | */
211 | private boolean needCountScan(AnalyzeContext context){
212 | if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){
213 | //正在处理中文数词,或者正在处理量词
214 | return true;
215 | }else{
216 | //找到一个相邻的数词
217 | if(!context.getOrgLexemes().isEmpty()){
218 | Lexeme l = context.getOrgLexemes().peekLast();
219 | if(Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()){
220 | if(l.getBegin() + l.getLength() == context.getCursor()){
221 | return true;
222 | }
223 | }
224 | }
225 | }
226 | return false;
227 | }
228 |
229 | /**
230 | * 添加数词词元到结果集
231 | * @param context
232 | */
233 | private void outputNumLexeme(AnalyzeContext context){
234 | if(nStart > -1 && nEnd > -1){
235 | //输出数词
236 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM);
237 | context.addLexeme(newLexeme);
238 |
239 | }
240 | }
241 |
242 | }
243 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/CharacterUtil.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | * 字符集识别工具类
25 | */
26 | package org.wltea.analyzer.core;
27 |
28 | /**
29 | *
30 | * 字符集识别工具类
31 | */
32 | class CharacterUtil {
33 |
34 | public static final int CHAR_USELESS = 0;
35 |
36 | public static final int CHAR_ARABIC = 0X00000001;
37 |
38 | public static final int CHAR_ENGLISH = 0X00000002;
39 |
40 | public static final int CHAR_CHINESE = 0X00000004;
41 |
42 | public static final int CHAR_OTHER_CJK = 0X00000008;
43 |
44 |
45 | /**
46 | * 识别字符类型
47 | * @param input
48 | * @return int CharacterUtil定义的字符类型常量
49 | */
50 | static int identifyCharType(char input){
51 | if(input >= '0' && input <= '9'){
52 | return CHAR_ARABIC;
53 |
54 | }else if((input >= 'a' && input <= 'z')
55 | || (input >= 'A' && input <= 'Z')){
56 | return CHAR_ENGLISH;
57 |
58 | }else {
59 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
60 |
61 | if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
62 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
63 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
64 | //目前已知的中文字符UTF-8集合
65 | return CHAR_CHINESE;
66 |
67 | }else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
68 | //韩文字符集
69 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES
70 | || ub == Character.UnicodeBlock.HANGUL_JAMO
71 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
72 | //日文字符集
73 | || ub == Character.UnicodeBlock.HIRAGANA //平假名
74 | || ub == Character.UnicodeBlock.KATAKANA //片假名
75 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){
76 | return CHAR_OTHER_CJK;
77 |
78 | }
79 | }
80 | //其他的不做处理的字符
81 | return CHAR_USELESS;
82 | }
83 |
84 | /**
85 | * 进行字符规格化(全角转半角,大写转小写处理)
86 | * @param input
87 | * @return char
88 | */
89 | static char regularize(char input){
90 | if (input == 12288) {
91 | input = (char) 32;
92 |
93 | }else if (input > 65280 && input < 65375) {
94 | input = (char) (input - 65248);
95 |
96 | }else if (input >= 'A' && input <= 'Z') {
97 | input += 32;
98 | }
99 |
100 | return input;
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/IKArbitrator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.util.Stack;
28 | import java.util.TreeSet;
29 |
30 | /**
31 | * IK分词歧义裁决器
32 | */
33 | class IKArbitrator {
34 |
35 | IKArbitrator(){
36 |
37 | }
38 |
39 | /**
40 | * 分词歧义处理
41 | * @param orgLexemes
42 | * @param useSmart
43 | */
44 | void process(AnalyzeContext context , boolean useSmart){
45 | QuickSortSet orgLexemes = context.getOrgLexemes();
46 | Lexeme orgLexeme = orgLexemes.pollFirst();
47 |
48 | LexemePath crossPath = new LexemePath();
49 | while(orgLexeme != null){
50 | if(!crossPath.addCrossLexeme(orgLexeme)){
51 | //找到与crossPath不相交的下一个crossPath
52 | if(crossPath.size() == 1 || !useSmart){
53 | //crossPath没有歧义 或者 不做歧义处理
54 | //直接输出当前crossPath
55 | context.addLexemePath(crossPath);
56 | }else{
57 | //对当前的crossPath进行歧义处理
58 | QuickSortSet.Cell headCell = crossPath.getHead();
59 | LexemePath judgeResult = this.judge(context,headCell, crossPath.getPathLength());
60 | //输出歧义处理结果judgeResult
61 | context.addLexemePath(judgeResult);
62 | }
63 |
64 | //把orgLexeme加入新的crossPath中
65 | crossPath = new LexemePath(); //再次new了对象
66 | crossPath.addCrossLexeme(orgLexeme);
67 | }
68 | orgLexeme = orgLexemes.pollFirst();
69 | }
70 |
71 |
72 | //处理最后的path
73 | if(crossPath.size() <= 1 || !useSmart){ //输入流单字情况,"额"
74 | //crossPath没有歧义 或者 不做歧义处理
75 | //直接输出当前crossPath
76 | context.addLexemePath(crossPath);
77 | }else{
78 | //对当前的crossPath进行歧义处理
79 | QuickSortSet.Cell headCell = crossPath.getHead();
80 | LexemePath judgeResult = this.judge(context,headCell, crossPath.getPathLength());
81 | //输出歧义处理结果judgeResult
82 | context.addLexemePath(judgeResult);
83 | }
84 | }
85 |
86 | /**
87 | * 歧义识别
88 | * @param lexemeCell 歧义路径链表头
89 | * @param fullTextLength 歧义路径文本长度
90 | * @param option 候选结果路径
91 | * @return
92 | */
93 | @SuppressWarnings("unused")
94 | private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
95 | //候选路径集合
96 | TreeSet pathOptions = new TreeSet();
97 | //候选结果路径
98 | LexemePath option = new LexemePath();
99 |
100 | //对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
101 | Stack lexemeStack = this.forwardPath(lexemeCell , option);
102 |
103 | //当前词元链并非最理想的,加入候选路径集合
104 | pathOptions.add(option.copy());
105 |
106 | //存在歧义词,处理
107 | QuickSortSet.Cell c = null;
108 | while(!lexemeStack.isEmpty()){
109 | c = lexemeStack.pop();
110 | //回滚词元链
111 | this.backPath(c.getLexeme() , option);
112 | //从歧义词位置开始,递归,生成可选方案
113 | this.forwardPath(c , option);
114 | pathOptions.add(option.copy());
115 | }
116 |
117 | //返回集合中的最优方案
118 | //return pathOptions.first();
119 | /*Iterator it=pathOptions.iterator();
120 |
121 | while(it.hasNext())
122 | {
123 | System.out.println(it.next().toString());
124 | }*/
125 |
126 | return pathOptions.last();
127 |
128 | }
129 |
130 | private LexemePath judge(AnalyzeContext context,QuickSortSet.Cell lexemeCell , int fullTextLength){
131 | //候选路径集合
132 | TreeSet pathOptions = new TreeSet();
133 |
134 | //候选结果路径
135 |
136 | LexemePath option = new LexemePath(context.getSegmentBuff(),lexemeCell.getLexeme().getBegin(),fullTextLength);
137 |
138 | //对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
139 | Stack lexemeStack = this.forwardPath(lexemeCell , option);
140 |
141 | //当前词元链并非最理想的,加入候选路径集合
142 | pathOptions.add(option.copy()); //自定义拷贝函数
143 |
144 | //存在歧义词,处理
145 | QuickSortSet.Cell c = null;
146 | while(!lexemeStack.isEmpty()){
147 | c = lexemeStack.pop();
148 | //回滚词元链
149 | this.backPath(c.getLexeme() , option);
150 | //从歧义词位置开始,递归,生成可选方案
151 | this.forwardPath(c , option);
152 | pathOptions.add(option.copy());
153 | }
154 |
155 | //路径添加完毕,比较接口没写好,导致返回0的被去重
156 | /*
157 | * ①、进行比较?
158 | * ②、单字比较?
159 | * ③、查找字典?
160 | * */
161 |
162 | return pathOptions.last();
163 |
164 | }
165 |
166 | /**
167 | * 向前遍历,添加词元,构造一个无歧义词元组合
168 | * @param LexemePath path
169 | * @return
170 | */
171 | private Stack forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){
172 | //发生冲突的Lexeme栈
173 | Stack conflictStack = new Stack();
174 | QuickSortSet.Cell c = lexemeCell;
175 | //迭代遍历Lexeme链表
176 | while(c != null && c.getLexeme() != null){
177 | if(!option.addNotCrossLexeme(c.getLexeme())){
178 | //词元交叉,添加失败则加入lexemeStack栈
179 | conflictStack.push(c);
180 | }
181 | c = c.getNext();
182 | }
183 | return conflictStack;
184 | }
185 |
186 | /**
187 | * 回滚词元链,直到它能够接受指定的词元
188 | * @param lexeme
189 | * @param l
190 | */
191 | private void backPath(Lexeme l , LexemePath option){
192 | while(option.checkCross(l)){
193 | option.removeTail();
194 | }
195 |
196 | }
197 |
198 | }
199 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/IKSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | */
24 | package org.wltea.analyzer.core;
25 |
26 | import java.io.IOException;
27 | import java.io.Reader;
28 | import java.util.ArrayList;
29 | import java.util.List;
30 |
31 | import org.wltea.analyzer.cfg.Configuration;
32 | import org.wltea.analyzer.cfg.DefaultConfig;
33 | import org.wltea.analyzer.dic.Dictionary;
34 |
35 | /**
36 | * IK分词器主类
37 | *
38 | */
39 | public final class IKSegmenter {
40 |
41 | //字符窜reader
42 | private Reader input;
43 | //分词器配置项
44 | private Configuration cfg;
45 | //分词器上下文
46 | private AnalyzeContext context;
47 | //分词处理器列表
48 | private List segmenters;
49 | //分词歧义裁决器
50 | private IKArbitrator arbitrator;
51 |
52 |
53 | /**
54 | * IK分词器构造函数
55 | * @param input
56 | * @param useSmart 为true,使用智能分词策略
57 | *
58 | * 非智能分词:细粒度输出所有可能的切分结果
59 | * 智能分词: 合并数词和量词,对分词结果进行歧义判断
60 | */
61 |
62 | public IKSegmenter(Reader input , boolean useSmart){
63 | this.input = input;
64 | this.cfg = DefaultConfig.getInstance();
65 | this.cfg.setUseSmart(useSmart);
66 | this.init();
67 | }
68 |
69 | /**
70 | * IK分词器构造函数
71 | * @param input
72 | * @param cfg 使用自定义的Configuration构造分词器
73 | *
74 | */
75 | public IKSegmenter(Reader input , Configuration cfg){
76 | this.input = input;
77 | this.cfg = cfg;
78 | this.init();
79 | }
80 |
81 | /**
82 | * 初始化
83 | */
84 | private void init(){
85 | //初始化词典单例
86 | Dictionary.initial(this.cfg);
87 | //初始化分词上下文
88 | this.context = new AnalyzeContext(this.cfg);
89 | //加载子分词器
90 | this.segmenters = this.loadSegmenters();
91 | //加载歧义裁决器
92 | this.arbitrator = new IKArbitrator();
93 | }
94 |
95 |
96 | /**
97 | * 初始化词典,加载子分词器实现
98 | * @return List
99 | */
100 | private List loadSegmenters(){
101 | List segmenters = new ArrayList(4);
102 | //处理字母的子分词器
103 | segmenters.add(new LetterSegmenter());
104 | //处理中文数量词的子分词器
105 | segmenters.add(new CN_QuantifierSegmenter());
106 | //处理中文词的子分词器
107 | segmenters.add(new CJKSegmenter());
108 | return segmenters;
109 | }
110 |
111 | /**
112 | * 分词,获取下一个词元
113 | * @return Lexeme 词元对象
114 | * @throws IOException
115 | */
116 | public synchronized Lexeme next()throws IOException{
117 | Lexeme l = null;
118 | while((l = context.getNextLexeme()) == null ){
119 | /*
120 | * 从reader中读取数据,填充buffer
121 | * 如果reader是分次读入buffer的,那么buffer要 进行移位处理
122 | * 移位处理上次读入的但未处理的数据
123 | */
124 | int available = context.fillBuffer(this.input);
125 | if(available <= 0){
126 | //reader已经读完
127 | context.reset();
128 | return null;
129 |
130 | }else{
131 | //初始化指针
132 | context.initCursor();
133 | do{
134 | //遍历子分词器
135 | for(ISegmenter segmenter : segmenters){
136 | segmenter.analyze(context);
137 | }
138 | //字符缓冲区接近读完,需要读入新的字符
139 | if(context.needRefillBuffer()){
140 | break;
141 | }
142 | //向前移动指针
143 | }while(context.moveCursor());
144 | //重置子分词器,为下轮循环进行初始化
145 | for(ISegmenter segmenter : segmenters){
146 | segmenter.reset();
147 | }
148 | }
149 | //对分词进行歧义处理
150 | this.arbitrator.process(context, this.cfg.useSmart());
151 | //将分词结果输出到结果集,并处理未切分的单个CJK字符
152 | context.outputToResult();
153 | //记录本次分词的缓冲区位移
154 | context.markBufferOffset();
155 | }
156 | return l;
157 | }
158 |
159 | /**
160 | * 重置分词器到初始状态
161 | * @param input
162 | */
163 | public synchronized void reset(Reader input) {
164 | this.input = input;
165 | context.reset();
166 | for(ISegmenter segmenter : segmenters){
167 | segmenter.reset();
168 | }
169 | }
170 | }
171 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/ISegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 |
28 | /**
29 | *
30 | * 子分词器接口
31 | */
32 | interface ISegmenter {
33 |
34 | /**
35 | * 从分析器读取下一个可能分解的词元对象
36 | * @param context 分词算法上下文
37 | */
38 | void analyze(AnalyzeContext context);
39 |
40 |
41 | /**
42 | * 重置子分析器状态
43 | */
44 | void reset();
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.util.Arrays;
28 |
29 | /**
30 | *
31 | * 英文字符及阿拉伯数字子分词器
32 | */
33 | class LetterSegmenter implements ISegmenter {
34 |
35 | //子分词器标签
36 | static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
37 | //链接符号
38 | private static final char[] Letter_Connector = new char[]{'#' , '&' , '+' , '-' , '.' , '@' , '_'};
39 |
40 | //数字符号
41 | private static final char[] Num_Connector = new char[]{',' , '.'};
42 |
43 | /*
44 | * 词元的开始位置,
45 | * 同时作为子分词器状态标识
46 | * 当start > -1 时,标识当前的分词器正在处理字符
47 | */
48 | private int start;
49 | /*
50 | * 记录词元结束位置
51 | * end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
52 | */
53 | private int end;
54 |
55 | /*
56 | * 字母起始位置
57 | */
58 | private int englishStart;
59 |
60 | /*
61 | * 字母结束位置
62 | */
63 | private int englishEnd;
64 |
65 | /*
66 | * 阿拉伯数字起始位置
67 | */
68 | private int arabicStart;
69 |
70 | /*
71 | * 阿拉伯数字结束位置
72 | */
73 | private int arabicEnd;
74 |
75 | LetterSegmenter(){
76 | Arrays.sort(Letter_Connector);
77 | Arrays.sort(Num_Connector);
78 | this.start = -1;
79 | this.end = -1;
80 | this.englishStart = -1;
81 | this.englishEnd = -1;
82 | this.arabicStart = -1;
83 | this.arabicEnd = -1;
84 | }
85 |
86 |
87 | /* (non-Javadoc)
88 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
89 | */
90 | public void analyze(AnalyzeContext context) {
91 | boolean bufferLockFlag = false;
92 | //处理英文字母
93 | bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag;
94 | //处理阿拉伯字母
95 | bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
96 | //处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复)
97 | bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
98 |
99 | //判断是否锁定缓冲区
100 | if(bufferLockFlag){
101 | context.lockBuffer(SEGMENTER_NAME);
102 | }else{
103 | //对缓冲区解锁
104 | context.unlockBuffer(SEGMENTER_NAME);
105 | }
106 | }
107 |
108 | /* (non-Javadoc)
109 | * @see org.wltea.analyzer.core.ISegmenter#reset()
110 | */
111 | public void reset() {
112 | this.start = -1;
113 | this.end = -1;
114 | this.englishStart = -1;
115 | this.englishEnd = -1;
116 | this.arabicStart = -1;
117 | this.arabicEnd = -1;
118 | }
119 |
120 | /**
121 | * 处理数字字母混合输出
122 | * 如:windos2000 | linliangyi2005@gmail.com
123 | * @param input
124 | * @param context
125 | * @return
126 | */
127 | private boolean processMixLetter(AnalyzeContext context){
128 | boolean needLock = false;
129 |
130 | if(this.start == -1){//当前的分词器尚未开始处理字符
131 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
132 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
133 | //记录起始指针的位置,标明分词器进入处理状态
134 | this.start = context.getCursor();
135 | this.end = start;
136 | }
137 |
138 | }else{//当前的分词器正在处理字符
139 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
140 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
141 | //记录下可能的结束位置
142 | this.end = context.getCursor();
143 |
144 | }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
145 | && this.isLetterConnector(context.getCurrentChar())){
146 | //记录下可能的结束位置
147 | this.end = context.getCursor();
148 | }else{
149 | //遇到非Letter字符,输出词元
150 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER);
151 | context.addLexeme(newLexeme);
152 | this.start = -1;
153 | this.end = -1;
154 | }
155 | }
156 |
157 | //判断缓冲区是否已经读完
158 | if(context.isBufferConsumed()){
159 | if(this.start != -1 && this.end != -1){
160 | //缓冲以读完,输出词元
161 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER);
162 | context.addLexeme(newLexeme);
163 | this.start = -1;
164 | this.end = -1;
165 | }
166 | }
167 |
168 | //判断是否锁定缓冲区
169 | if(this.start == -1 && this.end == -1){
170 | //对缓冲区解锁
171 | needLock = false;
172 | }else{
173 | needLock = true;
174 | }
175 | return needLock;
176 | }
177 |
178 | /**
179 | * 处理纯英文字母输出
180 | * @param context
181 | * @return
182 | */
183 | private boolean processEnglishLetter(AnalyzeContext context){
184 | boolean needLock = false;
185 |
186 | if(this.englishStart == -1){//当前的分词器尚未开始处理英文字符
187 | if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
188 | //记录起始指针的位置,标明分词器进入处理状态
189 | this.englishStart = context.getCursor();
190 | this.englishEnd = this.englishStart;
191 | }
192 | }else {//当前的分词器正在处理英文字符
193 | if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
194 | //记录当前指针位置为结束位置
195 | this.englishEnd = context.getCursor();
196 | }else{
197 | //遇到非English字符,输出词元
198 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH);
199 | context.addLexeme(newLexeme);
200 | this.englishStart = -1;
201 | this.englishEnd= -1;
202 | }
203 | }
204 |
205 | //判断缓冲区是否已经读完
206 | if(context.isBufferConsumed()){
207 | if(this.englishStart != -1 && this.englishEnd != -1){
208 | //缓冲以读完,输出词元
209 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH);
210 | context.addLexeme(newLexeme);
211 | this.englishStart = -1;
212 | this.englishEnd= -1;
213 | }
214 | }
215 |
216 | //判断是否锁定缓冲区
217 | if(this.englishStart == -1 && this.englishEnd == -1){
218 | //对缓冲区解锁
219 | needLock = false;
220 | }else{
221 | needLock = true;
222 | }
223 | return needLock;
224 | }
225 |
226 | /**
227 | * 处理阿拉伯数字输出
228 | * @param context
229 | * @return
230 | */
231 | private boolean processArabicLetter(AnalyzeContext context){
232 | boolean needLock = false;
233 |
234 | if(this.arabicStart == -1){//当前的分词器尚未开始处理数字字符
235 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){
236 | //记录起始指针的位置,标明分词器进入处理状态
237 | this.arabicStart = context.getCursor();
238 | this.arabicEnd = this.arabicStart;
239 | }
240 | }else {//当前的分词器正在处理数字字符
241 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){
242 | //记录当前指针位置为结束位置
243 | this.arabicEnd = context.getCursor();
244 | }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
245 | && this.isNumConnector(context.getCurrentChar())){
246 | //不输出数字,但不标记结束
247 | }else{
248 | ////遇到非Arabic字符,输出词元
249 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC);
250 | context.addLexeme(newLexeme);
251 | this.arabicStart = -1;
252 | this.arabicEnd = -1;
253 | }
254 | }
255 |
256 | //判断缓冲区是否已经读完
257 | if(context.isBufferConsumed()){
258 | if(this.arabicStart != -1 && this.arabicEnd != -1){
259 | //生成已切分的词元
260 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC);
261 | context.addLexeme(newLexeme);
262 | this.arabicStart = -1;
263 | this.arabicEnd = -1;
264 | }
265 | }
266 |
267 | //判断是否锁定缓冲区
268 | if(this.arabicStart == -1 && this.arabicEnd == -1){
269 | //对缓冲区解锁
270 | needLock = false;
271 | }else{
272 | needLock = true;
273 | }
274 | return needLock;
275 | }
276 |
277 | /**
278 | * 判断是否是字母连接符号
279 | * @param input
280 | * @return
281 | */
282 | private boolean isLetterConnector(char input){
283 | int index = Arrays.binarySearch(Letter_Connector, input);
284 | return index >= 0;
285 | }
286 |
287 | /**
288 | * 判断是否是数字连接符号
289 | * @param input
290 | * @return
291 | */
292 | private boolean isNumConnector(char input){
293 | int index = Arrays.binarySearch(Num_Connector, input);
294 | return index >= 0;
295 | }
296 | }
297 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/Lexeme.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | /**
28 | * IK词元对象
29 | */
30 | public class Lexeme implements Comparable{
31 | //lexemeType常量
32 | //未知
33 | public static final int TYPE_UNKNOWN = 0;
34 | //英文
35 | public static final int TYPE_ENGLISH = 1;
36 | //数字
37 | public static final int TYPE_ARABIC = 2;
38 | //英文数字混合
39 | public static final int TYPE_LETTER = 3;
40 | //中文词元
41 | public static final int TYPE_CNWORD = 4;
42 | //中文单字
43 | public static final int TYPE_CNCHAR = 64;
44 | //日韩文字
45 | public static final int TYPE_OTHER_CJK = 8;
46 | //中文数词
47 | public static final int TYPE_CNUM = 16;
48 | //中文量词
49 | public static final int TYPE_COUNT = 32;
50 | //中文数量词
51 | public static final int TYPE_CQUAN = 48;
52 |
53 | //词元的起始位移
54 | private int offset;
55 | //词元的相对起始位置
56 | private int begin;
57 | //词元的长度
58 | private int length;
59 | //词元文本
60 | private String lexemeText;
61 | //词元类型
62 | private int lexemeType;
63 |
64 |
65 | public Lexeme(int offset , int begin , int length , int lexemeType){
66 | this.offset = offset;
67 | this.begin = begin;
68 | if(length < 0){
69 | throw new IllegalArgumentException("length < 0");
70 | }
71 | this.length = length;
72 | this.lexemeType = lexemeType;
73 | }
74 |
75 | /*
76 | * 判断词元相等算法
77 | * 起始位置偏移、起始位置、终止位置相同
78 | * @see java.lang.Object#equals(Object o)
79 | */
80 | public boolean equals(Object o){
81 | if(o == null){
82 | return false;
83 | }
84 |
85 | if(this == o){
86 | return true;
87 | }
88 |
89 | if(o instanceof Lexeme){
90 | Lexeme other = (Lexeme)o;
91 | if(this.offset == other.getOffset()
92 | && this.begin == other.getBegin()
93 | && this.length == other.getLength()){
94 | return true;
95 | }else{
96 | return false;
97 | }
98 | }else{
99 | return false;
100 | }
101 | }
102 |
103 | /*
104 | * 词元哈希编码算法
105 | * @see java.lang.Object#hashCode()
106 | */
107 | public int hashCode(){
108 | int absBegin = getBeginPosition();
109 | int absEnd = getEndPosition();
110 | return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
111 | }
112 |
113 | /*
114 | * 词元在排序集合中的比较算法
115 | * @see java.lang.Comparable#compareTo(java.lang.Object)
116 | */
117 | public int compareTo(Lexeme other) {
118 | //起始位置优先
119 | if(this.begin < other.getBegin()){
120 | return -1;
121 | }else if(this.begin == other.getBegin()){
122 | //词元长度优先
123 | if(this.length > other.getLength()){
124 | return -1;
125 | }else if(this.length == other.getLength()){
126 | return 0;
127 | }else {//this.length < other.getLength()
128 | return 1;
129 | }
130 |
131 | }else{//this.begin > other.getBegin()
132 | return 1;
133 | }
134 | }
135 |
136 | public int getOffset() {
137 | return offset;
138 | }
139 |
140 | public void setOffset(int offset) {
141 | this.offset = offset;
142 | }
143 |
144 | public int getBegin() {
145 | return begin;
146 | }
147 | /**
148 | * 获取词元在文本中的起始位置
149 | * @return int
150 | */
151 | public int getBeginPosition(){
152 | return offset + begin;
153 | }
154 |
155 | public void setBegin(int begin) {
156 | this.begin = begin;
157 | }
158 |
159 | /**
160 | * 获取词元在文本中的结束位置
161 | * @return int
162 | */
163 | public int getEndPosition(){
164 | return offset + begin + length;
165 | }
166 |
167 | /**
168 | * 获取词元的字符长度
169 | * @return int
170 | */
171 | public int getLength(){
172 | return this.length;
173 | }
174 |
175 | public void setLength(int length) {
176 | if(this.length < 0){
177 | throw new IllegalArgumentException("length < 0");
178 | }
179 | this.length = length;
180 | }
181 |
182 | /**
183 | * 获取词元的文本内容
184 | * @return String
185 | */
186 | public String getLexemeText() {
187 | if(lexemeText == null){
188 | return "";
189 | }
190 | return lexemeText;
191 | }
192 |
193 | public void setLexemeText(String lexemeText) {
194 | if(lexemeText == null){
195 | this.lexemeText = "";
196 | this.length = 0;
197 | }else{
198 | this.lexemeText = lexemeText;
199 | this.length = lexemeText.length();
200 | }
201 | }
202 |
203 | /**
204 | * 获取词元类型
205 | * @return int
206 | */
207 | public int getLexemeType() {
208 | return lexemeType;
209 | }
210 |
211 | /**
212 | * 获取词元类型标示字符串
213 | * @return String
214 | */
215 | public String getLexemeTypeString(){
216 | switch(lexemeType) {
217 |
218 | case TYPE_ENGLISH :
219 | return "ENGLISH";
220 |
221 | case TYPE_ARABIC :
222 | return "ARABIC";
223 |
224 | case TYPE_LETTER :
225 | return "LETTER";
226 |
227 | case TYPE_CNWORD :
228 | return "CN_WORD";
229 |
230 | case TYPE_CNCHAR :
231 | return "CN_CHAR";
232 |
233 | case TYPE_OTHER_CJK :
234 | return "OTHER_CJK";
235 |
236 | case TYPE_COUNT :
237 | return "COUNT";
238 |
239 | case TYPE_CNUM :
240 | return "TYPE_CNUM";
241 |
242 | case TYPE_CQUAN:
243 | return "TYPE_CQUAN";
244 |
245 | default :
246 | return "UNKONW";
247 | }
248 | }
249 |
250 |
251 | public void setLexemeType(int lexemeType) {
252 | this.lexemeType = lexemeType;
253 | }
254 |
255 | /**
256 | * 合并两个相邻的词元
257 | * @param l
258 | * @param lexemeType
259 | * @return boolean 词元是否成功合并
260 | */
261 | public boolean append(Lexeme l , int lexemeType){
262 | if(l != null && this.getEndPosition() == l.getBeginPosition()){
263 | this.length += l.getLength();
264 | this.lexemeType = lexemeType;
265 | return true;
266 | }else {
267 | return false;
268 | }
269 | }
270 |
271 |
272 | /**
273 | *
274 | */
275 | public String toString(){
276 | StringBuffer strbuf = new StringBuffer();
277 | strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition());
278 | strbuf.append(" : ").append(this.lexemeText).append(" : \t");
279 | strbuf.append(this.getLexemeTypeString());
280 | return strbuf.toString();
281 | }
282 |
283 |
284 | }
285 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/core/LexemePath.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import org.wltea.analyzer.dic.Dictionary;
28 |
29 |
30 | /**
31 | * Lexeme链(路径)
32 | */
33 | class LexemePath extends QuickSortSet implements Comparable{
34 |
35 | //起始位置
36 | private int pathBegin;
37 | //结束
38 | private int pathEnd;
39 | //词元链的有效字符长度
40 | private int payloadLength;
41 |
42 | private char[] sentenceContent; //原始输入内容
43 | private int absBegin; //交集的绝对起始处----区别于词元
44 | private int absLength; //交集的绝对长度
45 |
46 | private float _result=-1.0f; //存储返回量化后的结果
47 |
48 | LexemePath(){
49 | this.pathBegin = -1;
50 | this.pathEnd = -1;
51 | this.payloadLength = 0;
52 | }
53 |
54 | LexemePath(char[] context,int absBegin ,int fullTextLength)
55 | {
56 | this.pathBegin = -1;
57 | this.pathEnd = -1;
58 | this.payloadLength = 0;
59 | //System.arraycopy(context, 0,sentenceContent, 0, 100);
60 | this.sentenceContent = context;
61 | this.absBegin = absBegin;
62 | this.absLength = fullTextLength;
63 | }
64 | /**
65 | * 向LexemePath追加相交的Lexeme
66 | * @param lexeme
67 | * @return
68 | */
69 | boolean addCrossLexeme(Lexeme lexeme){
70 | if(this.isEmpty()){
71 | this.addLexeme(lexeme);
72 | this.pathBegin = lexeme.getBegin();
73 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
74 | this.payloadLength += lexeme.getLength();
75 | return true;
76 |
77 | }else if(this.checkCross(lexeme)){
78 | this.addLexeme(lexeme);
79 | if(lexeme.getBegin() + lexeme.getLength() > this.pathEnd){
80 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
81 | }
82 | this.payloadLength = this.pathEnd - this.pathBegin; //此处payloadLength,交集处不算?end减原来的begin
83 | return true;
84 |
85 | }else{
86 | return false;
87 |
88 | }
89 | }
90 |
91 | /**
92 | * 向LexemePath追加不相交的Lexeme
93 | * @param lexeme
94 | * @return
95 | */
96 | boolean addNotCrossLexeme(Lexeme lexeme){
97 | if(this.isEmpty()){
98 | this.addLexeme(lexeme);
99 | this.pathBegin = lexeme.getBegin();
100 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
101 | this.payloadLength += lexeme.getLength();
102 | return true;
103 |
104 | }else if(this.checkCross(lexeme)){
105 | return false;
106 |
107 | }else{
108 | this.addLexeme(lexeme);
109 | this.payloadLength += lexeme.getLength();
110 | Lexeme head = this.peekFirst();
111 | this.pathBegin = head.getBegin();
112 | Lexeme tail = this.peekLast();
113 | this.pathEnd = tail.getBegin() + tail.getLength();
114 | return true;
115 |
116 | }
117 | }
118 |
119 | /**
120 | * 移除尾部的Lexeme
121 | * @return
122 | */
123 | Lexeme removeTail(){
124 | Lexeme tail = this.pollLast();
125 | if(this.isEmpty()){
126 | this.pathBegin = -1;
127 | this.pathEnd = -1;
128 | this.payloadLength = 0;
129 | }else{
130 | this.payloadLength -= tail.getLength();
131 | Lexeme newTail = this.peekLast();
132 | this.pathEnd = newTail.getBegin() + newTail.getLength();
133 | }
134 | return tail;
135 | }
136 |
137 | /**
138 | * 检测词元位置交叉(有歧义的切分)
139 | * @param lexeme
140 | * @return
141 | */
142 | boolean checkCross(Lexeme lexeme){
143 | return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
144 | || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()+ lexeme.getLength());
145 | }
146 |
147 | int getPathBegin() {
148 | return pathBegin;
149 | }
150 |
151 | int getPathEnd() {
152 | return pathEnd;
153 | }
154 |
155 | /**
156 | * 获取Path的有效词长
157 | * @return
158 | */
159 | int getPayloadLength(){
160 | return this.payloadLength;
161 | }
162 |
163 | /**
164 | * 获取LexemePath的路径长度
165 | * @return
166 | */
167 | int getPathLength(){
168 | return this.pathEnd - this.pathBegin;
169 | }
170 |
171 |
172 | /**
173 | * X权重(词元长度积),长度越平均,值越大
174 | * @return
175 | */
176 | int getXWeight(){
177 | int product = 1;
178 | Cell c = this.getHead();
179 | while( c != null && c.getLexeme() != null){
180 | product *= c.getLexeme().getLength();
181 | c = c.getNext();
182 | }
183 | return product;
184 | }
185 |
186 | /**
187 | * 词元位置权重,切分结果词元越多,值为大
188 | * @return
189 | */
190 | int getPWeight(){
191 | int pWeight = 0;
192 | int p = 0;
193 | Cell c = this.getHead();
194 | while( c != null && c.getLexeme() != null){
195 | p++;
196 | //pWeight += c.getLexeme().getBegin() * c.getLexeme().getLength();
197 | pWeight += p * c.getLexeme().getLength();
198 | c = c.getNext();
199 | }
200 | return pWeight;
201 | }
202 |
203 | LexemePath copy(){
204 | LexemePath theCopy = new LexemePath();
205 | theCopy.pathBegin = this.pathBegin;
206 | theCopy.pathEnd = this.pathEnd;
207 | theCopy.payloadLength = this.payloadLength;
208 |
209 | theCopy.sentenceContent = this.sentenceContent;
210 | theCopy.absBegin = this.absBegin;
211 | theCopy.absLength = this.absLength;
212 |
213 | Cell c = this.getHead();
214 | while( c != null && c.getLexeme() != null){
215 | theCopy.addLexeme(c.getLexeme());
216 | c = c.getNext();
217 | }
218 | return theCopy;
219 | }
220 |
221 | public int compareTo(LexemePath o) {
222 | float nowResult,OriginResult;
223 | nowResult = this.calcResult();
224 | OriginResult = o.calcResult();
225 |
226 | if( nowResult > OriginResult )
227 | {
228 | return 1;
229 | }
230 | else if(nowResult < OriginResult)
231 | {
232 | return -1;
233 | }
234 | else
235 | {
236 | if(this.pathEnd > o.pathEnd)
237 | {
238 | return 1;
239 | }
240 | else if(pathEnd < o.pathEnd)
241 | {
242 | return -1;
243 | }
244 | }
245 | return 0;
246 | }
247 |
248 | private float calcResult(){
249 | if(_result == -1.0f) //未被计算过
250 | {
251 | _result= (this.payloadLength*10) + (this.size()*(-5)) + this.getPathLength()+this.getXWeight()+this.getPWeight();
252 |
253 | /*存在单字
254 | *①、判断单字的个数,进行单字定位,用于获取
255 | *②、在单字字典进行查找,是否存在,取其概率值
256 | * */
257 | if(this.payloadLength < this.absLength) //存在单字
258 | {
259 | int curPoint;
260 | Cell head = this.getHead();
261 | curPoint = this.absBegin; //从路径绝对起始处开始扫描
262 | float sumFreq=0;
263 | char singleChar=0;
264 | while(head != null){
265 | while(curPoint 0){//词元接入链表头部
66 | this.head.prev = newCell;
67 | newCell.next = this.head;
68 | this.head = newCell;
69 | this.size++;
70 | return true;
71 |
72 | }else{
73 | //从尾部上逆
74 | Cell index = this.tail;
75 | while(index != null && index.compareTo(newCell) > 0){
76 | index = index.prev;
77 | }
78 | if(index.compareTo(newCell) == 0){//词元与集合中的词元重复,不放入集合
79 | return false;
80 |
81 | }else if(index.compareTo(newCell) < 0){//词元插入链表中的某个位置
82 | newCell.prev = index;
83 | newCell.next = index.next;
84 | index.next.prev = newCell;
85 | index.next = newCell;
86 | this.size++;
87 | return true;
88 | }
89 | }
90 | }
91 | return false;
92 | }
93 |
94 | /**
95 | * 返回链表头部元素
96 | * @return
97 | */
98 | Lexeme peekFirst(){
99 | if(this.head != null){
100 | return this.head.lexeme;
101 | }
102 | return null;
103 | }
104 |
105 | /**
106 | * 取出链表集合的第一个元素
107 | * @return Lexeme
108 | */
109 | Lexeme pollFirst(){
110 | if(this.size == 1){
111 | Lexeme first = this.head.lexeme;
112 | this.head = null;
113 | this.tail = null;
114 | this.size--;
115 | return first;
116 | }else if(this.size > 1){
117 | Lexeme first = this.head.lexeme;
118 | this.head = this.head.next;
119 | this.size --;
120 | return first;
121 | }else{
122 | return null;
123 | }
124 | }
125 |
126 | /**
127 | * 返回链表尾部元素
128 | * @return
129 | */
130 | Lexeme peekLast(){
131 | if(this.tail != null){
132 | return this.tail.lexeme;
133 | }
134 | return null;
135 | }
136 |
137 | /**
138 | * 取出链表集合的最后一个元素
139 | * @return Lexeme
140 | */
141 | Lexeme pollLast(){
142 | if(this.size == 1){
143 | Lexeme last = this.head.lexeme;
144 | this.head = null;
145 | this.tail = null;
146 | this.size--;
147 | return last;
148 |
149 | }else if(this.size > 1){
150 | Lexeme last = this.tail.lexeme;
151 | this.tail = this.tail.prev;
152 | this.size--;
153 | return last;
154 |
155 | }else{
156 | return null;
157 | }
158 | }
159 |
160 | /**
161 | * 返回集合大小
162 | * @return
163 | */
164 | int size(){
165 | return this.size;
166 | }
167 |
168 | /**
169 | * 判断集合是否为空
170 | * @return
171 | */
172 | boolean isEmpty(){
173 | return this.size == 0;
174 | }
175 |
176 | /**
177 | * 返回lexeme链的头部
178 | * @return
179 | */
180 | Cell getHead(){
181 | return this.head;
182 | }
183 |
184 | /**
185 | *
186 | * IK 中文分词 版本 5.0
187 | * IK Analyzer release 5.0
188 | *
189 | * Licensed to the Apache Software Foundation (ASF) under one or more
190 | * contributor license agreements. See the NOTICE file distributed with
191 | * this work for additional information regarding copyright ownership.
192 | * The ASF licenses this file to You under the Apache License, Version 2.0
193 | * (the "License"); you may not use this file except in compliance with
194 | * the License. You may obtain a copy of the License at
195 | *
196 | * http://www.apache.org/licenses/LICENSE-2.0
197 | *
198 | * Unless required by applicable law or agreed to in writing, software
199 | * distributed under the License is distributed on an "AS IS" BASIS,
200 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | * See the License for the specific language governing permissions and
202 | * limitations under the License.
203 | *
204 | * 源代码由林良益(linliangyi2005@gmail.com)提供
205 | * 版权声明 2012,乌龙茶工作室
206 | * provided by Linliangyi and copyright 2012 by Oolong studio
207 | *
208 | * QuickSortSet集合单元
209 | *
210 | */
211 | class Cell implements Comparable{
212 | private Cell prev;
213 | private Cell next;
214 | private Lexeme lexeme;
215 |
216 | Cell(Lexeme lexeme){
217 | if(lexeme == null){
218 | throw new IllegalArgumentException("lexeme must not be null");
219 | }
220 | this.lexeme = lexeme;
221 | }
222 |
223 | public int compareTo(Cell o) {
224 | return this.lexeme.compareTo(o.lexeme);
225 | }
226 |
227 | public Cell getPrev(){
228 | return this.prev;
229 | }
230 |
231 | public Cell getNext(){
232 | return this.next;
233 | }
234 |
235 | public Lexeme getLexeme(){
236 | return this.lexeme;
237 | }
238 | }
239 | }
240 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/dic/DictCharNode.java:
--------------------------------------------------------------------------------
1 | package org.wltea.analyzer.dic;
2 |
3 | import java.util.HashMap;
4 | import java.util.Map;
5 |
6 | public class DictCharNode {
7 | private static final Map charMap = new HashMap(1024,0.8f);
8 |
9 | void addChar(Character key,Float logFreq)
10 | {
11 | charMap.put(key, logFreq);
12 | //(int)(Math.log(Integer.parseInt(w[1]))*100),默认给0
13 | }
14 |
15 | float getCharFreq(Character singleChar)
16 | {
17 | float freq=-2.0f; //非单字,则表示该路径切分存在某些问题
18 | if(charMap.containsKey(singleChar)) //如果存在
19 | {
20 | freq = charMap.get(singleChar);
21 | }
22 | return freq;
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/dic/DictSegment.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * IK 中文分词 版本 5.0
4 | * IK Analyzer release 5.0
5 | *
6 | * Licensed to the Apache Software Foundation (ASF) under one or more
7 | * contributor license agreements. See the NOTICE file distributed with
8 | * this work for additional information regarding copyright ownership.
9 | * The ASF licenses this file to You under the Apache License, Version 2.0
10 | * (the "License"); you may not use this file except in compliance with
11 | * the License. You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | *
21 | * 源代码由林良益(linliangyi2005@gmail.com)提供
22 | * 版权声明 2012,乌龙茶工作室
23 | * provided by Linliangyi and copyright 2012 by Oolong studio
24 | *
25 | */
26 | package org.wltea.analyzer.dic;
27 |
28 | import java.util.Arrays;
29 | import java.util.HashMap;
30 | import java.util.Map;
31 |
32 | /**
33 | * 词典树分段,表示词典树的一个分枝
34 | */
35 | class DictSegment implements Comparable{
36 |
37 | //公用字典表,存储汉字
38 | //private static final Map charMap = new HashMap(16 , 0.95f);
39 | //数组大小上限
40 | private static final int ARRAY_LENGTH_LIMIT = 3;
41 |
42 |
43 | //Map存储结构
44 | private Map childrenMap;
45 | //数组方式存储结构
46 | private DictSegment[] childrenArray;
47 |
48 |
49 | //当前节点上存储的字符
50 | private Character nodeChar;
51 | //当前节点存储的Segment数目
52 | //storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
53 | private int storeSize = 0;
54 | //当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
55 | private int nodeState = 0;
56 |
57 |
58 | DictSegment(Character nodeChar){
59 | if(nodeChar == null){
60 | throw new IllegalArgumentException("参数为空异常,字符不能为空");
61 | }
62 | this.nodeChar = nodeChar;
63 | }
64 |
65 | Character getNodeChar() {
66 | return nodeChar;
67 | }
68 |
69 | /*
70 | * 判断是否有下一个节点
71 | */
72 | boolean hasNextNode(){
73 | return this.storeSize > 0;
74 | }
75 |
76 | /**
77 | * 匹配词段
78 | * @param charArray
79 | * @return Hit
80 | */
81 | Hit match(char[] charArray){
82 | return this.match(charArray , 0 , charArray.length , null);
83 | }
84 |
85 | /**
86 | * 匹配词段
87 | * @param charArray
88 | * @param begin
89 | * @param length
90 | * @return Hit
91 | */
92 | Hit match(char[] charArray , int begin , int length){
93 | return this.match(charArray , begin , length , null);
94 | }
95 |
96 | /**
97 | * 匹配词段
98 | * @param charArray
99 | * @param begin
100 | * @param length
101 | * @param searchHit
102 | * @return Hit
103 | */
104 | Hit match(char[] charArray , int begin , int length , Hit searchHit){
105 |
106 | if(searchHit == null){
107 | //如果hit为空,新建
108 | searchHit= new Hit();
109 | //设置hit的其实文本位置
110 | searchHit.setBegin(begin);
111 | }else{
112 | //否则要将HIT状态重置
113 | searchHit.setUnmatch();
114 | }
115 | //设置hit的当前处理位置
116 | searchHit.setEnd(begin);
117 |
118 | Character keyChar = new Character(charArray[begin]);
119 | DictSegment ds = null;
120 |
121 | //引用实例变量为本地变量,避免查询时遇到更新的同步问题
122 | DictSegment[] segmentArray = this.childrenArray;
123 | Map segmentMap = this.childrenMap;
124 |
125 | //STEP1 在节点中查找keyChar对应的DictSegment
126 | if(segmentArray != null){
127 | //在数组中查找
128 | DictSegment keySegment = new DictSegment(keyChar);
129 | int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize , keySegment);
130 | if(position >= 0){
131 | ds = segmentArray[position];
132 | }
133 |
134 | }else if(segmentMap != null){
135 | //在map中查找
136 | ds = (DictSegment)segmentMap.get(keyChar);
137 | }
138 |
139 | //STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果
140 | if(ds != null){
141 | if(length > 1){
142 | //词未匹配完,继续往下搜索
143 | return ds.match(charArray, begin + 1 , length - 1 , searchHit);
144 | }else if (length == 1){
145 |
146 | //搜索最后一个char
147 | if(ds.nodeState == 1){
148 | //添加HIT状态为完全匹配
149 | searchHit.setMatch();
150 | }
151 | if(ds.hasNextNode()){
152 | //添加HIT状态为前缀匹配
153 | searchHit.setPrefix();
154 | //记录当前位置的DictSegment
155 | searchHit.setMatchedDictSegment(ds);
156 | }
157 | return searchHit;
158 | }
159 |
160 | }
161 | //STEP3 没有找到DictSegment, 将HIT设置为不匹配
162 | return searchHit;
163 | }
164 |
165 | /**
166 | * 加载填充词典片段
167 | * @param charArray
168 | */
169 | void fillSegment(char[] charArray){
170 | this.fillSegment(charArray, 0 , charArray.length , 1);
171 | }
172 |
173 | /**
174 | * 屏蔽词典中的一个词
175 | * @param charArray
176 | */
177 | void disableSegment(char[] charArray){
178 | this.fillSegment(charArray, 0 , charArray.length , 0);
179 | }
180 |
181 | /**
182 | * 加载填充词典片段
183 | * @param charArray
184 | * @param begin
185 | * @param length
186 | * @param enabled
187 | */
188 | private synchronized void fillSegment(char[] charArray , int begin , int length , int enabled){
189 | //获取字典表中的汉字对象
190 | Character beginChar = new Character(charArray[begin]);
191 | /*Character keyChar = charMap.get(beginChar);
192 | //字典中没有该字,则将其添加入字典
193 | if(keyChar == null){
194 | charMap.put(beginChar, beginChar);
195 | keyChar = beginChar;
196 | }*/
197 |
198 | //搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建
199 | DictSegment ds = lookforSegment(beginChar , enabled);
200 | if(ds != null){
201 | //处理keyChar对应的segment
202 | if(length > 1){
203 | //词元还没有完全加入词典树
204 | ds.fillSegment(charArray, begin + 1, length - 1 , enabled);
205 | }else if (length == 1){
206 | //已经是词元的最后一个char,设置当前节点状态为enabled,
207 | //enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词
208 | ds.nodeState = enabled;
209 | }
210 | }
211 |
212 | }
213 |
214 | /**
215 | * 查找本节点下对应的keyChar的segment *
216 | * @param keyChar
217 | * @param create =1如果没有找到,则创建新的segment ; =0如果没有找到,不创建,返回null
218 | * @return
219 | */
220 | private DictSegment lookforSegment(Character keyChar , int create){
221 |
222 | DictSegment ds = null;
223 |
224 | if(this.storeSize <= ARRAY_LENGTH_LIMIT){
225 | //获取数组容器,如果数组未创建则创建数组
226 | DictSegment[] segmentArray = getChildrenArray();
227 | //搜寻数组
228 | DictSegment keySegment = new DictSegment(keyChar);
229 | int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize, keySegment);
230 | if(position >= 0){
231 | ds = segmentArray[position];
232 | }
233 |
234 | //遍历数组后没有找到对应的segment
235 | if(ds == null && create == 1){
236 | ds = keySegment;
237 | if(this.storeSize < ARRAY_LENGTH_LIMIT){
238 | //数组容量未满,使用数组存储
239 | segmentArray[this.storeSize] = ds;
240 | //segment数目+1
241 | this.storeSize++;
242 | Arrays.sort(segmentArray , 0 , this.storeSize);
243 |
244 | }else{
245 | //数组容量已满,切换Map存储
246 | //获取Map容器,如果Map未创建,则创建Map
247 | Map segmentMap = getChildrenMap();
248 | //将数组中的segment迁移到Map中
249 | migrate(segmentArray , segmentMap);
250 | //存储新的segment
251 | segmentMap.put(keyChar, ds);
252 | //segment数目+1 , 必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组
253 | this.storeSize++;
254 | //释放当前的数组引用
255 | this.childrenArray = null;
256 | }
257 |
258 | }
259 |
260 | }else{
261 | //获取Map容器,如果Map未创建,则创建Map
262 | Map segmentMap = getChildrenMap();
263 | //搜索Map
264 | ds = (DictSegment)segmentMap.get(keyChar);
265 | if(ds == null && create == 1){
266 | //构造新的segment
267 | ds = new DictSegment(keyChar);
268 | segmentMap.put(keyChar , ds);
269 | //当前节点存储segment数目+1
270 | this.storeSize ++;
271 | }
272 | }
273 |
274 | return ds;
275 | }
276 |
277 |
278 | /**
279 | * 获取数组容器
280 | * 线程同步方法
281 | */
282 | private DictSegment[] getChildrenArray(){
283 | if(this.childrenArray == null){
284 | synchronized(this){
285 | if(this.childrenArray == null){
286 | this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
287 | }
288 | }
289 | }
290 | return this.childrenArray;
291 | }
292 |
293 | /**
294 | * 获取Map容器
295 | * 线程同步方法
296 | */
297 | private Map getChildrenMap(){
298 | if(this.childrenMap == null){
299 | synchronized(this){
300 | if(this.childrenMap == null){
301 | this.childrenMap = new HashMap(ARRAY_LENGTH_LIMIT * 2,0.8f);
302 | }
303 | }
304 | }
305 | return this.childrenMap;
306 | }
307 |
308 | /**
309 | * 将数组中的segment迁移到Map中
310 | * @param segmentArray
311 | */
312 | private void migrate(DictSegment[] segmentArray , Map segmentMap){
313 | for(DictSegment segment : segmentArray){
314 | if(segment != null){
315 | segmentMap.put(segment.nodeChar, segment);
316 | }
317 | }
318 | }
319 |
320 | /**
321 | * 实现Comparable接口
322 | * @param o
323 | * @return int
324 | */
325 | public int compareTo(DictSegment o) {
326 | //对当前节点存储的char进行比较
327 | return this.nodeChar.compareTo(o.nodeChar);
328 | }
329 |
330 | }
331 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/dic/Dictionary.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | *
25 | */
26 | package org.wltea.analyzer.dic;
27 |
28 | import java.io.BufferedReader;
29 | import java.io.IOException;
30 | import java.io.InputStream;
31 | import java.io.InputStreamReader;
32 | import java.util.Collection;
33 | import java.util.List;
34 | import org.wltea.analyzer.cfg.Configuration;
35 | import org.wltea.analyzer.cfg.DefaultConfig;
36 |
37 | /**
38 | * 词典管理类,单子模式
39 | */
40 | public class Dictionary {
41 |
42 |
43 | /*
44 | * 词典单子实例
45 | */
46 | private volatile static Dictionary singleton;
47 |
48 | /*
49 | * 主词典对象
50 | */
51 | private DictSegment _MainDict;
52 |
53 | /*
54 | * 停止词词典
55 | */
56 | //private DictSegment _StopWordDict;
57 | /*
58 | * 量词词典
59 | */
60 | private DictSegment _QuantifierDict;
61 | /*
62 | * 单字带词频词典
63 | */
64 | private DictCharNode _CharFreqDict;
65 | /*
66 | * 配置对象
67 | */
68 | private Configuration cfg;
69 |
70 | private Dictionary(Configuration cfg){
71 | this.cfg = cfg;
72 | //建立一个主词典实例
73 | _MainDict = new DictSegment((char)0);
74 | this.loadMainDict(_MainDict);
75 |
76 | /*_StopWordDict = new DictSegment((char)0);
77 | this.loadStopWordDict(_StopWordDict);*/
78 |
79 | this.loadQuantifierDict();
80 | this.loadCharFreqDict();
81 |
82 | }
83 |
84 | /**
85 | * 词典初始化
86 | * 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
87 | * 只有当Dictionary类被实际调用时,才会开始载入词典,
88 | * 这将延长首次分词操作的时间
89 | * 该方法提供了一个在应用加载阶段就初始化字典的手段
90 | * @return Dictionary
91 | */
92 | public static Dictionary initial(Configuration cfg){
93 | if(singleton == null){
94 | synchronized(Dictionary.class){
95 | if(singleton == null){
96 | singleton = new Dictionary(cfg);
97 | return singleton;
98 | }
99 | }
100 | }
101 | return singleton;
102 | }
103 |
104 | /**
105 | * 把solr配置的字典加入到MainDic中,进行字典切换
106 | * @param inputStreamList 多字典输入流
107 | * @return
108 | */
109 | public static synchronized Dictionary addDic2MainDic(List inputStreamList)
110 | {
111 | if(singleton == null)
112 | {
113 | Configuration cfg = DefaultConfig.getInstance();
114 | Dictionary.initial(cfg);
115 | }
116 |
117 | DictSegment mainDicTemp = new DictSegment((char)0);
118 |
119 | System.out.println("begin load MainDict :");
120 | singleton.loadMainDict(mainDicTemp);
121 |
122 | System.out.println("begin loadSolrMainDict by List:");
123 | for(InputStream is : inputStreamList)
124 | {
125 | singleton.loadWords2DictSegment(is, mainDicTemp);
126 | }
127 |
128 | singleton._MainDict = mainDicTemp;
129 | System.out.println("*********************************");
130 | System.out.println("end switch!!");
131 | System.out.println("*********************************");
132 |
133 | mainDicTemp = null;
134 |
135 | return singleton;
136 | }
137 |
138 | /**
139 | * 获取词典单子实例
140 | * @return Dictionary 单例对象
141 | */
142 | public static Dictionary getSingleton(){
143 | if(singleton == null){
144 | throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
145 | }
146 | return singleton;
147 | }
148 |
149 | /**
150 | * 批量加载新词条
151 | * @param words Collection词条列表
152 | */
153 | public void addWords(Collection words){
154 | if(words != null){
155 | for(String word : words){
156 | if (word != null) {
157 | //批量加载词条到主内存词典中
158 | singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
159 | }
160 | }
161 | }
162 | }
163 |
164 | /**
165 | * 批量移除(屏蔽)词条
166 | * @param words
167 | */
168 | public void disableWords(Collection words){
169 | if(words != null){
170 | for(String word : words){
171 | if (word != null) {
172 | //批量屏蔽词条
173 | singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
174 | }
175 | }
176 | }
177 | }
178 |
179 | /**
180 | * 检索匹配主词典
181 | * @param charArray
182 | * @return Hit 匹配结果描述
183 | */
184 | public Hit matchInMainDict(char[] charArray){
185 | return singleton._MainDict.match(charArray);
186 | }
187 |
188 | /**
189 | * 检索匹配主词典
190 | * @param charArray
191 | * @param begin
192 | * @param length
193 | * @return Hit 匹配结果描述
194 | */
195 | public Hit matchInMainDict(char[] charArray , int begin, int length){
196 | return singleton._MainDict.match(charArray, begin, length);
197 | }
198 |
199 | /**
200 | * 检索匹配量词词典
201 | * @param charArray
202 | * @param begin
203 | * @param length
204 | * @return Hit 匹配结果描述
205 | */
206 | public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
207 | return singleton._QuantifierDict.match(charArray, begin, length);
208 | }
209 |
210 | /**
211 | * 从已匹配的Hit中直接取出DictSegment,继续向下匹配
212 | * @param charArray
213 | * @param currentIndex
214 | * @param matchedHit
215 | * @return Hit
216 | */
217 | public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){
218 | DictSegment ds = matchedHit.getMatchedDictSegment();
219 | return ds.match(charArray, currentIndex, 1 , matchedHit);
220 | }
221 |
222 |
223 | /**
224 | * 判断是否是停止词
225 | * @param charArray
226 | * @param begin
227 | * @param length
228 | * @return boolean
229 | */
230 | /*public boolean isStopWord(char[] charArray , int begin, int length){
231 | return singleton._StopWordDict.match(charArray, begin, length).isMatch();
232 | }*/
233 |
234 | /**
235 | * 加载主词典及扩展词典
236 | */
237 | private void loadMainDict(DictSegment dstDicSegment){
238 |
239 | //读取主词典文件
240 | InputStream inputStream = this.getClass().getClassLoader().getResourceAsStream("main2012.dic");
241 | if(inputStream == null){
242 | throw new RuntimeException("Main Dictionary not found!!!");
243 | }
244 |
245 | //System.out.println("test加载主字典");
246 | this.loadWords2DictSegment(inputStream,dstDicSegment);
247 |
248 | //System.out.println("test加载扩展字典");
249 | this.loadExtDict(dstDicSegment);
250 |
251 | }
252 |
253 | /**
254 | * 加载用户配置的扩展词典到主词库表
255 | */
256 | private void loadExtDict(DictSegment dstDicSegment){
257 | //加载扩展词典配置
258 | List extDictFiles = cfg.getExtDictionarys();
259 | if(extDictFiles != null){
260 | InputStream is = null;
261 | for(String extDictName : extDictFiles){
262 | //读取扩展词典文件
263 | //System.out.println("加载扩展词典:" + extDictName);
264 | is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
265 | //如果找不到扩展的字典,则忽略
266 | if(is == null){
267 | continue;
268 | }
269 | loadWords2DictSegment(is,dstDicSegment);
270 | }
271 | }
272 | }
273 |
274 | /**
275 | *
276 | * @param is 字典数据输入流
277 | * @param dstDicSegment 目标字典
278 | */
279 |
280 | private void loadWords2DictSegment(InputStream is,DictSegment dstDicSegment) {
281 |
282 | if(is != null)
283 | {
284 | try {
285 | BufferedReader br = new BufferedReader(new InputStreamReader(is,
286 | "UTF-8"));
287 | String theWord = null;
288 | do {
289 | theWord = br.readLine();
290 | if (theWord != null ) {
291 | String line = theWord.trim();
292 | if (!line.isEmpty() && !line.startsWith("#")){
293 | String[] words = line.split("[\\s=,>]+");
294 | for(String w :words)
295 | dstDicSegment.fillSegment(w.toLowerCase().toCharArray());
296 | }
297 | }
298 | } while (theWord != null);
299 |
300 | } catch (IOException ioe) {
301 | System.err.println(" Dictionary loading exception。ClassName: " + dstDicSegment.getClass().getName());
302 | ioe.printStackTrace();
303 |
304 | } finally {
305 | try {
306 | if (is != null) {
307 | is.close();
308 | is = null;
309 | }
310 | } catch (IOException e) {
311 | e.printStackTrace();
312 | }
313 | }
314 | }
315 | }
316 |
317 | /**
318 | * 加载量词词典
319 | */
320 | private void loadQuantifierDict(){
321 | //建立一个量词典实例
322 | _QuantifierDict = new DictSegment((char)0);
323 | //读取量词词典文件
324 | InputStream is = this.getClass().getClassLoader().getResourceAsStream("quantifier.dic");
325 | if(is == null){
326 | throw new RuntimeException("Quantifier Dictionary not found!!!");
327 | }
328 | loadWords2DictSegment(is, _QuantifierDict);
329 | }
330 |
331 | private void loadCharFreqDict(){
332 | _CharFreqDict = new DictCharNode();
333 | //读取量词词典文件
334 | InputStream is = this.getClass().getClassLoader().getResourceAsStream("chars.dic");
335 | if(is == null){
336 | throw new RuntimeException("Chars Dictionary not found!!!");
337 | }
338 | try { //此处可以抽象出一个接口,或公用函数
339 | BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
340 | String theWord = null;
341 | do {
342 | theWord = br.readLine();
343 | if (theWord != null && !"".equals(theWord.trim())) {
344 | String[] w = theWord.split(" ");
345 | if(w.length == 2)
346 | {
347 | _CharFreqDict.addChar(w[0].charAt(0), (float)(Math.log10(Integer.parseInt(w[1])+5)));
348 | }
349 | /*else
350 | {
351 | _CharFreqDict.addChar(w[0].charAt(0), 0); //默认无词性该给多少权重?是否该存在(对于没词频数据)?
352 | }*/
353 | }
354 | } while (theWord != null);
355 |
356 | } catch (IOException ioe) {
357 | System.err.println("Chars Dictionary loading exception.");
358 | ioe.printStackTrace();
359 | }finally{
360 | try {
361 | if(is != null){
362 | is.close();
363 | is = null;
364 | }
365 | } catch (IOException e) {
366 | e.printStackTrace();
367 | }
368 | }
369 | }
370 |
371 | public float getCharFreq(Character key)
372 | {
373 | return _CharFreqDict.getCharFreq(key);
374 | }
375 |
376 | }
377 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/dic/Hit.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * IK 中文分词 版本 5.0
4 | * IK Analyzer release 5.0
5 | *
6 | * Licensed to the Apache Software Foundation (ASF) under one or more
7 | * contributor license agreements. See the NOTICE file distributed with
8 | * this work for additional information regarding copyright ownership.
9 | * The ASF licenses this file to You under the Apache License, Version 2.0
10 | * (the "License"); you may not use this file except in compliance with
11 | * the License. You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | *
21 | * 源代码由林良益(linliangyi2005@gmail.com)提供
22 | * 版权声明 2012,乌龙茶工作室
23 | * provided by Linliangyi and copyright 2012 by Oolong studio
24 | *
25 | */
26 | package org.wltea.analyzer.dic;
27 |
28 | /**
29 | * 表示一次词典匹配的命中
30 | */
31 | public class Hit {
32 | //Hit不匹配
33 | private static final int UNMATCH = 0x00000000;
34 | //Hit完全匹配
35 | private static final int MATCH = 0x00000001;
36 | //Hit前缀匹配
37 | private static final int PREFIX = 0x00000010;
38 |
39 |
40 | //该HIT当前状态,默认未匹配
41 | private int hitState = UNMATCH;
42 |
43 | //记录词典匹配过程中,当前匹配到的词典分支节点
44 | private DictSegment matchedDictSegment;
45 | /*
46 | * 词段开始位置
47 | */
48 | private int begin;
49 | /*
50 | * 词段的结束位置
51 | */
52 | private int end;
53 |
54 |
55 | /**
56 | * 判断是否完全匹配
57 | */
58 | public boolean isMatch() {
59 | return (this.hitState & MATCH) > 0;
60 | }
61 | /**
62 | *
63 | */
64 | public void setMatch() {
65 | this.hitState = this.hitState | MATCH;
66 | }
67 |
68 | /**
69 | * 判断是否是词的前缀
70 | */
71 | public boolean isPrefix() {
72 | return (this.hitState & PREFIX) > 0;
73 | }
74 | /**
75 | *
76 | */
77 | public void setPrefix() {
78 | this.hitState = this.hitState | PREFIX;
79 | }
80 | /**
81 | * 判断是否是不匹配
82 | */
83 | public boolean isUnmatch() {
84 | return this.hitState == UNMATCH ;
85 | }
86 | /**
87 | *
88 | */
89 | public void setUnmatch() {
90 | this.hitState = UNMATCH;
91 | }
92 |
93 | public DictSegment getMatchedDictSegment() {
94 | return matchedDictSegment;
95 | }
96 |
97 | public void setMatchedDictSegment(DictSegment matchedDictSegment) {
98 | this.matchedDictSegment = matchedDictSegment;
99 | }
100 |
101 | public int getBegin() {
102 | return begin;
103 | }
104 |
105 | public void setBegin(int begin) {
106 | this.begin = begin;
107 | }
108 |
109 | public int getEnd() {
110 | return end;
111 | }
112 |
113 | public void setEnd(int end) {
114 | this.end = end;
115 | }
116 |
117 | }
118 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0.1
3 | * IK Analyzer release 5.0.1
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 |
25 | *
26 | */
27 | package org.wltea.analyzer.lucene;
28 |
29 | import java.io.IOException;
30 | import java.io.Reader;
31 |
32 | import org.apache.lucene.analysis.Tokenizer;
33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
35 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
36 |
37 | import org.wltea.analyzer.core.IKSegmenter;
38 | import org.wltea.analyzer.core.Lexeme;
39 |
40 | /**
41 | * IK分词器 Lucene Tokenizer适配器类 兼容Lucene 4.0版本
42 | */
43 | public final class IKTokenizer extends Tokenizer {
44 |
45 | // IK分词器实现
46 | private IKSegmenter _IKImplement;
47 |
48 | // 词元文本属性
49 | private final CharTermAttribute termAtt;
50 | // 词元位移属性
51 | private final OffsetAttribute offsetAtt;
52 | // 词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
53 | private final TypeAttribute typeAtt;
54 | // 记录最后一个词元的结束位置
55 | private int endPosition;
56 |
57 | /**
58 | * Lucene 4.0 Tokenizer适配器类构造函数
59 | *
60 | * @param in
61 | * @param useSmart
62 | */
63 |
64 | public IKTokenizer(Reader in, boolean useSmart) {
65 | super(in);
66 | offsetAtt = addAttribute(OffsetAttribute.class);
67 | termAtt = addAttribute(CharTermAttribute.class);
68 | typeAtt = addAttribute(TypeAttribute.class);
69 | _IKImplement = new IKSegmenter(input, useSmart);
70 | }
71 |
72 | /*
73 | * (non-Javadoc)
74 | *
75 | * @see org.apache.lucene.analysis.TokenStream#incrementToken()
76 | */
77 | @Override
78 | public boolean incrementToken() throws IOException {
79 | // 清除所有的词元属性
80 | clearAttributes();
81 | Lexeme nextLexeme = _IKImplement.next();
82 | if (nextLexeme != null) {
83 | // 将Lexeme转成Attributes
84 | // 设置词元文本
85 | termAtt.append(nextLexeme.getLexemeText());
86 | // 设置词元长度
87 | termAtt.setLength(nextLexeme.getLength());
88 | // 设置词元位移
89 | offsetAtt.setOffset(nextLexeme.getBeginPosition(),
90 | nextLexeme.getEndPosition());
91 | // 记录分词的最后位置
92 | endPosition = nextLexeme.getEndPosition();
93 | // 记录词元分类
94 | typeAtt.setType(nextLexeme.getLexemeTypeString());
95 | // 返会true告知还有下个词元
96 | return true;
97 | }
98 | // 返会false告知词元输出完毕
99 | return false;
100 | }
101 |
102 | /*
103 | * (non-Javadoc)
104 | *
105 | * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
106 | */
107 | @Override
108 | public void reset() throws IOException {
109 | super.reset();
110 | _IKImplement.reset(input);
111 | }
112 |
113 | @Override
114 | public final void end() {
115 | // set final offset
116 | int finalOffset = correctOffset(this.endPosition);
117 | offsetAtt.setOffset(finalOffset, finalOffset);
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/java/org/wltea/analyzer/lucene/IKTokenizerFactory.java:
--------------------------------------------------------------------------------
1 | package org.wltea.analyzer.lucene;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.io.Reader;
6 | import java.util.List;
7 | import java.util.Map;
8 | import org.apache.lucene.analysis.Tokenizer;
9 | import org.apache.lucene.analysis.util.ResourceLoader;
10 | import org.apache.lucene.util.AttributeSource.AttributeFactory;
11 | import org.wltea.analyzer.dic.Dictionary;
12 |
13 | import com.mlcs.search.mlcsseg.lucene.ReloadableTokenizerFactory;
14 | import com.mlcs.search.mlcsseg.lucene.ReloaderRegister;
15 |
16 | public class IKTokenizerFactory extends ReloadableTokenizerFactory {
17 |
18 |
19 | public IKTokenizerFactory(Map args) {
20 | super(args);
21 |
22 | useSmart = getBoolean(args, "useSmart", false);
23 | System.out.println(":::ik:construction::::::::::::::::::::::::::" + conf);
24 | }
25 | private boolean useSmart = false;
26 |
27 | private boolean useSmart() {
28 | return useSmart;
29 | }
30 |
31 |
32 | // 通过这个实现,调用自身分词器
33 | public Tokenizer create(AttributeFactory attributeFactory, Reader in) { // 会多次被调用
34 | return new IKTokenizer(in, this.useSmart()); // 初始化词典,分词器,消歧器
35 | }
36 |
37 | public void inform(ResourceLoader loader) throws IOException { // 在启动时初始化一次
38 | System.out.println(":::ik:::inform::::::::::::::::::::::::" + conf);
39 | ReloaderRegister.register(this, loader, conf);
40 | }
41 |
42 |
43 |
44 | @Override
45 | public void update(List inputStreams) {
46 | Dictionary.addDic2MainDic(inputStreams);
47 | }
48 |
49 |
50 | }
51 |
--------------------------------------------------------------------------------
/mlcsseg-ik/src/main/resources/quantifier.dic:
--------------------------------------------------------------------------------
1 | 丈
2 | 下
3 | 世
4 | 世纪
5 | 两
6 | 个
7 | 中
8 | 串
9 | 亩
10 | 人
11 | 介
12 | 付
13 | 代
14 | 件
15 | 任
16 | 份
17 | 伏
18 | 伙
19 | 位
20 | 位数
21 | 例
22 | 倍
23 | 像素
24 | 元
25 | 克
26 | 克拉
27 | 公亩
28 | 公克
29 | 公分
30 | 公升
31 | 公尺
32 | 公担
33 | 公斤
34 | 公里
35 | 公顷
36 | 具
37 | 册
38 | 出
39 | 刀
40 | 分
41 | 分钟
42 | 分米
43 | 划
44 | 列
45 | 则
46 | 刻
47 | 剂
48 | 剑
49 | 副
50 | 加仑
51 | 勺
52 | 包
53 | 匙
54 | 匹
55 | 区
56 | 千克
57 | 千米
58 | 升
59 | 卷
60 | 厅
61 | 厘
62 | 厘米
63 | 双
64 | 发
65 | 口
66 | 句
67 | 只
68 | 台
69 | 叶
70 | 号
71 | 名
72 | 吨
73 | 听
74 | 员
75 | 周
76 | 周年
77 | 品
78 | 回
79 | 团
80 | 圆
81 | 圈
82 | 地
83 | 场
84 | 块
85 | 坪
86 | 堆
87 | 声
88 | 壶
89 | 处
90 | 夜
91 | 大
92 | 天
93 | 头
94 | 套
95 | 女
96 | 孔
97 | 字
98 | 宗
99 | 室
100 | 家
101 | 寸
102 | 对
103 | 封
104 | 尊
105 | 小时
106 | 尺
107 | 尾
108 | 局
109 | 层
110 | 届
111 | 岁
112 | 师
113 | 帧
114 | 幅
115 | 幕
116 | 幢
117 | 平方
118 | 平方公尺
119 | 平方公里
120 | 平方分米
121 | 平方厘米
122 | 平方码
123 | 平方米
124 | 平方英寸
125 | 平方英尺
126 | 平方英里
127 | 平米
128 | 年
129 | 年代
130 | 年级
131 | 度
132 | 座
133 | 式
134 | 引
135 | 张
136 | 成
137 | 战
138 | 截
139 | 户
140 | 房
141 | 所
142 | 扇
143 | 手
144 | 打
145 | 批
146 | 把
147 | 折
148 | 担
149 | 拍
150 | 招
151 | 拨
152 | 拳
153 | 指
154 | 掌
155 | 排
156 | 撮
157 | 支
158 | 文
159 | 斗
160 | 斤
161 | 方
162 | 族
163 | 日
164 | 时
165 | 曲
166 | 月
167 | 月份
168 | 期
169 | 本
170 | 朵
171 | 村
172 | 束
173 | 条
174 | 来
175 | 杯
176 | 枚
177 | 枝
178 | 枪
179 | 架
180 | 柄
181 | 柜
182 | 栋
183 | 栏
184 | 株
185 | 样
186 | 根
187 | 格
188 | 案
189 | 桌
190 | 档
191 | 桩
192 | 桶
193 | 梯
194 | 棵
195 | 楼
196 | 次
197 | 款
198 | 步
199 | 段
200 | 毛
201 | 毫
202 | 毫升
203 | 毫米
204 | 毫克
205 | 池
206 | 洲
207 | 派
208 | 海里
209 | 滴
210 | 炮
211 | 点
212 | 点钟
213 | 片
214 | 版
215 | 环
216 | 班
217 | 瓣
218 | 瓶
219 | 生
220 | 男
221 | 画
222 | 界
223 | 盆
224 | 盎司
225 | 盏
226 | 盒
227 | 盘
228 | 相
229 | 眼
230 | 石
231 | 码
232 | 碗
233 | 碟
234 | 磅
235 | 种
236 | 科
237 | 秒
238 | 秒钟
239 | 窝
240 | 立方公尺
241 | 立方分米
242 | 立方厘米
243 | 立方码
244 | 立方米
245 | 立方英寸
246 | 立方英尺
247 | 站
248 | 章
249 | 笔
250 | 等
251 | 筐
252 | 筒
253 | 箱
254 | 篇
255 | 篓
256 | 篮
257 | 簇
258 | 米
259 | 类
260 | 粒
261 | 级
262 | 组
263 | 维
264 | 缕
265 | 缸
266 | 罐
267 | 网
268 | 群
269 | 股
270 | 脚
271 | 船
272 | 艇
273 | 艘
274 | 色
275 | 节
276 | 英亩
277 | 英寸
278 | 英尺
279 | 英里
280 | 行
281 | 袋
282 | 角
283 | 言
284 | 课
285 | 起
286 | 趟
287 | 路
288 | 车
289 | 转
290 | 轮
291 | 辆
292 | 辈
293 | 连
294 | 通
295 | 遍
296 | 部
297 | 里
298 | 重
299 | 针
300 | 钟
301 | 钱
302 | 锅
303 | 门
304 | 间
305 | 队
306 | 阶段
307 | 隅
308 | 集
309 | 页
310 | 顶
311 | 顷
312 | 项
313 | 顿
314 | 颗
315 | 餐
316 | 首
--------------------------------------------------------------------------------
/mlcsseg-ik/src/test/java/org/wltea/analyzer/test/TestIk.java:
--------------------------------------------------------------------------------
1 | package org.wltea.analyzer.test;
2 |
3 | import java.io.IOException;
4 | import java.io.StringReader;
5 |
6 | import org.wltea.analyzer.core.IKSegmenter;
7 |
8 | public class TestIk {
9 | public static void main(String[] args) throws IOException {
10 | IKSegmenter ik = new IKSegmenter(new StringReader(""), true);
11 | ik.next();
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.mlcs.search
6 | mlcsseg
7 | 4.6.0-SNAPSHOT
8 | pom
9 |
10 | mlcsseg
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 |
16 |
17 |
18 | mlcsseg-common
19 | mlcsseg-ik
20 | mlcsseg-filter
21 | mlcsseg-ansj
22 |
23 |
24 |
25 |
26 | org.apache.lucene
27 | lucene-analyzers-common
28 | 4.6.1
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/test1/conf/admin-extra.html:
--------------------------------------------------------------------------------
1 |
17 |
18 |
32 |
--------------------------------------------------------------------------------
/test1/conf/admin-extra.menu-bottom.html:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/test1/conf/admin-extra.menu-top.html:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/test1/conf/extDic.txt:
--------------------------------------------------------------------------------
1 |
2 | 七匹狼
3 | 秋装
4 | 伊莲娜
5 | 格男仕
6 | 李东垣
7 | 卡扎菲
8 | 大舒服
9 | 惠国吉
10 | 楠
11 | 木
12 | 金
13 | 丝
--------------------------------------------------------------------------------
/test1/conf/extDic1.txt:
--------------------------------------------------------------------------------
1 |
2 | 古妃奇
3 | 简直笨
4 | 并发编程
5 | 穆定喜
--------------------------------------------------------------------------------
/test1/conf/ik.conf:
--------------------------------------------------------------------------------
1 | lastupdate=11223
2 | files=extDic.txt,extDic1.txt,synonyms.txt,isynonyms.txt
--------------------------------------------------------------------------------
/test1/conf/isynonyms.txt:
--------------------------------------------------------------------------------
1 | # The ASF licenses this file to You under the Apache License, Version 2.0
2 | # (the "License"); you may not use this file except in compliance with
3 | # the License. You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | #-----------------------------------------------------------------------
14 | #some test synonym mappings unlikely to appear in real input text
15 | aaafoo => aaabar
16 | bbbfoo => bbbfoo bbbbar
17 | cccfoo => cccbar cccbaz
18 | fooaaa,baraaa,bazaaa
19 |
20 | # Some synonym groups specific to this example
21 | GB,gib,gigabyte,gigabytes
22 | MB,mib,megabyte,megabytes
23 | Television, Televisions, TV, TVs
24 | #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
25 | #after us won't split it into two words.
26 |
27 | # Synonym mappings can be used for spelling correction too
28 | 男式=>男
29 |
--------------------------------------------------------------------------------
/test1/conf/schema.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
47 |
48 |
49 |
65 |
66 |
67 |
92 |
93 |
99 |
100 |
101 |
102 |
103 |
104 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
121 | id
122 |
123 |
124 |
125 |
126 |
133 |
134 |
140 |
141 |
144 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
184 |
185 |
188 |
189 |
190 |
191 |
192 |
193 |
203 |
204 |
205 |
206 |
207 |
208 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
265 |
266 |
267 |
278 |
279 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
305 |
306 |
307 |
308 |
309 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
345 |
354 |
355 |
356 |
357 |
362 |
367 |
368 |
369 |
--------------------------------------------------------------------------------
/test1/conf/stop.conf:
--------------------------------------------------------------------------------
1 | lastupdate=111221
2 | files=stopwords.txt
--------------------------------------------------------------------------------
/test1/conf/stopwords.txt:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 和
16 |
--------------------------------------------------------------------------------
/test1/conf/synonym.conf:
--------------------------------------------------------------------------------
1 | lastupdate=12
2 | files=synonyms.txt
--------------------------------------------------------------------------------
/test1/conf/synonym2.conf:
--------------------------------------------------------------------------------
1 | lastupdate=12
2 | files=isynonyms.txt
--------------------------------------------------------------------------------
/test1/conf/synonyms.txt:
--------------------------------------------------------------------------------
1 | # The ASF licenses this file to You under the Apache License, Version 2.0
2 | # (the "License"); you may not use this file except in compliance with
3 | # the License. You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | #-----------------------------------------------------------------------
14 | #some test synonym mappings unlikely to appear in real input text
15 | aaafoo => aaabar
16 | bbbfoo => bbbfoo bbbbar
17 | cccfoo => cccbar cccbaz
18 | fooaaa,baraaa,bazaaa
19 |
20 | # Some synonym groups specific to this example
21 | GB,gib,gigabyte,gigabytes
22 | MB,mib,megabyte,megabytes
23 | Television, Televisions, TV, TVs
24 | #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
25 | #after us won't split it into two words.
26 |
27 | # Synonym mappings can be used for spelling correction too
28 | pixima => pixma
29 | 李东垣 => 李东阳
30 | 卡扎菲,卡扎渣,卡炸飞
31 | 穆定喜 => 木丁西
--------------------------------------------------------------------------------
/test1/conf/update-script.js:
--------------------------------------------------------------------------------
1 | /*
2 | This is a basic skeleton JavaScript update processor.
3 |
4 | In order for this to be executed, it must be properly wired into solrconfig.xml; by default it is commented out in
5 | the example solrconfig.xml and must be uncommented to be enabled.
6 |
7 | See http://wiki.apache.org/solr/ScriptUpdateProcessor for more details.
8 | */
9 |
10 | function processAdd(cmd) {
11 |
12 | doc = cmd.solrDoc; // org.apache.solr.common.SolrInputDocument
13 | id = doc.getFieldValue("id");
14 | logger.info("update-script#processAdd: id=" + id);
15 |
16 | // Set a field value:
17 | // doc.setField("foo_s", "whatever");
18 |
19 | // Get a configuration parameter:
20 | // config_param = params.get('config_param'); // "params" only exists if processor configured with
21 |
22 | // Get a request parameter:
23 | // some_param = req.getParams().get("some_param")
24 |
25 | // Add a field of field names that match a pattern:
26 | // - Potentially useful to determine the fields/attributes represented in a result set, via faceting on field_name_ss
27 | // field_names = doc.getFieldNames().toArray();
28 | // for(i=0; i < field_names.length; i++) {
29 | // field_name = field_names[i];
30 | // if (/attr_.*/.test(field_name)) { doc.addField("attribute_ss", field_names[i]); }
31 | // }
32 |
33 | }
34 |
35 | function processDelete(cmd) {
36 | // no-op
37 | }
38 |
39 | function processMergeIndexes(cmd) {
40 | // no-op
41 | }
42 |
43 | function processCommit(cmd) {
44 | // no-op
45 | }
46 |
47 | function processRollback(cmd) {
48 | // no-op
49 | }
50 |
51 | function finish() {
52 | // no-op
53 | }
54 |
--------------------------------------------------------------------------------
/test2/conf/admin-extra.html:
--------------------------------------------------------------------------------
1 |
17 |
18 |
32 |
--------------------------------------------------------------------------------
/test2/conf/admin-extra.menu-bottom.html:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/test2/conf/admin-extra.menu-top.html:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/test2/conf/ansj.conf:
--------------------------------------------------------------------------------
1 | lastupdate=1226
2 | files=extDic.txt,extDic1.txt
--------------------------------------------------------------------------------
/test2/conf/extDic.txt:
--------------------------------------------------------------------------------
1 |
2 | 七匹狼
3 | 秋装
4 | 伊莲娜
5 | 格男仕
6 | 李东垣
7 | 卡扎菲
8 | 大舒服
9 |
--------------------------------------------------------------------------------
/test2/conf/extDic1.txt:
--------------------------------------------------------------------------------
1 |
2 | 古妃奇
3 | 简直笨
4 |
--------------------------------------------------------------------------------
/test2/conf/isynonyms.txt:
--------------------------------------------------------------------------------
1 | # The ASF licenses this file to You under the Apache License, Version 2.0
2 | # (the "License"); you may not use this file except in compliance with
3 | # the License. You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | #-----------------------------------------------------------------------
14 | #some test synonym mappings unlikely to appear in real input text
15 | aaafoo => aaabar
16 | bbbfoo => bbbfoo bbbbar
17 | cccfoo => cccbar cccbaz
18 | fooaaa,baraaa,bazaaa
19 |
20 | # Some synonym groups specific to this example
21 | GB,gib,gigabyte,gigabytes
22 | MB,mib,megabyte,megabytes
23 | Television, Televisions, TV, TVs
24 | #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
25 | #after us won't split it into two words.
26 |
27 | # Synonym mappings can be used for spelling correction too
28 | 男式=>男
29 |
--------------------------------------------------------------------------------
/test2/conf/schema.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
47 |
48 |
49 |
65 |
66 |
67 |
92 |
93 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
122 | id
123 |
124 |
125 |
126 |
127 |
134 |
135 |
141 |
142 |
145 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
185 |
186 |
189 |
190 |
191 |
192 |
193 |
194 |
204 |
205 |
206 |
207 |
208 |
209 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
266 |
267 |
268 |
279 |
280 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
306 |
307 |
308 |
309 |
310 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
339 |
348 |
349 |
350 |
351 |
356 |
361 |
362 |
363 |
--------------------------------------------------------------------------------
/test2/conf/stop.conf:
--------------------------------------------------------------------------------
1 | lastupdate=11122
2 | files=stopwords.txt
--------------------------------------------------------------------------------
/test2/conf/stopwords.txt:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 和
16 | 你
17 |
--------------------------------------------------------------------------------
/test2/conf/synonym.conf:
--------------------------------------------------------------------------------
1 | lastupdate=1
2 | files=synonyms.txt
--------------------------------------------------------------------------------
/test2/conf/synonym2.conf:
--------------------------------------------------------------------------------
1 | lastupdate=12
2 | files=isynonyms.txt
--------------------------------------------------------------------------------
/test2/conf/synonyms.txt:
--------------------------------------------------------------------------------
1 | # The ASF licenses this file to You under the Apache License, Version 2.0
2 | # (the "License"); you may not use this file except in compliance with
3 | # the License. You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | #-----------------------------------------------------------------------
14 | #some test synonym mappings unlikely to appear in real input text
15 | aaafoo => aaabar
16 | bbbfoo => bbbfoo bbbbar
17 | cccfoo => cccbar cccbaz
18 | fooaaa,baraaa,bazaaa
19 |
20 | # Some synonym groups specific to this example
21 | GB,gib,gigabyte,gigabytes
22 | MB,mib,megabyte,megabytes
23 | Television, Televisions, TV, TVs
24 | #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
25 | #after us won't split it into two words.
26 |
27 | # Synonym mappings can be used for spelling correction too
28 | pixima => pixma
29 | 李东垣 => 李东阳
30 | 卡扎菲,卡扎渣,卡炸飞
31 |
--------------------------------------------------------------------------------
/test2/conf/update-script.js:
--------------------------------------------------------------------------------
1 | /*
2 | This is a basic skeleton JavaScript update processor.
3 |
4 | In order for this to be executed, it must be properly wired into solrconfig.xml; by default it is commented out in
5 | the example solrconfig.xml and must be uncommented to be enabled.
6 |
7 | See http://wiki.apache.org/solr/ScriptUpdateProcessor for more details.
8 | */
9 |
10 | function processAdd(cmd) {
11 |
12 | doc = cmd.solrDoc; // org.apache.solr.common.SolrInputDocument
13 | id = doc.getFieldValue("id");
14 | logger.info("update-script#processAdd: id=" + id);
15 |
16 | // Set a field value:
17 | // doc.setField("foo_s", "whatever");
18 |
19 | // Get a configuration parameter:
20 | // config_param = params.get('config_param'); // "params" only exists if processor configured with
21 |
22 | // Get a request parameter:
23 | // some_param = req.getParams().get("some_param")
24 |
25 | // Add a field of field names that match a pattern:
26 | // - Potentially useful to determine the fields/attributes represented in a result set, via faceting on field_name_ss
27 | // field_names = doc.getFieldNames().toArray();
28 | // for(i=0; i < field_names.length; i++) {
29 | // field_name = field_names[i];
30 | // if (/attr_.*/.test(field_name)) { doc.addField("attribute_ss", field_names[i]); }
31 | // }
32 |
33 | }
34 |
35 | function processDelete(cmd) {
36 | // no-op
37 | }
38 |
39 | function processMergeIndexes(cmd) {
40 | // no-op
41 | }
42 |
43 | function processCommit(cmd) {
44 | // no-op
45 | }
46 |
47 | function processRollback(cmd) {
48 | // no-op
49 | }
50 |
51 | function finish() {
52 | // no-op
53 | }
54 |
--------------------------------------------------------------------------------
|