├── lib
    └── nlp-lang-1.7.8.jar
├── .gitignore
├── .travis.yml
├── src
    ├── test
    │   ├── resources
    │   │   └── log4j.properties
    │   └── java
    │   │   └── org
    │   │       └── elasticsearch
    │   │           └── index
    │   │               └── analysis
    │   │                   ├── PinyinAlphabetTokenizerTest.java
    │   │                   └── PinyinAnalysisTest.java
    └── main
    │   ├── java
    │       └── org
    │       │   └── elasticsearch
    │       │       ├── index
    │       │           └── analysis
    │       │           │   ├── ConfigErrorException.java
    │       │           │   ├── PinyinAnalyzer.java
    │       │           │   ├── TermItem.java
    │       │           │   ├── PinyinTokenizerFactory.java
    │       │           │   ├── MultiplePinyinTokenizerFactory.java
    │       │           │   ├── PinyinTokenFilterFactory.java
    │       │           │   ├── MultiplePinyinTokenFilterFactory.java
    │       │           │   ├── PinyinAnalyzerProvider.java
    │       │           │   ├── PinyinAbbreviationsTokenizerFactory.java
    │       │           │   ├── PinyinAlphabetTokenizer.java
    │       │           │   ├── PinyinTokenFilter.java
    │       │           │   ├── PinyinTokenizer.java
    │       │           │   ├── MultiplePinyinTokenFilter.java
    │       │           │   └── MultiplePinyinTokenizer.java
    │       │       ├── plugin
    │       │           └── analysis
    │       │           │   └── pinyin
    │       │           │       └── AnalysisPinyinPlugin.java
    │       │       └── analysis
    │       │           └── PinyinConfig.java
    │   ├── assemblies
    │       └── plugin.xml
    │   └── resources
    │       ├── plugin-descriptor.properties
    │       └── pinyin_alphabet.dict
├── LICENSE.txt
├── README.md
└── pom.xml


/lib/nlp-lang-1.7.8.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RickyHuo/elasticsearch-analysis-pinyin/HEAD/lib/nlp-lang-1.7.8.jar


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /data
 2 | /work
 3 | /logs
 4 | /.idea
 5 | /target
 6 | .DS_Store
 7 | *.iml
 8 | /.project
 9 | /.settings
10 | /.classpath
11 | /*.ipr
12 | /*.iws
13 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | jdk:
 3 |   - oraclejdk8
 4 | install: true
 5 | script:
 6 |   - sudo apt-get update && sudo apt-get install oracle-java8-installer
 7 |   - java -version
 8 | language: java
 9 | script: mvn clean package
10 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=INFO, out
2 | 
3 | log4j.appender.out=org.apache.log4j.ConsoleAppender
4 | log4j.appender.out.layout=org.apache.log4j.PatternLayout
5 | log4j.appender.out.layout.conversionPattern=[%d{ISO8601}][%-5p][%-25c] %m%n
6 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/ConfigErrorException.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | /**
 4 |  * Created by medcl on 16/8/22.
 5 |  */
 6 | public class ConfigErrorException extends RuntimeException {
 7 |     private final String mesage;
 8 | 
 9 |     public ConfigErrorException(String message) {
10 |         this.mesage=message;
11 |     }
12 |     public String getMessage() {
13 |         return this.mesage;
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/PinyinAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.elasticsearch.analysis.PinyinConfig;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | 
 7 | /**
 8 |  * Created by IntelliJ IDEA.
 9 |  * User: Medcl'
10 |  * Date: 12-5-22
11 |  * Time: 上午10:39
12 |  */
13 | public final class PinyinAnalyzer extends Analyzer {
14 | 
15 |     private PinyinConfig config;
16 | 
17 |     public PinyinAnalyzer(PinyinConfig config) {
18 |         this.config=config;
19 |     }
20 | 
21 |     @Override
22 |     protected TokenStreamComponents createComponents(String fieldName) {
23 |             return new TokenStreamComponents(new PinyinTokenizer(config));
24 |     }
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/TermItem.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | /**
 4 |  * Created by IntelliJ IDEA.
 5 |  * User: Medcl'
 6 |  * Date: 12-5-21
 7 |  * Time: 下午5:53
 8 |  */
 9 | 
10 | public class TermItem implements Comparable<TermItem>{
11 |     String term;
12 |     int startOffset;
13 |     int endOffset;
14 |     int position;
15 |     public TermItem(String term,int startOffset,int endOffset,int position){
16 |         this.term=term;
17 |         this.startOffset=startOffset;
18 |         this.endOffset=endOffset;
19 |         this.position=position;
20 |     }
21 | 
22 |     @Override
23 |     public String toString() {
24 |         return term;
25 |     }
26 | 
27 |     @Override
28 |     public int compareTo(TermItem o) {
29 |         return this.position-o.position;
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/PinyinTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import org.apache.lucene.analysis.Tokenizer;
 4 | import org.elasticsearch.analysis.PinyinConfig;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | import org.elasticsearch.env.Environment;
 7 | import org.elasticsearch.index.IndexSettings;
 8 | 
 9 | public class PinyinTokenizerFactory extends AbstractTokenizerFactory {
10 | 
11 |     private PinyinConfig config;
12 | 
13 |     public PinyinTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
14 |         super(indexSettings, name, settings);
15 |         config=new PinyinConfig(settings);
16 |     }
17 | 
18 |     @Override
19 |     public Tokenizer create() {
20 |             return new PinyinTokenizer(config);
21 |     }
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/MultiplePinyinTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import org.apache.lucene.analysis.Tokenizer;
 4 | import org.elasticsearch.analysis.PinyinConfig;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | import org.elasticsearch.env.Environment;
 7 | import org.elasticsearch.index.IndexSettings;
 8 | 
 9 | public class MultiplePinyinTokenizerFactory extends AbstractTokenizerFactory {
10 | 
11 |     private PinyinConfig config;
12 | 
13 |     public MultiplePinyinTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
14 |         super(indexSettings, name, settings);
15 |         config=new PinyinConfig(settings);
16 |     }
17 | 
18 |     @Override
19 |     public Tokenizer create() {
20 |             return new MultiplePinyinTokenizer(config);
21 |     }
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/PinyinTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | 
 4 | import org.apache.lucene.analysis.TokenStream;
 5 | import org.elasticsearch.analysis.PinyinConfig;
 6 | import org.elasticsearch.common.settings.Settings;
 7 | import org.elasticsearch.env.Environment;
 8 | import org.elasticsearch.index.IndexSettings;
 9 | 
10 | public class PinyinTokenFilterFactory extends AbstractTokenFilterFactory {
11 |     private PinyinConfig config;
12 | 
13 | 
14 |     public PinyinTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
15 |         super(indexSettings, name, settings);
16 |        config=new PinyinConfig(settings);
17 |     }
18 | 
19 |     @Override
20 |     public TokenStream create(TokenStream tokenStream) {
21 |         return new PinyinTokenFilter(tokenStream, config);
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/MultiplePinyinTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | 
 4 | import org.apache.lucene.analysis.TokenStream;
 5 | import org.elasticsearch.analysis.PinyinConfig;
 6 | import org.elasticsearch.common.settings.Settings;
 7 | import org.elasticsearch.env.Environment;
 8 | import org.elasticsearch.index.IndexSettings;
 9 | 
10 | public class MultiplePinyinTokenFilterFactory extends AbstractTokenFilterFactory {
11 |     private PinyinConfig config;
12 | 
13 | 
14 |     public MultiplePinyinTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
15 |         super(indexSettings, name, settings);
16 |        config=new PinyinConfig(settings);
17 |     }
18 | 
19 |     @Override
20 |     public TokenStream create(TokenStream tokenStream) {
21 |         return new MultiplePinyinTokenFilter(tokenStream, config);
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/PinyinAnalyzerProvider.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import org.elasticsearch.analysis.PinyinConfig;
 4 | import org.elasticsearch.common.inject.Inject;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | import org.elasticsearch.env.Environment;
 7 | import org.elasticsearch.index.IndexSettings;
 8 | 
 9 | /**
10 |  */
11 | public class PinyinAnalyzerProvider extends AbstractIndexAnalyzerProvider<PinyinAnalyzer> {
12 | 
13 |     private final PinyinAnalyzer analyzer;
14 |     private PinyinConfig config;
15 | 
16 |     @Inject
17 |     public PinyinAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
18 |         super(indexSettings, name, settings);
19 |         config=new PinyinConfig(settings);
20 |         analyzer = new PinyinAnalyzer(config);
21 |     }
22 | 
23 |     @Override
24 |     public PinyinAnalyzer get() {
25 |         return this.analyzer;
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/PinyinAlphabetTokenizerTest.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import java.util.Arrays;
 4 | 
 5 | import org.junit.Assert;
 6 | import org.junit.Test;
 7 | 
 8 | /**
 9 |  * 拼音串切分，很难做到最好，认为取最少切分是最好的
10 |  *
11 |  * @author shenyanchao
12 |  * @since 2018-10-08 12:22
13 |  */
14 | public class PinyinAlphabetTokenizerTest {
15 | 
16 |     @Test
17 |     public void walk() throws Exception {
18 | 
19 |         Assert.assertEquals(Arrays.asList("xian").toString(), PinyinAlphabetTokenizer.walk("xian").toString());
20 |         Assert.assertEquals(Arrays.asList("wo", "shi", "liang").toString(),
21 |                 PinyinAlphabetTokenizer.walk("woshiliang").toString());
22 | 
23 |         Assert.assertEquals(Arrays.asList("zhong", "hua", "ren", "min", "gong", "he", "guo").toString(),
24 |                 PinyinAlphabetTokenizer.walk("zhonghuarenmingongheguo").toString());
25 |         Assert.assertEquals(
26 |                 Arrays.asList("5", "zhong", "hua", "ren", "89", "min", "gong", "he", "guo", "234").toString(),
27 |                 PinyinAlphabetTokenizer.walk("5zhonghuaren89mingongheguo234").toString());
28 |     }
29 | 
30 | }


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/PinyinAbbreviationsTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import org.apache.lucene.analysis.Tokenizer;
 4 | import org.elasticsearch.analysis.PinyinConfig;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | import org.elasticsearch.env.Environment;
 7 | import org.elasticsearch.index.IndexSettings;
 8 | 
 9 | public class PinyinAbbreviationsTokenizerFactory extends AbstractTokenizerFactory {
10 | 
11 |     public PinyinAbbreviationsTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
12 |         super(indexSettings, name, settings);
13 |     }
14 | 
15 |     @Override
16 |     public Tokenizer create() {
17 |         PinyinConfig config=new PinyinConfig();
18 |         config.keepFirstLetter=true;
19 |         config.keepFullPinyin=false;
20 |         config.keepNoneChinese=false;
21 |         config.keepNoneChineseTogether=true;
22 |         config.noneChinesePinyinTokenize=false;
23 |         config.keepOriginal=false;
24 |         config.lowercase=true;
25 |         config.trimWhitespace=true;
26 |         config.keepNoneChineseInFirstLetter=true;
27 |         return new PinyinTokenizer(config);
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/assemblies/plugin.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <assembly>
 3 |     <id>plugin</id>
 4 |     <formats>
 5 |         <format>zip</format>
 6 |     </formats>
 7 |     <includeBaseDirectory>false</includeBaseDirectory>
 8 |     <files>
 9 |         <file>
10 |             <source>${project.basedir}/src/main/resources/plugin-descriptor.properties</source>
11 |             <outputDirectory/>
12 |             <filtered>true</filtered>
13 |         </file>
14 |     </files>
15 |     <dependencySets>
16 |         <dependencySet>
17 |             <outputDirectory>/</outputDirectory>
18 |             <useProjectArtifact>true</useProjectArtifact>
19 |             <useTransitiveFiltering>true</useTransitiveFiltering>
20 |             <excludes>
21 |                 <exclude>org.elasticsearch:elasticsearch</exclude>
22 |             </excludes>
23 |         </dependencySet>
24 |         <dependencySet>
25 |             <outputDirectory>/</outputDirectory>
26 |             <useProjectArtifact>true</useProjectArtifact>
27 |             <useTransitiveFiltering>true</useTransitiveFiltering>
28 |             <includes>
29 |                 <include>org.apache.lucene:lucene-pinyin</include>
30 |             </includes>
31 |         </dependencySet>
32 |     </dependencySets>
33 |    <fileSets>
34 |         <fileSet>
35 |             <directory>${basedir}/lib/</directory>
36 |             <outputDirectory>/</outputDirectory>
37 |         </fileSet>
38 |     </fileSets>
39 | </assembly>
40 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/analysis/pinyin/AnalysisPinyinPlugin.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.plugin.analysis.pinyin;
 2 | 
 3 | 
 4 | import org.apache.lucene.analysis.Analyzer;
 5 | import org.elasticsearch.index.analysis.*;
 6 | import org.elasticsearch.indices.analysis.AnalysisModule;
 7 | import org.elasticsearch.plugins.AnalysisPlugin;
 8 | import org.elasticsearch.plugins.Plugin;
 9 | 
10 | import java.util.Collections;
11 | import java.util.HashMap;
12 | import java.util.Map;
13 | 
14 | 
15 | public class AnalysisPinyinPlugin extends Plugin implements AnalysisPlugin {
16 | 
17 |     @Override
18 |     public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
19 |         Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> extra = new HashMap<>();
20 |         extra.put("pinyin", PinyinTokenizerFactory::new);
21 |         extra.put("multiple_pinyin", MultiplePinyinTokenizerFactory::new);
22 |         extra.put("pinyin_first_letter", PinyinAbbreviationsTokenizerFactory::new);
23 |         return extra;
24 |     }
25 | 
26 |     @Override
27 |     public Map<String, AnalysisModule.AnalysisProvider<org.elasticsearch.index.analysis.TokenFilterFactory>> getTokenFilters() {
28 |         Map<String, AnalysisModule.AnalysisProvider<org.elasticsearch.index.analysis.TokenFilterFactory>> extra = new HashMap<>();
29 |         extra.put("pinyin", PinyinTokenFilterFactory::new);
30 |         extra.put("multiple_pinyin", MultiplePinyinTokenFilterFactory::new);
31 |         return extra;
32 |     }
33 | 
34 |     @Override
35 |     public Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
36 |         return Collections.singletonMap("pinyin", PinyinAnalyzerProvider::new);
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
 1 | # Elasticsearch plugin descriptor file
 2 | # This file must exist as 'plugin-descriptor.properties' at
 3 | # the root directory of all plugins.
 4 | #
 5 | # A plugin can be 'site', 'jvm', or both.
 6 | #
 7 | ### example site plugin for "foo":
 8 | #
 9 | # foo.zip <-- zip file for the plugin, with this structure:
10 | #   _site/ <-- the contents that will be served
11 | #   plugin-descriptor.properties <-- example contents below:
12 | #
13 | # site=true
14 | # description=My cool plugin
15 | # version=1.0
16 | #
17 | ### example jvm plugin for "foo"
18 | #
19 | # foo.zip <-- zip file for the plugin, with this structure:
20 | #   <arbitrary name1>.jar <-- classes, resources, dependencies
21 | #   <arbitrary nameN>.jar <-- any number of jars
22 | #   plugin-descriptor.properties <-- example contents below:
23 | #
24 | # jvm=true
25 | # classname=foo.bar.BazPlugin
26 | # description=My cool plugin
27 | # version=2.0.0-rc1
28 | # elasticsearch.version=2.0
29 | # java.version=1.7
30 | #
31 | ### mandatory elements for all plugins:
32 | #
33 | # 'description': simple summary of the plugin
34 | description=${project.description}
35 | #
36 | # 'version': plugin's version
37 | version=${project.version}
38 | #
39 | # 'name': the plugin name
40 | name=${elasticsearch.plugin.name}
41 | 
42 | #
43 | # 'classname': the name of the class to load, fully-qualified.
44 | classname=${elasticsearch.plugin.classname}
45 | #
46 | # 'java.version' version of java the code is built against
47 | # use the system property java.specification.version
48 | # version string must be a sequence of nonnegative decimal integers
49 | # separated by "."'s and may have leading zeros
50 | java.version=${maven.compiler.target}
51 | #
52 | # 'elasticsearch.version' version of elasticsearch compiled against
53 | # You will have to release a new version of the plugin for each new
54 | # elasticsearch release. This version is checked when the plugin
55 | # is loaded so Elasticsearch will refuse to start in the presence of
56 | # plugins with the incorrect elasticsearch.version.
57 | elasticsearch.version=${elasticsearch.version}
58 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/analysis/PinyinConfig.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.analysis;
 2 | 
 3 | import org.elasticsearch.common.settings.Settings;
 4 | 
 5 | /**
 6 |  * Created by medcl on 15/11/26.
 7 |  */
 8 | public class PinyinConfig {
 9 | 
10 |     public boolean lowercase=true;
11 |     public boolean trimWhitespace=true;
12 |     public boolean keepNoneChinese=true;
13 |     public boolean keepNoneChineseInFirstLetter =true;
14 |     public boolean keepNoneChineseInJoinedFullPinyin =false;
15 |     public boolean keepOriginal=false;
16 |     public boolean keepFirstLetter=true;
17 |     public boolean keepSeparateFirstLetter=false;
18 |     public boolean keepNoneChineseTogether=true;
19 |     public boolean noneChinesePinyinTokenize =true;
20 |     public int     LimitFirstLetterLength=16;
21 |     public boolean keepFullPinyin=true;
22 |     public boolean keepJoinedFullPinyin =false;
23 |     public boolean removeDuplicateTerm=false;
24 |     public boolean fixedPinyinOffset =false;
25 |     //  after 6.0, offset is strictly constrained, overlapped tokens are not allowed, with this parameter, overlapped token will allowed by ignore offset, please note, all position related query or highlight will become incorrect, you should use multi fields and specify different settings for different query purpose. if you need offset, please set it to false. default: true.
26 |     public boolean ignorePinyinOffset =true;
27 | 
28 |     public PinyinConfig() {}
29 |     public PinyinConfig(Settings settings) {
30 |         this.keepFirstLetter=settings.getAsBoolean("keep_first_letter",true);
31 |         this.keepSeparateFirstLetter=settings.getAsBoolean("keep_separate_first_letter",false);
32 |         this.keepFullPinyin=settings.getAsBoolean("keep_full_pinyin", true);
33 |         this.keepJoinedFullPinyin =settings.getAsBoolean("keep_joined_full_pinyin", false);
34 |         this.keepNoneChinese=settings.getAsBoolean("keep_none_chinese",true);
35 |         this.keepNoneChineseTogether=settings.getAsBoolean("keep_none_chinese_together",true);
36 |         this.noneChinesePinyinTokenize =settings.getAsBoolean("none_chinese_pinyin_tokenize",true);
37 |         this.keepOriginal=settings.getAsBoolean("keep_original", false);
38 |         this.LimitFirstLetterLength=settings.getAsInt("limit_first_letter_length", 16);
39 |         this.lowercase=settings.getAsBoolean("lowercase", true);
40 |         this.trimWhitespace=settings.getAsBoolean("trim_whitespace", true);
41 |         this.keepNoneChineseInFirstLetter =settings.getAsBoolean("keep_none_chinese_in_first_letter", true);
42 |         this.keepNoneChineseInJoinedFullPinyin =settings.getAsBoolean("keep_none_chinese_in_joined_full_pinyin", false);
43 |         this.removeDuplicateTerm =settings.getAsBoolean("remove_duplicated_term", false);
44 |         this.fixedPinyinOffset =settings.getAsBoolean("fixed_pinyin_offset", false);
45 |         this.ignorePinyinOffset =settings.getAsBoolean("ignore_pinyin_offset", true);
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/resources/pinyin_alphabet.dict:
--------------------------------------------------------------------------------
  1 | a
  2 | ai
  3 | an
  4 | ang
  5 | ao
  6 | b
  7 | ba
  8 | bai
  9 | ban
 10 | bang
 11 | bao
 12 | bei
 13 | ben
 14 | beng
 15 | bi
 16 | bian
 17 | biao
 18 | bie
 19 | bin
 20 | bing
 21 | bo
 22 | bu
 23 | c
 24 | ca
 25 | cai
 26 | can
 27 | cang
 28 | cao
 29 | ce
 30 | cen
 31 | ceng
 32 | ch
 33 | cha
 34 | chai
 35 | chan
 36 | chang
 37 | chao
 38 | che
 39 | chen
 40 | cheng
 41 | chi
 42 | chong
 43 | chou
 44 | chu
 45 | chua
 46 | chuai
 47 | chuan
 48 | chuang
 49 | chui
 50 | chun
 51 | chuo
 52 | ci
 53 | cong
 54 | cou
 55 | cu
 56 | cuan
 57 | cui
 58 | cun
 59 | cuo
 60 | d
 61 | da
 62 | dai
 63 | dan
 64 | dang
 65 | dao
 66 | de
 67 | dei
 68 | den
 69 | deng
 70 | di
 71 | dia
 72 | dian
 73 | diao
 74 | die
 75 | ding
 76 | diu
 77 | dong
 78 | dou
 79 | du
 80 | duan
 81 | dui
 82 | dun
 83 | duo
 84 | e
 85 | er
 86 | f
 87 | fa
 88 | fan
 89 | fang
 90 | fei
 91 | fen
 92 | feng
 93 | fiao
 94 | fo
 95 | fou
 96 | fu
 97 | g
 98 | ga
 99 | gai
100 | gan
101 | gang
102 | gao
103 | ge
104 | gei
105 | gen
106 | geng
107 | gong
108 | gou
109 | gu
110 | gua
111 | guai
112 | guan
113 | guang
114 | gui
115 | gun
116 | guo
117 | h
118 | ha
119 | hai
120 | han
121 | hang
122 | hao
123 | he
124 | hei
125 | hen
126 | heng
127 | hong
128 | hou
129 | hu
130 | hua
131 | huai
132 | huan
133 | huang
134 | hui
135 | hun
136 | huo
137 | i
138 | j
139 | ja
140 | ji
141 | jia
142 | jian
143 | jiang
144 | jiao
145 | jie
146 | jin
147 | jing
148 | jiong
149 | jiu
150 | ju
151 | juan
152 | jue
153 | jun
154 | k
155 | ka
156 | kai
157 | kan
158 | kang
159 | kao
160 | ke
161 | kei
162 | ken
163 | keng
164 | kong
165 | kou
166 | ku
167 | kua
168 | kuai
169 | kuan
170 | kuang
171 | kui
172 | kun
173 | kuo
174 | l
175 | la
176 | lai
177 | lan
178 | lang
179 | lao
180 | le
181 | lei
182 | leng
183 | li
184 | lia
185 | lian
186 | liang
187 | liao
188 | lie
189 | lin
190 | ling
191 | liu
192 | lo
193 | long
194 | lou
195 | lu
196 | luan
197 | lun
198 | luo
199 | lv
200 | lve
201 | lü
202 | lüe
203 | m
204 | ma
205 | mai
206 | man
207 | mang
208 | mao
209 | me
210 | mei
211 | men
212 | meng
213 | mi
214 | mian
215 | miao
216 | mie
217 | min
218 | ming
219 | miu
220 | mo
221 | mou
222 | mu
223 | n
224 | na
225 | nai
226 | nan
227 | nang
228 | nao
229 | ne
230 | nei
231 | nen
232 | neng
233 | ni
234 | nian
235 | niang
236 | niao
237 | nie
238 | nin
239 | ning
240 | niu
241 | nong
242 | nou
243 | nu
244 | nuan
245 | nun
246 | nuo
247 | nv
248 | nve
249 | nü
250 | nüe
251 | o
252 | p
253 | pa
254 | pai
255 | pan
256 | pang
257 | pao
258 | pei
259 | pen
260 | peng
261 | pi
262 | pian
263 | piao
264 | pie
265 | pin
266 | ping
267 | po
268 | pou
269 | pu
270 | q
271 | qi
272 | qia
273 | qian
274 | qiang
275 | qiao
276 | qie
277 | qin
278 | qing
279 | qiong
280 | qiu
281 | qu
282 | quan
283 | que
284 | qun
285 | r
286 | ran
287 | rang
288 | rao
289 | re
290 | ren
291 | reng
292 | ri
293 | rong
294 | rou
295 | ru
296 | ruan
297 | rui
298 | run
299 | ruo
300 | s
301 | sa
302 | sai
303 | san
304 | sang
305 | sao
306 | se
307 | sen
308 | seng
309 | sh
310 | sha
311 | shai
312 | shan
313 | shang
314 | shao
315 | she
316 | shei
317 | shen
318 | sheng
319 | shi
320 | shou
321 | shu
322 | shua
323 | shuai
324 | shuan
325 | shuang
326 | shui
327 | shun
328 | shuo
329 | si
330 | song
331 | sou
332 | su
333 | suan
334 | sui
335 | sun
336 | suo
337 | t
338 | ta
339 | tai
340 | tan
341 | tang
342 | tao
343 | te
344 | teng
345 | ti
346 | tian
347 | tiao
348 | tie
349 | ting
350 | tong
351 | tou
352 | tu
353 | tuan
354 | tui
355 | tun
356 | tuo
357 | u
358 | v
359 | w
360 | wa
361 | wai
362 | wan
363 | wang
364 | wei
365 | wen
366 | weng
367 | wo
368 | wu
369 | x
370 | xi
371 | xia
372 | xian
373 | xiang
374 | xiao
375 | xie
376 | xin
377 | xing
378 | xiong
379 | xiu
380 | xu
381 | xuan
382 | xue
383 | xun
384 | y
385 | ya
386 | yai
387 | yan
388 | yang
389 | yao
390 | ye
391 | yi
392 | yin
393 | ying
394 | yo
395 | yong
396 | you
397 | yu
398 | yuan
399 | yue
400 | yun
401 | z
402 | za
403 | zai
404 | zan
405 | zang
406 | zao
407 | ze
408 | zei
409 | zen
410 | zeng
411 | zh
412 | zha
413 | zhai
414 | zhan
415 | zhang
416 | zhao
417 | zhe
418 | zhei
419 | zhen
420 | zheng
421 | zhi
422 | zhong
423 | zhou
424 | zhu
425 | zhua
426 | zhuai
427 | zhuan
428 | zhuang
429 | zhui
430 | zhun
431 | zhuo
432 | zi
433 | zong
434 | zou
435 | zu
436 | zuan
437 | zui
438 | zun
439 | zuo
440 | ü
441 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/PinyinAlphabetTokenizer.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.index.analysis;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.InputStream;
  5 | import java.io.InputStreamReader;
  6 | import java.util.*;
  7 | 
  8 | /**
  9 |  * Created by medcl on 16/10/13.
 10 |  */
 11 | public class PinyinAlphabetTokenizer {
 12 | 
 13 |     private static final int PINYIN_MAX_LENGTH = 6;
 14 | 
 15 |     public static List<String> walk(String text) {
 16 |         return segPinyinStr(text);
 17 |     }
 18 | 
 19 |     private static List<String> segPinyinStr(String content) {
 20 |         String pinyinStr = content;
 21 |         pinyinStr = pinyinStr.toLowerCase();
 22 |         // 按非letter切分
 23 |         List<String> pinyinStrList = splitByNoletter(pinyinStr);
 24 |         List<String> pinyinList = new ArrayList<>();
 25 |         for (String pinyinText : pinyinStrList) {
 26 |             if (pinyinText.length() == 1) {
 27 |                 pinyinList.add(pinyinText);
 28 |             } else {
 29 |                 List<String> forward = positiveMaxMatch(pinyinText, PINYIN_MAX_LENGTH);
 30 |                 if (forward.size() == 1) { // 前向只切出1个的话，没有必要再做逆向分词
 31 |                     pinyinList.addAll(forward);
 32 |                 } else {
 33 |                     // 分别正向、逆向最大匹配，选出最短的作为最优结果
 34 |                     List<String> backward = reverseMaxMatch(pinyinText, PINYIN_MAX_LENGTH);
 35 |                     if (forward.size() <= backward.size()) {
 36 |                         pinyinList.addAll(forward);
 37 |                     } else {
 38 |                         pinyinList.addAll(backward);
 39 |                     }
 40 |                 }
 41 |             }
 42 |         }
 43 |         return pinyinList;
 44 |     }
 45 | 
 46 |     private static List<String> splitByNoletter(String pinyinStr) {
 47 |         List<String> pinyinStrList = new ArrayList<>();
 48 |         StringBuffer sb = new StringBuffer();
 49 |         boolean lastWord = true;
 50 |         for (char c : pinyinStr.toCharArray()) {
 51 |             if ((c > 96 && c < 123) || (c > 64 && c < 91)) {
 52 |                 if (!lastWord){
 53 |                     pinyinStrList.add(sb.toString());
 54 |                     sb.setLength(0);
 55 |                 }
 56 |                 sb.append(c);
 57 |                 lastWord = true;
 58 |             } else {
 59 |                 if (lastWord & sb.length()>0) {
 60 |                     pinyinStrList.add(sb.toString());
 61 |                     sb.setLength(0);
 62 |                 }
 63 |                 sb.append(c);
 64 |                 lastWord = false;
 65 |             }
 66 |         }
 67 |         if (sb.length() > 0) {
 68 |             pinyinStrList.add(sb.toString());
 69 |         }
 70 |         return pinyinStrList;
 71 | 
 72 |     }
 73 | 
 74 |     private static List<String> positiveMaxMatch(String pinyinText, int maxLength) {
 75 | 
 76 |         List<String> pinyinList = new ArrayList<>();
 77 |         StringBuffer noMatchBuffer = new StringBuffer();
 78 |         for (int start = 0; start < pinyinText.length(); ) {
 79 |             int end = start + maxLength;
 80 |             if (end > pinyinText.length()) {
 81 |                 end = pinyinText.length();
 82 |             }
 83 |             if (start == end) {
 84 |                 break;
 85 |             }
 86 |             String sixStr = pinyinText.substring(start, end);
 87 |             boolean match = false;
 88 |             for (int j = 0; j < sixStr.length(); j++) {
 89 |                 String guess = sixStr.substring(0, sixStr.length() - j);
 90 |                 if (PinyinAlphabetDict.getInstance().match(guess)) {
 91 |                     pinyinList.add(guess);
 92 |                     start += guess.length();
 93 |                     match = true;
 94 |                     break;
 95 |                 }
 96 |             }
 97 |             if (!match) { //没命中,向后移动一位
 98 |                 noMatchBuffer.append(sixStr.substring(0, 1));
 99 |                 start++;
100 |             }else { // 命中，加上之前没命中的，并清空
101 |                 if (noMatchBuffer.length() > 0) {
102 |                     pinyinList.add(noMatchBuffer.toString());
103 |                     noMatchBuffer.setLength(0);
104 |                 }
105 |             }
106 |         }
107 |         if (noMatchBuffer.length() > 0) {
108 |             pinyinList.add(noMatchBuffer.toString());
109 |             noMatchBuffer.setLength(0);
110 |         }
111 | 
112 |         return pinyinList;
113 |     }
114 | 
115 |     private static List<String> reverseMaxMatch(String pinyinText, int maxLength) {
116 |         List<String> pinyinList = new ArrayList<>();
117 |         StringBuffer noMatchBuffer = new StringBuffer();
118 |         for (int end = pinyinText.length(); end >= 0; ) {
119 |             int start = end - maxLength;
120 |             if (start < 0) {
121 |                 start = 0;
122 |             }
123 |             if (start == end) {
124 |                 break;
125 |             }
126 |             boolean match = false;
127 |             String sixStr = pinyinText.substring(start, end);
128 |             for (int j = 0; j < sixStr.length(); j++) {
129 |                 String guess = sixStr.substring(j);
130 |                 if (PinyinAlphabetDict.getInstance().match(guess)) {
131 |                     pinyinList.add(guess);
132 |                     end -= guess.length();
133 |                     match = true;
134 |                     break;
135 |                 }
136 |             }
137 |             if (!match) { //一个也没命中
138 |                 noMatchBuffer.append(sixStr.substring(sixStr.length() - 1));
139 |                 end--;
140 |             } else {
141 |                 if (noMatchBuffer.length() > 0) {
142 |                     pinyinList.add(noMatchBuffer.toString());
143 |                     noMatchBuffer.setLength(0);
144 |                 }
145 |             }
146 |         }
147 | 
148 |         if (noMatchBuffer.length() > 0) {
149 |             pinyinList.add(noMatchBuffer.toString());
150 |             noMatchBuffer.setLength(0);
151 |         }
152 |         // reverse 保持切词顺序
153 |         Collections.reverse(pinyinList);
154 |         return pinyinList;
155 |     }
156 | 
157 | 
158 | }
159 | 
160 |  class PinyinAlphabetDict {
161 | 
162 |     private static final String fileName = "/pinyin_alphabet.dict";
163 | 
164 |     private Set<String> alphabet = new HashSet<String>();
165 | 
166 |     private static PinyinAlphabetDict instance;
167 | 
168 |     private PinyinAlphabetDict() {
169 |         InputStream in = PinyinAlphabetDict.class.getResourceAsStream(fileName);
170 |         BufferedReader reader = new BufferedReader(new InputStreamReader(in));
171 |         try {
172 |             String line;
173 |             while (null != (line = reader.readLine())) {
174 |                 if (line.trim().length() > 0) {
175 |                     alphabet.add(line);
176 |                 }
177 |             }
178 |         } catch (Exception ex) {
179 |             throw new RuntimeException("read pinyin dic error.", ex);
180 |         } finally {
181 |             try {
182 |                 reader.close();
183 |             } catch (Exception ignored) {
184 |             }
185 |         }
186 |     }
187 | 
188 |     public static PinyinAlphabetDict getInstance() {
189 |         if (instance == null) {
190 |             synchronized (PinyinAlphabetDict.class) {
191 |                 if (instance == null) {
192 |                     instance = new PinyinAlphabetDict();
193 |                 }
194 |             }
195 |         }
196 |         return instance;
197 |     }
198 | 
199 |     public boolean match(String c) {
200 |         return alphabet.contains(c);
201 |     }
202 | }
203 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Pinyin Analysis for Elasticsearch
  2 | ==================================
  3 | 
  4 | This Pinyin Analysis plugin is used to do conversion between Chinese characters and Pinyin, integrates NLP tools (https://github.com/NLPchina/nlp-lang).
  5 | 
  6 |     --------------------------------------------------
  7 |     | Pinyin   Analysis Plugin      | Elasticsearch  |
  8 |     --------------------------------------------------
  9 |     | master                        | 6.x -> master  |
 10 |     --------------------------------------------------
 11 |     | 6.3.0                         | 6.3.0          |
 12 |     --------------------------------------------------
 13 |     | 6.2.4                         | 6.2.4          |
 14 |     --------------------------------------------------
 15 |     | 6.1.4                         | 6.1.4          |
 16 |     --------------------------------------------------
 17 |     | 5.6.9                         | 5.6.9          |
 18 |     --------------------------------------------------
 19 |     | 5.5.3                         | 5.5.3          |
 20 |     --------------------------------------------------
 21 |     | 5.4.3                         | 5.4.3          |
 22 |     --------------------------------------------------
 23 |     | 5.3.3                         | 5.3.3          |
 24 |     --------------------------------------------------
 25 |     | 5.2.2                         | 5.2.2          |
 26 |     --------------------------------------------------  
 27 |     | 5.1.2                         | 5.1.2          |
 28 |     --------------------------------------------------  
 29 |     | 1.8.1                         | 2.4.1          |
 30 |     --------------------------------------------------  
 31 |     | 1.7.5                         | 2.3.5          |
 32 |     --------------------------------------------------  
 33 |     | 1.6.1                         | 2.2.1          |
 34 |     --------------------------------------------------
 35 |     | 1.5.0                         | 2.1.0          |
 36 |     --------------------------------------------------
 37 |     | 1.4.0                         | 2.0.x          |
 38 |     --------------------------------------------------
 39 |     | 1.3.0                         | 1.6.x          |
 40 |     --------------------------------------------------
 41 |     | 1.2.2                         | 1.0.x          |
 42 |     --------------------------------------------------
 43 | 
 44 | The plugin includes analyzer: `pinyin` ,  tokenizer: `pinyin` and  token-filter:  `pinyin`.
 45 | 
 46 | ** Optional Parameters ** 
 47 | * `keep_first_letter` when this option enabled,  eg: `刘德华`>`ldh`, default: true
 48 | * `keep_separate_first_letter` when this option enabled, will keep first letters separately,  eg: `刘德华`>`l`,`d`,`h`, default: false, NOTE: query result maybe too fuzziness due to term too frequency
 49 | * `limit_first_letter_length` set max length of the first_letter result, default: 16
 50 | * `keep_full_pinyin` when this option enabled, eg: `刘德华`> [`liu`,`de`,`hua`], default: true
 51 | * `keep_joined_full_pinyin` when this option enabled, eg: `刘德华`> [`liudehua`], default: false
 52 | * `keep_none_chinese` keep non chinese letter or number in result, default: true
 53 | * `keep_none_chinese_together` keep non chinese letter together, default: true, eg: `DJ音乐家` -> `DJ`,`yin`,`yue`,`jia`, when set to `false`, eg: `DJ音乐家` -> `D`,`J`,`yin`,`yue`,`jia`, NOTE: `keep_none_chinese` should be enabled first
 54 | * `keep_none_chinese_in_first_letter` keep non Chinese letters in first letter, eg: `刘德华AT2016`->`ldhat2016`, default: true
 55 | * `keep_none_chinese_in_joined_full_pinyin` keep non Chinese letters in joined full pinyin, eg: `刘德华2016`->`liudehua2016`, default: false
 56 | * `none_chinese_pinyin_tokenize` break non chinese letters into separate pinyin term if they are pinyin, default: true, eg: `liudehuaalibaba13zhuanghan` -> `liu`,`de`,`hua`,`a`,`li`,`ba`,`ba`,`13`,`zhuang`,`han`, NOTE:  `keep_none_chinese` and `keep_none_chinese_together` should be enabled first
 57 | * `keep_original` when this option enabled, will keep original input as well, default: false
 58 | * `lowercase`  lowercase non Chinese letters, default: true
 59 | * `trim_whitespace` default: true
 60 | * `remove_duplicated_term` when this option enabled, duplicated term will be removed to save index, eg: `de的`>`de`, default: false,  NOTE: position related query maybe influenced
 61 | * `ignore_pinyin_offset` after 6.0, offset is strictly constrained, overlapped tokens are not allowed, with this parameter, overlapped token will allowed by ignore offset, please note, all position related query or highlight will become incorrect, you should use multi fields and specify different settings for different query purpose. if you need offset, please set it to false. default: true.
 62 | 
 63 | 
 64 | 
 65 | 1.Create a index with custom pinyin analyzer
 66 | <pre>
 67 | PUT /medcl/ 
 68 | {
 69 |     "index" : {
 70 |         "analysis" : {
 71 |             "analyzer" : {
 72 |                 "pinyin_analyzer" : {
 73 |                     "tokenizer" : "my_pinyin"
 74 |                     }
 75 |             },
 76 |             "tokenizer" : {
 77 |                 "my_pinyin" : {
 78 |                     "type" : "pinyin",
 79 |                     "keep_separate_first_letter" : false,
 80 |                     "keep_full_pinyin" : true,
 81 |                     "keep_original" : true,
 82 |                     "limit_first_letter_length" : 16,
 83 |                     "lowercase" : true,
 84 |                     "remove_duplicated_term" : true
 85 |                 }
 86 |             }
 87 |         }
 88 |     }
 89 | }
 90 | </pre>
 91 | 
 92 | 2.Test Analyzer, analyzing a chinese name, such as 刘德华
 93 | <pre>
 94 | GET /medcl/_analyze
 95 | {
 96 |   "text": ["刘德华"],
 97 |   "analyzer": "pinyin_analyzer"
 98 | }</pre>
 99 | <pre>
100 | {
101 |   "tokens" : [
102 |     {
103 |       "token" : "liu",
104 |       "start_offset" : 0,
105 |       "end_offset" : 1,
106 |       "type" : "word",
107 |       "position" : 0
108 |     },
109 |     {
110 |       "token" : "de",
111 |       "start_offset" : 1,
112 |       "end_offset" : 2,
113 |       "type" : "word",
114 |       "position" : 1
115 |     },
116 |     {
117 |       "token" : "hua",
118 |       "start_offset" : 2,
119 |       "end_offset" : 3,
120 |       "type" : "word",
121 |       "position" : 2
122 |     },
123 |     {
124 |       "token" : "刘德华",
125 |       "start_offset" : 0,
126 |       "end_offset" : 3,
127 |       "type" : "word",
128 |       "position" : 3
129 |     },
130 |     {
131 |       "token" : "ldh",
132 |       "start_offset" : 0,
133 |       "end_offset" : 3,
134 |       "type" : "word",
135 |       "position" : 4
136 |     }
137 |   ]
138 | }
139 | </pre>
140 | 
141 | 3.Create mapping
142 | <pre>
143 | POST /medcl/folks/_mapping 
144 | {
145 |     "folks": {
146 |         "properties": {
147 |             "name": {
148 |                 "type": "keyword",
149 |                 "fields": {
150 |                     "pinyin": {
151 |                         "type": "text",
152 |                         "store": false,
153 |                         "term_vector": "with_offsets",
154 |                         "analyzer": "pinyin_analyzer",
155 |                         "boost": 10
156 |                     }
157 |                 }
158 |             }
159 |         }
160 |     }
161 | }
162 | </pre>
163 | 
164 | 4.Indexing
165 | <pre>
166 | POST /medcl/folks/andy 
167 | {"name":"刘德华"}
168 | </pre>
169 | 
170 | 5.Let's search
171 | <pre>
172 | http://localhost:9200/medcl/folks/_search?q=name:%E5%88%98%E5%BE%B7%E5%8D%8E
173 | curl http://localhost:9200/medcl/folks/_search?q=name.pinyin:%e5%88%98%e5%be%b7
174 | curl http://localhost:9200/medcl/folks/_search?q=name.pinyin:liu
175 | curl http://localhost:9200/medcl/folks/_search?q=name.pinyin:ldh
176 | curl http://localhost:9200/medcl/folks/_search?q=name.pinyin:de+hua
177 | </pre>
178 | 
179 | 6.Using Pinyin-TokenFilter
180 | <pre>
181 | PUT /medcl1/ 
182 | {
183 |     "index" : {
184 |         "analysis" : {
185 |             "analyzer" : {
186 |                 "user_name_analyzer" : {
187 |                     "tokenizer" : "whitespace",
188 |                     "filter" : "pinyin_first_letter_and_full_pinyin_filter"
189 |                 }
190 |             },
191 |             "filter" : {
192 |                 "pinyin_first_letter_and_full_pinyin_filter" : {
193 |                     "type" : "pinyin",
194 |                     "keep_first_letter" : true,
195 |                     "keep_full_pinyin" : false,
196 |                     "keep_none_chinese" : true,
197 |                     "keep_original" : false,
198 |                     "limit_first_letter_length" : 16,
199 |                     "lowercase" : true,
200 |                     "trim_whitespace" : true,
201 |                     "keep_none_chinese_in_first_letter" : true
202 |                 }
203 |             }
204 |         }
205 |     }
206 | }
207 | </pre>
208 | 
209 | Token Test:刘德华 张学友 郭富城 黎明 四大天王
210 | <pre>
211 | GET /medcl/_analyze
212 | {
213 |   "text": ["刘德华 张学友 郭富城 黎明 四大天王"],
214 |   "analyzer": "user_name_analyzer"
215 | }
216 | </pre>
217 | <pre>
218 | {
219 |   "tokens" : [
220 |     {
221 |       "token" : "ldh",
222 |       "start_offset" : 0,
223 |       "end_offset" : 3,
224 |       "type" : "word",
225 |       "position" : 0
226 |     },
227 |     {
228 |       "token" : "zxy",
229 |       "start_offset" : 4,
230 |       "end_offset" : 7,
231 |       "type" : "word",
232 |       "position" : 1
233 |     },
234 |     {
235 |       "token" : "gfc",
236 |       "start_offset" : 8,
237 |       "end_offset" : 11,
238 |       "type" : "word",
239 |       "position" : 2
240 |     },
241 |     {
242 |       "token" : "lm",
243 |       "start_offset" : 12,
244 |       "end_offset" : 14,
245 |       "type" : "word",
246 |       "position" : 3
247 |     },
248 |     {
249 |       "token" : "sdtw",
250 |       "start_offset" : 15,
251 |       "end_offset" : 19,
252 |       "type" : "word",
253 |       "position" : 4
254 |     }
255 |   ]
256 | }
257 | </pre>
258 | 
259 | 
260 | 7.Used in phrase query
261 | - option 1
262 |     <pre>
263 |     PUT /medcl/
264 |     {
265 |         "index" : {
266 |             "analysis" : {
267 |                 "analyzer" : {
268 |                     "pinyin_analyzer" : {
269 |                         "tokenizer" : "my_pinyin"
270 |                         }
271 |                 },
272 |                 "tokenizer" : {
273 |                     "my_pinyin" : {
274 |                         "type" : "pinyin",
275 |                         "keep_first_letter":false,
276 |                         "keep_separate_first_letter" : false,
277 |                         "keep_full_pinyin" : true,
278 |                         "keep_original" : false,
279 |                         "limit_first_letter_length" : 16,
280 |                         "lowercase" : true
281 |                     }
282 |                 }
283 |             }
284 |         }
285 |     }
286 |     GET /medcl/folks/_search
287 |     {
288 |       "query": {"match_phrase": {
289 |         "name.pinyin": "刘德华"
290 |       }}
291 |     }
292 | 
293 |     </pre>
294 | 
295 | - option 2
296 |     <pre>
297 | 
298 |     PUT /medcl/
299 |     {
300 |         "index" : {
301 |             "analysis" : {
302 |                 "analyzer" : {
303 |                     "pinyin_analyzer" : {
304 |                         "tokenizer" : "my_pinyin"
305 |                         }
306 |                 },
307 |                 "tokenizer" : {
308 |                     "my_pinyin" : {
309 |                         "type" : "pinyin",
310 |                         "keep_first_letter":false,
311 |                         "keep_separate_first_letter" : true,
312 |                         "keep_full_pinyin" : false,
313 |                         "keep_original" : false,
314 |                         "limit_first_letter_length" : 16,
315 |                         "lowercase" : true
316 |                     }
317 |                 }
318 |             }
319 |         }
320 |     }
321 | 
322 |     POST /medcl/folks/andy
323 |     {"name":"刘德华"}
324 | 
325 |     GET /medcl/folks/_search
326 |     {
327 |       "query": {"match_phrase": {
328 |         "name.pinyin": "刘德h"
329 |       }}
330 |     }
331 | 
332 |     GET /medcl/folks/_search
333 |     {
334 |       "query": {"match_phrase": {
335 |         "name.pinyin": "刘dh"
336 |       }}
337 |     }
338 | 
339 |     GET /medcl/folks/_search
340 |     {
341 |       "query": {"match_phrase": {
342 |         "name.pinyin": "dh"
343 |       }}
344 |     }
345 | 
346 |     </pre>
347 | 
348 | 8.That's all, have fun.
349 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/PinyinTokenFilter.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.index.analysis;
  2 | 
  3 | /**
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  * <p/>
 11 |  * http://www.apache.org/licenses/LICENSE-2.0
 12 |  * <p/>
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | import org.apache.lucene.analysis.TokenFilter;
 21 | import org.apache.lucene.analysis.TokenStream;
 22 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 23 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 24 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 25 | import org.elasticsearch.analysis.PinyinConfig;
 26 | import org.nlpcn.commons.lang.pinyin.Pinyin;
 27 | 
 28 | import java.io.IOException;
 29 | import java.util.ArrayList;
 30 | import java.util.Collections;
 31 | import java.util.HashSet;
 32 | import java.util.List;
 33 | 
 34 | public class PinyinTokenFilter extends TokenFilter {
 35 | 
 36 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 37 |     private boolean done = true;
 38 |     private boolean processedCandidate = false;
 39 |     private boolean processedFullPinyinLetter = false;
 40 |     private boolean processedFirstLetter = false;
 41 |     private boolean processedOriginal = false;
 42 |     private boolean processedSortCandidate = false;
 43 |     protected int position = 0;
 44 |     protected int lastOffset = 0;
 45 |     private PinyinConfig config;
 46 |     List<TermItem> candidate;
 47 |     private HashSet<String> termsFilter;
 48 | 
 49 |     protected int candidateOffset = 0;
 50 |     StringBuilder firstLetters;
 51 |     StringBuilder fullPinyinLetters;
 52 |     String source;
 53 |     private int lastIncrementPosition = 0;
 54 | 
 55 |     private PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
 56 | 
 57 |     public PinyinTokenFilter(TokenStream in, PinyinConfig config) {
 58 |         super(in);
 59 |         this.config = config;
 60 |         //validate config
 61 |         if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) {
 62 |             throw new ConfigErrorException("pinyin config error, can't disable separate_first_letter, first_letter and full_pinyin at the same time.");
 63 |         }
 64 |         candidate = new ArrayList<>();
 65 |         firstLetters = new StringBuilder();
 66 |         termsFilter = new HashSet<>();
 67 |         fullPinyinLetters = new StringBuilder();
 68 |     }
 69 | 
 70 |     //TODO refactor, merge code
 71 |     @Override
 72 |     public final boolean incrementToken() throws IOException {
 73 | 
 74 | 
 75 |         if (!done) {
 76 |             if (readTerm()) return true;
 77 |         }
 78 | 
 79 |         if (done) {
 80 |             resetVariable();
 81 |             if (!input.incrementToken()) {
 82 |                 return false;
 83 |             }
 84 |             done = false;
 85 |         }
 86 |         readTerm();
 87 |         return true;
 88 |     }
 89 | 
 90 |     private boolean readTerm() {
 91 |         if (!processedCandidate) {
 92 |             processedCandidate = true;
 93 |             lastOffset = termAtt.length();
 94 |             source = termAtt.toString();
 95 |             if (config.trimWhitespace) {
 96 |                 source = source.trim();
 97 |             }
 98 | 
 99 |             List<String> pinyinList = Pinyin.pinyin(source);
100 |             if (pinyinList.size() == 0) return false;
101 | 
102 |             StringBuilder buff = new StringBuilder();
103 |             int buffStartPosition = 0;
104 |             int buffSize = 0;
105 |             position = 0;
106 | 
107 |             for (int i = 0; i < source.length(); i++) {
108 |                 char c = source.charAt(i);
109 | 
110 |                 //keep original alphabet
111 |                 if (c < 128) {
112 |                     if (buff.length() <= 0) {
113 |                         buffStartPosition = i;
114 |                     }
115 |                     if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) {
116 |                         if (config.keepNoneChinese) {
117 |                             if (config.keepNoneChinese) {
118 |                                 if (config.keepNoneChineseTogether) {
119 |                                     buff.append(c);
120 |                                     buffSize++;
121 |                                 } else {
122 |                                     addCandidate(new TermItem(String.valueOf(c), i, i + 1, buffStartPosition));
123 |                                 }
124 |                             }
125 |                         }
126 |                         if (config.keepNoneChineseInFirstLetter) {
127 |                             firstLetters.append(c);
128 |                         }
129 |                         if (config.keepNoneChineseInJoinedFullPinyin) {
130 |                             fullPinyinLetters.append(c);
131 |                         }
132 |                     }
133 |                 } else {
134 |                     //clean previous temp
135 |                     if (buff.length() > 0) {
136 |                         buffSize = parseBuff(buff, buffSize, buffStartPosition);
137 |                     }
138 | 
139 |                     String pinyin = pinyinList.get(i);
140 |                     if (pinyin != null && pinyin.length() > 0) {
141 |                         position++;
142 |                         firstLetters.append(pinyin.charAt(0));
143 |                         if (config.keepSeparateFirstLetter & pinyin.length() > 1) {
144 |                             addCandidate(new TermItem(String.valueOf(pinyin.charAt(0)), i, i + 1, position));
145 |                         }
146 |                         if (config.keepFullPinyin) {
147 |                             addCandidate(new TermItem(pinyin, i, i + 1, position));
148 |                         }
149 |                         if (config.keepJoinedFullPinyin) {
150 |                             fullPinyinLetters.append(pinyin);
151 |                         }
152 |                     }
153 |                 }
154 | 
155 |                 lastOffset = i;
156 | 
157 |             }
158 | 
159 |             //clean previous temp
160 |             if (buff.length() > 0) {
161 |                 buffSize = parseBuff(buff, buffSize, buffStartPosition);
162 |             }
163 |         }
164 | 
165 | 
166 |         if (config.keepOriginal && !processedOriginal) {
167 |             processedOriginal = true;
168 |             addCandidate(new TermItem(source, 0, source.length(), 1));
169 |         }
170 | 
171 |         if (config.keepJoinedFullPinyin && !processedFullPinyinLetter && fullPinyinLetters.length() > 0) {
172 |             processedFullPinyinLetter = true;
173 |             addCandidate(new TermItem(fullPinyinLetters.toString(), 0, source.length(), 1));
174 |             fullPinyinLetters.setLength(0);
175 |         }
176 | 
177 | 
178 |         if (config.keepFirstLetter && firstLetters.length() > 0 && !processedFirstLetter) {
179 |             processedFirstLetter = true;
180 |             String fl;
181 |             if (firstLetters.length() > config.LimitFirstLetterLength && config.LimitFirstLetterLength > 0) {
182 |                 fl = firstLetters.substring(0, config.LimitFirstLetterLength);
183 |             } else {
184 |                 fl = firstLetters.toString();
185 |             }
186 |             if (config.lowercase) {
187 |                 fl = fl.toLowerCase();
188 |             }
189 |             if (!(config.keepSeparateFirstLetter && fl.length() <= 1)) {
190 |                 addCandidate(new TermItem(fl, 0, fl.length(), 1));
191 |             }
192 |         }
193 | 
194 |         if (!processedSortCandidate) {
195 |             processedSortCandidate = true;
196 |             Collections.sort(candidate);
197 |         }
198 | 
199 |         if (candidateOffset < candidate.size()) {
200 |             TermItem item = candidate.get(candidateOffset);
201 |             candidateOffset++;
202 |             setTerm(item.term, item.startOffset, item.endOffset, item.position);
203 |             return true;
204 |         }
205 | 
206 |         done = true;
207 |         return false;
208 |     }
209 | 
210 | 
211 |     void addCandidate(TermItem item) {
212 | 
213 |         String term = item.term;
214 |         if (config.lowercase) {
215 |             term = term.toLowerCase();
216 |         }
217 | 
218 |         if (config.trimWhitespace) {
219 |             term = term.trim();
220 |         }
221 |         item.term = term;
222 | 
223 |         if (term.length() == 0) {
224 |             return;
225 |         }
226 | 
227 |         //remove same term with same position
228 |         String fr=term+item.position;
229 | 
230 |         //remove same term, regardless position
231 |         if (config.removeDuplicateTerm) {
232 |             fr=term;
233 |         }
234 | 
235 |         if (termsFilter.contains(fr)) {
236 |             return;
237 |         }
238 |         termsFilter.add(fr);
239 | 
240 |         candidate.add(item);
241 |     }
242 | 
243 | 
244 |     void setTerm(String term, int startOffset, int endOffset, int position) {
245 |         if (config.lowercase) {
246 |             term = term.toLowerCase();
247 |         }
248 | 
249 |         if (config.trimWhitespace) {
250 |             term = term.trim();
251 |         }
252 | 
253 |         //ignore empty term
254 |         if(term.length()==0){
255 |             return;
256 |         }
257 | 
258 |         termAtt.setEmpty();
259 |         termAtt.append(term);
260 |         if (startOffset < 0) {
261 |             startOffset = 0;
262 |         }
263 |         if (endOffset < startOffset) {
264 |             endOffset = startOffset + term.length();
265 |         }
266 | 
267 |         int offset = position - lastIncrementPosition;
268 |         if (offset < 0) {
269 |             offset = 0;
270 |         }
271 |         positionAttr.setPositionIncrement(offset);
272 | 
273 |         lastIncrementPosition = position;
274 |     }
275 | 
276 |     private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) {
277 |         if (config.keepNoneChinese) {
278 |             if (config.noneChinesePinyinTokenize) {
279 |                 List<String> result = PinyinAlphabetTokenizer.walk(buff.toString());
280 |                 int start = (lastOffset - buffSize + 1);
281 |                 for (int i = 0; i < result.size(); i++) {
282 |                     int end;
283 |                     String t = result.get(i);
284 |                     if (config.fixedPinyinOffset) {
285 |                         end = start + 1;
286 |                     } else {
287 |                         end = start + t.length();
288 |                     }
289 |                     addCandidate(new TermItem(result.get(i), start, end, ++position));
290 |                     start = end;
291 |                 }
292 |             } else if (config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || !config.keepNoneChineseInJoinedFullPinyin) {
293 |                 addCandidate(new TermItem(buff.toString(), lastOffset - buffSize, lastOffset, ++position));
294 |             }
295 |         }
296 | 
297 |         buff.setLength(0);
298 |         buffSize = 0;
299 |         return buffSize;
300 |     }
301 | 
302 |     @Override
303 |     public final void end() throws IOException {
304 |         super.end();
305 |     }
306 | 
307 |     void resetVariable() {
308 |         position = 0;
309 |         lastOffset = 0;
310 |         candidate.clear();
311 |         this.processedCandidate = false;
312 |         this.processedFirstLetter = false;
313 |         this.processedFullPinyinLetter = false;
314 |         this.processedOriginal = false;
315 |         firstLetters.setLength(0);
316 |         fullPinyinLetters.setLength(0);
317 |         source = null;
318 |         candidateOffset = 0;
319 |         termsFilter.clear();
320 |         lastIncrementPosition = 0;
321 |     }
322 | 
323 |     @Override
324 |     public void reset() throws IOException {
325 |         super.reset();
326 |         this.done = true;
327 |         resetVariable();
328 |     }
329 | 
330 | 
331 | }


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <name>elasticsearch-analysis-pinyin</name>
  6 |     <modelVersion>4.0.0</modelVersion>
  7 |     <groupId>org.elasticsearch</groupId>
  8 |     <artifactId>elasticsearch-analysis-pinyin</artifactId>
  9 |     <version>${elasticsearch.version}</version>
 10 |     <packaging>jar</packaging>
 11 |     <description>Pinyin Analysis for Elasticsearch</description>
 12 |     <inceptionYear>2012</inceptionYear>
 13 | 
 14 |     <properties>
 15 |         <elasticsearch.version>5.4.2</elasticsearch.version>
 16 |         <maven.compiler.target>1.8</maven.compiler.target>
 17 |         <elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>
 18 |         <elasticsearch.plugin.name>analysis-pinyin</elasticsearch.plugin.name>
 19 |         <elasticsearch.plugin.classname>org.elasticsearch.plugin.analysis.pinyin.AnalysisPinyinPlugin</elasticsearch.plugin.classname>
 20 |         <elasticsearch.plugin.jvm>true</elasticsearch.plugin.jvm>
 21 |         <tests.rest.load_packaged>false</tests.rest.load_packaged>
 22 |         <skip.unit.tests>true</skip.unit.tests>
 23 |         <gpg.keyname>4E899B30</gpg.keyname>
 24 |         <gpg.useagent>true</gpg.useagent>
 25 |     </properties>
 26 | 
 27 |     <licenses>
 28 |         <license>
 29 |             <name>The Apache Software License, Version 2.0</name>
 30 |             <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 31 |             <distribution>repo</distribution>
 32 |         </license>
 33 |     </licenses>
 34 | 
 35 |     <developers>
 36 |         <developer>
 37 |             <name>Medcl</name>
 38 |             <email>medcl@elastic.co</email>
 39 |             <organization>elastic</organization>
 40 |             <organizationUrl>http://www.elastic.co</organizationUrl>
 41 |         </developer>
 42 |     </developers>
 43 | 
 44 |     <scm>
 45 |         <connection>scm:git:git@github.com:medcl/elasticsearch-analysis-pinyin.git</connection>
 46 |         <developerConnection>scm:git:git@github.com:medcl/elasticsearch-analysis-pinyin.git
 47 |         </developerConnection>
 48 |         <url>http://github.com/medcl/elasticsearch-analysis-pinyin</url>
 49 |     </scm>
 50 | 
 51 |     <parent>
 52 |         <groupId>org.sonatype.oss</groupId>
 53 |         <artifactId>oss-parent</artifactId>
 54 |         <version>9</version>
 55 |     </parent>
 56 | 
 57 |     <distributionManagement>
 58 |         <snapshotRepository>
 59 |             <id>oss.sonatype.org</id>
 60 |             <url>https://oss.sonatype.org/content/repositories/snapshots</url>
 61 |         </snapshotRepository>
 62 |         <repository>
 63 |             <id>oss.sonatype.org</id>
 64 |             <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
 65 |         </repository>
 66 |     </distributionManagement>
 67 | 
 68 |     <repositories>
 69 |         <repository>
 70 |             <id>oss.sonatype.org</id>
 71 |             <name>OSS Sonatype</name>
 72 |             <releases>
 73 |                 <enabled>true</enabled>
 74 |             </releases>
 75 |             <snapshots>
 76 |                 <enabled>true</enabled>
 77 |             </snapshots>
 78 |             <url>http://oss.sonatype.org/content/repositories/releases/</url>
 79 |         </repository>
 80 |     </repositories>
 81 | 
 82 |     <dependencies>
 83 |         <dependency>
 84 |             <groupId>org.nlpcn</groupId>
 85 |             <artifactId>nlp-lang</artifactId>
 86 |             <version>1.7</version>
 87 |             <systemPath>${basedir}/lib/nlp-lang-1.7.8.jar</systemPath>
 88 |             <scope>system</scope>
 89 |         </dependency>
 90 | 
 91 |         <dependency>
 92 |             <groupId>org.elasticsearch</groupId>
 93 |             <artifactId>elasticsearch</artifactId>
 94 |             <version>${elasticsearch.version}</version>
 95 |             <scope>compile</scope>
 96 |         </dependency>
 97 | 
 98 |         <dependency>
 99 |             <groupId>log4j</groupId>
100 |             <artifactId>log4j</artifactId>
101 |             <version>1.2.16</version>
102 |             <scope>runtime</scope>
103 |         </dependency>
104 | 
105 |         <dependency>
106 |             <groupId>org.hamcrest</groupId>
107 |             <artifactId>hamcrest-core</artifactId>
108 |             <version>1.3.RC2</version>
109 |             <scope>test</scope>
110 |         </dependency>
111 | 
112 |         <dependency>
113 |             <groupId>org.hamcrest</groupId>
114 |             <artifactId>hamcrest-library</artifactId>
115 |             <version>1.3.RC2</version>
116 |             <scope>test</scope>
117 |         </dependency>
118 | 
119 |         <dependency>
120 |             <groupId>org.powermock</groupId>
121 |             <artifactId>powermock-module-junit4</artifactId>
122 |             <version>1.6.2</version>
123 |             <scope>test</scope>
124 |         </dependency>
125 | 
126 |         <dependency>
127 |             <groupId>org.powermock</groupId>
128 |             <artifactId>powermock-api-mockito</artifactId>
129 |             <version>1.6.2</version>
130 |             <scope>test</scope>
131 |         </dependency>
132 | 
133 |         <dependency>
134 |             <groupId>nl.jqno.equalsverifier</groupId>
135 |             <artifactId>equalsverifier</artifactId>
136 |             <version>1.7.5</version>
137 |             <scope>test</scope>
138 |         </dependency>
139 | 
140 |         <dependency>
141 |             <groupId>com.openpojo</groupId>
142 |             <artifactId>openpojo</artifactId>
143 |             <version>0.8.1</version>
144 |             <scope>test</scope>
145 |         </dependency>
146 | 
147 |         <dependency>
148 |             <groupId>junit</groupId>
149 |             <artifactId>junit</artifactId>
150 |             <version>4.9</version>
151 |             <scope>test</scope>
152 |         </dependency>
153 |     </dependencies>
154 | 
155 |     <build>
156 |         <plugins>
157 |             <plugin>
158 |                 <groupId>org.apache.maven.plugins</groupId>
159 |                 <artifactId>maven-compiler-plugin</artifactId>
160 |                 <version>3.5.1</version>
161 |                 <configuration>
162 |                     <source>${maven.compiler.target}</source>
163 |                     <target>${maven.compiler.target}</target>
164 |                 </configuration>
165 |             </plugin>
166 |             <plugin>
167 |                 <groupId>org.apache.maven.plugins</groupId>
168 |                 <artifactId>maven-surefire-plugin</artifactId>
169 |                 <version>2.19.1</version>
170 |             </plugin>
171 |             <plugin>
172 |                 <groupId>org.apache.maven.plugins</groupId>
173 |                 <artifactId>maven-source-plugin</artifactId>
174 |                 <version>2.1.2</version>
175 |                 <executions>
176 |                     <execution>
177 |                         <id>attach-sources</id>
178 |                         <goals>
179 |                             <goal>jar</goal>
180 |                         </goals>
181 |                     </execution>
182 |                 </executions>
183 |             </plugin>
184 |             <plugin>
185 |                 <artifactId>maven-assembly-plugin</artifactId>
186 |                 <version>2.3</version>
187 |                 <configuration>
188 |                     <appendAssemblyId>false</appendAssemblyId>
189 |                     <outputDirectory>${project.build.directory}/releases/</outputDirectory>
190 |                     <descriptors>
191 |                         <descriptor>${basedir}/src/main/assemblies/plugin.xml</descriptor>
192 |                     </descriptors>
193 |                 </configuration>
194 |                 <executions>
195 |                     <execution>
196 |                         <phase>package</phase>
197 |                         <goals>
198 |                             <goal>single</goal>
199 |                         </goals>
200 |                     </execution>
201 |                 </executions>
202 |             </plugin>
203 |         </plugins>
204 |     </build>
205 | 
206 |     <profiles>
207 |         <profile>
208 |             <id>disable-java8-doclint</id>
209 |             <activation>
210 |                 <jdk>[1.8,)</jdk>
211 |             </activation>
212 |             <properties>
213 |                 <additionalparam>-Xdoclint:none</additionalparam>
214 |             </properties>
215 |         </profile>
216 |         <profile>
217 |             <id>release</id>
218 |             <build>
219 |                 <plugins>
220 |                     <plugin>
221 |                         <groupId>org.sonatype.plugins</groupId>
222 |                         <artifactId>nexus-staging-maven-plugin</artifactId>
223 |                         <version>1.6.3</version>
224 |                         <extensions>true</extensions>
225 |                         <configuration>
226 |                             <serverId>oss</serverId>
227 |                             <nexusUrl>https://oss.sonatype.org/</nexusUrl>
228 |                             <autoReleaseAfterClose>true</autoReleaseAfterClose>
229 |                         </configuration>
230 |                     </plugin>
231 |                     <plugin>
232 |                         <groupId>org.apache.maven.plugins</groupId>
233 |                         <artifactId>maven-release-plugin</artifactId>
234 |                         <version>2.1</version>
235 |                         <configuration>
236 |                             <autoVersionSubmodules>true</autoVersionSubmodules>
237 |                             <useReleaseProfile>false</useReleaseProfile>
238 |                             <releaseProfiles>release</releaseProfiles>
239 |                             <goals>deploy</goals>
240 |                         </configuration>
241 |                     </plugin>
242 |                     <plugin>
243 |                         <groupId>org.apache.maven.plugins</groupId>
244 |                         <artifactId>maven-compiler-plugin</artifactId>
245 |                         <version>3.5.1</version>
246 |                         <configuration>
247 |                             <source>${maven.compiler.target}</source>
248 |                             <target>${maven.compiler.target}</target>
249 |                         </configuration>
250 |                     </plugin>
251 |                     <plugin>
252 |                         <groupId>org.apache.maven.plugins</groupId>
253 |                         <artifactId>maven-gpg-plugin</artifactId>
254 |                         <version>1.5</version>
255 |                         <executions>
256 |                             <execution>
257 |                                 <id>sign-artifacts</id>
258 |                                 <phase>verify</phase>
259 |                                 <goals>
260 |                                     <goal>sign</goal>
261 |                                 </goals>
262 |                             </execution>
263 |                         </executions>
264 |                     </plugin>
265 |                     <plugin>
266 |                         <groupId>org.apache.maven.plugins</groupId>
267 |                         <artifactId>maven-source-plugin</artifactId>
268 |                         <version>2.2.1</version>
269 |                         <executions>
270 |                             <execution>
271 |                                 <id>attach-sources</id>
272 |                                 <goals>
273 |                                     <goal>jar-no-fork</goal>
274 |                                 </goals>
275 |                             </execution>
276 |                         </executions>
277 |                     </plugin>
278 |                     <plugin>
279 |                         <groupId>org.apache.maven.plugins</groupId>
280 |                         <artifactId>maven-javadoc-plugin</artifactId>
281 |                         <version>2.9</version>
282 |                         <executions>
283 |                             <execution>
284 |                                 <id>attach-javadocs</id>
285 |                                 <goals>
286 |                                     <goal>jar</goal>
287 |                                 </goals>
288 |                             </execution>
289 |                         </executions>
290 |                     </plugin>
291 |                 </plugins>
292 |             </build>
293 |         </profile>
294 |     </profiles>
295 | </project>
296 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.index.analysis;
  2 | 
  3 | import org.apache.lucene.analysis.Tokenizer;
  4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  6 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  7 | import org.elasticsearch.analysis.PinyinConfig;
  8 | import org.nlpcn.commons.lang.pinyin.Pinyin;
  9 | 
 10 | import java.io.IOException;
 11 | import java.util.ArrayList;
 12 | import java.util.Collections;
 13 | import java.util.HashSet;
 14 | import java.util.List;
 15 | 
 16 | 
 17 | public class PinyinTokenizer extends Tokenizer {
 18 | 
 19 | 
 20 |     private static final int DEFAULT_BUFFER_SIZE = 256;
 21 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 22 |     private boolean done = false;
 23 |     private boolean processedCandidate = false;
 24 |     private boolean processedSortCandidate = false;
 25 |     private boolean processedFirstLetter = false;
 26 |     private boolean processedFullPinyinLetter = false;
 27 |     private boolean processedOriginal = false;
 28 |     protected int position = 0;
 29 |     protected int lastOffset = 0;
 30 |     private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 31 |     private PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
 32 |     private PinyinConfig config;
 33 |     ArrayList<TermItem> candidate;
 34 |     protected int candidateOffset = 0; //indicate candidates process offset
 35 |     private HashSet<String> termsFilter;
 36 |     StringBuilder firstLetters;
 37 |     StringBuilder fullPinyinLetters;
 38 | 
 39 |     private int lastIncrementPosition = 0;
 40 | 
 41 |     String source;
 42 | 
 43 |     public PinyinTokenizer(PinyinConfig config) {
 44 |         this(DEFAULT_BUFFER_SIZE);
 45 |         this.config = config;
 46 | 
 47 |         //validate config
 48 |         if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) {
 49 |             throw new ConfigErrorException("pinyin config error, can't disable separate_first_letter, first_letter and full_pinyin at the same time.");
 50 |         }
 51 |         candidate = new ArrayList<>();
 52 |         termsFilter = new HashSet<>();
 53 |         firstLetters = new StringBuilder();
 54 |         fullPinyinLetters = new StringBuilder();
 55 |     }
 56 | 
 57 |     public PinyinTokenizer(int bufferSize) {
 58 |         super();
 59 |         termAtt.resizeBuffer(bufferSize);
 60 |     }
 61 | 
 62 |     void addCandidate(TermItem item) {
 63 | 
 64 |         String term = item.term;
 65 |         if (config.lowercase) {
 66 |             term = term.toLowerCase();
 67 |         }
 68 | 
 69 |         if (config.trimWhitespace) {
 70 |             term = term.trim();
 71 |         }
 72 |         item.term = term;
 73 | 
 74 |         if (term.length() == 0) {
 75 |             return;
 76 |         }
 77 | 
 78 |         //remove same term with same position
 79 |         String fr=term+item.position;
 80 | 
 81 |         //remove same term, regardless position
 82 |         if (config.removeDuplicateTerm) {
 83 |             fr=term;
 84 |         }
 85 | 
 86 |         if (termsFilter.contains(fr)) {
 87 |             return;
 88 |         }
 89 |         termsFilter.add(fr);
 90 | 
 91 |         candidate.add(item);
 92 |     }
 93 | 
 94 | 
 95 |     void setTerm(String term, int startOffset, int endOffset, int position) {
 96 |         if (config.lowercase) {
 97 |             term = term.toLowerCase();
 98 |         }
 99 | 
100 |         if (config.trimWhitespace) {
101 |             term = term.trim();
102 |         }
103 | 
104 |         //ignore empty term
105 |         if(term.length()==0){
106 |             return;
107 |         }
108 | 
109 |         termAtt.setEmpty();
110 |         termAtt.append(term);
111 |         if (startOffset < 0) {
112 |             startOffset = 0;
113 |         }
114 |         if (endOffset < startOffset) {
115 |             endOffset = startOffset + term.length();
116 |         }
117 | 
118 |         if(!config.ignorePinyinOffset){
119 |             offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
120 |         }
121 | 
122 |         int offset = position - lastIncrementPosition;
123 |         if (offset < 0) {
124 |             offset = 0;
125 |         }
126 |         positionAttr.setPositionIncrement(offset);
127 | 
128 |         lastIncrementPosition = position;
129 |     }
130 | 
131 |     @Override
132 |     public final boolean incrementToken() throws IOException {
133 | 
134 |         clearAttributes();
135 | 
136 |         if (!done) {
137 | 
138 |             //combine text together to get right pinyin
139 |             if (!processedCandidate) {
140 |                 processedCandidate = true;
141 |                 int upto = 0;
142 |                 char[] buffer = termAtt.buffer();
143 |                 while (true) {
144 |                     final int length = input.read(buffer, upto, buffer.length - upto);
145 |                     if (length == -1) break;
146 |                     upto += length;
147 |                     if (upto == buffer.length)
148 |                         buffer = termAtt.resizeBuffer(1 + buffer.length);
149 |                 }
150 |                 termAtt.setLength(upto);
151 |                 source = termAtt.toString();
152 | 
153 |                 List<String> pinyinList = Pinyin.pinyin(source);
154 |                 if (pinyinList.size() == 0) return false;
155 | 
156 |                 StringBuilder buff = new StringBuilder();
157 |                 int buffStartPosition = 0;
158 |                 int buffSize = 0;
159 | 
160 |                 position = 0;
161 | 
162 |                 for (int i = 0; i < source.length(); i++) {
163 |                     char c = source.charAt(i);
164 |                     //keep original alphabet
165 |                     if (c < 128) {
166 |                         if (buff.length() <= 0) {
167 |                             buffStartPosition = i+1;
168 |                         }
169 |                         if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) {
170 |                             if (config.keepNoneChinese) {
171 |                                 if (config.keepNoneChinese) {
172 |                                     if (config.keepNoneChineseTogether) {
173 |                                         buff.append(c);
174 |                                         buffSize++;
175 |                                     } else {
176 |                                         addCandidate(new TermItem(String.valueOf(c), i, i + 1, buffStartPosition));
177 |                                     }
178 |                                 }
179 |                             }
180 |                             if (config.keepNoneChineseInFirstLetter) {
181 |                                 firstLetters.append(c);
182 |                             }
183 |                             if (config.keepNoneChineseInJoinedFullPinyin) {
184 |                                 fullPinyinLetters.append(c);
185 |                             }
186 |                         }
187 |                     } else {
188 | 
189 |                         //clean previous temp
190 |                         if (buff.length() > 0) {
191 |                             buffSize = parseBuff(buff, buffSize, buffStartPosition);
192 |                         }
193 | 
194 |                         String pinyin = pinyinList.get(i);
195 |                         if (pinyin != null && pinyin.length() > 0) {
196 |                             position++;
197 |                             firstLetters.append(pinyin.charAt(0));
198 |                             if (config.keepSeparateFirstLetter & pinyin.length() > 1) {
199 |                                 addCandidate(new TermItem(String.valueOf(pinyin.charAt(0)), i, i + 1, position));
200 |                             }
201 |                             if (config.keepFullPinyin) {
202 |                                 addCandidate(new TermItem(pinyin, i, i + 1, position));
203 |                             }
204 |                             if (config.keepJoinedFullPinyin) {
205 |                                 fullPinyinLetters.append(pinyin);
206 |                             }
207 |                         }
208 |                     }
209 | 
210 |                     lastOffset = i;
211 | 
212 |                 }
213 | 
214 |                 //clean previous temp
215 |                 if (buff.length() > 0) {
216 |                     buffSize = parseBuff(buff, buffSize, buffStartPosition);
217 |                 }
218 |             }
219 | 
220 |             if (config.keepOriginal && !processedOriginal) {
221 |                 processedOriginal = true;
222 |                 addCandidate(new TermItem(source, 0, source.length(), 1));
223 |             }
224 | 
225 |             if (config.keepJoinedFullPinyin && !processedFullPinyinLetter && fullPinyinLetters.length() > 0) {
226 |                 processedFullPinyinLetter = true;
227 |                 addCandidate(new TermItem(fullPinyinLetters.toString(), 0, source.length(), 1));
228 |                 fullPinyinLetters.setLength(0);
229 |             }
230 | 
231 | 
232 |             if (config.keepFirstLetter && firstLetters.length() > 0 && !processedFirstLetter) {
233 |                 processedFirstLetter = true;
234 |                 String fl;
235 |                 if (firstLetters.length() > config.LimitFirstLetterLength && config.LimitFirstLetterLength > 0) {
236 |                     fl = firstLetters.substring(0, config.LimitFirstLetterLength);
237 |                 } else {
238 |                     fl = firstLetters.toString();
239 |                 }
240 |                 if (config.lowercase) {
241 |                     fl = fl.toLowerCase();
242 |                 }
243 |                 if (!(config.keepSeparateFirstLetter && fl.length() <= 1)) {
244 |                     addCandidate(new TermItem(fl, 0, fl.length(), 1));
245 |                 }
246 |             }
247 | 
248 |             if (!processedSortCandidate) {
249 |                 processedSortCandidate = true;
250 |                 Collections.sort(candidate);
251 |             }
252 | 
253 |             if (candidateOffset < candidate.size()) {
254 |                 TermItem item = candidate.get(candidateOffset);
255 |                 candidateOffset++;
256 |                 setTerm(item.term, item.startOffset, item.endOffset, item.position);
257 |                 return true;
258 |             }
259 | 
260 | 
261 |             done = true;
262 |             return false;
263 |         }
264 |         return false;
265 |     }
266 | 
267 |     private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) {
268 |         if (config.keepNoneChinese) {
269 |             if (config.noneChinesePinyinTokenize) {
270 |                 List<String> result = PinyinAlphabetTokenizer.walk(buff.toString());
271 |                 int start = (lastOffset - buffSize + 1);
272 |                 for (int i = 0; i < result.size(); i++) {
273 |                     int end;
274 |                     String t = result.get(i);
275 |                     if (config.fixedPinyinOffset) {
276 |                         end = start + 1;
277 |                     } else {
278 |                         end = start + t.length();
279 |                     }
280 |                     addCandidate(new TermItem(result.get(i), start, end, ++position));
281 |                     start = end;
282 |                 }
283 |             } else if (config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || !config.keepNoneChineseInJoinedFullPinyin) {
284 |                 addCandidate(new TermItem(buff.toString(), lastOffset - buffSize, lastOffset, ++position));
285 |             }
286 |         }
287 | 
288 |         buff.setLength(0);
289 |         buffSize = 0;
290 |         return buffSize;
291 |     }
292 | 
293 |     @Override
294 |     public final void end() throws IOException {
295 |         super.end();
296 |     }
297 | 
298 |     @Override
299 |     public void reset() throws IOException {
300 |         super.reset();
301 |         position = 0;
302 |         candidateOffset = 0;
303 |         this.done = false;
304 |         this.processedCandidate = false;
305 |         this.processedFirstLetter = false;
306 |         this.processedFullPinyinLetter = false;
307 |         this.processedOriginal = false;
308 |         firstLetters.setLength(0);
309 |         fullPinyinLetters.setLength(0);
310 |         termsFilter.clear();
311 |         candidate.clear();
312 |         source = null;
313 |         lastIncrementPosition = 0;
314 |     }
315 | 
316 | 
317 | }


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/MultiplePinyinTokenFilter.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.index.analysis;
  2 | 
  3 | /**
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  * <p/>
 11 |  * http://www.apache.org/licenses/LICENSE-2.0
 12 |  * <p/>
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | import org.apache.lucene.analysis.TokenFilter;
 21 | import org.apache.lucene.analysis.TokenStream;
 22 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 23 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 24 | import org.elasticsearch.analysis.PinyinConfig;
 25 | import org.nlpcn.commons.lang.pinyin.Pinyin;
 26 | 
 27 | import java.io.IOException;
 28 | import java.util.*;
 29 | 
 30 | public class MultiplePinyinTokenFilter extends TokenFilter {
 31 | 
 32 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 33 |     private boolean done = true;
 34 |     private boolean processedCandidate = false;
 35 |     private boolean processedFullPinyinLetter = false;
 36 |     private boolean processedFirstLetter = false;
 37 |     private boolean processedOriginal = false;
 38 |     private boolean processedSortCandidate = false;
 39 |     protected int position = 0;
 40 |     protected int lastOffset = 0;
 41 |     private PinyinConfig config;
 42 |     List<TermItem> candidate;
 43 |     private HashSet<String> termsFilter;
 44 | 
 45 |     protected int candidateOffset = 0;
 46 |     List<StringBuilder> firstLetters;
 47 |     List<StringBuilder> fullPinyinLetters;
 48 |     String source;
 49 |     private int lastIncrementPosition = 0;
 50 | 
 51 |     private PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
 52 | 
 53 |     public MultiplePinyinTokenFilter(TokenStream in, PinyinConfig config) {
 54 |         super(in);
 55 |         this.config = config;
 56 |         //validate config
 57 |         if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) {
 58 |             throw new ConfigErrorException("pinyin config error, can't disable separate_first_letter, first_letter and full_pinyin at the same time.");
 59 |         }
 60 |         candidate = new ArrayList<>();
 61 |         firstLetters = new LinkedList<StringBuilder>();
 62 |         termsFilter = new HashSet<>();
 63 |         fullPinyinLetters = new LinkedList<StringBuilder>();
 64 |     }
 65 | 
 66 |     //TODO refactor, merge code
 67 |     @Override
 68 |     public final boolean incrementToken() throws IOException {
 69 | 
 70 | 
 71 |         if (!done) {
 72 |             if (readTerm()) return true;
 73 |         }
 74 | 
 75 |         if (done) {
 76 |             resetVariable();
 77 |             if (!input.incrementToken()) {
 78 |                 return false;
 79 |             }
 80 |             done = false;
 81 |         }
 82 |         readTerm();
 83 |         return true;
 84 |     }
 85 | 
 86 |     private boolean readTerm() {
 87 |         if (!processedCandidate) {
 88 |             processedCandidate = true;
 89 |             lastOffset = termAtt.length();
 90 |             source = termAtt.toString();
 91 |             if (config.trimWhitespace) {
 92 |                 source = source.trim();
 93 |             }
 94 | 
 95 |             List<String> pinyinList = Pinyin.multiplePinyin(source);
 96 |             if (pinyinList.size() == 0) return false;
 97 | 
 98 |             StringBuilder buff = new StringBuilder();
 99 |             int buffStartPosition = 0;
100 |             int buffSize = 0;
101 |             position = 0;
102 | 
103 |             for (int i = 0; i < source.length(); i++) {
104 |                 char c = source.charAt(i);
105 | 
106 |                 //keep original alphabet
107 |                 if (c < 128) {
108 |                     if (buff.length() <= 0) {
109 |                         buffStartPosition = i;
110 |                     }
111 |                     if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) {
112 |                         if (config.keepNoneChinese) {
113 |                             if (config.keepNoneChinese) {
114 |                                 if (config.keepNoneChineseTogether) {
115 |                                     buff.append(c);
116 |                                     buffSize++;
117 |                                 } else {
118 |                                     addCandidate(new TermItem(String.valueOf(c), i, i + 1, buffStartPosition));
119 |                                 }
120 |                             }
121 |                         }
122 |                         if (config.keepNoneChineseInFirstLetter) {
123 |                             if (firstLetters.size() == 0) {
124 |                                 firstLetters.add(new StringBuilder(c+""));
125 |                             } else {
126 |                                 for (int j=0; j<firstLetters.size();j++) {
127 |                                     firstLetters.get(j).append(c);
128 |                                 }
129 |                             }
130 |                         }
131 |                         if (config.keepNoneChineseInJoinedFullPinyin) {
132 |                             if (fullPinyinLetters.size() == 0) {
133 |                                 fullPinyinLetters.add(new StringBuilder(c+""));
134 |                             } else {
135 |                                 for (StringBuilder fullPinyinLetter: fullPinyinLetters) {
136 |                                     fullPinyinLetter.append(c);
137 |                                 }
138 |                             }
139 |                         }
140 |                     }
141 |                 } else {
142 |                     //clean previous temp
143 |                     if (buff.length() > 0) {
144 |                         buffSize = parseBuff(buff, buffSize, buffStartPosition);
145 |                     }
146 | 
147 |                     String pinyin = pinyinList.get(i);
148 |                     if (pinyin != null && pinyin.length() > 0) {
149 |                         String[] pingyinList = pinyin.split(" ");
150 |                         position++;
151 |                         if (firstLetters.size() == 0) {
152 |                             if (pingyinList.length > 1) {
153 |                                 for (String py: pingyinList) {
154 |                                     firstLetters.add(new StringBuilder(py.substring(0, 1)));
155 |                                 }
156 |                             }
157 |                             else {
158 |                                 firstLetters.add(new StringBuilder(pinyin.substring(0, 1)));
159 |                             }
160 |                         } else {
161 |                             if (pingyinList.length > 1) {
162 |                                 int lettersSize = firstLetters.size();
163 |                                 for (int j=0; j<lettersSize;j++) {
164 |                                     String letter = firstLetters.get(j).toString();
165 |                                     for (String py: pingyinList) {
166 |                                         firstLetters.add(new StringBuilder(letter + py.substring(0, 1)));
167 |                                     }
168 |                                 }
169 | 
170 |                                 for (int j =0; j< lettersSize; j++) {
171 |                                     firstLetters.remove(lettersSize-j-1);
172 |                                 }
173 |                             } else {
174 |                                 for (int j=0; j<firstLetters.size();j++) {
175 |                                     firstLetters.get(j).append(pinyin.charAt(0));
176 |                                 }
177 |                             }
178 |                         }
179 |                         if (config.keepSeparateFirstLetter & pinyin.length() > 1) {
180 |                             addCandidate(new TermItem(String.valueOf(pinyin.charAt(0)), i, i + 1, position));
181 |                         }
182 |                         if (config.keepFullPinyin) {
183 |                             addCandidate(new TermItem(pinyin, i, i + 1, position));
184 |                         }
185 |                         if (config.keepJoinedFullPinyin) {
186 |                             if (fullPinyinLetters.size() == 0) {
187 |                                 if (pingyinList.length > 1) {
188 |                                     for (String py: pingyinList) {
189 |                                         fullPinyinLetters.add(new StringBuilder(py));
190 |                                     }
191 |                                 } else {
192 |                                     fullPinyinLetters.add(new StringBuilder(pingyinList[0]));
193 |                                 }
194 |                             } else {
195 |                                 if (pingyinList.length > 1) {
196 |                                     int fullPinyinSize = fullPinyinLetters.size();
197 |                                     for (int j=0; j<fullPinyinSize;j++) {
198 |                                         String letter = fullPinyinLetters.get(j).toString();
199 |                                         for (String py: pingyinList) {
200 |                                             fullPinyinLetters.add(new StringBuilder(letter + py));
201 |                                         }
202 |                                     }
203 | 
204 |                                     for (int j =0; j< fullPinyinSize; j++) {
205 |                                         fullPinyinLetters.remove(fullPinyinSize-j-1);
206 |                                     }
207 |                                 } else {
208 |                                     for (int j=0; j<fullPinyinLetters.size();j++) {
209 |                                         fullPinyinLetters.get(j).append(pingyinList[0]);
210 |                                     }
211 |                                 }
212 |                             }
213 |                         }
214 |                     }
215 |                 }
216 | 
217 |                 lastOffset = i;
218 | 
219 |             }
220 | 
221 |             //clean previous temp
222 |             if (buff.length() > 0) {
223 |                 buffSize = parseBuff(buff, buffSize, buffStartPosition);
224 |             }
225 |         }
226 | 
227 | 
228 |         if (config.keepOriginal && !processedOriginal) {
229 |             processedOriginal = true;
230 |             addCandidate(new TermItem(source, 0, source.length(), 1));
231 |         }
232 | 
233 |         if (config.keepJoinedFullPinyin && !processedFullPinyinLetter && fullPinyinLetters.size() > 0) {
234 |             processedFullPinyinLetter = true;
235 |             for (StringBuilder fullPinyinLetter: fullPinyinLetters) {
236 |                 addCandidate(new TermItem(fullPinyinLetter.toString(), 0, source.length(), 1));
237 |             }
238 |             fullPinyinLetters.clear();
239 |         }
240 | 
241 | 
242 |         if (config.keepFirstLetter && firstLetters.size() > 0 && !processedFirstLetter) {
243 |             processedFirstLetter = true;
244 |             for (StringBuilder firstLetter: firstLetters) {
245 |                 String fl;
246 |                 if (firstLetter.length() > config.LimitFirstLetterLength && config.LimitFirstLetterLength > 0) {
247 |                     fl = firstLetter.substring(0, config.LimitFirstLetterLength);
248 |                 } else {
249 |                     fl = firstLetter.toString();
250 |                 }
251 |                 if (config.lowercase) {
252 |                     fl = fl.toLowerCase();
253 |                 }
254 |                 if (!(config.keepSeparateFirstLetter && fl.length() <= 1)) {
255 |                     addCandidate(new TermItem(fl, 0, fl.length(), 1));
256 |                 }
257 |             }
258 |         }
259 | 
260 |         if (!processedSortCandidate) {
261 |             processedSortCandidate = true;
262 |             Collections.sort(candidate);
263 |         }
264 | 
265 |         if (candidateOffset < candidate.size()) {
266 |             TermItem item = candidate.get(candidateOffset);
267 |             candidateOffset++;
268 |             setTerm(item.term, item.startOffset, item.endOffset, item.position);
269 |             return true;
270 |         }
271 | 
272 |         done = true;
273 |         return false;
274 |     }
275 | 
276 | 
277 |     void addCandidate(TermItem item) {
278 | 
279 |         String term = item.term;
280 |         if (config.lowercase) {
281 |             term = term.toLowerCase();
282 |         }
283 | 
284 |         if (config.trimWhitespace) {
285 |             term = term.trim();
286 |         }
287 |         item.term = term;
288 | 
289 |         if (term.length() == 0) {
290 |             return;
291 |         }
292 | 
293 |         //remove same term with same position
294 |         String fr=term+item.position;
295 | 
296 |         //remove same term, regardless position
297 |         if (config.removeDuplicateTerm) {
298 |             fr=term;
299 |         }
300 | 
301 |         if (termsFilter.contains(fr)) {
302 |             return;
303 |         }
304 |         termsFilter.add(fr);
305 | 
306 |         candidate.add(item);
307 |     }
308 | 
309 | 
310 |     void setTerm(String term, int startOffset, int endOffset, int position) {
311 |         if (config.lowercase) {
312 |             term = term.toLowerCase();
313 |         }
314 | 
315 |         if (config.trimWhitespace) {
316 |             term = term.trim();
317 |         }
318 | 
319 |         //ignore empty term
320 |         if(term.length()==0){
321 |             return;
322 |         }
323 | 
324 |         termAtt.setEmpty();
325 |         termAtt.append(term);
326 |         if (startOffset < 0) {
327 |             startOffset = 0;
328 |         }
329 |         if (endOffset < startOffset) {
330 |             endOffset = startOffset + term.length();
331 |         }
332 | 
333 |         int offset = position - lastIncrementPosition;
334 |         if (offset < 0) {
335 |             offset = 0;
336 |         }
337 |         positionAttr.setPositionIncrement(offset);
338 | 
339 |         lastIncrementPosition = position;
340 |     }
341 | 
342 |     private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) {
343 |         if (config.keepNoneChinese) {
344 |             if (config.noneChinesePinyinTokenize) {
345 |                 List<String> result = PinyinAlphabetTokenizer.walk(buff.toString());
346 |                 int start = (lastOffset - buffSize + 1);
347 |                 for (int i = 0; i < result.size(); i++) {
348 |                     int end;
349 |                     String t = result.get(i);
350 |                     if (config.fixedPinyinOffset) {
351 |                         end = start + 1;
352 |                     } else {
353 |                         end = start + t.length();
354 |                     }
355 |                     addCandidate(new TermItem(result.get(i), start, end, ++position));
356 |                     start = end;
357 |                 }
358 |             } else if (config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || !config.keepNoneChineseInJoinedFullPinyin) {
359 |                 addCandidate(new TermItem(buff.toString(), lastOffset - buffSize, lastOffset, ++position));
360 |             }
361 |         }
362 | 
363 |         buff.setLength(0);
364 |         buffSize = 0;
365 |         return buffSize;
366 |     }
367 | 
368 |     @Override
369 |     public final void end() throws IOException {
370 |         super.end();
371 |     }
372 | 
373 |     void resetVariable() {
374 |         position = 0;
375 |         lastOffset = 0;
376 |         candidate.clear();
377 |         this.processedCandidate = false;
378 |         this.processedFirstLetter = false;
379 |         this.processedFullPinyinLetter = false;
380 |         this.processedOriginal = false;
381 |         firstLetters.clear();
382 |         fullPinyinLetters.clear();
383 |         source = null;
384 |         candidateOffset = 0;
385 |         termsFilter.clear();
386 |         lastIncrementPosition = 0;
387 |     }
388 | 
389 |     @Override
390 |     public void reset() throws IOException {
391 |         super.reset();
392 |         this.done = true;
393 |         resetVariable();
394 |     }
395 | 
396 | 
397 | }
398 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/MultiplePinyinTokenizer.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.index.analysis;
  2 | 
  3 | import org.apache.lucene.analysis.Tokenizer;
  4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  6 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  7 | import org.elasticsearch.analysis.PinyinConfig;
  8 | import org.nlpcn.commons.lang.pinyin.Pinyin;
  9 | 
 10 | import java.io.IOException;
 11 | import java.util.*;
 12 | 
 13 | 
 14 | public class MultiplePinyinTokenizer extends Tokenizer {
 15 | 
 16 | 
 17 |     private static final int DEFAULT_BUFFER_SIZE = 256;
 18 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 19 |     private boolean done = false;
 20 |     private boolean processedCandidate = false;
 21 |     private boolean processedSortCandidate = false;
 22 |     private boolean processedFirstLetter = false;
 23 |     private boolean processedFullPinyinLetter = false;
 24 |     private boolean processedOriginal = false;
 25 |     protected int position = 0;
 26 |     protected int lastOffset = 0;
 27 |     private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 28 |     private PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
 29 |     private PinyinConfig config;
 30 |     ArrayList<TermItem> candidate;
 31 |     protected int candidateOffset = 0; //indicate candidates process offset
 32 |     private HashSet<String> termsFilter;
 33 |     List<StringBuilder> firstLetters;
 34 |     List<StringBuilder> fullPinyinLetters;
 35 | 
 36 |     private int lastIncrementPosition = 0;
 37 | 
 38 |     String source;
 39 | 
 40 |     public MultiplePinyinTokenizer(PinyinConfig config) {
 41 |         this(DEFAULT_BUFFER_SIZE);
 42 |         this.config = config;
 43 | 
 44 |         //validate config
 45 |         if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) {
 46 |             throw new ConfigErrorException("pinyin config error, can't disable separate_first_letter, first_letter and full_pinyin at the same time.");
 47 |         }
 48 |         candidate = new ArrayList<>();
 49 |         termsFilter = new HashSet<>();
 50 |         firstLetters = new LinkedList<StringBuilder>();
 51 |         fullPinyinLetters = new LinkedList<StringBuilder>();
 52 |     }
 53 | 
 54 |     public MultiplePinyinTokenizer(int bufferSize) {
 55 |         super();
 56 |         termAtt.resizeBuffer(bufferSize);
 57 |     }
 58 | 
 59 |     void addCandidate(TermItem item) {
 60 | 
 61 |         String term = item.term;
 62 |         if (config.lowercase) {
 63 |             term = term.toLowerCase();
 64 |         }
 65 | 
 66 |         if (config.trimWhitespace) {
 67 |             term = term.trim();
 68 |         }
 69 |         item.term = term;
 70 | 
 71 |         if (term.length() == 0) {
 72 |             return;
 73 |         }
 74 | 
 75 |         //remove same term with same position
 76 |         String fr=term+item.position;
 77 | 
 78 |         //remove same term, regardless position
 79 |         if (config.removeDuplicateTerm) {
 80 |              fr=term;
 81 |         }
 82 | 
 83 |         if (termsFilter.contains(fr)) {
 84 |             return;
 85 |         }
 86 |         termsFilter.add(fr);
 87 | 
 88 |         candidate.add(item);
 89 |     }
 90 | 
 91 | 
 92 |     void setTerm(String term, int startOffset, int endOffset, int position) {
 93 |         if (config.lowercase) {
 94 |             term = term.toLowerCase();
 95 |         }
 96 | 
 97 |         if (config.trimWhitespace) {
 98 |             term = term.trim();
 99 |         }
100 | 
101 |         //ignore empty term
102 |         if(term.length()==0){
103 |             return;
104 |         }
105 | 
106 |         termAtt.setEmpty();
107 |         termAtt.append(term);
108 |         if (startOffset < 0) {
109 |             startOffset = 0;
110 |         }
111 |         if (endOffset < startOffset) {
112 |             endOffset = startOffset + term.length();
113 |         }
114 | 
115 |         if(!config.ignorePinyinOffset){
116 |             offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
117 |         }
118 | 
119 |         int offset = position - lastIncrementPosition;
120 |         if (offset < 0) {
121 |             offset = 0;
122 |         }
123 |         positionAttr.setPositionIncrement(offset);
124 | 
125 |         lastIncrementPosition = position;
126 |     }
127 | 
128 |     @Override
129 |     public final boolean incrementToken() throws IOException {
130 | 
131 |         clearAttributes();
132 | 
133 |         if (!done) {
134 | 
135 |             //combine text together to get right pinyin
136 |             if (!processedCandidate) {
137 |                 processedCandidate = true;
138 |                 int upto = 0;
139 |                 char[] buffer = termAtt.buffer();
140 |                 while (true) {
141 |                     final int length = input.read(buffer, upto, buffer.length - upto);
142 |                     if (length == -1) break;
143 |                     upto += length;
144 |                     if (upto == buffer.length)
145 |                         buffer = termAtt.resizeBuffer(1 + buffer.length);
146 |                 }
147 |                 termAtt.setLength(upto);
148 |                 source = termAtt.toString();
149 | 
150 |                 List<String> pinyinList = Pinyin.multiplePinyin(source);
151 |                 if (pinyinList.size() == 0) return false;
152 | 
153 |                 StringBuilder buff = new StringBuilder();
154 |                 int buffStartPosition = 0;
155 |                 int buffSize = 0;
156 | 
157 |                 position = 0;
158 | 
159 |                 for (int i = 0; i < source.length(); i++) {
160 |                     char c = source.charAt(i);
161 |                     //keep original alphabet
162 |                     if (c < 128) {
163 |                         if (buff.length() <= 0) {
164 |                             buffStartPosition = i+1;
165 |                         }
166 |                         if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) {
167 |                             if (config.keepNoneChinese) {
168 |                                 if (config.keepNoneChinese) {
169 |                                     if (config.keepNoneChineseTogether) {
170 |                                         buff.append(c);
171 |                                         buffSize++;
172 |                                     } else {
173 |                                         addCandidate(new TermItem(String.valueOf(c), i, i + 1, buffStartPosition));
174 |                                     }
175 |                                 }
176 |                             }
177 |                             if (config.keepNoneChineseInFirstLetter) {
178 |                                 if (firstLetters.size() == 0) {
179 |                                     firstLetters.add(new StringBuilder(c+""));
180 |                                 } else {
181 |                                     for (int j=0; j<firstLetters.size();j++) {
182 |                                         firstLetters.get(j).append(c);
183 |                                     }
184 |                                 }
185 | 
186 |                             }
187 |                             if (config.keepNoneChineseInJoinedFullPinyin) {
188 |                                 if (fullPinyinLetters.size() == 0) {
189 |                                     fullPinyinLetters.add(new StringBuilder(c+""));
190 |                                 } else {
191 |                                     for (StringBuilder fullPinyinLetter: fullPinyinLetters) {
192 |                                         fullPinyinLetter.append(c);
193 |                                     }
194 |                                 }
195 |                             }
196 |                         }
197 |                     } else {
198 | 
199 |                         //clean previous temp
200 |                         if (buff.length() > 0) {
201 |                             buffSize = parseBuff(buff, buffSize, buffStartPosition);
202 |                         }
203 | 
204 |                         String pinyin = pinyinList.get(i);
205 |                         if (pinyin != null && pinyin.length() > 0) {
206 |                             String[] pingyinList = pinyin.split(" ");
207 |                             position++;
208 |                             if (firstLetters.size() == 0) {
209 |                                 if (pingyinList.length > 1) {
210 |                                     for (String py: pingyinList) {
211 |                                         firstLetters.add(new StringBuilder(py.substring(0, 1)));
212 |                                     }
213 |                                 }
214 |                                 else {
215 |                                     firstLetters.add(new StringBuilder(pinyin.substring(0, 1)));
216 |                                 }
217 |                             } else {
218 |                                 if (pingyinList.length > 1) {
219 |                                     int lettersSize = firstLetters.size();
220 |                                     for (int j=0; j<lettersSize;j++) {
221 |                                         String letter = firstLetters.get(j).toString();
222 |                                         for (String py: pingyinList) {
223 |                                             firstLetters.add(new StringBuilder(letter + py.substring(0, 1)));
224 |                                         }
225 |                                     }
226 | 
227 |                                     for (int j =0; j< lettersSize; j++) {
228 |                                         firstLetters.remove(lettersSize-j-1);
229 |                                     }
230 |                                 } else {
231 |                                     for (int j=0; j<firstLetters.size();j++) {
232 |                                         firstLetters.get(j).append(pinyin.charAt(0));
233 |                                     }
234 |                                 }
235 |                             }
236 |                             if (config.keepSeparateFirstLetter & pinyin.length() > 1) {
237 |                                 addCandidate(new TermItem(String.valueOf(pinyin.charAt(0)), i, i + 1, position));
238 |                             }
239 |                             if (config.keepFullPinyin) {
240 |                                 addCandidate(new TermItem(pinyin, i, i + 1, position));
241 |                             }
242 |                             if (config.keepJoinedFullPinyin) {
243 |                                 if (fullPinyinLetters.size() == 0) {
244 |                                     if (pingyinList.length > 1) {
245 |                                         for (String py: pingyinList) {
246 |                                             fullPinyinLetters.add(new StringBuilder(py));
247 |                                         }
248 |                                     } else {
249 |                                         fullPinyinLetters.add(new StringBuilder(pingyinList[0]));
250 |                                     }
251 |                                 } else {
252 |                                     if (pingyinList.length > 1) {
253 |                                         int fullPinyinSize = fullPinyinLetters.size();
254 |                                         for (int j=0; j<fullPinyinSize;j++) {
255 |                                             String letter = fullPinyinLetters.get(j).toString();
256 |                                             for (String py: pingyinList) {
257 |                                                 fullPinyinLetters.add(new StringBuilder(letter + py));
258 |                                             }
259 |                                         }
260 | 
261 |                                         for (int j =0; j< fullPinyinSize; j++) {
262 |                                             fullPinyinLetters.remove(fullPinyinSize-j-1);
263 |                                         }
264 |                                     } else {
265 |                                         for (int j=0; j<fullPinyinLetters.size();j++) {
266 |                                             fullPinyinLetters.get(j).append(pingyinList[0]);
267 |                                         }
268 |                                     }
269 |                                 }
270 |                             }
271 |                         }
272 |                     }
273 | 
274 |                     lastOffset = i;
275 | 
276 |                 }
277 | 
278 |                 //clean previous temp
279 |                 if (buff.length() > 0) {
280 |                     buffSize = parseBuff(buff, buffSize, buffStartPosition);
281 |                 }
282 |             }
283 | 
284 |             if (config.keepOriginal && !processedOriginal) {
285 |                 processedOriginal = true;
286 |                 addCandidate(new TermItem(source, 0, source.length(), 1));
287 |             }
288 | 
289 |             if (config.keepJoinedFullPinyin && !processedFullPinyinLetter && fullPinyinLetters.size() > 0) {
290 |                 processedFullPinyinLetter = true;
291 |                 for (StringBuilder fullPinyinLetter: fullPinyinLetters) {
292 |                     addCandidate(new TermItem(fullPinyinLetter.toString(), 0, source.length(), 1));
293 |                 }
294 |                 fullPinyinLetters.clear();
295 |             }
296 | 
297 | 
298 |             if (config.keepFirstLetter && firstLetters.size() > 0 && !processedFirstLetter) {
299 |                 processedFirstLetter = true;
300 | 
301 |                 for (StringBuilder firstLetter: firstLetters) {
302 |                     String fl;
303 |                     if (firstLetter.length() > config.LimitFirstLetterLength && config.LimitFirstLetterLength > 0) {
304 |                         fl = firstLetter.substring(0, config.LimitFirstLetterLength);
305 |                     } else {
306 |                         fl = firstLetter.toString();
307 |                     }
308 |                     if (config.lowercase) {
309 |                         fl = fl.toLowerCase();
310 |                     }
311 |                     if (!(config.keepSeparateFirstLetter && fl.length() <= 1)) {
312 |                         addCandidate(new TermItem(fl, 0, fl.length(), 1));
313 |                     }
314 |                 }
315 |             }
316 | 
317 |             if (!processedSortCandidate) {
318 |                 processedSortCandidate = true;
319 |                 Collections.sort(candidate);
320 |             }
321 | 
322 |             if (candidateOffset < candidate.size()) {
323 |                 TermItem item = candidate.get(candidateOffset);
324 |                 candidateOffset++;
325 |                 setTerm(item.term, item.startOffset, item.endOffset, item.position);
326 |                 return true;
327 |             }
328 | 
329 | 
330 |             done = true;
331 |             return false;
332 |         }
333 |         return false;
334 |     }
335 | 
336 |     private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) {
337 |         if (config.keepNoneChinese) {
338 |             if (config.noneChinesePinyinTokenize) {
339 |                 List<String> result = PinyinAlphabetTokenizer.walk(buff.toString());
340 |                 int start = (lastOffset - buffSize + 1);
341 |                 for (int i = 0; i < result.size(); i++) {
342 |                     int end;
343 |                     String t = result.get(i);
344 |                     if (config.fixedPinyinOffset) {
345 |                         end = start + 1;
346 |                     } else {
347 |                         end = start + t.length();
348 |                     }
349 |                     addCandidate(new TermItem(result.get(i), start, end, ++position));
350 |                     start = end;
351 |                 }
352 |             } else if (config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || !config.keepNoneChineseInJoinedFullPinyin) {
353 |                 addCandidate(new TermItem(buff.toString(), lastOffset - buffSize, lastOffset, ++position));
354 |             }
355 |         }
356 | 
357 |         buff.setLength(0);
358 |         buffSize = 0;
359 |         return buffSize;
360 |     }
361 | 
362 |     @Override
363 |     public final void end() throws IOException {
364 |         super.end();
365 |     }
366 | 
367 |     @Override
368 |     public void reset() throws IOException {
369 |         super.reset();
370 |         position = 0;
371 |         candidateOffset = 0;
372 |         this.done = false;
373 |         this.processedCandidate = false;
374 |         this.processedFirstLetter = false;
375 |         this.processedFullPinyinLetter = false;
376 |         this.processedOriginal = false;
377 |         firstLetters.clear();
378 |         fullPinyinLetters.clear();
379 |         termsFilter.clear();
380 |         candidate.clear();
381 |         source = null;
382 |         lastIncrementPosition = 0;
383 |     }
384 | 
385 | 
386 | }
387 | 


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTest.java:
--------------------------------------------------------------------------------
   1 | /*
   2 | * Licensed to ElasticSearch and Shay Banon under one
   3 | * or more contributor license agreements.  See the NOTICE file
   4 | * distributed with this work for additional information
   5 | * regarding copyright ownership. ElasticSearch licenses this
   6 | * file to you under the Apache License, Version 2.0 (the
   7 | * "License"); you may not use this file except in compliance
   8 | * with the License.  You may obtain a copy of the License at
   9 | *
  10 | *    http://www.apache.org/licenses/LICENSE-2.0
  11 | *
  12 | * Unless required by applicable law or agreed to in writing,
  13 | * software distributed under the License is distributed on an
  14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15 | * KIND, either express or implied.  See the License for the
  16 | * specific language governing permissions and limitations
  17 | * under the License.
  18 | */
  19 | 
  20 | package org.elasticsearch.index.analysis;
  21 | 
  22 | import junit.framework.Assert;
  23 | import org.apache.lucene.analysis.Analyzer;
  24 | import org.apache.lucene.analysis.core.KeywordAnalyzer;
  25 | import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
  26 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
  27 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  28 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  29 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  30 | import org.elasticsearch.analysis.PinyinConfig;
  31 | import org.junit.Test;
  32 | import org.nlpcn.commons.lang.pinyin.Pinyin;
  33 | 
  34 | import java.io.IOException;
  35 | import java.io.StringReader;
  36 | import java.util.ArrayList;
  37 | import java.util.HashMap;
  38 | import java.util.List;
  39 | 
  40 | /**
  41 |  */
  42 | 
  43 | public class PinyinAnalysisTest {
  44 | 
  45 | 
  46 |     @Test
  47 |     public void testTokenFilter() throws IOException {
  48 |         PinyinConfig config = new PinyinConfig();
  49 |         config.keepFirstLetter = true;
  50 |         config.keepNoneChinese = true;
  51 |         config.keepOriginal = false;
  52 |         config.keepFullPinyin = false;
  53 |         config.ignorePinyinOffset = false;
  54 | 
  55 | 
  56 |         StringReader sr = new StringReader("刘德华");
  57 |         Analyzer analyzer = new StandardAnalyzer();
  58 |         PinyinTokenFilter filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config);
  59 |         List<String> pinyin = new ArrayList<String>();
  60 |         filter.reset();
  61 |         System.out.println();
  62 |         while (filter.incrementToken()) {
  63 |             CharTermAttribute ta = filter.getAttribute(CharTermAttribute.class);
  64 |             pinyin.add(ta.toString());
  65 |             System.out.println(ta.toString());
  66 |         }
  67 | 
  68 |         Assert.assertEquals(3, pinyin.size());
  69 |         Assert.assertEquals("l", pinyin.get(0));
  70 |         Assert.assertEquals("d", pinyin.get(1));
  71 |         Assert.assertEquals("h", pinyin.get(2));
  72 | 
  73 |         sr = new StringReader("刘德华");
  74 |         analyzer = new KeywordAnalyzer();
  75 |         filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config);
  76 |         pinyin.clear();
  77 |         filter.reset();
  78 |         System.out.println();
  79 |         while (filter.incrementToken()) {
  80 |             CharTermAttribute ta = filter.getAttribute(CharTermAttribute.class);
  81 |             pinyin.add(ta.toString());
  82 |             System.out.println(ta.toString());
  83 |         }
  84 |         Assert.assertEquals(1, pinyin.size());
  85 |         Assert.assertEquals("ldh", pinyin.get(0));
  86 | 
  87 | 
  88 |         config = new PinyinConfig();
  89 |         config.keepFirstLetter = false;
  90 |         config.keepNoneChinese = true;
  91 |         config.keepOriginal = false;
  92 |         config.keepFullPinyin = true;
  93 |         config.ignorePinyinOffset = false;
  94 | 
  95 | 
  96 |         sr = new StringReader("刘德华");
  97 |         analyzer = new StandardAnalyzer();
  98 |         filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config);
  99 |         pinyin = new ArrayList<String>();
 100 |         filter.reset();
 101 |         System.out.println();
 102 |         while (filter.incrementToken()) {
 103 |             CharTermAttribute ta = filter.getAttribute(CharTermAttribute.class);
 104 |             pinyin.add(ta.toString());
 105 |             System.out.println(ta.toString());
 106 |         }
 107 |         Assert.assertEquals(3, pinyin.size());
 108 |         Assert.assertEquals("liu", pinyin.get(0));
 109 |         Assert.assertEquals("de", pinyin.get(1));
 110 |         Assert.assertEquals("hua", pinyin.get(2));
 111 | 
 112 | 
 113 |         config = new PinyinConfig();
 114 |         config.keepFirstLetter = true;
 115 |         config.keepNoneChinese = true;
 116 |         config.keepOriginal = true;
 117 |         config.keepFullPinyin = true;
 118 |         config.ignorePinyinOffset = false;
 119 | 
 120 | 
 121 |         sr = new StringReader("刘德华");
 122 |         analyzer = new StandardAnalyzer();
 123 |         filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config);
 124 |         pinyin = new ArrayList<String>();
 125 |         filter.reset();
 126 |         System.out.println();
 127 |         while (filter.incrementToken()) {
 128 |             CharTermAttribute ta = filter.getAttribute(CharTermAttribute.class);
 129 |             pinyin.add(ta.toString());
 130 |             System.out.println(ta.toString());
 131 |         }
 132 | 
 133 |         Assert.assertEquals(9, pinyin.size());
 134 |         Assert.assertEquals("liu", pinyin.get(0));
 135 |         Assert.assertEquals("刘", pinyin.get(1));
 136 |         Assert.assertEquals("l", pinyin.get(2));
 137 |         Assert.assertEquals("de", pinyin.get(3));
 138 |         Assert.assertEquals("德", pinyin.get(4));
 139 |         Assert.assertEquals("d", pinyin.get(5));
 140 |         Assert.assertEquals("hua", pinyin.get(6));
 141 |         Assert.assertEquals("华", pinyin.get(7));
 142 |         Assert.assertEquals("h", pinyin.get(8));
 143 | 
 144 | 
 145 |         config = new PinyinConfig();
 146 |         config.keepFirstLetter = true;
 147 |         config.keepNoneChinese = true;
 148 |         config.keepOriginal = true;
 149 |         config.keepFullPinyin = true;
 150 |         config.ignorePinyinOffset = false;
 151 | 
 152 | 
 153 |         sr = new StringReader("刘德华");
 154 |         analyzer = new KeywordAnalyzer();
 155 |         filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config);
 156 |         pinyin = new ArrayList<String>();
 157 |         filter.reset();
 158 |         System.out.println();
 159 |         while (filter.incrementToken()) {
 160 |             CharTermAttribute ta = filter.getAttribute(CharTermAttribute.class);
 161 |             pinyin.add(ta.toString());
 162 |             System.out.println(ta.toString());
 163 |         }
 164 | 
 165 |         Assert.assertEquals(5, pinyin.size());
 166 |         Assert.assertEquals("liu", pinyin.get(0));
 167 |         Assert.assertEquals("刘德华", pinyin.get(1));
 168 |         Assert.assertEquals("ldh", pinyin.get(2));
 169 |         Assert.assertEquals("de", pinyin.get(3));
 170 |         Assert.assertEquals("hua", pinyin.get(4));
 171 | 
 172 | 
 173 | 
 174 |         config = new PinyinConfig();
 175 |         config.keepFirstLetter = true;
 176 |         config.keepNoneChinese = false;
 177 |         config.keepNoneChineseInFirstLetter = true;
 178 |         config.keepOriginal = false;
 179 |         config.keepFullPinyin = false;
 180 |         config.LimitFirstLetterLength = 5;
 181 |         config.lowercase = true;
 182 |         config.ignorePinyinOffset = false;
 183 | 
 184 | 
 185 |         sr = new StringReader("Go的数组是纯粹的值类型，传递一个[N]T的代价是N个T");
 186 |         analyzer = new KeywordAnalyzer();
 187 |         filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config);
 188 |         pinyin = new ArrayList<String>();
 189 |         filter.reset();
 190 |         System.out.println();
 191 |         while (filter.incrementToken()) {
 192 |             CharTermAttribute ta = filter.getAttribute(CharTermAttribute.class);
 193 |             pinyin.add(ta.toString());
 194 |             System.out.println(ta.toString());
 195 |         }
 196 | 
 197 |         Assert.assertEquals(1, pinyin.size());
 198 |         Assert.assertEquals("godsz", pinyin.get(0));
 199 | 
 200 | 
 201 |         config = new PinyinConfig();
 202 |         config.keepFirstLetter = true;
 203 |         config.keepSeparateFirstLetter = true;
 204 |         config.keepNoneChinese = true;
 205 |         config.keepNoneChineseInFirstLetter = false;
 206 |         config.keepOriginal = false;
 207 |         config.keepFullPinyin = true;
 208 |         config.LimitFirstLetterLength = 5;
 209 |         config.lowercase = true;
 210 |         config.ignorePinyinOffset = false;
 211 | 
 212 | 
 213 |         sr = new StringReader("liu德hua 名字");
 214 |         analyzer = new WhitespaceAnalyzer();
 215 |         filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config);
 216 |         filter.reset();
 217 |         System.out.println();
 218 |         pinyin = getTokenFilterResult(filter);
 219 | 
 220 |         Assert.assertEquals(9, pinyin.size());
 221 |         Assert.assertEquals("liu", pinyin.get(0));
 222 |         Assert.assertEquals("d", pinyin.get(1));
 223 |         Assert.assertEquals("de", pinyin.get(2));
 224 |         Assert.assertEquals("hua", pinyin.get(3));
 225 |         Assert.assertEquals("m", pinyin.get(4));
 226 |         Assert.assertEquals("ming", pinyin.get(5));
 227 |         Assert.assertEquals("z", pinyin.get(6));
 228 |         Assert.assertEquals("zi", pinyin.get(7));
 229 |         Assert.assertEquals("mz", pinyin.get(8));
 230 | 
 231 | 
 232 |         config = new PinyinConfig();
 233 |         config.keepFirstLetter = true;
 234 |         config.keepSeparateFirstLetter = true;
 235 |         config.keepNoneChinese = true;
 236 |         config.keepNoneChineseInFirstLetter = false;
 237 |         config.keepOriginal = false;
 238 |         config.keepFullPinyin = true;
 239 |         config.LimitFirstLetterLength = 5;
 240 |         config.lowercase = true;
 241 |         config.noneChinesePinyinTokenize=true;
 242 |         config.removeDuplicateTerm=false;
 243 |         config.ignorePinyinOffset = false;
 244 | 
 245 | 
 246 |         sr = new StringReader("liudehuaalibaba13zhuanghan134");
 247 |         analyzer = new WhitespaceAnalyzer();
 248 |         filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config);
 249 | 
 250 |         filter.reset();
 251 |         System.out.println();
 252 | 
 253 |         pinyin= getTokenFilterResult(filter);
 254 | 
 255 |         Assert.assertEquals(11, pinyin.size());
 256 |         Assert.assertEquals("liu", pinyin.get(0));
 257 |         Assert.assertEquals("de", pinyin.get(1));
 258 |         Assert.assertEquals("hua", pinyin.get(2));
 259 |         Assert.assertEquals("a", pinyin.get(3));
 260 |         Assert.assertEquals("li", pinyin.get(4));
 261 |         Assert.assertEquals("ba", pinyin.get(5));
 262 |         Assert.assertEquals("ba", pinyin.get(6));
 263 |         Assert.assertEquals("13", pinyin.get(7));
 264 |         Assert.assertEquals("zhuang", pinyin.get(8));
 265 |         Assert.assertEquals("han", pinyin.get(9));
 266 |         Assert.assertEquals("134", pinyin.get(10));
 267 | 
 268 | 
 269 | 
 270 |         config = new PinyinConfig();
 271 |         config.keepFirstLetter=true;
 272 |         config.keepFullPinyin=false;
 273 |         config.keepJoinedFullPinyin =true;
 274 |         config.keepNoneChinese=false;
 275 |         config.keepNoneChineseTogether=true;
 276 |         config.noneChinesePinyinTokenize=true;
 277 |         config.keepNoneChineseInFirstLetter=true;
 278 |         config.keepOriginal=false;
 279 |         config.lowercase=true;
 280 |         config.trimWhitespace=true;
 281 |         config.fixedPinyinOffset =true;
 282 |         config.ignorePinyinOffset = false;
 283 | 
 284 |         sr = new StringReader("刘德华");
 285 |         analyzer = new WhitespaceAnalyzer();
 286 |         filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config);
 287 |         filter.reset();
 288 |         pinyin= getTokenFilterResult(filter);
 289 |         Assert.assertEquals("liudehua", pinyin.get(0));
 290 |         Assert.assertEquals("ldh", pinyin.get(1));
 291 | 
 292 | 
 293 |     }
 294 | 
 295 |     private List<String> getTokenFilterResult(PinyinTokenFilter filter)  throws IOException {
 296 |         List<String> pinyin = new ArrayList<String>();
 297 |         int pos=0;
 298 |         while (filter.incrementToken()) {
 299 |             CharTermAttribute ta = filter.getAttribute(CharTermAttribute.class);
 300 |             OffsetAttribute offset = filter.getAttribute(OffsetAttribute.class);
 301 |             PositionIncrementAttribute position = filter.getAttribute(PositionIncrementAttribute.class);
 302 |             pos=pos+position.getPositionIncrement();
 303 |             pinyin.add(ta.toString());
 304 |             Assert.assertTrue("startOffset must be non-negative",offset.startOffset()>=0);
 305 |             Assert.assertTrue("endOffset must be >= startOffset",offset.startOffset()>=0);
 306 |             System.out.println(ta.toString()+","+offset.startOffset()+","+offset.endOffset()+","+pos);
 307 |         }
 308 |         return pinyin;
 309 |     }
 310 | 
 311 | 
 312 |     @Test
 313 |     public void TestTokenizer() throws IOException {
 314 |         String[] s =
 315 |                 {"刘德华"
 316 |                         , "劉德華", "刘德华A1",
 317 |                         "讲话频率小，不能发高音", "T波低平或倒置", "β-氨基酸尿",
 318 |                         "DJ音乐家", "人生一大乐事, 哈哈",
 319 |                 };
 320 | 
 321 |         PinyinConfig config = new PinyinConfig();
 322 |         config.noneChinesePinyinTokenize=false;
 323 |         config.keepOriginal=true;
 324 |         config.ignorePinyinOffset = false;
 325 | 
 326 |         HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);
 327 | 
 328 |         ArrayList<TermItem> re = result.get("刘德华");
 329 |         Assert.assertEquals(5, re.size());
 330 |         Assert.assertEquals("liu", re.get(0).term);
 331 |         Assert.assertEquals("刘德华", re.get(1).term);
 332 |         Assert.assertEquals("ldh", re.get(2).term);
 333 |         Assert.assertEquals("de", re.get(3).term);
 334 |         Assert.assertEquals("hua", re.get(4).term);
 335 | 
 336 | 
 337 |         re = result.get("劉德華");
 338 |         Assert.assertEquals(5, re.size());
 339 |         Assert.assertEquals("liu", re.get(0).term);
 340 |         Assert.assertEquals("劉德華", re.get(1).term);
 341 |         Assert.assertEquals("ldh", re.get(2).term);
 342 |         Assert.assertEquals("de", re.get(3).term);
 343 |         Assert.assertEquals("hua", re.get(4).term);
 344 | 
 345 | 
 346 |         re = result.get("刘德华A1");
 347 |         Assert.assertEquals(6, re.size());
 348 |         Assert.assertEquals("liu", re.get(0).term);
 349 |         Assert.assertEquals("刘德华a1", re.get(1).term);
 350 |         Assert.assertEquals("ldha1", re.get(2).term);
 351 |         Assert.assertEquals("de", re.get(3).term);
 352 |         Assert.assertEquals("hua", re.get(4).term);
 353 |         Assert.assertEquals("a1", re.get(5).term);
 354 | 
 355 | 
 356 |         re = result.get("讲话频率小，不能发高音");
 357 |         Assert.assertEquals(12, re.size());
 358 |         Assert.assertEquals("jiang", re.get(0).term);
 359 |         Assert.assertEquals("讲话频率小，不能发高音", re.get(1).term);
 360 |         Assert.assertEquals("jhplxbnfgy", re.get(2).term);
 361 |         Assert.assertEquals("hua", re.get(3).term);
 362 |         Assert.assertEquals("pin", re.get(4).term);
 363 |         Assert.assertEquals("lv", re.get(5).term);
 364 |         Assert.assertEquals("xiao", re.get(6).term);
 365 |         Assert.assertEquals("bu", re.get(7).term);
 366 |         Assert.assertEquals("neng", re.get(8).term);
 367 |         Assert.assertEquals("fa", re.get(9).term);
 368 |         Assert.assertEquals("gao", re.get(10).term);
 369 |         Assert.assertEquals("yin", re.get(11).term);
 370 | 
 371 | 
 372 |         re = result.get("T波低平或倒置");
 373 |         Assert.assertEquals(9, re.size());
 374 |         Assert.assertEquals("t", re.get(0).term);
 375 |         Assert.assertEquals("t波低平或倒置", re.get(1).term);
 376 |         Assert.assertEquals("tbdphdz", re.get(2).term);
 377 |         Assert.assertEquals("bo", re.get(3).term);
 378 |         Assert.assertEquals("di", re.get(4).term);
 379 |         Assert.assertEquals("ping", re.get(5).term);
 380 |         Assert.assertEquals("huo", re.get(6).term);
 381 |         Assert.assertEquals("dao", re.get(7).term);
 382 |         Assert.assertEquals("zhi", re.get(8).term);
 383 | 
 384 | 
 385 |         re = result.get("β-氨基酸尿");
 386 |         Assert.assertEquals(6, re.size());
 387 |         Assert.assertEquals("β-氨基酸尿", re.get(1).term);
 388 |         Assert.assertEquals("ajsn", re.get(2).term);
 389 |         Assert.assertEquals("an", re.get(0).term);
 390 |         Assert.assertEquals("ji", re.get(3).term);
 391 |         Assert.assertEquals("suan", re.get(4).term);
 392 |         Assert.assertEquals("niao", re.get(5).term);
 393 | 
 394 |         re = result.get("DJ音乐家");
 395 |         Assert.assertEquals(6, re.size());
 396 |         Assert.assertEquals("dj", re.get(0).term);
 397 |         Assert.assertEquals("dj音乐家", re.get(1).term);
 398 |         Assert.assertEquals("djyyj", re.get(2).term);
 399 |         Assert.assertEquals("yin", re.get(3).term);
 400 |         Assert.assertEquals("yue", re.get(4).term);
 401 |         Assert.assertEquals("jia", re.get(5).term);
 402 | 
 403 | 
 404 |         String[] s1 =
 405 |                 {"刘德华", "刘 de 华"};
 406 |         config = new PinyinConfig();
 407 |         config.keepFirstLetter = true;
 408 |         config.keepSeparateFirstLetter = true;
 409 |         config.keepNoneChinese = false;
 410 |         config.keepNoneChineseInFirstLetter = false;
 411 |         config.keepOriginal = false;
 412 |         config.keepFullPinyin = true;
 413 |         config.LimitFirstLetterLength = 5;
 414 |         config.lowercase = false;
 415 |         config.ignorePinyinOffset = false;
 416 | 
 417 | 
 418 |         result = getStringArrayListHashMap(s1, config);
 419 | 
 420 |         re = result.get("刘德华");
 421 |         Assert.assertEquals(7, re.size());
 422 |         Assert.assertEquals("l", re.get(0).term);
 423 |         Assert.assertEquals("liu", re.get(1).term);
 424 |         Assert.assertEquals("ldh", re.get(2).term);
 425 |         Assert.assertEquals("d", re.get(3).term);
 426 |         Assert.assertEquals("de", re.get(4).term);
 427 |         Assert.assertEquals("h", re.get(5).term);
 428 |         Assert.assertEquals("hua", re.get(6).term);
 429 | 
 430 |         s1 = new String[]{"我的的"};
 431 |         config = new PinyinConfig();
 432 |         config.keepFirstLetter = true;
 433 |         config.keepSeparateFirstLetter = true;
 434 |         config.keepNoneChinese = false;
 435 |         config.keepNoneChineseInFirstLetter = false;
 436 |         config.keepOriginal = false;
 437 |         config.keepFullPinyin = true;
 438 |         config.LimitFirstLetterLength = 5;
 439 |         config.removeDuplicateTerm = true;
 440 |         config.lowercase = false;
 441 |         config.ignorePinyinOffset = false;
 442 | 
 443 | 
 444 |         result = getStringArrayListHashMap(s1, config);
 445 | 
 446 |         re = result.get("我的的");
 447 |         Assert.assertEquals(5, re.size());
 448 |         Assert.assertEquals("w", re.get(0).term);
 449 |         Assert.assertEquals("wo", re.get(1).term);
 450 |         Assert.assertEquals("wdd", re.get(2).term);
 451 |         Assert.assertEquals("d", re.get(3).term);
 452 |         Assert.assertEquals("de", re.get(4).term);
 453 | 
 454 |         s1 = new String[]{"lu金 s刘德华 张学友 郭富城 黎明 四大lao天王liudehua"};
 455 |         config = new PinyinConfig();
 456 |         config.keepFirstLetter=true;
 457 |         config.keepFullPinyin=false;
 458 |         config.keepNoneChinese=false;
 459 |         config.keepNoneChineseTogether=true;
 460 |         config.noneChinesePinyinTokenize=true;
 461 |         config.keepNoneChineseInFirstLetter=true;
 462 |         config.keepOriginal=false;
 463 |         config.lowercase=true;
 464 |         config.trimWhitespace=true;
 465 |         config.ignorePinyinOffset = false;
 466 | 
 467 | 
 468 |         result = getStringArrayListHashMap(s1, config);
 469 | 
 470 |         re = result.get("lu金 s刘德华 张学友 郭富城 黎明 四大lao天王liudehua");
 471 |         Assert.assertEquals("lujsldhzxygfclms", re.get(0).term);
 472 | 
 473 | 
 474 |         s1 = new String[]{"刘德华"};
 475 |         config = new PinyinConfig();
 476 |         config.keepFirstLetter=true;
 477 |         config.keepFullPinyin=false;
 478 |         config.keepJoinedFullPinyin =true;
 479 |         config.keepNoneChinese=false;
 480 |         config.keepNoneChineseTogether=true;
 481 |         config.noneChinesePinyinTokenize=true;
 482 |         config.keepNoneChineseInFirstLetter=true;
 483 |         config.keepOriginal=false;
 484 |         config.lowercase=true;
 485 |         config.trimWhitespace=true;
 486 |         config.ignorePinyinOffset = false;
 487 | 
 488 | 
 489 |         result = getStringArrayListHashMap(s1, config);
 490 | 
 491 |         re = result.get("刘德华");
 492 |         Assert.assertEquals("liudehua", re.get(0).term);
 493 |         Assert.assertEquals("ldh", re.get(1).term);
 494 | 
 495 |         s1 = new String[]{"刘德华"};
 496 |         config = new PinyinConfig();
 497 |         config.keepFirstLetter=false;
 498 |         config.keepFullPinyin=false;
 499 |         config.keepJoinedFullPinyin =true;
 500 |         config.keepNoneChinese=false;
 501 |         config.keepNoneChineseTogether=true;
 502 |         config.noneChinesePinyinTokenize=true;
 503 |         config.keepNoneChineseInFirstLetter=true;
 504 |         config.keepOriginal=false;
 505 |         config.lowercase=true;
 506 |         config.trimWhitespace=true;
 507 |         config.ignorePinyinOffset = false;
 508 | 
 509 | 
 510 |         result = getStringArrayListHashMap(s1, config);
 511 | 
 512 |         re = result.get("刘德华");
 513 |         Assert.assertEquals("liudehua", re.get(0).term);
 514 | 
 515 | 
 516 |         s1 = new String[]{"ceshi"};
 517 |         config = new PinyinConfig();
 518 |         config.keepFirstLetter=false;
 519 |         config.keepSeparateFirstLetter=false;
 520 |         config.keepFullPinyin=false;
 521 |         config.keepJoinedFullPinyin =true;
 522 |         config.keepNoneChinese=true;
 523 |         config.keepNoneChineseTogether=true;
 524 |         config.keepOriginal=true;
 525 |         config.LimitFirstLetterLength=16;
 526 |         config.noneChinesePinyinTokenize=true;
 527 |         config.lowercase=true;
 528 |         config.ignorePinyinOffset = false;
 529 | 
 530 | 
 531 |         result = getStringArrayListHashMap(s1, config);
 532 | 
 533 |         re = result.get("ceshi");
 534 |         Assert.assertEquals("ce", re.get(0).term);
 535 |         Assert.assertEquals("shi", re.get(2).term);
 536 |         Assert.assertEquals("ceshi", re.get(1).term);
 537 | 
 538 | 
 539 | 
 540 | 
 541 |     }
 542 | 
 543 |     @Test
 544 |     public void TestFirstLetters() throws IOException {
 545 |         String[] s1 = new String[]{"刘德华"};
 546 |         PinyinConfig config = new PinyinConfig();
 547 |         config.keepFirstLetter = false;
 548 |         config.keepSeparateFirstLetter = true;
 549 |         config.keepFullPinyin = false;
 550 |         config.keepJoinedFullPinyin = false;
 551 |         config.keepNoneChinese = true;
 552 |         config.keepNoneChineseTogether = true;
 553 |         config.keepOriginal = false;
 554 |         config.LimitFirstLetterLength = 16;
 555 |         config.noneChinesePinyinTokenize = true;
 556 |         config.lowercase = true;
 557 |         config.ignorePinyinOffset = false;
 558 | 
 559 | 
 560 |         HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s1, config);
 561 | 
 562 |         ArrayList<TermItem> re = result.get("刘德华");
 563 |         Assert.assertEquals("l", re.get(0).term);
 564 |         Assert.assertEquals("d", re.get(1).term);
 565 |         Assert.assertEquals("h", re.get(2).term);
 566 | 
 567 |         Assert.assertEquals(0, re.get(0).startOffset);
 568 |         Assert.assertEquals(1, re.get(1).startOffset);
 569 |         Assert.assertEquals(2, re.get(2).startOffset);
 570 | 
 571 |         Assert.assertEquals(1, re.get(0).endOffset);
 572 |         Assert.assertEquals(2, re.get(1).endOffset);
 573 |         Assert.assertEquals(3, re.get(2).endOffset);
 574 |     }
 575 | 
 576 |     @Test
 577 |     public void TestOnlyLetters() throws IOException {
 578 |         String[] s1 = new String[]{"ldh"};
 579 |         PinyinConfig config = new PinyinConfig();
 580 |         config.keepFirstLetter=false;
 581 |         config.keepSeparateFirstLetter=false;
 582 |         config.keepFullPinyin=true;
 583 |         config.keepJoinedFullPinyin =false;
 584 |         config.keepNoneChinese=true;
 585 |         config.keepNoneChineseTogether=true;
 586 |         config.keepOriginal=false;
 587 |         config.LimitFirstLetterLength=16;
 588 |         config.noneChinesePinyinTokenize=true;
 589 |         config.lowercase=true;
 590 |         config.ignorePinyinOffset = false;
 591 | 
 592 | 
 593 |         HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s1, config);
 594 | 
 595 |         ArrayList<TermItem> re = result.get("ldh");
 596 |         Assert.assertEquals("l", re.get(0).term);
 597 |         Assert.assertEquals("d", re.get(1).term);
 598 |         Assert.assertEquals("h", re.get(2).term);
 599 | 
 600 |         Assert.assertEquals(0, re.get(0).startOffset);
 601 |         Assert.assertEquals(1, re.get(1).startOffset);
 602 |         Assert.assertEquals(2, re.get(2).startOffset);
 603 | 
 604 |         Assert.assertEquals(1, re.get(0).endOffset);
 605 |         Assert.assertEquals(2, re.get(1).endOffset);
 606 |         Assert.assertEquals(3, re.get(2).endOffset);
 607 | 
 608 | 
 609 |         s1 = new String[]{"liuldhdehua"};
 610 |          config = new PinyinConfig();
 611 |         config.keepFirstLetter=false;
 612 |         config.keepSeparateFirstLetter=false;
 613 |         config.keepFullPinyin=true;
 614 |         config.keepJoinedFullPinyin =false;
 615 |         config.keepNoneChinese=true;
 616 |         config.keepNoneChineseTogether=true;
 617 |         config.keepOriginal=false;
 618 |         config.LimitFirstLetterLength=16;
 619 |         config.noneChinesePinyinTokenize=true;
 620 |         config.lowercase=true;
 621 |         config.ignorePinyinOffset = false;
 622 | 
 623 | 
 624 |         result = getStringArrayListHashMap(s1, config);
 625 | 
 626 |         re = result.get("liuldhdehua");
 627 |         Assert.assertEquals("liu", re.get(0).term);
 628 |         Assert.assertEquals("l", re.get(1).term);
 629 |         Assert.assertEquals("d", re.get(2).term);
 630 |         Assert.assertEquals("h", re.get(3).term);
 631 |         Assert.assertEquals("de", re.get(4).term);
 632 |         Assert.assertEquals("hua", re.get(5).term);
 633 | 
 634 |        s1 = new String[]{"liuldh"};
 635 |          config = new PinyinConfig();
 636 |         config.keepFirstLetter=false;
 637 |         config.keepSeparateFirstLetter=false;
 638 |         config.keepFullPinyin=true;
 639 |         config.keepJoinedFullPinyin =false;
 640 |         config.keepNoneChinese=true;
 641 |         config.keepNoneChineseTogether=true;
 642 |         config.keepOriginal=false;
 643 |         config.LimitFirstLetterLength=16;
 644 |         config.noneChinesePinyinTokenize=true;
 645 |         config.lowercase=true;
 646 |         config.ignorePinyinOffset = false;
 647 | 
 648 | 
 649 |         result = getStringArrayListHashMap(s1, config);
 650 | 
 651 |         re = result.get("liuldh");
 652 |         Assert.assertEquals("liu", re.get(0).term);
 653 |         Assert.assertEquals("l", re.get(1).term);
 654 |         Assert.assertEquals("d", re.get(2).term);
 655 |         Assert.assertEquals("h", re.get(3).term);
 656 | 
 657 |         s1 = new String[]{"ldhdehua"};
 658 |          config = new PinyinConfig();
 659 |         config.keepFirstLetter=false;
 660 |         config.keepSeparateFirstLetter=false;
 661 |         config.keepFullPinyin=true;
 662 |         config.keepJoinedFullPinyin =false;
 663 |         config.keepNoneChinese=true;
 664 |         config.keepNoneChineseTogether=true;
 665 |         config.keepOriginal=false;
 666 |         config.LimitFirstLetterLength=16;
 667 |         config.noneChinesePinyinTokenize=true;
 668 |         config.lowercase=true;
 669 |         config.ignorePinyinOffset = false;
 670 | 
 671 | 
 672 |         result = getStringArrayListHashMap(s1, config);
 673 | 
 674 |         re = result.get("ldhdehua");
 675 |         Assert.assertEquals("l", re.get(0).term);
 676 |         Assert.assertEquals("d", re.get(1).term);
 677 |         Assert.assertEquals("h", re.get(2).term);
 678 |         Assert.assertEquals("de", re.get(3).term);
 679 |         Assert.assertEquals("hua", re.get(4).term);
 680 | 
 681 |         s1 = new String[]{"ldh123dehua"};
 682 |          config = new PinyinConfig();
 683 |         config.keepFirstLetter=false;
 684 |         config.keepSeparateFirstLetter=false;
 685 |         config.keepFullPinyin=true;
 686 |         config.keepJoinedFullPinyin =false;
 687 |         config.keepNoneChinese=true;
 688 |         config.keepNoneChineseTogether=true;
 689 |         config.keepOriginal=false;
 690 |         config.LimitFirstLetterLength=16;
 691 |         config.noneChinesePinyinTokenize=true;
 692 |         config.lowercase=true;
 693 |         config.ignorePinyinOffset = false;
 694 | 
 695 | 
 696 |         result = getStringArrayListHashMap(s1, config);
 697 | 
 698 |         re = result.get("ldh123dehua");
 699 |         Assert.assertEquals("l", re.get(0).term);
 700 |         Assert.assertEquals("d", re.get(1).term);
 701 |         Assert.assertEquals("h", re.get(2).term);
 702 |         Assert.assertEquals("123", re.get(3).term);
 703 |         Assert.assertEquals("de", re.get(4).term);
 704 |         Assert.assertEquals("hua", re.get(5).term);
 705 |     }
 706 | 
 707 |     @Test
 708 |     public void TestOnlyFirstLetterTokenizer() throws IOException {
 709 |         String[] s =
 710 |                 {"刘德华", "β-氨基酸尿", "DJ音乐家"
 711 |                 };
 712 | 
 713 |         PinyinConfig config = new PinyinConfig();
 714 |         config.keepFirstLetter = true;
 715 |         config.keepNoneChinese = true;
 716 |         config.keepOriginal = false;
 717 |         config.keepFullPinyin = false;
 718 |         config.keepNoneChineseTogether = false;
 719 |         config.ignorePinyinOffset = false;
 720 | 
 721 | 
 722 |         HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);
 723 | 
 724 |         ArrayList<TermItem> re = result.get("刘德华");
 725 |         Assert.assertEquals(1, re.size());
 726 |         Assert.assertEquals("ldh", re.get(0).term);
 727 | 
 728 |         re = result.get("β-氨基酸尿");
 729 |         Assert.assertEquals(1, re.size());
 730 |         Assert.assertEquals("ajsn", re.get(0).term);
 731 | 
 732 |         re = result.get("DJ音乐家");
 733 |         Assert.assertEquals(3, re.size());
 734 |         Assert.assertEquals("d", re.get(0).term);
 735 |         Assert.assertEquals("djyyj", re.get(1).term);
 736 |         Assert.assertEquals("j", re.get(2).term);
 737 | 
 738 | 
 739 |         config = new PinyinConfig();
 740 |         config.keepFirstLetter = true;
 741 |         config.keepNoneChinese = false;
 742 |         config.keepNoneChineseInFirstLetter = false;
 743 |         config.keepOriginal = false;
 744 |         config.keepFullPinyin = false;
 745 |         config.keepNoneChineseTogether = false;
 746 |         config.ignorePinyinOffset = false;
 747 | 
 748 | 
 749 |         result = getStringArrayListHashMap(s, config);
 750 | 
 751 |         re = result.get("DJ音乐家");
 752 |         Assert.assertEquals(1, re.size());
 753 |         Assert.assertEquals("yyj", re.get(0).term);
 754 | 
 755 |         config = new PinyinConfig();
 756 |         config.keepFirstLetter = true;
 757 |         config.keepNoneChinese=true;
 758 |         config.keepNoneChineseInFirstLetter = true;
 759 |         config.keepNoneChineseTogether = true;
 760 |         config.keepOriginal = false;
 761 |         config.keepFullPinyin = false;
 762 |         config.noneChinesePinyinTokenize=false;
 763 |         config.ignorePinyinOffset = false;
 764 | 
 765 |         result = getStringArrayListHashMap(s, config);
 766 | 
 767 |         re = result.get("DJ音乐家");
 768 |         Assert.assertEquals(2, re.size());
 769 |         Assert.assertEquals("dj", re.get(0).term);
 770 |         Assert.assertEquals("djyyj", re.get(1).term);
 771 | 
 772 |     }
 773 | 
 774 |     @Test
 775 |     public void TestFullJoinedPinyin() throws IOException{
 776 |         String[] s =
 777 |                 {"DJ音乐家"
 778 |                 };
 779 |         PinyinConfig config = new PinyinConfig();
 780 |         config.keepFirstLetter = false;
 781 |         config.keepNoneChineseInFirstLetter = false;
 782 |         config.keepOriginal = false;
 783 |         config.keepFullPinyin = false;
 784 |         config.noneChinesePinyinTokenize=false;
 785 |         config.keepNoneChinese=true;
 786 |         config.keepJoinedFullPinyin=true;
 787 |         config.keepNoneChineseTogether = true;
 788 |         config.keepNoneChineseInJoinedFullPinyin=true;
 789 |         config.ignorePinyinOffset = false;
 790 | 
 791 |         HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);
 792 | 
 793 |         ArrayList<TermItem> re = result.get("DJ音乐家");
 794 |         Assert.assertEquals(1, re.size());
 795 |         Assert.assertEquals("djyinyuejia", re.get(0).term);
 796 |     }
 797 | 
 798 |     @Test
 799 |     public void TestMixedPinyinTokenizer() throws IOException {
 800 |         String[] s =
 801 |                 {
 802 |                         "刘德华",
 803 |                         "刘de华",
 804 |                         "liude华",
 805 |                         " liude 华"};
 806 | 
 807 |         PinyinConfig config = new PinyinConfig();
 808 |         config.keepFirstLetter = true;
 809 |         config.keepSeparateFirstLetter = true;
 810 |         config.keepNoneChinese = true;
 811 |         config.keepOriginal = true;
 812 |         config.keepFullPinyin = true;
 813 |         config.keepNoneChineseTogether = true;
 814 |         config.ignorePinyinOffset = false;
 815 | 
 816 | 
 817 |         HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);
 818 | 
 819 |         ArrayList<TermItem> re = result.get("刘德华");
 820 |         Assert.assertEquals(8, re.size());
 821 |         Assert.assertEquals("l", re.get(0).term);
 822 |         Assert.assertEquals(0, re.get(0).startOffset);
 823 |         Assert.assertEquals(1, re.get(0).endOffset);
 824 |         Assert.assertEquals("liu", re.get(1).term);
 825 |         Assert.assertEquals(0, re.get(1).startOffset);
 826 |         Assert.assertEquals(1, re.get(1).endOffset);
 827 | 
 828 |         Assert.assertEquals("刘德华", re.get(2).term);
 829 |         Assert.assertEquals(0, re.get(2).startOffset);
 830 |         Assert.assertEquals(3, re.get(2).endOffset);
 831 |         Assert.assertEquals("ldh", re.get(3).term);
 832 |         Assert.assertEquals(0, re.get(3).startOffset);
 833 |         Assert.assertEquals(3, re.get(3).endOffset);
 834 | 
 835 |         Assert.assertEquals("d", re.get(4).term);
 836 |         Assert.assertEquals(1, re.get(4).startOffset);
 837 |         Assert.assertEquals(2, re.get(4).endOffset);
 838 |         Assert.assertEquals("de", re.get(5).term);
 839 |         Assert.assertEquals(1, re.get(5).startOffset);
 840 |         Assert.assertEquals(2, re.get(5).endOffset);
 841 |         Assert.assertEquals("h", re.get(6).term);
 842 |         Assert.assertEquals(2, re.get(6).startOffset);
 843 |         Assert.assertEquals(3, re.get(6).endOffset);
 844 |         Assert.assertEquals("hua", re.get(7).term);
 845 |         Assert.assertEquals(2, re.get(7).startOffset);
 846 |         Assert.assertEquals(3, re.get(7).endOffset);
 847 | 
 848 |     }
 849 | 
 850 |     @Test
 851 |     public void TestPinyinTokenizerOffsetWithExtraTerms() throws IOException {
 852 |         String[] s =
 853 |                 {
 854 |                         "ceshi",
 855 |                         "测shi",
 856 |                         "ce试",
 857 |                         "测试",
 858 |                         "1测shi",
 859 |                 };
 860 | 
 861 |         PinyinConfig config = new PinyinConfig();
 862 |         config.keepFirstLetter = false;
 863 |         config.keepSeparateFirstLetter = false;
 864 |         config.keepNoneChinese = true;
 865 |         config.keepOriginal = false;
 866 |         config.keepFullPinyin = true;
 867 |         config.keepNoneChineseTogether = true;
 868 |         config.removeDuplicateTerm = true;
 869 |         config.fixedPinyinOffset=false;
 870 |         config.keepJoinedFullPinyin=false;
 871 |         config.ignorePinyinOffset = false;
 872 | 
 873 | 
 874 | 
 875 |         HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);
 876 | 
 877 |         ArrayList<TermItem> re;
 878 | 
 879 |         re = result.get("ceshi");
 880 |         Assert.assertEquals(2, re.size());
 881 |         Assert.assertEquals("ce", re.get(0).term);
 882 |         Assert.assertEquals(0, re.get(0).startOffset);
 883 |         Assert.assertEquals(2, re.get(0).endOffset);
 884 |         Assert.assertEquals("shi", re.get(1).term);
 885 |         Assert.assertEquals(2, re.get(1).startOffset);
 886 |         Assert.assertEquals(5, re.get(1).endOffset);
 887 | 
 888 |         re = result.get("测shi");
 889 |         Assert.assertEquals(2, re.size());
 890 |         Assert.assertEquals("ce", re.get(0).term);
 891 |         Assert.assertEquals(0, re.get(0).startOffset);
 892 |         Assert.assertEquals(1, re.get(0).endOffset);
 893 |         Assert.assertEquals("shi", re.get(1).term);
 894 |         Assert.assertEquals(1, re.get(1).startOffset);
 895 |         Assert.assertEquals(4, re.get(1).endOffset);
 896 | 
 897 |         re = result.get("ce试");
 898 |         Assert.assertEquals(2, re.size());
 899 |         Assert.assertEquals("ce", re.get(0).term);
 900 |         Assert.assertEquals(0, re.get(0).startOffset);
 901 |         Assert.assertEquals(2, re.get(0).endOffset);
 902 |         Assert.assertEquals("shi", re.get(1).term);
 903 |         Assert.assertEquals(2, re.get(1).startOffset);
 904 |         Assert.assertEquals(3, re.get(1).endOffset);
 905 | 
 906 |         re = result.get("测试");
 907 |         Assert.assertEquals(2, re.size());
 908 |         Assert.assertEquals("ce", re.get(0).term);
 909 |         Assert.assertEquals(0, re.get(0).startOffset);
 910 |         Assert.assertEquals(1, re.get(0).endOffset);
 911 |         Assert.assertEquals("shi", re.get(1).term);
 912 |         Assert.assertEquals(1, re.get(1).startOffset);
 913 |         Assert.assertEquals(2, re.get(1).endOffset);
 914 | 
 915 |         re = result.get("1测shi");
 916 |         Assert.assertEquals(3, re.size());
 917 |         Assert.assertEquals("1", re.get(0).term);
 918 |         Assert.assertEquals(0, re.get(0).startOffset);
 919 |         Assert.assertEquals(1, re.get(0).endOffset);
 920 |         Assert.assertEquals("ce", re.get(1).term);
 921 |         Assert.assertEquals(1, re.get(1).startOffset);
 922 |         Assert.assertEquals(2, re.get(1).endOffset);
 923 |         Assert.assertEquals("shi", re.get(2).term);
 924 |         Assert.assertEquals(2, re.get(2).startOffset);
 925 |         Assert.assertEquals(5, re.get(2).endOffset);
 926 | 
 927 |     }
 928 | 
 929 |     @Test
 930 |     public void TestPinyinTokenizerOffset() throws IOException {
 931 |         String[] s =
 932 |                 {
 933 |                         "ceshi",
 934 |                         "测shi",
 935 |                         "ce试",
 936 |                         "测试",
 937 |                         "1测shi",
 938 |                 };
 939 | 
 940 |         PinyinConfig config = new PinyinConfig();
 941 |         config.keepFirstLetter = false;
 942 |         config.keepSeparateFirstLetter = false;
 943 |         config.keepNoneChinese = true;
 944 |         config.keepOriginal = false;
 945 |         config.keepFullPinyin = true;
 946 |         config.keepNoneChineseTogether = true;
 947 |         config.fixedPinyinOffset=false;
 948 |         config.ignorePinyinOffset = false;
 949 | 
 950 |         HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);
 951 | 
 952 |         ArrayList<TermItem> re;
 953 | 
 954 |         re = result.get("ceshi");
 955 |         Assert.assertEquals(2, re.size());
 956 |         Assert.assertEquals("ce", re.get(0).term);
 957 |         Assert.assertEquals(0, re.get(0).startOffset);
 958 |         Assert.assertEquals(2, re.get(0).endOffset);
 959 |         Assert.assertEquals("shi", re.get(1).term);
 960 |         Assert.assertEquals(2, re.get(1).startOffset);
 961 |         Assert.assertEquals(5, re.get(1).endOffset);
 962 | 
 963 |         re = result.get("测shi");
 964 |         Assert.assertEquals(2, re.size());
 965 |         Assert.assertEquals("ce", re.get(0).term);
 966 |         Assert.assertEquals(0, re.get(0).startOffset);
 967 |         Assert.assertEquals(1, re.get(0).endOffset);
 968 |         Assert.assertEquals("shi", re.get(1).term);
 969 |         Assert.assertEquals(1, re.get(1).startOffset);
 970 |         Assert.assertEquals(4, re.get(1).endOffset);
 971 | 
 972 |         re = result.get("ce试");
 973 |         Assert.assertEquals(2, re.size());
 974 |         Assert.assertEquals("ce", re.get(0).term);
 975 |         Assert.assertEquals(0, re.get(0).startOffset);
 976 |         Assert.assertEquals(2, re.get(0).endOffset);
 977 |         Assert.assertEquals("shi", re.get(1).term);
 978 |         Assert.assertEquals(2, re.get(1).startOffset);
 979 |         Assert.assertEquals(3, re.get(1).endOffset);
 980 | 
 981 |         re = result.get("测试");
 982 |         Assert.assertEquals(2, re.size());
 983 |         Assert.assertEquals("ce", re.get(0).term);
 984 |         Assert.assertEquals(0, re.get(0).startOffset);
 985 |         Assert.assertEquals(1, re.get(0).endOffset);
 986 |         Assert.assertEquals("shi", re.get(1).term);
 987 |         Assert.assertEquals(1, re.get(1).startOffset);
 988 |         Assert.assertEquals(2, re.get(1).endOffset);
 989 | 
 990 |         re = result.get("1测shi");
 991 |         Assert.assertEquals(3, re.size());
 992 |         Assert.assertEquals("1", re.get(0).term);
 993 |         Assert.assertEquals(0, re.get(0).startOffset);
 994 |         Assert.assertEquals(1, re.get(0).endOffset);
 995 |         Assert.assertEquals("ce", re.get(1).term);
 996 |         Assert.assertEquals(1, re.get(1).startOffset);
 997 |         Assert.assertEquals(2, re.get(1).endOffset);
 998 |         Assert.assertEquals("shi", re.get(2).term);
 999 |         Assert.assertEquals(2, re.get(2).startOffset);
1000 |         Assert.assertEquals(5, re.get(2).endOffset);
1001 | 
1002 |     }
1003 | 
1004 |     @Test
1005 |     public void TestPinyinTokenizerFixedOffset() throws IOException {
1006 |         String[] s =
1007 |                 {
1008 |                         "ceshi",
1009 |                         "测shi",
1010 | //                        "ce试",
1011 |                         "测试",
1012 |                         "1测shi",
1013 |                 };
1014 | 
1015 |         PinyinConfig config = new PinyinConfig();
1016 |         config.keepFirstLetter = false;
1017 |         config.keepSeparateFirstLetter = false;
1018 |         config.keepNoneChinese = true;
1019 |         config.keepOriginal = false;
1020 |         config.keepFullPinyin = true;
1021 |         config.keepNoneChineseTogether = true;
1022 |         config.fixedPinyinOffset=true;
1023 |         config.ignorePinyinOffset = false;
1024 | 
1025 | 
1026 |         HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);
1027 | 
1028 |         ArrayList<TermItem> re;
1029 | 
1030 |         re = result.get("ceshi");
1031 |         Assert.assertEquals(2, re.size());
1032 |         Assert.assertEquals("ce", re.get(0).term);
1033 |         Assert.assertEquals(0, re.get(0).startOffset);
1034 |         Assert.assertEquals(1, re.get(0).endOffset);
1035 |         Assert.assertEquals("shi", re.get(1).term);
1036 |         Assert.assertEquals(1, re.get(1).startOffset);
1037 |         Assert.assertEquals(2, re.get(1).endOffset);
1038 | 
1039 |         re = result.get("测shi");
1040 |         Assert.assertEquals(2, re.size());
1041 |         Assert.assertEquals("ce", re.get(0).term);
1042 |         Assert.assertEquals(0, re.get(0).startOffset);
1043 |         Assert.assertEquals(1, re.get(0).endOffset);
1044 |         Assert.assertEquals("shi", re.get(1).term);
1045 |         Assert.assertEquals(1, re.get(1).startOffset);
1046 |         Assert.assertEquals(2, re.get(1).endOffset);
1047 | 
1048 | //        re = result.get("ce试");
1049 | //        Assert.assertEquals(2, re.size());
1050 | //        Assert.assertEquals("ce", re.get(0).term);
1051 | //        Assert.assertEquals(0, re.get(0).startOffset);
1052 | //        Assert.assertEquals(1, re.get(0).endOffset);
1053 | //        Assert.assertEquals("shi", re.get(1).term);
1054 | //        Assert.assertEquals(1, re.get(1).startOffset);
1055 | //        Assert.assertEquals(2, re.get(1).endOffset);
1056 | 
1057 |         re = result.get("测试");
1058 |         Assert.assertEquals(2, re.size());
1059 |         Assert.assertEquals("ce", re.get(0).term);
1060 |         Assert.assertEquals(0, re.get(0).startOffset);
1061 |         Assert.assertEquals(1, re.get(0).endOffset);
1062 |         Assert.assertEquals("shi", re.get(1).term);
1063 |         Assert.assertEquals(1, re.get(1).startOffset);
1064 |         Assert.assertEquals(2, re.get(1).endOffset);
1065 | 
1066 |         re = result.get("1测shi");
1067 |         Assert.assertEquals(3, re.size());
1068 |         Assert.assertEquals("1", re.get(0).term);
1069 |         Assert.assertEquals(0, re.get(0).startOffset);
1070 |         Assert.assertEquals(1, re.get(0).endOffset);
1071 |         Assert.assertEquals("ce", re.get(1).term);
1072 |         Assert.assertEquals(1, re.get(1).startOffset);
1073 |         Assert.assertEquals(2, re.get(1).endOffset);
1074 |         Assert.assertEquals("shi", re.get(2).term);
1075 |         Assert.assertEquals(2, re.get(2).startOffset);
1076 |         Assert.assertEquals(3, re.get(2).endOffset);
1077 | 
1078 |     }
1079 | 
1080 |     @Test
1081 |     public void TestPinyin() {
1082 |         List<String> result = Pinyin.pinyin("德");
1083 |         for (int i = 0; i < result.size(); i++) {
1084 |             String s = result.get(i);
1085 |             System.out.println(s);
1086 |         }
1087 |         Assert.assertEquals("de", result.get(0));
1088 |     }
1089 | 
1090 |     private HashMap<String, ArrayList<TermItem>> getStringArrayListHashMap(String[] s, PinyinConfig config) throws IOException {
1091 |         HashMap<String, ArrayList<TermItem>> result = new HashMap<>();
1092 |         for (String value : s) {
1093 |             System.out.println("\n" + value);
1094 |             StringReader sr = new StringReader(value);
1095 | 
1096 |             PinyinTokenizer tokenizer = new PinyinTokenizer(config);
1097 |             tokenizer.setReader(sr);
1098 | 
1099 |             tokenizer.reset();
1100 | 
1101 |             boolean hasnext = tokenizer.incrementToken();
1102 | 
1103 |             int pos=0;
1104 |             ArrayList<TermItem> re = new ArrayList<>();
1105 |             while (hasnext) {
1106 |                 CharTermAttribute ta = tokenizer.getAttribute(CharTermAttribute.class);
1107 |                 PositionIncrementAttribute position = tokenizer.getAttribute(PositionIncrementAttribute.class);
1108 |                 OffsetAttribute offset = tokenizer.getAttribute(OffsetAttribute.class);
1109 |                 pos=pos+position.getPositionIncrement();
1110 |                 System.out.printf("%s: %d -> %d ,%d\n", ta.toString(), offset.startOffset(), offset.endOffset(),pos);
1111 |                 re.add(new TermItem(ta.toString(),offset.startOffset(),offset.endOffset(),pos));
1112 |                 hasnext = tokenizer.incrementToken();
1113 |             }
1114 |             result.put(value, re);
1115 |         }
1116 |         return result;
1117 |     }
1118 | 
1119 |     @Test
1120 |     public void TestPinyinFunction() {
1121 |         List<String> result = Pinyin.pinyin("貌美如誮");
1122 |         for (int i = 0; i < result.size(); i++) {
1123 |             String s = result.get(i);
1124 |             System.out.println(s);
1125 |         }
1126 |         Assert.assertEquals("mao", result.get(0));
1127 |         Assert.assertEquals("mei", result.get(1));
1128 |         Assert.assertEquals("ru", result.get(2));
1129 |         Assert.assertEquals("hua", result.get(3));
1130 |     }
1131 | 
1132 |     @Test
1133 |     public void TestPinyinTokenize(){
1134 |         String str ="liudehuaalibaba13zhuanghan134";
1135 |         List<String> result = PinyinAlphabetTokenizer.walk(str);
1136 |         for (int i = 0; i < result.size(); i++) {
1137 |             System.out.println(result.get(i));
1138 |         }
1139 |         Assert.assertEquals("liu", result.get(0));
1140 |         Assert.assertEquals("de", result.get(1));
1141 |         Assert.assertEquals("hua", result.get(2));
1142 |         Assert.assertEquals("a", result.get(3));
1143 |         Assert.assertEquals("li", result.get(4));
1144 |         Assert.assertEquals("ba", result.get(5));
1145 |         Assert.assertEquals("ba", result.get(6));
1146 |         Assert.assertEquals("13", result.get(7));
1147 |         Assert.assertEquals("zhuang", result.get(8));
1148 |         Assert.assertEquals("han", result.get(9));
1149 |         Assert.assertEquals("134", result.get(10));
1150 | 
1151 |         str ="a123";
1152 |         result = PinyinAlphabetTokenizer.walk(str);
1153 |         for (int i = 0; i < result.size(); i++) {
1154 |             System.out.println(result.get(i));
1155 |         }
1156 |         Assert.assertEquals("a", result.get(0));
1157 |         Assert.assertEquals("123", result.get(1));
1158 | 
1159 |         str ="liudehua";
1160 |         result = PinyinAlphabetTokenizer.walk(str);
1161 |         for (int i = 0; i < result.size(); i++) {
1162 |             System.out.println(result.get(i));
1163 |         }
1164 |         Assert.assertEquals("liu", result.get(0));
1165 |         Assert.assertEquals("de", result.get(1));
1166 |         Assert.assertEquals("hua", result.get(2));
1167 | 
1168 | 
1169 |         str ="ceshi";
1170 |         result = PinyinAlphabetTokenizer.walk(str);
1171 |         for (int i = 0; i < result.size(); i++) {
1172 |             System.out.println(i+": "+result.get(i));
1173 |         }
1174 |         Assert.assertEquals("ce", result.get(0));
1175 |         Assert.assertEquals("shi", result.get(1));
1176 |     }
1177 | 
1178 |     @Test
1179 |     public void TestPinyinPosition1() throws IOException {
1180 |         String[] s ={ "刘德华"};
1181 | 
1182 |         PinyinConfig config = new PinyinConfig();
1183 |         config.keepFirstLetter = true;
1184 |         config.keepSeparateFirstLetter = true;
1185 |         config.keepNoneChinese = true;
1186 |         config.keepOriginal = true;
1187 |         config.keepFullPinyin = true;
1188 |         config.keepNoneChineseTogether = true;
1189 |         config.ignorePinyinOffset = false;
1190 | 
1191 |         HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);
1192 | 
1193 |         ArrayList<TermItem> re = result.get("刘德华");
1194 |         Assert.assertEquals("l", re.get(0).term);
1195 |         Assert.assertEquals(0, re.get(0).startOffset);
1196 |         Assert.assertEquals(1, re.get(0).endOffset);
1197 |         Assert.assertEquals(1, re.get(0).position);
1198 |         Assert.assertEquals("liu", re.get(1).term);
1199 |         Assert.assertEquals(0, re.get(1).startOffset);
1200 |         Assert.assertEquals(1, re.get(1).endOffset);
1201 |         Assert.assertEquals(1, re.get(1).position);
1202 | 
1203 |         Assert.assertEquals("刘德华", re.get(2).term);
1204 |         Assert.assertEquals(0, re.get(2).startOffset);
1205 |         Assert.assertEquals(3, re.get(2).endOffset);
1206 |         Assert.assertEquals(1, re.get(2).position);
1207 |         Assert.assertEquals("ldh", re.get(3).term);
1208 |         Assert.assertEquals(0, re.get(3).startOffset);
1209 |         Assert.assertEquals(3, re.get(3).endOffset);
1210 |         Assert.assertEquals(1, re.get(3).position);
1211 | 
1212 |         Assert.assertEquals("d", re.get(4).term);
1213 |         Assert.assertEquals(1, re.get(4).startOffset);
1214 |         Assert.assertEquals(2, re.get(4).endOffset);
1215 |         Assert.assertEquals(2, re.get(4).position);
1216 |         Assert.assertEquals("de", re.get(5).term);
1217 |         Assert.assertEquals(1, re.get(5).startOffset);
1218 |         Assert.assertEquals(2, re.get(5).endOffset);
1219 |         Assert.assertEquals(2, re.get(5).position);
1220 |         Assert.assertEquals("h", re.get(6).term);
1221 |         Assert.assertEquals(2, re.get(6).startOffset);
1222 |         Assert.assertEquals(3, re.get(6).endOffset);
1223 |         Assert.assertEquals(3, re.get(6).position);
1224 |         Assert.assertEquals("hua", re.get(7).term);
1225 |         Assert.assertEquals(2, re.get(7).startOffset);
1226 |         Assert.assertEquals(3, re.get(7).endOffset);
1227 |         Assert.assertEquals(3, re.get(7).position);
1228 |     }
1229 | 
1230 |     @Test
1231 |     public void TestPinyinPosition2() throws IOException {
1232 |         String[] s ={ "l德华"};
1233 | 
1234 |         PinyinConfig config = new PinyinConfig();
1235 |         config.keepFirstLetter = true;
1236 |         config.keepSeparateFirstLetter = true;
1237 |         config.keepNoneChinese = true;
1238 |         config.keepOriginal = true;
1239 |         config.keepFullPinyin = true;
1240 |         config.keepNoneChineseTogether = true;
1241 |         config.ignorePinyinOffset = false;
1242 | 
1243 | 
1244 |         HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);
1245 | 
1246 |         ArrayList<TermItem> re = result.get("l德华");
1247 |         Assert.assertEquals("l", re.get(0).term);
1248 |         Assert.assertEquals(0, re.get(0).startOffset);
1249 |         Assert.assertEquals(1, re.get(0).endOffset);
1250 |         Assert.assertEquals(1, re.get(0).position);
1251 | 
1252 |         Assert.assertEquals("l德华", re.get(1).term);
1253 |         Assert.assertEquals(0, re.get(1).startOffset);
1254 |         Assert.assertEquals(3, re.get(1).endOffset);
1255 |         Assert.assertEquals(1, re.get(1).position);
1256 |         Assert.assertEquals("ldh", re.get(2).term);
1257 |         Assert.assertEquals(0, re.get(2).startOffset);
1258 |         Assert.assertEquals(3, re.get(2).endOffset);
1259 |         Assert.assertEquals(1, re.get(2).position);
1260 | 
1261 |         Assert.assertEquals("d", re.get(3).term);
1262 |         Assert.assertEquals(1, re.get(3).startOffset);
1263 |         Assert.assertEquals(2, re.get(3).endOffset);
1264 |         Assert.assertEquals(2, re.get(3).position);
1265 |         Assert.assertEquals("de", re.get(4).term);
1266 |         Assert.assertEquals(1, re.get(4).startOffset);
1267 |         Assert.assertEquals(2, re.get(4).endOffset);
1268 |         Assert.assertEquals(2, re.get(4).position);
1269 |         Assert.assertEquals("h", re.get(5).term);
1270 |         Assert.assertEquals(2, re.get(5).startOffset);
1271 |         Assert.assertEquals(3, re.get(5).endOffset);
1272 |         Assert.assertEquals(3, re.get(5).position);
1273 |         Assert.assertEquals("hua", re.get(6).term);
1274 |         Assert.assertEquals(2, re.get(6).startOffset);
1275 |         Assert.assertEquals(3, re.get(6).endOffset);
1276 |         Assert.assertEquals(3, re.get(6).position);
1277 |     }
1278 | 
1279 |     @Test
1280 |     public void TestPinyinPosition3() throws IOException {
1281 |         String[] s ={ "liude华","liudehua","ldhua","刘de华","刘dehua","DJ音乐家"};
1282 | 
1283 |         PinyinConfig config = new PinyinConfig();
1284 |         config.keepFirstLetter = true;
1285 |         config.keepSeparateFirstLetter = true;
1286 |         config.keepNoneChinese = true;
1287 |         config.keepOriginal = true;
1288 |         config.keepFullPinyin = true;
1289 |         config.keepNoneChineseTogether = true;
1290 |         config.ignorePinyinOffset = false;
1291 | 
1292 | 
1293 |         HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);
1294 | 
1295 |         ArrayList<TermItem> re = result.get("liude华");
1296 |         Assert.assertEquals("liu", re.get(0).term);
1297 |         Assert.assertEquals(0, re.get(0).startOffset);
1298 |         Assert.assertEquals(3, re.get(0).endOffset);
1299 |         Assert.assertEquals(1, re.get(0).position);
1300 | 
1301 |         Assert.assertEquals("liude华", re.get(1).term);
1302 |         Assert.assertEquals(0, re.get(1).startOffset);
1303 |         Assert.assertEquals(6, re.get(1).endOffset);
1304 |         Assert.assertEquals(1, re.get(1).position);
1305 | 
1306 |         Assert.assertEquals("liudeh", re.get(2).term);
1307 |         Assert.assertEquals(0, re.get(2).startOffset);
1308 |         Assert.assertEquals(6, re.get(2).endOffset);
1309 |         Assert.assertEquals(1, re.get(2).position);
1310 | 
1311 |         Assert.assertEquals("de", re.get(3).term);
1312 |         Assert.assertEquals(3, re.get(3).startOffset);
1313 |         Assert.assertEquals(5, re.get(3).endOffset);
1314 |         Assert.assertEquals(2, re.get(3).position);
1315 | 
1316 | 
1317 |         Assert.assertEquals("h", re.get(4).term);
1318 |         Assert.assertEquals(5, re.get(4).startOffset);
1319 |         Assert.assertEquals(6, re.get(4).endOffset);
1320 |         Assert.assertEquals(3, re.get(4).position);
1321 | 
1322 |         Assert.assertEquals("hua", re.get(5).term);
1323 |         Assert.assertEquals(5, re.get(5).startOffset);
1324 |         Assert.assertEquals(6, re.get(5).endOffset);
1325 |         Assert.assertEquals(3, re.get(5).position);
1326 | 
1327 |     }
1328 | 
1329 |     @Test
1330 |     public void TestPinyinPosition4() throws IOException {
1331 |         String[] s ={ "medcl"};
1332 | 
1333 |         PinyinConfig config = new PinyinConfig();
1334 |         config.keepFirstLetter = true;
1335 |         config.keepSeparateFirstLetter = true;
1336 |         config.keepNoneChinese = true;
1337 |         config.keepOriginal = true;
1338 |         config.keepFullPinyin = true;
1339 |         config.keepNoneChineseTogether = true;
1340 |         config.ignorePinyinOffset = false;
1341 | 
1342 | 
1343 |         HashMap<String, ArrayList<TermItem>> result= getStringArrayListHashMap(s, config);
1344 | 
1345 |         ArrayList<TermItem> re = result.get("medcl");
1346 |         Assert.assertEquals("me", re.get(0).term);
1347 |         Assert.assertEquals(0, re.get(0).startOffset);
1348 |         Assert.assertEquals(2, re.get(0).endOffset);
1349 |         Assert.assertEquals(1, re.get(0).position);
1350 | 
1351 |         Assert.assertEquals("medcl", re.get(1).term);
1352 |         Assert.assertEquals(0, re.get(1).startOffset);
1353 |         Assert.assertEquals(5, re.get(1).endOffset);
1354 |         Assert.assertEquals(1, re.get(1).position);
1355 | 
1356 |         config = new PinyinConfig();
1357 |         config.keepFirstLetter = true;
1358 |         config.keepSeparateFirstLetter = true;
1359 |         config.keepNoneChinese = true;
1360 |         config.keepOriginal = true;
1361 |         config.keepFullPinyin = true;
1362 |         config.keepNoneChineseTogether = false;
1363 |         config.keepJoinedFullPinyin = true;
1364 |         config.ignorePinyinOffset = false;
1365 | 
1366 | 
1367 |         result = getStringArrayListHashMap(s, config);
1368 | 
1369 |         re = result.get("medcl");
1370 |         Assert.assertEquals("m", re.get(0).term);
1371 |         Assert.assertEquals(0, re.get(0).startOffset);
1372 |         Assert.assertEquals(1, re.get(0).endOffset);
1373 |         Assert.assertEquals(1, re.get(0).position);
1374 | 
1375 |         Assert.assertEquals("medcl", re.get(1).term);
1376 |         Assert.assertEquals(0, re.get(1).startOffset);
1377 |         Assert.assertEquals(5, re.get(1).endOffset);
1378 |         Assert.assertEquals(1, re.get(1).position);
1379 | 
1380 | 
1381 | 
1382 |         Assert.assertEquals("e", re.get(2).term);
1383 |         Assert.assertEquals(1, re.get(2).startOffset);
1384 |         Assert.assertEquals(2, re.get(2).endOffset);
1385 |         Assert.assertEquals(2, re.get(2).position);
1386 | 
1387 |         Assert.assertEquals("d", re.get(3).term);
1388 |         Assert.assertEquals(2, re.get(3).startOffset);
1389 |         Assert.assertEquals(3, re.get(3).endOffset);
1390 |         Assert.assertEquals(3, re.get(3).position);
1391 | 
1392 | 
1393 |     }
1394 | }
1395 | 


--------------------------------------------------------------------------------