├── lib └── nlp-lang-1.7.8.jar ├── .gitignore ├── .travis.yml ├── src ├── test │ ├── resources │ │ └── log4j.properties │ └── java │ │ └── org │ │ └── elasticsearch │ │ └── index │ │ └── analysis │ │ ├── PinyinAlphabetTokenizerTest.java │ │ └── PinyinAnalysisTest.java └── main │ ├── java │ └── org │ │ └── elasticsearch │ │ ├── index │ │ └── analysis │ │ │ ├── ConfigErrorException.java │ │ │ ├── PinyinAnalyzer.java │ │ │ ├── TermItem.java │ │ │ ├── PinyinTokenizerFactory.java │ │ │ ├── MultiplePinyinTokenizerFactory.java │ │ │ ├── PinyinTokenFilterFactory.java │ │ │ ├── MultiplePinyinTokenFilterFactory.java │ │ │ ├── PinyinAnalyzerProvider.java │ │ │ ├── PinyinAbbreviationsTokenizerFactory.java │ │ │ ├── PinyinAlphabetTokenizer.java │ │ │ ├── PinyinTokenFilter.java │ │ │ ├── PinyinTokenizer.java │ │ │ ├── MultiplePinyinTokenFilter.java │ │ │ └── MultiplePinyinTokenizer.java │ │ ├── plugin │ │ └── analysis │ │ │ └── pinyin │ │ │ └── AnalysisPinyinPlugin.java │ │ └── analysis │ │ └── PinyinConfig.java │ ├── assemblies │ └── plugin.xml │ └── resources │ ├── plugin-descriptor.properties │ └── pinyin_alphabet.dict ├── LICENSE.txt ├── README.md └── pom.xml /lib/nlp-lang-1.7.8.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RickyHuo/elasticsearch-analysis-pinyin/HEAD/lib/nlp-lang-1.7.8.jar -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /data 2 | /work 3 | /logs 4 | /.idea 5 | /target 6 | .DS_Store 7 | *.iml 8 | /.project 9 | /.settings 10 | /.classpath 11 | /*.ipr 12 | /*.iws 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | jdk: 3 | - oraclejdk8 4 | install: true 5 | script: 6 | - sudo apt-get update && sudo apt-get install oracle-java8-installer 7 | - java -version 8 | language: java 9 | script: mvn clean package 10 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, out 2 | 3 | log4j.appender.out=org.apache.log4j.ConsoleAppender 4 | log4j.appender.out.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.out.layout.conversionPattern=[%d{ISO8601}][%-5p][%-25c] %m%n 6 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/ConfigErrorException.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | /** 4 | * Created by medcl on 16/8/22. 5 | */ 6 | public class ConfigErrorException extends RuntimeException { 7 | private final String mesage; 8 | 9 | public ConfigErrorException(String message) { 10 | this.mesage=message; 11 | } 12 | public String getMessage() { 13 | return this.mesage; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/PinyinAnalyzer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.elasticsearch.analysis.PinyinConfig; 5 | import org.elasticsearch.common.settings.Settings; 6 | 7 | /** 8 | * Created by IntelliJ IDEA. 9 | * User: Medcl' 10 | * Date: 12-5-22 11 | * Time: 上午10:39 12 | */ 13 | public final class PinyinAnalyzer extends Analyzer { 14 | 15 | private PinyinConfig config; 16 | 17 | public PinyinAnalyzer(PinyinConfig config) { 18 | this.config=config; 19 | } 20 | 21 | @Override 22 | protected TokenStreamComponents createComponents(String fieldName) { 23 | return new TokenStreamComponents(new PinyinTokenizer(config)); 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/TermItem.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | /** 4 | * Created by IntelliJ IDEA. 5 | * User: Medcl' 6 | * Date: 12-5-21 7 | * Time: 下午5:53 8 | */ 9 | 10 | public class TermItem implements Comparable{ 11 | String term; 12 | int startOffset; 13 | int endOffset; 14 | int position; 15 | public TermItem(String term,int startOffset,int endOffset,int position){ 16 | this.term=term; 17 | this.startOffset=startOffset; 18 | this.endOffset=endOffset; 19 | this.position=position; 20 | } 21 | 22 | @Override 23 | public String toString() { 24 | return term; 25 | } 26 | 27 | @Override 28 | public int compareTo(TermItem o) { 29 | return this.position-o.position; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/PinyinTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.elasticsearch.analysis.PinyinConfig; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.IndexSettings; 8 | 9 | public class PinyinTokenizerFactory extends AbstractTokenizerFactory { 10 | 11 | private PinyinConfig config; 12 | 13 | public PinyinTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 14 | super(indexSettings, name, settings); 15 | config=new PinyinConfig(settings); 16 | } 17 | 18 | @Override 19 | public Tokenizer create() { 20 | return new PinyinTokenizer(config); 21 | } 22 | } 23 | 24 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/MultiplePinyinTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.elasticsearch.analysis.PinyinConfig; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.IndexSettings; 8 | 9 | public class MultiplePinyinTokenizerFactory extends AbstractTokenizerFactory { 10 | 11 | private PinyinConfig config; 12 | 13 | public MultiplePinyinTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 14 | super(indexSettings, name, settings); 15 | config=new PinyinConfig(settings); 16 | } 17 | 18 | @Override 19 | public Tokenizer create() { 20 | return new MultiplePinyinTokenizer(config); 21 | } 22 | } 23 | 24 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/PinyinTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | 4 | import org.apache.lucene.analysis.TokenStream; 5 | import org.elasticsearch.analysis.PinyinConfig; 6 | import org.elasticsearch.common.settings.Settings; 7 | import org.elasticsearch.env.Environment; 8 | import org.elasticsearch.index.IndexSettings; 9 | 10 | public class PinyinTokenFilterFactory extends AbstractTokenFilterFactory { 11 | private PinyinConfig config; 12 | 13 | 14 | public PinyinTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { 15 | super(indexSettings, name, settings); 16 | config=new PinyinConfig(settings); 17 | } 18 | 19 | @Override 20 | public TokenStream create(TokenStream tokenStream) { 21 | return new PinyinTokenFilter(tokenStream, config); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/MultiplePinyinTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | 4 | import org.apache.lucene.analysis.TokenStream; 5 | import org.elasticsearch.analysis.PinyinConfig; 6 | import org.elasticsearch.common.settings.Settings; 7 | import org.elasticsearch.env.Environment; 8 | import org.elasticsearch.index.IndexSettings; 9 | 10 | public class MultiplePinyinTokenFilterFactory extends AbstractTokenFilterFactory { 11 | private PinyinConfig config; 12 | 13 | 14 | public MultiplePinyinTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { 15 | super(indexSettings, name, settings); 16 | config=new PinyinConfig(settings); 17 | } 18 | 19 | @Override 20 | public TokenStream create(TokenStream tokenStream) { 21 | return new MultiplePinyinTokenFilter(tokenStream, config); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/PinyinAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.elasticsearch.analysis.PinyinConfig; 4 | import org.elasticsearch.common.inject.Inject; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.IndexSettings; 8 | 9 | /** 10 | */ 11 | public class PinyinAnalyzerProvider extends AbstractIndexAnalyzerProvider { 12 | 13 | private final PinyinAnalyzer analyzer; 14 | private PinyinConfig config; 15 | 16 | @Inject 17 | public PinyinAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { 18 | super(indexSettings, name, settings); 19 | config=new PinyinConfig(settings); 20 | analyzer = new PinyinAnalyzer(config); 21 | } 22 | 23 | @Override 24 | public PinyinAnalyzer get() { 25 | return this.analyzer; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/test/java/org/elasticsearch/index/analysis/PinyinAlphabetTokenizerTest.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import java.util.Arrays; 4 | 5 | import org.junit.Assert; 6 | import org.junit.Test; 7 | 8 | /** 9 | * 拼音串切分,很难做到最好,认为取最少切分是最好的 10 | * 11 | * @author shenyanchao 12 | * @since 2018-10-08 12:22 13 | */ 14 | public class PinyinAlphabetTokenizerTest { 15 | 16 | @Test 17 | public void walk() throws Exception { 18 | 19 | Assert.assertEquals(Arrays.asList("xian").toString(), PinyinAlphabetTokenizer.walk("xian").toString()); 20 | Assert.assertEquals(Arrays.asList("wo", "shi", "liang").toString(), 21 | PinyinAlphabetTokenizer.walk("woshiliang").toString()); 22 | 23 | Assert.assertEquals(Arrays.asList("zhong", "hua", "ren", "min", "gong", "he", "guo").toString(), 24 | PinyinAlphabetTokenizer.walk("zhonghuarenmingongheguo").toString()); 25 | Assert.assertEquals( 26 | Arrays.asList("5", "zhong", "hua", "ren", "89", "min", "gong", "he", "guo", "234").toString(), 27 | PinyinAlphabetTokenizer.walk("5zhonghuaren89mingongheguo234").toString()); 28 | } 29 | 30 | } -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/PinyinAbbreviationsTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.elasticsearch.analysis.PinyinConfig; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.IndexSettings; 8 | 9 | public class PinyinAbbreviationsTokenizerFactory extends AbstractTokenizerFactory { 10 | 11 | public PinyinAbbreviationsTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 12 | super(indexSettings, name, settings); 13 | } 14 | 15 | @Override 16 | public Tokenizer create() { 17 | PinyinConfig config=new PinyinConfig(); 18 | config.keepFirstLetter=true; 19 | config.keepFullPinyin=false; 20 | config.keepNoneChinese=false; 21 | config.keepNoneChineseTogether=true; 22 | config.noneChinesePinyinTokenize=false; 23 | config.keepOriginal=false; 24 | config.lowercase=true; 25 | config.trimWhitespace=true; 26 | config.keepNoneChineseInFirstLetter=true; 27 | return new PinyinTokenizer(config); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/assemblies/plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | plugin 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | ${project.basedir}/src/main/resources/plugin-descriptor.properties 11 | 12 | true 13 | 14 | 15 | 16 | 17 | / 18 | true 19 | true 20 | 21 | org.elasticsearch:elasticsearch 22 | 23 | 24 | 25 | / 26 | true 27 | true 28 | 29 | org.apache.lucene:lucene-pinyin 30 | 31 | 32 | 33 | 34 | 35 | ${basedir}/lib/ 36 | / 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/plugin/analysis/pinyin/AnalysisPinyinPlugin.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.analysis.pinyin; 2 | 3 | 4 | import org.apache.lucene.analysis.Analyzer; 5 | import org.elasticsearch.index.analysis.*; 6 | import org.elasticsearch.indices.analysis.AnalysisModule; 7 | import org.elasticsearch.plugins.AnalysisPlugin; 8 | import org.elasticsearch.plugins.Plugin; 9 | 10 | import java.util.Collections; 11 | import java.util.HashMap; 12 | import java.util.Map; 13 | 14 | 15 | public class AnalysisPinyinPlugin extends Plugin implements AnalysisPlugin { 16 | 17 | @Override 18 | public Map> getTokenizers() { 19 | Map> extra = new HashMap<>(); 20 | extra.put("pinyin", PinyinTokenizerFactory::new); 21 | extra.put("multiple_pinyin", MultiplePinyinTokenizerFactory::new); 22 | extra.put("pinyin_first_letter", PinyinAbbreviationsTokenizerFactory::new); 23 | return extra; 24 | } 25 | 26 | @Override 27 | public Map> getTokenFilters() { 28 | Map> extra = new HashMap<>(); 29 | extra.put("pinyin", PinyinTokenFilterFactory::new); 30 | extra.put("multiple_pinyin", MultiplePinyinTokenFilterFactory::new); 31 | return extra; 32 | } 33 | 34 | @Override 35 | public Map>> getAnalyzers() { 36 | return Collections.singletonMap("pinyin", PinyinAnalyzerProvider::new); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | # Elasticsearch plugin descriptor file 2 | # This file must exist as 'plugin-descriptor.properties' at 3 | # the root directory of all plugins. 4 | # 5 | # A plugin can be 'site', 'jvm', or both. 6 | # 7 | ### example site plugin for "foo": 8 | # 9 | # foo.zip <-- zip file for the plugin, with this structure: 10 | # _site/ <-- the contents that will be served 11 | # plugin-descriptor.properties <-- example contents below: 12 | # 13 | # site=true 14 | # description=My cool plugin 15 | # version=1.0 16 | # 17 | ### example jvm plugin for "foo" 18 | # 19 | # foo.zip <-- zip file for the plugin, with this structure: 20 | # .jar <-- classes, resources, dependencies 21 | # .jar <-- any number of jars 22 | # plugin-descriptor.properties <-- example contents below: 23 | # 24 | # jvm=true 25 | # classname=foo.bar.BazPlugin 26 | # description=My cool plugin 27 | # version=2.0.0-rc1 28 | # elasticsearch.version=2.0 29 | # java.version=1.7 30 | # 31 | ### mandatory elements for all plugins: 32 | # 33 | # 'description': simple summary of the plugin 34 | description=${project.description} 35 | # 36 | # 'version': plugin's version 37 | version=${project.version} 38 | # 39 | # 'name': the plugin name 40 | name=${elasticsearch.plugin.name} 41 | 42 | # 43 | # 'classname': the name of the class to load, fully-qualified. 44 | classname=${elasticsearch.plugin.classname} 45 | # 46 | # 'java.version' version of java the code is built against 47 | # use the system property java.specification.version 48 | # version string must be a sequence of nonnegative decimal integers 49 | # separated by "."'s and may have leading zeros 50 | java.version=${maven.compiler.target} 51 | # 52 | # 'elasticsearch.version' version of elasticsearch compiled against 53 | # You will have to release a new version of the plugin for each new 54 | # elasticsearch release. This version is checked when the plugin 55 | # is loaded so Elasticsearch will refuse to start in the presence of 56 | # plugins with the incorrect elasticsearch.version. 57 | elasticsearch.version=${elasticsearch.version} 58 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/analysis/PinyinConfig.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.analysis; 2 | 3 | import org.elasticsearch.common.settings.Settings; 4 | 5 | /** 6 | * Created by medcl on 15/11/26. 7 | */ 8 | public class PinyinConfig { 9 | 10 | public boolean lowercase=true; 11 | public boolean trimWhitespace=true; 12 | public boolean keepNoneChinese=true; 13 | public boolean keepNoneChineseInFirstLetter =true; 14 | public boolean keepNoneChineseInJoinedFullPinyin =false; 15 | public boolean keepOriginal=false; 16 | public boolean keepFirstLetter=true; 17 | public boolean keepSeparateFirstLetter=false; 18 | public boolean keepNoneChineseTogether=true; 19 | public boolean noneChinesePinyinTokenize =true; 20 | public int LimitFirstLetterLength=16; 21 | public boolean keepFullPinyin=true; 22 | public boolean keepJoinedFullPinyin =false; 23 | public boolean removeDuplicateTerm=false; 24 | public boolean fixedPinyinOffset =false; 25 | // after 6.0, offset is strictly constrained, overlapped tokens are not allowed, with this parameter, overlapped token will allowed by ignore offset, please note, all position related query or highlight will become incorrect, you should use multi fields and specify different settings for different query purpose. if you need offset, please set it to false. default: true. 26 | public boolean ignorePinyinOffset =true; 27 | 28 | public PinyinConfig() {} 29 | public PinyinConfig(Settings settings) { 30 | this.keepFirstLetter=settings.getAsBoolean("keep_first_letter",true); 31 | this.keepSeparateFirstLetter=settings.getAsBoolean("keep_separate_first_letter",false); 32 | this.keepFullPinyin=settings.getAsBoolean("keep_full_pinyin", true); 33 | this.keepJoinedFullPinyin =settings.getAsBoolean("keep_joined_full_pinyin", false); 34 | this.keepNoneChinese=settings.getAsBoolean("keep_none_chinese",true); 35 | this.keepNoneChineseTogether=settings.getAsBoolean("keep_none_chinese_together",true); 36 | this.noneChinesePinyinTokenize =settings.getAsBoolean("none_chinese_pinyin_tokenize",true); 37 | this.keepOriginal=settings.getAsBoolean("keep_original", false); 38 | this.LimitFirstLetterLength=settings.getAsInt("limit_first_letter_length", 16); 39 | this.lowercase=settings.getAsBoolean("lowercase", true); 40 | this.trimWhitespace=settings.getAsBoolean("trim_whitespace", true); 41 | this.keepNoneChineseInFirstLetter =settings.getAsBoolean("keep_none_chinese_in_first_letter", true); 42 | this.keepNoneChineseInJoinedFullPinyin =settings.getAsBoolean("keep_none_chinese_in_joined_full_pinyin", false); 43 | this.removeDuplicateTerm =settings.getAsBoolean("remove_duplicated_term", false); 44 | this.fixedPinyinOffset =settings.getAsBoolean("fixed_pinyin_offset", false); 45 | this.ignorePinyinOffset =settings.getAsBoolean("ignore_pinyin_offset", true); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/resources/pinyin_alphabet.dict: -------------------------------------------------------------------------------- 1 | a 2 | ai 3 | an 4 | ang 5 | ao 6 | b 7 | ba 8 | bai 9 | ban 10 | bang 11 | bao 12 | bei 13 | ben 14 | beng 15 | bi 16 | bian 17 | biao 18 | bie 19 | bin 20 | bing 21 | bo 22 | bu 23 | c 24 | ca 25 | cai 26 | can 27 | cang 28 | cao 29 | ce 30 | cen 31 | ceng 32 | ch 33 | cha 34 | chai 35 | chan 36 | chang 37 | chao 38 | che 39 | chen 40 | cheng 41 | chi 42 | chong 43 | chou 44 | chu 45 | chua 46 | chuai 47 | chuan 48 | chuang 49 | chui 50 | chun 51 | chuo 52 | ci 53 | cong 54 | cou 55 | cu 56 | cuan 57 | cui 58 | cun 59 | cuo 60 | d 61 | da 62 | dai 63 | dan 64 | dang 65 | dao 66 | de 67 | dei 68 | den 69 | deng 70 | di 71 | dia 72 | dian 73 | diao 74 | die 75 | ding 76 | diu 77 | dong 78 | dou 79 | du 80 | duan 81 | dui 82 | dun 83 | duo 84 | e 85 | er 86 | f 87 | fa 88 | fan 89 | fang 90 | fei 91 | fen 92 | feng 93 | fiao 94 | fo 95 | fou 96 | fu 97 | g 98 | ga 99 | gai 100 | gan 101 | gang 102 | gao 103 | ge 104 | gei 105 | gen 106 | geng 107 | gong 108 | gou 109 | gu 110 | gua 111 | guai 112 | guan 113 | guang 114 | gui 115 | gun 116 | guo 117 | h 118 | ha 119 | hai 120 | han 121 | hang 122 | hao 123 | he 124 | hei 125 | hen 126 | heng 127 | hong 128 | hou 129 | hu 130 | hua 131 | huai 132 | huan 133 | huang 134 | hui 135 | hun 136 | huo 137 | i 138 | j 139 | ja 140 | ji 141 | jia 142 | jian 143 | jiang 144 | jiao 145 | jie 146 | jin 147 | jing 148 | jiong 149 | jiu 150 | ju 151 | juan 152 | jue 153 | jun 154 | k 155 | ka 156 | kai 157 | kan 158 | kang 159 | kao 160 | ke 161 | kei 162 | ken 163 | keng 164 | kong 165 | kou 166 | ku 167 | kua 168 | kuai 169 | kuan 170 | kuang 171 | kui 172 | kun 173 | kuo 174 | l 175 | la 176 | lai 177 | lan 178 | lang 179 | lao 180 | le 181 | lei 182 | leng 183 | li 184 | lia 185 | lian 186 | liang 187 | liao 188 | lie 189 | lin 190 | ling 191 | liu 192 | lo 193 | long 194 | lou 195 | lu 196 | luan 197 | lun 198 | luo 199 | lv 200 | lve 201 | lü 202 | lüe 203 | m 204 | ma 205 | mai 206 | man 207 | mang 208 | mao 209 | me 210 | mei 211 | men 212 | meng 213 | mi 214 | mian 215 | miao 216 | mie 217 | min 218 | ming 219 | miu 220 | mo 221 | mou 222 | mu 223 | n 224 | na 225 | nai 226 | nan 227 | nang 228 | nao 229 | ne 230 | nei 231 | nen 232 | neng 233 | ni 234 | nian 235 | niang 236 | niao 237 | nie 238 | nin 239 | ning 240 | niu 241 | nong 242 | nou 243 | nu 244 | nuan 245 | nun 246 | nuo 247 | nv 248 | nve 249 | nü 250 | nüe 251 | o 252 | p 253 | pa 254 | pai 255 | pan 256 | pang 257 | pao 258 | pei 259 | pen 260 | peng 261 | pi 262 | pian 263 | piao 264 | pie 265 | pin 266 | ping 267 | po 268 | pou 269 | pu 270 | q 271 | qi 272 | qia 273 | qian 274 | qiang 275 | qiao 276 | qie 277 | qin 278 | qing 279 | qiong 280 | qiu 281 | qu 282 | quan 283 | que 284 | qun 285 | r 286 | ran 287 | rang 288 | rao 289 | re 290 | ren 291 | reng 292 | ri 293 | rong 294 | rou 295 | ru 296 | ruan 297 | rui 298 | run 299 | ruo 300 | s 301 | sa 302 | sai 303 | san 304 | sang 305 | sao 306 | se 307 | sen 308 | seng 309 | sh 310 | sha 311 | shai 312 | shan 313 | shang 314 | shao 315 | she 316 | shei 317 | shen 318 | sheng 319 | shi 320 | shou 321 | shu 322 | shua 323 | shuai 324 | shuan 325 | shuang 326 | shui 327 | shun 328 | shuo 329 | si 330 | song 331 | sou 332 | su 333 | suan 334 | sui 335 | sun 336 | suo 337 | t 338 | ta 339 | tai 340 | tan 341 | tang 342 | tao 343 | te 344 | teng 345 | ti 346 | tian 347 | tiao 348 | tie 349 | ting 350 | tong 351 | tou 352 | tu 353 | tuan 354 | tui 355 | tun 356 | tuo 357 | u 358 | v 359 | w 360 | wa 361 | wai 362 | wan 363 | wang 364 | wei 365 | wen 366 | weng 367 | wo 368 | wu 369 | x 370 | xi 371 | xia 372 | xian 373 | xiang 374 | xiao 375 | xie 376 | xin 377 | xing 378 | xiong 379 | xiu 380 | xu 381 | xuan 382 | xue 383 | xun 384 | y 385 | ya 386 | yai 387 | yan 388 | yang 389 | yao 390 | ye 391 | yi 392 | yin 393 | ying 394 | yo 395 | yong 396 | you 397 | yu 398 | yuan 399 | yue 400 | yun 401 | z 402 | za 403 | zai 404 | zan 405 | zang 406 | zao 407 | ze 408 | zei 409 | zen 410 | zeng 411 | zh 412 | zha 413 | zhai 414 | zhan 415 | zhang 416 | zhao 417 | zhe 418 | zhei 419 | zhen 420 | zheng 421 | zhi 422 | zhong 423 | zhou 424 | zhu 425 | zhua 426 | zhuai 427 | zhuan 428 | zhuang 429 | zhui 430 | zhun 431 | zhuo 432 | zi 433 | zong 434 | zou 435 | zu 436 | zuan 437 | zui 438 | zun 439 | zuo 440 | ü 441 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/PinyinAlphabetTokenizer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | import java.util.*; 7 | 8 | /** 9 | * Created by medcl on 16/10/13. 10 | */ 11 | public class PinyinAlphabetTokenizer { 12 | 13 | private static final int PINYIN_MAX_LENGTH = 6; 14 | 15 | public static List walk(String text) { 16 | return segPinyinStr(text); 17 | } 18 | 19 | private static List segPinyinStr(String content) { 20 | String pinyinStr = content; 21 | pinyinStr = pinyinStr.toLowerCase(); 22 | // 按非letter切分 23 | List pinyinStrList = splitByNoletter(pinyinStr); 24 | List pinyinList = new ArrayList<>(); 25 | for (String pinyinText : pinyinStrList) { 26 | if (pinyinText.length() == 1) { 27 | pinyinList.add(pinyinText); 28 | } else { 29 | List forward = positiveMaxMatch(pinyinText, PINYIN_MAX_LENGTH); 30 | if (forward.size() == 1) { // 前向只切出1个的话,没有必要再做逆向分词 31 | pinyinList.addAll(forward); 32 | } else { 33 | // 分别正向、逆向最大匹配,选出最短的作为最优结果 34 | List backward = reverseMaxMatch(pinyinText, PINYIN_MAX_LENGTH); 35 | if (forward.size() <= backward.size()) { 36 | pinyinList.addAll(forward); 37 | } else { 38 | pinyinList.addAll(backward); 39 | } 40 | } 41 | } 42 | } 43 | return pinyinList; 44 | } 45 | 46 | private static List splitByNoletter(String pinyinStr) { 47 | List pinyinStrList = new ArrayList<>(); 48 | StringBuffer sb = new StringBuffer(); 49 | boolean lastWord = true; 50 | for (char c : pinyinStr.toCharArray()) { 51 | if ((c > 96 && c < 123) || (c > 64 && c < 91)) { 52 | if (!lastWord){ 53 | pinyinStrList.add(sb.toString()); 54 | sb.setLength(0); 55 | } 56 | sb.append(c); 57 | lastWord = true; 58 | } else { 59 | if (lastWord & sb.length()>0) { 60 | pinyinStrList.add(sb.toString()); 61 | sb.setLength(0); 62 | } 63 | sb.append(c); 64 | lastWord = false; 65 | } 66 | } 67 | if (sb.length() > 0) { 68 | pinyinStrList.add(sb.toString()); 69 | } 70 | return pinyinStrList; 71 | 72 | } 73 | 74 | private static List positiveMaxMatch(String pinyinText, int maxLength) { 75 | 76 | List pinyinList = new ArrayList<>(); 77 | StringBuffer noMatchBuffer = new StringBuffer(); 78 | for (int start = 0; start < pinyinText.length(); ) { 79 | int end = start + maxLength; 80 | if (end > pinyinText.length()) { 81 | end = pinyinText.length(); 82 | } 83 | if (start == end) { 84 | break; 85 | } 86 | String sixStr = pinyinText.substring(start, end); 87 | boolean match = false; 88 | for (int j = 0; j < sixStr.length(); j++) { 89 | String guess = sixStr.substring(0, sixStr.length() - j); 90 | if (PinyinAlphabetDict.getInstance().match(guess)) { 91 | pinyinList.add(guess); 92 | start += guess.length(); 93 | match = true; 94 | break; 95 | } 96 | } 97 | if (!match) { //没命中,向后移动一位 98 | noMatchBuffer.append(sixStr.substring(0, 1)); 99 | start++; 100 | }else { // 命中,加上之前没命中的,并清空 101 | if (noMatchBuffer.length() > 0) { 102 | pinyinList.add(noMatchBuffer.toString()); 103 | noMatchBuffer.setLength(0); 104 | } 105 | } 106 | } 107 | if (noMatchBuffer.length() > 0) { 108 | pinyinList.add(noMatchBuffer.toString()); 109 | noMatchBuffer.setLength(0); 110 | } 111 | 112 | return pinyinList; 113 | } 114 | 115 | private static List reverseMaxMatch(String pinyinText, int maxLength) { 116 | List pinyinList = new ArrayList<>(); 117 | StringBuffer noMatchBuffer = new StringBuffer(); 118 | for (int end = pinyinText.length(); end >= 0; ) { 119 | int start = end - maxLength; 120 | if (start < 0) { 121 | start = 0; 122 | } 123 | if (start == end) { 124 | break; 125 | } 126 | boolean match = false; 127 | String sixStr = pinyinText.substring(start, end); 128 | for (int j = 0; j < sixStr.length(); j++) { 129 | String guess = sixStr.substring(j); 130 | if (PinyinAlphabetDict.getInstance().match(guess)) { 131 | pinyinList.add(guess); 132 | end -= guess.length(); 133 | match = true; 134 | break; 135 | } 136 | } 137 | if (!match) { //一个也没命中 138 | noMatchBuffer.append(sixStr.substring(sixStr.length() - 1)); 139 | end--; 140 | } else { 141 | if (noMatchBuffer.length() > 0) { 142 | pinyinList.add(noMatchBuffer.toString()); 143 | noMatchBuffer.setLength(0); 144 | } 145 | } 146 | } 147 | 148 | if (noMatchBuffer.length() > 0) { 149 | pinyinList.add(noMatchBuffer.toString()); 150 | noMatchBuffer.setLength(0); 151 | } 152 | // reverse 保持切词顺序 153 | Collections.reverse(pinyinList); 154 | return pinyinList; 155 | } 156 | 157 | 158 | } 159 | 160 | class PinyinAlphabetDict { 161 | 162 | private static final String fileName = "/pinyin_alphabet.dict"; 163 | 164 | private Set alphabet = new HashSet(); 165 | 166 | private static PinyinAlphabetDict instance; 167 | 168 | private PinyinAlphabetDict() { 169 | InputStream in = PinyinAlphabetDict.class.getResourceAsStream(fileName); 170 | BufferedReader reader = new BufferedReader(new InputStreamReader(in)); 171 | try { 172 | String line; 173 | while (null != (line = reader.readLine())) { 174 | if (line.trim().length() > 0) { 175 | alphabet.add(line); 176 | } 177 | } 178 | } catch (Exception ex) { 179 | throw new RuntimeException("read pinyin dic error.", ex); 180 | } finally { 181 | try { 182 | reader.close(); 183 | } catch (Exception ignored) { 184 | } 185 | } 186 | } 187 | 188 | public static PinyinAlphabetDict getInstance() { 189 | if (instance == null) { 190 | synchronized (PinyinAlphabetDict.class) { 191 | if (instance == null) { 192 | instance = new PinyinAlphabetDict(); 193 | } 194 | } 195 | } 196 | return instance; 197 | } 198 | 199 | public boolean match(String c) { 200 | return alphabet.contains(c); 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Pinyin Analysis for Elasticsearch 2 | ================================== 3 | 4 | This Pinyin Analysis plugin is used to do conversion between Chinese characters and Pinyin, integrates NLP tools (https://github.com/NLPchina/nlp-lang). 5 | 6 | -------------------------------------------------- 7 | | Pinyin Analysis Plugin | Elasticsearch | 8 | -------------------------------------------------- 9 | | master | 6.x -> master | 10 | -------------------------------------------------- 11 | | 6.3.0 | 6.3.0 | 12 | -------------------------------------------------- 13 | | 6.2.4 | 6.2.4 | 14 | -------------------------------------------------- 15 | | 6.1.4 | 6.1.4 | 16 | -------------------------------------------------- 17 | | 5.6.9 | 5.6.9 | 18 | -------------------------------------------------- 19 | | 5.5.3 | 5.5.3 | 20 | -------------------------------------------------- 21 | | 5.4.3 | 5.4.3 | 22 | -------------------------------------------------- 23 | | 5.3.3 | 5.3.3 | 24 | -------------------------------------------------- 25 | | 5.2.2 | 5.2.2 | 26 | -------------------------------------------------- 27 | | 5.1.2 | 5.1.2 | 28 | -------------------------------------------------- 29 | | 1.8.1 | 2.4.1 | 30 | -------------------------------------------------- 31 | | 1.7.5 | 2.3.5 | 32 | -------------------------------------------------- 33 | | 1.6.1 | 2.2.1 | 34 | -------------------------------------------------- 35 | | 1.5.0 | 2.1.0 | 36 | -------------------------------------------------- 37 | | 1.4.0 | 2.0.x | 38 | -------------------------------------------------- 39 | | 1.3.0 | 1.6.x | 40 | -------------------------------------------------- 41 | | 1.2.2 | 1.0.x | 42 | -------------------------------------------------- 43 | 44 | The plugin includes analyzer: `pinyin` , tokenizer: `pinyin` and token-filter: `pinyin`. 45 | 46 | ** Optional Parameters ** 47 | * `keep_first_letter` when this option enabled, eg: `刘德华`>`ldh`, default: true 48 | * `keep_separate_first_letter` when this option enabled, will keep first letters separately, eg: `刘德华`>`l`,`d`,`h`, default: false, NOTE: query result maybe too fuzziness due to term too frequency 49 | * `limit_first_letter_length` set max length of the first_letter result, default: 16 50 | * `keep_full_pinyin` when this option enabled, eg: `刘德华`> [`liu`,`de`,`hua`], default: true 51 | * `keep_joined_full_pinyin` when this option enabled, eg: `刘德华`> [`liudehua`], default: false 52 | * `keep_none_chinese` keep non chinese letter or number in result, default: true 53 | * `keep_none_chinese_together` keep non chinese letter together, default: true, eg: `DJ音乐家` -> `DJ`,`yin`,`yue`,`jia`, when set to `false`, eg: `DJ音乐家` -> `D`,`J`,`yin`,`yue`,`jia`, NOTE: `keep_none_chinese` should be enabled first 54 | * `keep_none_chinese_in_first_letter` keep non Chinese letters in first letter, eg: `刘德华AT2016`->`ldhat2016`, default: true 55 | * `keep_none_chinese_in_joined_full_pinyin` keep non Chinese letters in joined full pinyin, eg: `刘德华2016`->`liudehua2016`, default: false 56 | * `none_chinese_pinyin_tokenize` break non chinese letters into separate pinyin term if they are pinyin, default: true, eg: `liudehuaalibaba13zhuanghan` -> `liu`,`de`,`hua`,`a`,`li`,`ba`,`ba`,`13`,`zhuang`,`han`, NOTE: `keep_none_chinese` and `keep_none_chinese_together` should be enabled first 57 | * `keep_original` when this option enabled, will keep original input as well, default: false 58 | * `lowercase` lowercase non Chinese letters, default: true 59 | * `trim_whitespace` default: true 60 | * `remove_duplicated_term` when this option enabled, duplicated term will be removed to save index, eg: `de的`>`de`, default: false, NOTE: position related query maybe influenced 61 | * `ignore_pinyin_offset` after 6.0, offset is strictly constrained, overlapped tokens are not allowed, with this parameter, overlapped token will allowed by ignore offset, please note, all position related query or highlight will become incorrect, you should use multi fields and specify different settings for different query purpose. if you need offset, please set it to false. default: true. 62 | 63 | 64 | 65 | 1.Create a index with custom pinyin analyzer 66 |
 67 | PUT /medcl/ 
 68 | {
 69 |     "index" : {
 70 |         "analysis" : {
 71 |             "analyzer" : {
 72 |                 "pinyin_analyzer" : {
 73 |                     "tokenizer" : "my_pinyin"
 74 |                     }
 75 |             },
 76 |             "tokenizer" : {
 77 |                 "my_pinyin" : {
 78 |                     "type" : "pinyin",
 79 |                     "keep_separate_first_letter" : false,
 80 |                     "keep_full_pinyin" : true,
 81 |                     "keep_original" : true,
 82 |                     "limit_first_letter_length" : 16,
 83 |                     "lowercase" : true,
 84 |                     "remove_duplicated_term" : true
 85 |                 }
 86 |             }
 87 |         }
 88 |     }
 89 | }
 90 | 
91 | 92 | 2.Test Analyzer, analyzing a chinese name, such as 刘德华 93 |
 94 | GET /medcl/_analyze
 95 | {
 96 |   "text": ["刘德华"],
 97 |   "analyzer": "pinyin_analyzer"
 98 | }
99 |
100 | {
101 |   "tokens" : [
102 |     {
103 |       "token" : "liu",
104 |       "start_offset" : 0,
105 |       "end_offset" : 1,
106 |       "type" : "word",
107 |       "position" : 0
108 |     },
109 |     {
110 |       "token" : "de",
111 |       "start_offset" : 1,
112 |       "end_offset" : 2,
113 |       "type" : "word",
114 |       "position" : 1
115 |     },
116 |     {
117 |       "token" : "hua",
118 |       "start_offset" : 2,
119 |       "end_offset" : 3,
120 |       "type" : "word",
121 |       "position" : 2
122 |     },
123 |     {
124 |       "token" : "刘德华",
125 |       "start_offset" : 0,
126 |       "end_offset" : 3,
127 |       "type" : "word",
128 |       "position" : 3
129 |     },
130 |     {
131 |       "token" : "ldh",
132 |       "start_offset" : 0,
133 |       "end_offset" : 3,
134 |       "type" : "word",
135 |       "position" : 4
136 |     }
137 |   ]
138 | }
139 | 
140 | 141 | 3.Create mapping 142 |
143 | POST /medcl/folks/_mapping 
144 | {
145 |     "folks": {
146 |         "properties": {
147 |             "name": {
148 |                 "type": "keyword",
149 |                 "fields": {
150 |                     "pinyin": {
151 |                         "type": "text",
152 |                         "store": false,
153 |                         "term_vector": "with_offsets",
154 |                         "analyzer": "pinyin_analyzer",
155 |                         "boost": 10
156 |                     }
157 |                 }
158 |             }
159 |         }
160 |     }
161 | }
162 | 
163 | 164 | 4.Indexing 165 |
166 | POST /medcl/folks/andy 
167 | {"name":"刘德华"}
168 | 
169 | 170 | 5.Let's search 171 |
172 | http://localhost:9200/medcl/folks/_search?q=name:%E5%88%98%E5%BE%B7%E5%8D%8E
173 | curl http://localhost:9200/medcl/folks/_search?q=name.pinyin:%e5%88%98%e5%be%b7
174 | curl http://localhost:9200/medcl/folks/_search?q=name.pinyin:liu
175 | curl http://localhost:9200/medcl/folks/_search?q=name.pinyin:ldh
176 | curl http://localhost:9200/medcl/folks/_search?q=name.pinyin:de+hua
177 | 
178 | 179 | 6.Using Pinyin-TokenFilter 180 |
181 | PUT /medcl1/ 
182 | {
183 |     "index" : {
184 |         "analysis" : {
185 |             "analyzer" : {
186 |                 "user_name_analyzer" : {
187 |                     "tokenizer" : "whitespace",
188 |                     "filter" : "pinyin_first_letter_and_full_pinyin_filter"
189 |                 }
190 |             },
191 |             "filter" : {
192 |                 "pinyin_first_letter_and_full_pinyin_filter" : {
193 |                     "type" : "pinyin",
194 |                     "keep_first_letter" : true,
195 |                     "keep_full_pinyin" : false,
196 |                     "keep_none_chinese" : true,
197 |                     "keep_original" : false,
198 |                     "limit_first_letter_length" : 16,
199 |                     "lowercase" : true,
200 |                     "trim_whitespace" : true,
201 |                     "keep_none_chinese_in_first_letter" : true
202 |                 }
203 |             }
204 |         }
205 |     }
206 | }
207 | 
208 | 209 | Token Test:刘德华 张学友 郭富城 黎明 四大天王 210 |
211 | GET /medcl/_analyze
212 | {
213 |   "text": ["刘德华 张学友 郭富城 黎明 四大天王"],
214 |   "analyzer": "user_name_analyzer"
215 | }
216 | 
217 |
218 | {
219 |   "tokens" : [
220 |     {
221 |       "token" : "ldh",
222 |       "start_offset" : 0,
223 |       "end_offset" : 3,
224 |       "type" : "word",
225 |       "position" : 0
226 |     },
227 |     {
228 |       "token" : "zxy",
229 |       "start_offset" : 4,
230 |       "end_offset" : 7,
231 |       "type" : "word",
232 |       "position" : 1
233 |     },
234 |     {
235 |       "token" : "gfc",
236 |       "start_offset" : 8,
237 |       "end_offset" : 11,
238 |       "type" : "word",
239 |       "position" : 2
240 |     },
241 |     {
242 |       "token" : "lm",
243 |       "start_offset" : 12,
244 |       "end_offset" : 14,
245 |       "type" : "word",
246 |       "position" : 3
247 |     },
248 |     {
249 |       "token" : "sdtw",
250 |       "start_offset" : 15,
251 |       "end_offset" : 19,
252 |       "type" : "word",
253 |       "position" : 4
254 |     }
255 |   ]
256 | }
257 | 
258 | 259 | 260 | 7.Used in phrase query 261 | - option 1 262 |
263 |     PUT /medcl/
264 |     {
265 |         "index" : {
266 |             "analysis" : {
267 |                 "analyzer" : {
268 |                     "pinyin_analyzer" : {
269 |                         "tokenizer" : "my_pinyin"
270 |                         }
271 |                 },
272 |                 "tokenizer" : {
273 |                     "my_pinyin" : {
274 |                         "type" : "pinyin",
275 |                         "keep_first_letter":false,
276 |                         "keep_separate_first_letter" : false,
277 |                         "keep_full_pinyin" : true,
278 |                         "keep_original" : false,
279 |                         "limit_first_letter_length" : 16,
280 |                         "lowercase" : true
281 |                     }
282 |                 }
283 |             }
284 |         }
285 |     }
286 |     GET /medcl/folks/_search
287 |     {
288 |       "query": {"match_phrase": {
289 |         "name.pinyin": "刘德华"
290 |       }}
291 |     }
292 | 
293 |     
294 | 295 | - option 2 296 |
297 | 
298 |     PUT /medcl/
299 |     {
300 |         "index" : {
301 |             "analysis" : {
302 |                 "analyzer" : {
303 |                     "pinyin_analyzer" : {
304 |                         "tokenizer" : "my_pinyin"
305 |                         }
306 |                 },
307 |                 "tokenizer" : {
308 |                     "my_pinyin" : {
309 |                         "type" : "pinyin",
310 |                         "keep_first_letter":false,
311 |                         "keep_separate_first_letter" : true,
312 |                         "keep_full_pinyin" : false,
313 |                         "keep_original" : false,
314 |                         "limit_first_letter_length" : 16,
315 |                         "lowercase" : true
316 |                     }
317 |                 }
318 |             }
319 |         }
320 |     }
321 | 
322 |     POST /medcl/folks/andy
323 |     {"name":"刘德华"}
324 | 
325 |     GET /medcl/folks/_search
326 |     {
327 |       "query": {"match_phrase": {
328 |         "name.pinyin": "刘德h"
329 |       }}
330 |     }
331 | 
332 |     GET /medcl/folks/_search
333 |     {
334 |       "query": {"match_phrase": {
335 |         "name.pinyin": "刘dh"
336 |       }}
337 |     }
338 | 
339 |     GET /medcl/folks/_search
340 |     {
341 |       "query": {"match_phrase": {
342 |         "name.pinyin": "dh"
343 |       }}
344 |     }
345 | 
346 |     
347 | 348 | 8.That's all, have fun. 349 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/PinyinTokenFilter.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | *

11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | *

13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import org.apache.lucene.analysis.TokenFilter; 21 | import org.apache.lucene.analysis.TokenStream; 22 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 23 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 24 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 25 | import org.elasticsearch.analysis.PinyinConfig; 26 | import org.nlpcn.commons.lang.pinyin.Pinyin; 27 | 28 | import java.io.IOException; 29 | import java.util.ArrayList; 30 | import java.util.Collections; 31 | import java.util.HashSet; 32 | import java.util.List; 33 | 34 | public class PinyinTokenFilter extends TokenFilter { 35 | 36 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 37 | private boolean done = true; 38 | private boolean processedCandidate = false; 39 | private boolean processedFullPinyinLetter = false; 40 | private boolean processedFirstLetter = false; 41 | private boolean processedOriginal = false; 42 | private boolean processedSortCandidate = false; 43 | protected int position = 0; 44 | protected int lastOffset = 0; 45 | private PinyinConfig config; 46 | List candidate; 47 | private HashSet termsFilter; 48 | 49 | protected int candidateOffset = 0; 50 | StringBuilder firstLetters; 51 | StringBuilder fullPinyinLetters; 52 | String source; 53 | private int lastIncrementPosition = 0; 54 | 55 | private PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class); 56 | 57 | public PinyinTokenFilter(TokenStream in, PinyinConfig config) { 58 | super(in); 59 | this.config = config; 60 | //validate config 61 | if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) { 62 | throw new ConfigErrorException("pinyin config error, can't disable separate_first_letter, first_letter and full_pinyin at the same time."); 63 | } 64 | candidate = new ArrayList<>(); 65 | firstLetters = new StringBuilder(); 66 | termsFilter = new HashSet<>(); 67 | fullPinyinLetters = new StringBuilder(); 68 | } 69 | 70 | //TODO refactor, merge code 71 | @Override 72 | public final boolean incrementToken() throws IOException { 73 | 74 | 75 | if (!done) { 76 | if (readTerm()) return true; 77 | } 78 | 79 | if (done) { 80 | resetVariable(); 81 | if (!input.incrementToken()) { 82 | return false; 83 | } 84 | done = false; 85 | } 86 | readTerm(); 87 | return true; 88 | } 89 | 90 | private boolean readTerm() { 91 | if (!processedCandidate) { 92 | processedCandidate = true; 93 | lastOffset = termAtt.length(); 94 | source = termAtt.toString(); 95 | if (config.trimWhitespace) { 96 | source = source.trim(); 97 | } 98 | 99 | List pinyinList = Pinyin.pinyin(source); 100 | if (pinyinList.size() == 0) return false; 101 | 102 | StringBuilder buff = new StringBuilder(); 103 | int buffStartPosition = 0; 104 | int buffSize = 0; 105 | position = 0; 106 | 107 | for (int i = 0; i < source.length(); i++) { 108 | char c = source.charAt(i); 109 | 110 | //keep original alphabet 111 | if (c < 128) { 112 | if (buff.length() <= 0) { 113 | buffStartPosition = i; 114 | } 115 | if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) { 116 | if (config.keepNoneChinese) { 117 | if (config.keepNoneChinese) { 118 | if (config.keepNoneChineseTogether) { 119 | buff.append(c); 120 | buffSize++; 121 | } else { 122 | addCandidate(new TermItem(String.valueOf(c), i, i + 1, buffStartPosition)); 123 | } 124 | } 125 | } 126 | if (config.keepNoneChineseInFirstLetter) { 127 | firstLetters.append(c); 128 | } 129 | if (config.keepNoneChineseInJoinedFullPinyin) { 130 | fullPinyinLetters.append(c); 131 | } 132 | } 133 | } else { 134 | //clean previous temp 135 | if (buff.length() > 0) { 136 | buffSize = parseBuff(buff, buffSize, buffStartPosition); 137 | } 138 | 139 | String pinyin = pinyinList.get(i); 140 | if (pinyin != null && pinyin.length() > 0) { 141 | position++; 142 | firstLetters.append(pinyin.charAt(0)); 143 | if (config.keepSeparateFirstLetter & pinyin.length() > 1) { 144 | addCandidate(new TermItem(String.valueOf(pinyin.charAt(0)), i, i + 1, position)); 145 | } 146 | if (config.keepFullPinyin) { 147 | addCandidate(new TermItem(pinyin, i, i + 1, position)); 148 | } 149 | if (config.keepJoinedFullPinyin) { 150 | fullPinyinLetters.append(pinyin); 151 | } 152 | } 153 | } 154 | 155 | lastOffset = i; 156 | 157 | } 158 | 159 | //clean previous temp 160 | if (buff.length() > 0) { 161 | buffSize = parseBuff(buff, buffSize, buffStartPosition); 162 | } 163 | } 164 | 165 | 166 | if (config.keepOriginal && !processedOriginal) { 167 | processedOriginal = true; 168 | addCandidate(new TermItem(source, 0, source.length(), 1)); 169 | } 170 | 171 | if (config.keepJoinedFullPinyin && !processedFullPinyinLetter && fullPinyinLetters.length() > 0) { 172 | processedFullPinyinLetter = true; 173 | addCandidate(new TermItem(fullPinyinLetters.toString(), 0, source.length(), 1)); 174 | fullPinyinLetters.setLength(0); 175 | } 176 | 177 | 178 | if (config.keepFirstLetter && firstLetters.length() > 0 && !processedFirstLetter) { 179 | processedFirstLetter = true; 180 | String fl; 181 | if (firstLetters.length() > config.LimitFirstLetterLength && config.LimitFirstLetterLength > 0) { 182 | fl = firstLetters.substring(0, config.LimitFirstLetterLength); 183 | } else { 184 | fl = firstLetters.toString(); 185 | } 186 | if (config.lowercase) { 187 | fl = fl.toLowerCase(); 188 | } 189 | if (!(config.keepSeparateFirstLetter && fl.length() <= 1)) { 190 | addCandidate(new TermItem(fl, 0, fl.length(), 1)); 191 | } 192 | } 193 | 194 | if (!processedSortCandidate) { 195 | processedSortCandidate = true; 196 | Collections.sort(candidate); 197 | } 198 | 199 | if (candidateOffset < candidate.size()) { 200 | TermItem item = candidate.get(candidateOffset); 201 | candidateOffset++; 202 | setTerm(item.term, item.startOffset, item.endOffset, item.position); 203 | return true; 204 | } 205 | 206 | done = true; 207 | return false; 208 | } 209 | 210 | 211 | void addCandidate(TermItem item) { 212 | 213 | String term = item.term; 214 | if (config.lowercase) { 215 | term = term.toLowerCase(); 216 | } 217 | 218 | if (config.trimWhitespace) { 219 | term = term.trim(); 220 | } 221 | item.term = term; 222 | 223 | if (term.length() == 0) { 224 | return; 225 | } 226 | 227 | //remove same term with same position 228 | String fr=term+item.position; 229 | 230 | //remove same term, regardless position 231 | if (config.removeDuplicateTerm) { 232 | fr=term; 233 | } 234 | 235 | if (termsFilter.contains(fr)) { 236 | return; 237 | } 238 | termsFilter.add(fr); 239 | 240 | candidate.add(item); 241 | } 242 | 243 | 244 | void setTerm(String term, int startOffset, int endOffset, int position) { 245 | if (config.lowercase) { 246 | term = term.toLowerCase(); 247 | } 248 | 249 | if (config.trimWhitespace) { 250 | term = term.trim(); 251 | } 252 | 253 | //ignore empty term 254 | if(term.length()==0){ 255 | return; 256 | } 257 | 258 | termAtt.setEmpty(); 259 | termAtt.append(term); 260 | if (startOffset < 0) { 261 | startOffset = 0; 262 | } 263 | if (endOffset < startOffset) { 264 | endOffset = startOffset + term.length(); 265 | } 266 | 267 | int offset = position - lastIncrementPosition; 268 | if (offset < 0) { 269 | offset = 0; 270 | } 271 | positionAttr.setPositionIncrement(offset); 272 | 273 | lastIncrementPosition = position; 274 | } 275 | 276 | private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) { 277 | if (config.keepNoneChinese) { 278 | if (config.noneChinesePinyinTokenize) { 279 | List result = PinyinAlphabetTokenizer.walk(buff.toString()); 280 | int start = (lastOffset - buffSize + 1); 281 | for (int i = 0; i < result.size(); i++) { 282 | int end; 283 | String t = result.get(i); 284 | if (config.fixedPinyinOffset) { 285 | end = start + 1; 286 | } else { 287 | end = start + t.length(); 288 | } 289 | addCandidate(new TermItem(result.get(i), start, end, ++position)); 290 | start = end; 291 | } 292 | } else if (config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || !config.keepNoneChineseInJoinedFullPinyin) { 293 | addCandidate(new TermItem(buff.toString(), lastOffset - buffSize, lastOffset, ++position)); 294 | } 295 | } 296 | 297 | buff.setLength(0); 298 | buffSize = 0; 299 | return buffSize; 300 | } 301 | 302 | @Override 303 | public final void end() throws IOException { 304 | super.end(); 305 | } 306 | 307 | void resetVariable() { 308 | position = 0; 309 | lastOffset = 0; 310 | candidate.clear(); 311 | this.processedCandidate = false; 312 | this.processedFirstLetter = false; 313 | this.processedFullPinyinLetter = false; 314 | this.processedOriginal = false; 315 | firstLetters.setLength(0); 316 | fullPinyinLetters.setLength(0); 317 | source = null; 318 | candidateOffset = 0; 319 | termsFilter.clear(); 320 | lastIncrementPosition = 0; 321 | } 322 | 323 | @Override 324 | public void reset() throws IOException { 325 | super.reset(); 326 | this.done = true; 327 | resetVariable(); 328 | } 329 | 330 | 331 | } -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | elasticsearch-analysis-pinyin 6 | 4.0.0 7 | org.elasticsearch 8 | elasticsearch-analysis-pinyin 9 | ${elasticsearch.version} 10 | jar 11 | Pinyin Analysis for Elasticsearch 12 | 2012 13 | 14 | 15 | 5.4.2 16 | 1.8 17 | ${project.basedir}/src/main/assemblies/plugin.xml 18 | analysis-pinyin 19 | org.elasticsearch.plugin.analysis.pinyin.AnalysisPinyinPlugin 20 | true 21 | false 22 | true 23 | 4E899B30 24 | true 25 | 26 | 27 | 28 | 29 | The Apache Software License, Version 2.0 30 | http://www.apache.org/licenses/LICENSE-2.0.txt 31 | repo 32 | 33 | 34 | 35 | 36 | 37 | Medcl 38 | medcl@elastic.co 39 | elastic 40 | http://www.elastic.co 41 | 42 | 43 | 44 | 45 | scm:git:git@github.com:medcl/elasticsearch-analysis-pinyin.git 46 | scm:git:git@github.com:medcl/elasticsearch-analysis-pinyin.git 47 | 48 | http://github.com/medcl/elasticsearch-analysis-pinyin 49 | 50 | 51 | 52 | org.sonatype.oss 53 | oss-parent 54 | 9 55 | 56 | 57 | 58 | 59 | oss.sonatype.org 60 | https://oss.sonatype.org/content/repositories/snapshots 61 | 62 | 63 | oss.sonatype.org 64 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 65 | 66 | 67 | 68 | 69 | 70 | oss.sonatype.org 71 | OSS Sonatype 72 | 73 | true 74 | 75 | 76 | true 77 | 78 | http://oss.sonatype.org/content/repositories/releases/ 79 | 80 | 81 | 82 | 83 | 84 | org.nlpcn 85 | nlp-lang 86 | 1.7 87 | ${basedir}/lib/nlp-lang-1.7.8.jar 88 | system 89 | 90 | 91 | 92 | org.elasticsearch 93 | elasticsearch 94 | ${elasticsearch.version} 95 | compile 96 | 97 | 98 | 99 | log4j 100 | log4j 101 | 1.2.16 102 | runtime 103 | 104 | 105 | 106 | org.hamcrest 107 | hamcrest-core 108 | 1.3.RC2 109 | test 110 | 111 | 112 | 113 | org.hamcrest 114 | hamcrest-library 115 | 1.3.RC2 116 | test 117 | 118 | 119 | 120 | org.powermock 121 | powermock-module-junit4 122 | 1.6.2 123 | test 124 | 125 | 126 | 127 | org.powermock 128 | powermock-api-mockito 129 | 1.6.2 130 | test 131 | 132 | 133 | 134 | nl.jqno.equalsverifier 135 | equalsverifier 136 | 1.7.5 137 | test 138 | 139 | 140 | 141 | com.openpojo 142 | openpojo 143 | 0.8.1 144 | test 145 | 146 | 147 | 148 | junit 149 | junit 150 | 4.9 151 | test 152 | 153 | 154 | 155 | 156 | 157 | 158 | org.apache.maven.plugins 159 | maven-compiler-plugin 160 | 3.5.1 161 | 162 | ${maven.compiler.target} 163 | ${maven.compiler.target} 164 | 165 | 166 | 167 | org.apache.maven.plugins 168 | maven-surefire-plugin 169 | 2.19.1 170 | 171 | 172 | org.apache.maven.plugins 173 | maven-source-plugin 174 | 2.1.2 175 | 176 | 177 | attach-sources 178 | 179 | jar 180 | 181 | 182 | 183 | 184 | 185 | maven-assembly-plugin 186 | 2.3 187 | 188 | false 189 | ${project.build.directory}/releases/ 190 | 191 | ${basedir}/src/main/assemblies/plugin.xml 192 | 193 | 194 | 195 | 196 | package 197 | 198 | single 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | disable-java8-doclint 209 | 210 | [1.8,) 211 | 212 | 213 | -Xdoclint:none 214 | 215 | 216 | 217 | release 218 | 219 | 220 | 221 | org.sonatype.plugins 222 | nexus-staging-maven-plugin 223 | 1.6.3 224 | true 225 | 226 | oss 227 | https://oss.sonatype.org/ 228 | true 229 | 230 | 231 | 232 | org.apache.maven.plugins 233 | maven-release-plugin 234 | 2.1 235 | 236 | true 237 | false 238 | release 239 | deploy 240 | 241 | 242 | 243 | org.apache.maven.plugins 244 | maven-compiler-plugin 245 | 3.5.1 246 | 247 | ${maven.compiler.target} 248 | ${maven.compiler.target} 249 | 250 | 251 | 252 | org.apache.maven.plugins 253 | maven-gpg-plugin 254 | 1.5 255 | 256 | 257 | sign-artifacts 258 | verify 259 | 260 | sign 261 | 262 | 263 | 264 | 265 | 266 | org.apache.maven.plugins 267 | maven-source-plugin 268 | 2.2.1 269 | 270 | 271 | attach-sources 272 | 273 | jar-no-fork 274 | 275 | 276 | 277 | 278 | 279 | org.apache.maven.plugins 280 | maven-javadoc-plugin 281 | 2.9 282 | 283 | 284 | attach-javadocs 285 | 286 | jar 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 6 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 7 | import org.elasticsearch.analysis.PinyinConfig; 8 | import org.nlpcn.commons.lang.pinyin.Pinyin; 9 | 10 | import java.io.IOException; 11 | import java.util.ArrayList; 12 | import java.util.Collections; 13 | import java.util.HashSet; 14 | import java.util.List; 15 | 16 | 17 | public class PinyinTokenizer extends Tokenizer { 18 | 19 | 20 | private static final int DEFAULT_BUFFER_SIZE = 256; 21 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 22 | private boolean done = false; 23 | private boolean processedCandidate = false; 24 | private boolean processedSortCandidate = false; 25 | private boolean processedFirstLetter = false; 26 | private boolean processedFullPinyinLetter = false; 27 | private boolean processedOriginal = false; 28 | protected int position = 0; 29 | protected int lastOffset = 0; 30 | private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); 31 | private PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class); 32 | private PinyinConfig config; 33 | ArrayList candidate; 34 | protected int candidateOffset = 0; //indicate candidates process offset 35 | private HashSet termsFilter; 36 | StringBuilder firstLetters; 37 | StringBuilder fullPinyinLetters; 38 | 39 | private int lastIncrementPosition = 0; 40 | 41 | String source; 42 | 43 | public PinyinTokenizer(PinyinConfig config) { 44 | this(DEFAULT_BUFFER_SIZE); 45 | this.config = config; 46 | 47 | //validate config 48 | if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) { 49 | throw new ConfigErrorException("pinyin config error, can't disable separate_first_letter, first_letter and full_pinyin at the same time."); 50 | } 51 | candidate = new ArrayList<>(); 52 | termsFilter = new HashSet<>(); 53 | firstLetters = new StringBuilder(); 54 | fullPinyinLetters = new StringBuilder(); 55 | } 56 | 57 | public PinyinTokenizer(int bufferSize) { 58 | super(); 59 | termAtt.resizeBuffer(bufferSize); 60 | } 61 | 62 | void addCandidate(TermItem item) { 63 | 64 | String term = item.term; 65 | if (config.lowercase) { 66 | term = term.toLowerCase(); 67 | } 68 | 69 | if (config.trimWhitespace) { 70 | term = term.trim(); 71 | } 72 | item.term = term; 73 | 74 | if (term.length() == 0) { 75 | return; 76 | } 77 | 78 | //remove same term with same position 79 | String fr=term+item.position; 80 | 81 | //remove same term, regardless position 82 | if (config.removeDuplicateTerm) { 83 | fr=term; 84 | } 85 | 86 | if (termsFilter.contains(fr)) { 87 | return; 88 | } 89 | termsFilter.add(fr); 90 | 91 | candidate.add(item); 92 | } 93 | 94 | 95 | void setTerm(String term, int startOffset, int endOffset, int position) { 96 | if (config.lowercase) { 97 | term = term.toLowerCase(); 98 | } 99 | 100 | if (config.trimWhitespace) { 101 | term = term.trim(); 102 | } 103 | 104 | //ignore empty term 105 | if(term.length()==0){ 106 | return; 107 | } 108 | 109 | termAtt.setEmpty(); 110 | termAtt.append(term); 111 | if (startOffset < 0) { 112 | startOffset = 0; 113 | } 114 | if (endOffset < startOffset) { 115 | endOffset = startOffset + term.length(); 116 | } 117 | 118 | if(!config.ignorePinyinOffset){ 119 | offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset)); 120 | } 121 | 122 | int offset = position - lastIncrementPosition; 123 | if (offset < 0) { 124 | offset = 0; 125 | } 126 | positionAttr.setPositionIncrement(offset); 127 | 128 | lastIncrementPosition = position; 129 | } 130 | 131 | @Override 132 | public final boolean incrementToken() throws IOException { 133 | 134 | clearAttributes(); 135 | 136 | if (!done) { 137 | 138 | //combine text together to get right pinyin 139 | if (!processedCandidate) { 140 | processedCandidate = true; 141 | int upto = 0; 142 | char[] buffer = termAtt.buffer(); 143 | while (true) { 144 | final int length = input.read(buffer, upto, buffer.length - upto); 145 | if (length == -1) break; 146 | upto += length; 147 | if (upto == buffer.length) 148 | buffer = termAtt.resizeBuffer(1 + buffer.length); 149 | } 150 | termAtt.setLength(upto); 151 | source = termAtt.toString(); 152 | 153 | List pinyinList = Pinyin.pinyin(source); 154 | if (pinyinList.size() == 0) return false; 155 | 156 | StringBuilder buff = new StringBuilder(); 157 | int buffStartPosition = 0; 158 | int buffSize = 0; 159 | 160 | position = 0; 161 | 162 | for (int i = 0; i < source.length(); i++) { 163 | char c = source.charAt(i); 164 | //keep original alphabet 165 | if (c < 128) { 166 | if (buff.length() <= 0) { 167 | buffStartPosition = i+1; 168 | } 169 | if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) { 170 | if (config.keepNoneChinese) { 171 | if (config.keepNoneChinese) { 172 | if (config.keepNoneChineseTogether) { 173 | buff.append(c); 174 | buffSize++; 175 | } else { 176 | addCandidate(new TermItem(String.valueOf(c), i, i + 1, buffStartPosition)); 177 | } 178 | } 179 | } 180 | if (config.keepNoneChineseInFirstLetter) { 181 | firstLetters.append(c); 182 | } 183 | if (config.keepNoneChineseInJoinedFullPinyin) { 184 | fullPinyinLetters.append(c); 185 | } 186 | } 187 | } else { 188 | 189 | //clean previous temp 190 | if (buff.length() > 0) { 191 | buffSize = parseBuff(buff, buffSize, buffStartPosition); 192 | } 193 | 194 | String pinyin = pinyinList.get(i); 195 | if (pinyin != null && pinyin.length() > 0) { 196 | position++; 197 | firstLetters.append(pinyin.charAt(0)); 198 | if (config.keepSeparateFirstLetter & pinyin.length() > 1) { 199 | addCandidate(new TermItem(String.valueOf(pinyin.charAt(0)), i, i + 1, position)); 200 | } 201 | if (config.keepFullPinyin) { 202 | addCandidate(new TermItem(pinyin, i, i + 1, position)); 203 | } 204 | if (config.keepJoinedFullPinyin) { 205 | fullPinyinLetters.append(pinyin); 206 | } 207 | } 208 | } 209 | 210 | lastOffset = i; 211 | 212 | } 213 | 214 | //clean previous temp 215 | if (buff.length() > 0) { 216 | buffSize = parseBuff(buff, buffSize, buffStartPosition); 217 | } 218 | } 219 | 220 | if (config.keepOriginal && !processedOriginal) { 221 | processedOriginal = true; 222 | addCandidate(new TermItem(source, 0, source.length(), 1)); 223 | } 224 | 225 | if (config.keepJoinedFullPinyin && !processedFullPinyinLetter && fullPinyinLetters.length() > 0) { 226 | processedFullPinyinLetter = true; 227 | addCandidate(new TermItem(fullPinyinLetters.toString(), 0, source.length(), 1)); 228 | fullPinyinLetters.setLength(0); 229 | } 230 | 231 | 232 | if (config.keepFirstLetter && firstLetters.length() > 0 && !processedFirstLetter) { 233 | processedFirstLetter = true; 234 | String fl; 235 | if (firstLetters.length() > config.LimitFirstLetterLength && config.LimitFirstLetterLength > 0) { 236 | fl = firstLetters.substring(0, config.LimitFirstLetterLength); 237 | } else { 238 | fl = firstLetters.toString(); 239 | } 240 | if (config.lowercase) { 241 | fl = fl.toLowerCase(); 242 | } 243 | if (!(config.keepSeparateFirstLetter && fl.length() <= 1)) { 244 | addCandidate(new TermItem(fl, 0, fl.length(), 1)); 245 | } 246 | } 247 | 248 | if (!processedSortCandidate) { 249 | processedSortCandidate = true; 250 | Collections.sort(candidate); 251 | } 252 | 253 | if (candidateOffset < candidate.size()) { 254 | TermItem item = candidate.get(candidateOffset); 255 | candidateOffset++; 256 | setTerm(item.term, item.startOffset, item.endOffset, item.position); 257 | return true; 258 | } 259 | 260 | 261 | done = true; 262 | return false; 263 | } 264 | return false; 265 | } 266 | 267 | private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) { 268 | if (config.keepNoneChinese) { 269 | if (config.noneChinesePinyinTokenize) { 270 | List result = PinyinAlphabetTokenizer.walk(buff.toString()); 271 | int start = (lastOffset - buffSize + 1); 272 | for (int i = 0; i < result.size(); i++) { 273 | int end; 274 | String t = result.get(i); 275 | if (config.fixedPinyinOffset) { 276 | end = start + 1; 277 | } else { 278 | end = start + t.length(); 279 | } 280 | addCandidate(new TermItem(result.get(i), start, end, ++position)); 281 | start = end; 282 | } 283 | } else if (config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || !config.keepNoneChineseInJoinedFullPinyin) { 284 | addCandidate(new TermItem(buff.toString(), lastOffset - buffSize, lastOffset, ++position)); 285 | } 286 | } 287 | 288 | buff.setLength(0); 289 | buffSize = 0; 290 | return buffSize; 291 | } 292 | 293 | @Override 294 | public final void end() throws IOException { 295 | super.end(); 296 | } 297 | 298 | @Override 299 | public void reset() throws IOException { 300 | super.reset(); 301 | position = 0; 302 | candidateOffset = 0; 303 | this.done = false; 304 | this.processedCandidate = false; 305 | this.processedFirstLetter = false; 306 | this.processedFullPinyinLetter = false; 307 | this.processedOriginal = false; 308 | firstLetters.setLength(0); 309 | fullPinyinLetters.setLength(0); 310 | termsFilter.clear(); 311 | candidate.clear(); 312 | source = null; 313 | lastIncrementPosition = 0; 314 | } 315 | 316 | 317 | } -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/MultiplePinyinTokenFilter.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | *

11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | *

13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import org.apache.lucene.analysis.TokenFilter; 21 | import org.apache.lucene.analysis.TokenStream; 22 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 23 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 24 | import org.elasticsearch.analysis.PinyinConfig; 25 | import org.nlpcn.commons.lang.pinyin.Pinyin; 26 | 27 | import java.io.IOException; 28 | import java.util.*; 29 | 30 | public class MultiplePinyinTokenFilter extends TokenFilter { 31 | 32 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 33 | private boolean done = true; 34 | private boolean processedCandidate = false; 35 | private boolean processedFullPinyinLetter = false; 36 | private boolean processedFirstLetter = false; 37 | private boolean processedOriginal = false; 38 | private boolean processedSortCandidate = false; 39 | protected int position = 0; 40 | protected int lastOffset = 0; 41 | private PinyinConfig config; 42 | List candidate; 43 | private HashSet termsFilter; 44 | 45 | protected int candidateOffset = 0; 46 | List firstLetters; 47 | List fullPinyinLetters; 48 | String source; 49 | private int lastIncrementPosition = 0; 50 | 51 | private PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class); 52 | 53 | public MultiplePinyinTokenFilter(TokenStream in, PinyinConfig config) { 54 | super(in); 55 | this.config = config; 56 | //validate config 57 | if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) { 58 | throw new ConfigErrorException("pinyin config error, can't disable separate_first_letter, first_letter and full_pinyin at the same time."); 59 | } 60 | candidate = new ArrayList<>(); 61 | firstLetters = new LinkedList(); 62 | termsFilter = new HashSet<>(); 63 | fullPinyinLetters = new LinkedList(); 64 | } 65 | 66 | //TODO refactor, merge code 67 | @Override 68 | public final boolean incrementToken() throws IOException { 69 | 70 | 71 | if (!done) { 72 | if (readTerm()) return true; 73 | } 74 | 75 | if (done) { 76 | resetVariable(); 77 | if (!input.incrementToken()) { 78 | return false; 79 | } 80 | done = false; 81 | } 82 | readTerm(); 83 | return true; 84 | } 85 | 86 | private boolean readTerm() { 87 | if (!processedCandidate) { 88 | processedCandidate = true; 89 | lastOffset = termAtt.length(); 90 | source = termAtt.toString(); 91 | if (config.trimWhitespace) { 92 | source = source.trim(); 93 | } 94 | 95 | List pinyinList = Pinyin.multiplePinyin(source); 96 | if (pinyinList.size() == 0) return false; 97 | 98 | StringBuilder buff = new StringBuilder(); 99 | int buffStartPosition = 0; 100 | int buffSize = 0; 101 | position = 0; 102 | 103 | for (int i = 0; i < source.length(); i++) { 104 | char c = source.charAt(i); 105 | 106 | //keep original alphabet 107 | if (c < 128) { 108 | if (buff.length() <= 0) { 109 | buffStartPosition = i; 110 | } 111 | if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) { 112 | if (config.keepNoneChinese) { 113 | if (config.keepNoneChinese) { 114 | if (config.keepNoneChineseTogether) { 115 | buff.append(c); 116 | buffSize++; 117 | } else { 118 | addCandidate(new TermItem(String.valueOf(c), i, i + 1, buffStartPosition)); 119 | } 120 | } 121 | } 122 | if (config.keepNoneChineseInFirstLetter) { 123 | if (firstLetters.size() == 0) { 124 | firstLetters.add(new StringBuilder(c+"")); 125 | } else { 126 | for (int j=0; j 0) { 144 | buffSize = parseBuff(buff, buffSize, buffStartPosition); 145 | } 146 | 147 | String pinyin = pinyinList.get(i); 148 | if (pinyin != null && pinyin.length() > 0) { 149 | String[] pingyinList = pinyin.split(" "); 150 | position++; 151 | if (firstLetters.size() == 0) { 152 | if (pingyinList.length > 1) { 153 | for (String py: pingyinList) { 154 | firstLetters.add(new StringBuilder(py.substring(0, 1))); 155 | } 156 | } 157 | else { 158 | firstLetters.add(new StringBuilder(pinyin.substring(0, 1))); 159 | } 160 | } else { 161 | if (pingyinList.length > 1) { 162 | int lettersSize = firstLetters.size(); 163 | for (int j=0; j 1) { 180 | addCandidate(new TermItem(String.valueOf(pinyin.charAt(0)), i, i + 1, position)); 181 | } 182 | if (config.keepFullPinyin) { 183 | addCandidate(new TermItem(pinyin, i, i + 1, position)); 184 | } 185 | if (config.keepJoinedFullPinyin) { 186 | if (fullPinyinLetters.size() == 0) { 187 | if (pingyinList.length > 1) { 188 | for (String py: pingyinList) { 189 | fullPinyinLetters.add(new StringBuilder(py)); 190 | } 191 | } else { 192 | fullPinyinLetters.add(new StringBuilder(pingyinList[0])); 193 | } 194 | } else { 195 | if (pingyinList.length > 1) { 196 | int fullPinyinSize = fullPinyinLetters.size(); 197 | for (int j=0; j 0) { 223 | buffSize = parseBuff(buff, buffSize, buffStartPosition); 224 | } 225 | } 226 | 227 | 228 | if (config.keepOriginal && !processedOriginal) { 229 | processedOriginal = true; 230 | addCandidate(new TermItem(source, 0, source.length(), 1)); 231 | } 232 | 233 | if (config.keepJoinedFullPinyin && !processedFullPinyinLetter && fullPinyinLetters.size() > 0) { 234 | processedFullPinyinLetter = true; 235 | for (StringBuilder fullPinyinLetter: fullPinyinLetters) { 236 | addCandidate(new TermItem(fullPinyinLetter.toString(), 0, source.length(), 1)); 237 | } 238 | fullPinyinLetters.clear(); 239 | } 240 | 241 | 242 | if (config.keepFirstLetter && firstLetters.size() > 0 && !processedFirstLetter) { 243 | processedFirstLetter = true; 244 | for (StringBuilder firstLetter: firstLetters) { 245 | String fl; 246 | if (firstLetter.length() > config.LimitFirstLetterLength && config.LimitFirstLetterLength > 0) { 247 | fl = firstLetter.substring(0, config.LimitFirstLetterLength); 248 | } else { 249 | fl = firstLetter.toString(); 250 | } 251 | if (config.lowercase) { 252 | fl = fl.toLowerCase(); 253 | } 254 | if (!(config.keepSeparateFirstLetter && fl.length() <= 1)) { 255 | addCandidate(new TermItem(fl, 0, fl.length(), 1)); 256 | } 257 | } 258 | } 259 | 260 | if (!processedSortCandidate) { 261 | processedSortCandidate = true; 262 | Collections.sort(candidate); 263 | } 264 | 265 | if (candidateOffset < candidate.size()) { 266 | TermItem item = candidate.get(candidateOffset); 267 | candidateOffset++; 268 | setTerm(item.term, item.startOffset, item.endOffset, item.position); 269 | return true; 270 | } 271 | 272 | done = true; 273 | return false; 274 | } 275 | 276 | 277 | void addCandidate(TermItem item) { 278 | 279 | String term = item.term; 280 | if (config.lowercase) { 281 | term = term.toLowerCase(); 282 | } 283 | 284 | if (config.trimWhitespace) { 285 | term = term.trim(); 286 | } 287 | item.term = term; 288 | 289 | if (term.length() == 0) { 290 | return; 291 | } 292 | 293 | //remove same term with same position 294 | String fr=term+item.position; 295 | 296 | //remove same term, regardless position 297 | if (config.removeDuplicateTerm) { 298 | fr=term; 299 | } 300 | 301 | if (termsFilter.contains(fr)) { 302 | return; 303 | } 304 | termsFilter.add(fr); 305 | 306 | candidate.add(item); 307 | } 308 | 309 | 310 | void setTerm(String term, int startOffset, int endOffset, int position) { 311 | if (config.lowercase) { 312 | term = term.toLowerCase(); 313 | } 314 | 315 | if (config.trimWhitespace) { 316 | term = term.trim(); 317 | } 318 | 319 | //ignore empty term 320 | if(term.length()==0){ 321 | return; 322 | } 323 | 324 | termAtt.setEmpty(); 325 | termAtt.append(term); 326 | if (startOffset < 0) { 327 | startOffset = 0; 328 | } 329 | if (endOffset < startOffset) { 330 | endOffset = startOffset + term.length(); 331 | } 332 | 333 | int offset = position - lastIncrementPosition; 334 | if (offset < 0) { 335 | offset = 0; 336 | } 337 | positionAttr.setPositionIncrement(offset); 338 | 339 | lastIncrementPosition = position; 340 | } 341 | 342 | private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) { 343 | if (config.keepNoneChinese) { 344 | if (config.noneChinesePinyinTokenize) { 345 | List result = PinyinAlphabetTokenizer.walk(buff.toString()); 346 | int start = (lastOffset - buffSize + 1); 347 | for (int i = 0; i < result.size(); i++) { 348 | int end; 349 | String t = result.get(i); 350 | if (config.fixedPinyinOffset) { 351 | end = start + 1; 352 | } else { 353 | end = start + t.length(); 354 | } 355 | addCandidate(new TermItem(result.get(i), start, end, ++position)); 356 | start = end; 357 | } 358 | } else if (config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || !config.keepNoneChineseInJoinedFullPinyin) { 359 | addCandidate(new TermItem(buff.toString(), lastOffset - buffSize, lastOffset, ++position)); 360 | } 361 | } 362 | 363 | buff.setLength(0); 364 | buffSize = 0; 365 | return buffSize; 366 | } 367 | 368 | @Override 369 | public final void end() throws IOException { 370 | super.end(); 371 | } 372 | 373 | void resetVariable() { 374 | position = 0; 375 | lastOffset = 0; 376 | candidate.clear(); 377 | this.processedCandidate = false; 378 | this.processedFirstLetter = false; 379 | this.processedFullPinyinLetter = false; 380 | this.processedOriginal = false; 381 | firstLetters.clear(); 382 | fullPinyinLetters.clear(); 383 | source = null; 384 | candidateOffset = 0; 385 | termsFilter.clear(); 386 | lastIncrementPosition = 0; 387 | } 388 | 389 | @Override 390 | public void reset() throws IOException { 391 | super.reset(); 392 | this.done = true; 393 | resetVariable(); 394 | } 395 | 396 | 397 | } 398 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/MultiplePinyinTokenizer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 6 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 7 | import org.elasticsearch.analysis.PinyinConfig; 8 | import org.nlpcn.commons.lang.pinyin.Pinyin; 9 | 10 | import java.io.IOException; 11 | import java.util.*; 12 | 13 | 14 | public class MultiplePinyinTokenizer extends Tokenizer { 15 | 16 | 17 | private static final int DEFAULT_BUFFER_SIZE = 256; 18 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 19 | private boolean done = false; 20 | private boolean processedCandidate = false; 21 | private boolean processedSortCandidate = false; 22 | private boolean processedFirstLetter = false; 23 | private boolean processedFullPinyinLetter = false; 24 | private boolean processedOriginal = false; 25 | protected int position = 0; 26 | protected int lastOffset = 0; 27 | private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); 28 | private PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class); 29 | private PinyinConfig config; 30 | ArrayList candidate; 31 | protected int candidateOffset = 0; //indicate candidates process offset 32 | private HashSet termsFilter; 33 | List firstLetters; 34 | List fullPinyinLetters; 35 | 36 | private int lastIncrementPosition = 0; 37 | 38 | String source; 39 | 40 | public MultiplePinyinTokenizer(PinyinConfig config) { 41 | this(DEFAULT_BUFFER_SIZE); 42 | this.config = config; 43 | 44 | //validate config 45 | if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) { 46 | throw new ConfigErrorException("pinyin config error, can't disable separate_first_letter, first_letter and full_pinyin at the same time."); 47 | } 48 | candidate = new ArrayList<>(); 49 | termsFilter = new HashSet<>(); 50 | firstLetters = new LinkedList(); 51 | fullPinyinLetters = new LinkedList(); 52 | } 53 | 54 | public MultiplePinyinTokenizer(int bufferSize) { 55 | super(); 56 | termAtt.resizeBuffer(bufferSize); 57 | } 58 | 59 | void addCandidate(TermItem item) { 60 | 61 | String term = item.term; 62 | if (config.lowercase) { 63 | term = term.toLowerCase(); 64 | } 65 | 66 | if (config.trimWhitespace) { 67 | term = term.trim(); 68 | } 69 | item.term = term; 70 | 71 | if (term.length() == 0) { 72 | return; 73 | } 74 | 75 | //remove same term with same position 76 | String fr=term+item.position; 77 | 78 | //remove same term, regardless position 79 | if (config.removeDuplicateTerm) { 80 | fr=term; 81 | } 82 | 83 | if (termsFilter.contains(fr)) { 84 | return; 85 | } 86 | termsFilter.add(fr); 87 | 88 | candidate.add(item); 89 | } 90 | 91 | 92 | void setTerm(String term, int startOffset, int endOffset, int position) { 93 | if (config.lowercase) { 94 | term = term.toLowerCase(); 95 | } 96 | 97 | if (config.trimWhitespace) { 98 | term = term.trim(); 99 | } 100 | 101 | //ignore empty term 102 | if(term.length()==0){ 103 | return; 104 | } 105 | 106 | termAtt.setEmpty(); 107 | termAtt.append(term); 108 | if (startOffset < 0) { 109 | startOffset = 0; 110 | } 111 | if (endOffset < startOffset) { 112 | endOffset = startOffset + term.length(); 113 | } 114 | 115 | if(!config.ignorePinyinOffset){ 116 | offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset)); 117 | } 118 | 119 | int offset = position - lastIncrementPosition; 120 | if (offset < 0) { 121 | offset = 0; 122 | } 123 | positionAttr.setPositionIncrement(offset); 124 | 125 | lastIncrementPosition = position; 126 | } 127 | 128 | @Override 129 | public final boolean incrementToken() throws IOException { 130 | 131 | clearAttributes(); 132 | 133 | if (!done) { 134 | 135 | //combine text together to get right pinyin 136 | if (!processedCandidate) { 137 | processedCandidate = true; 138 | int upto = 0; 139 | char[] buffer = termAtt.buffer(); 140 | while (true) { 141 | final int length = input.read(buffer, upto, buffer.length - upto); 142 | if (length == -1) break; 143 | upto += length; 144 | if (upto == buffer.length) 145 | buffer = termAtt.resizeBuffer(1 + buffer.length); 146 | } 147 | termAtt.setLength(upto); 148 | source = termAtt.toString(); 149 | 150 | List pinyinList = Pinyin.multiplePinyin(source); 151 | if (pinyinList.size() == 0) return false; 152 | 153 | StringBuilder buff = new StringBuilder(); 154 | int buffStartPosition = 0; 155 | int buffSize = 0; 156 | 157 | position = 0; 158 | 159 | for (int i = 0; i < source.length(); i++) { 160 | char c = source.charAt(i); 161 | //keep original alphabet 162 | if (c < 128) { 163 | if (buff.length() <= 0) { 164 | buffStartPosition = i+1; 165 | } 166 | if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) { 167 | if (config.keepNoneChinese) { 168 | if (config.keepNoneChinese) { 169 | if (config.keepNoneChineseTogether) { 170 | buff.append(c); 171 | buffSize++; 172 | } else { 173 | addCandidate(new TermItem(String.valueOf(c), i, i + 1, buffStartPosition)); 174 | } 175 | } 176 | } 177 | if (config.keepNoneChineseInFirstLetter) { 178 | if (firstLetters.size() == 0) { 179 | firstLetters.add(new StringBuilder(c+"")); 180 | } else { 181 | for (int j=0; j 0) { 201 | buffSize = parseBuff(buff, buffSize, buffStartPosition); 202 | } 203 | 204 | String pinyin = pinyinList.get(i); 205 | if (pinyin != null && pinyin.length() > 0) { 206 | String[] pingyinList = pinyin.split(" "); 207 | position++; 208 | if (firstLetters.size() == 0) { 209 | if (pingyinList.length > 1) { 210 | for (String py: pingyinList) { 211 | firstLetters.add(new StringBuilder(py.substring(0, 1))); 212 | } 213 | } 214 | else { 215 | firstLetters.add(new StringBuilder(pinyin.substring(0, 1))); 216 | } 217 | } else { 218 | if (pingyinList.length > 1) { 219 | int lettersSize = firstLetters.size(); 220 | for (int j=0; j 1) { 237 | addCandidate(new TermItem(String.valueOf(pinyin.charAt(0)), i, i + 1, position)); 238 | } 239 | if (config.keepFullPinyin) { 240 | addCandidate(new TermItem(pinyin, i, i + 1, position)); 241 | } 242 | if (config.keepJoinedFullPinyin) { 243 | if (fullPinyinLetters.size() == 0) { 244 | if (pingyinList.length > 1) { 245 | for (String py: pingyinList) { 246 | fullPinyinLetters.add(new StringBuilder(py)); 247 | } 248 | } else { 249 | fullPinyinLetters.add(new StringBuilder(pingyinList[0])); 250 | } 251 | } else { 252 | if (pingyinList.length > 1) { 253 | int fullPinyinSize = fullPinyinLetters.size(); 254 | for (int j=0; j 0) { 280 | buffSize = parseBuff(buff, buffSize, buffStartPosition); 281 | } 282 | } 283 | 284 | if (config.keepOriginal && !processedOriginal) { 285 | processedOriginal = true; 286 | addCandidate(new TermItem(source, 0, source.length(), 1)); 287 | } 288 | 289 | if (config.keepJoinedFullPinyin && !processedFullPinyinLetter && fullPinyinLetters.size() > 0) { 290 | processedFullPinyinLetter = true; 291 | for (StringBuilder fullPinyinLetter: fullPinyinLetters) { 292 | addCandidate(new TermItem(fullPinyinLetter.toString(), 0, source.length(), 1)); 293 | } 294 | fullPinyinLetters.clear(); 295 | } 296 | 297 | 298 | if (config.keepFirstLetter && firstLetters.size() > 0 && !processedFirstLetter) { 299 | processedFirstLetter = true; 300 | 301 | for (StringBuilder firstLetter: firstLetters) { 302 | String fl; 303 | if (firstLetter.length() > config.LimitFirstLetterLength && config.LimitFirstLetterLength > 0) { 304 | fl = firstLetter.substring(0, config.LimitFirstLetterLength); 305 | } else { 306 | fl = firstLetter.toString(); 307 | } 308 | if (config.lowercase) { 309 | fl = fl.toLowerCase(); 310 | } 311 | if (!(config.keepSeparateFirstLetter && fl.length() <= 1)) { 312 | addCandidate(new TermItem(fl, 0, fl.length(), 1)); 313 | } 314 | } 315 | } 316 | 317 | if (!processedSortCandidate) { 318 | processedSortCandidate = true; 319 | Collections.sort(candidate); 320 | } 321 | 322 | if (candidateOffset < candidate.size()) { 323 | TermItem item = candidate.get(candidateOffset); 324 | candidateOffset++; 325 | setTerm(item.term, item.startOffset, item.endOffset, item.position); 326 | return true; 327 | } 328 | 329 | 330 | done = true; 331 | return false; 332 | } 333 | return false; 334 | } 335 | 336 | private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) { 337 | if (config.keepNoneChinese) { 338 | if (config.noneChinesePinyinTokenize) { 339 | List result = PinyinAlphabetTokenizer.walk(buff.toString()); 340 | int start = (lastOffset - buffSize + 1); 341 | for (int i = 0; i < result.size(); i++) { 342 | int end; 343 | String t = result.get(i); 344 | if (config.fixedPinyinOffset) { 345 | end = start + 1; 346 | } else { 347 | end = start + t.length(); 348 | } 349 | addCandidate(new TermItem(result.get(i), start, end, ++position)); 350 | start = end; 351 | } 352 | } else if (config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || !config.keepNoneChineseInJoinedFullPinyin) { 353 | addCandidate(new TermItem(buff.toString(), lastOffset - buffSize, lastOffset, ++position)); 354 | } 355 | } 356 | 357 | buff.setLength(0); 358 | buffSize = 0; 359 | return buffSize; 360 | } 361 | 362 | @Override 363 | public final void end() throws IOException { 364 | super.end(); 365 | } 366 | 367 | @Override 368 | public void reset() throws IOException { 369 | super.reset(); 370 | position = 0; 371 | candidateOffset = 0; 372 | this.done = false; 373 | this.processedCandidate = false; 374 | this.processedFirstLetter = false; 375 | this.processedFullPinyinLetter = false; 376 | this.processedOriginal = false; 377 | firstLetters.clear(); 378 | fullPinyinLetters.clear(); 379 | termsFilter.clear(); 380 | candidate.clear(); 381 | source = null; 382 | lastIncrementPosition = 0; 383 | } 384 | 385 | 386 | } 387 | -------------------------------------------------------------------------------- /src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to ElasticSearch and Shay Banon under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. ElasticSearch licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package org.elasticsearch.index.analysis; 21 | 22 | import junit.framework.Assert; 23 | import org.apache.lucene.analysis.Analyzer; 24 | import org.apache.lucene.analysis.core.KeywordAnalyzer; 25 | import org.apache.lucene.analysis.core.WhitespaceAnalyzer; 26 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 27 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 28 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 29 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 30 | import org.elasticsearch.analysis.PinyinConfig; 31 | import org.junit.Test; 32 | import org.nlpcn.commons.lang.pinyin.Pinyin; 33 | 34 | import java.io.IOException; 35 | import java.io.StringReader; 36 | import java.util.ArrayList; 37 | import java.util.HashMap; 38 | import java.util.List; 39 | 40 | /** 41 | */ 42 | 43 | public class PinyinAnalysisTest { 44 | 45 | 46 | @Test 47 | public void testTokenFilter() throws IOException { 48 | PinyinConfig config = new PinyinConfig(); 49 | config.keepFirstLetter = true; 50 | config.keepNoneChinese = true; 51 | config.keepOriginal = false; 52 | config.keepFullPinyin = false; 53 | config.ignorePinyinOffset = false; 54 | 55 | 56 | StringReader sr = new StringReader("刘德华"); 57 | Analyzer analyzer = new StandardAnalyzer(); 58 | PinyinTokenFilter filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config); 59 | List pinyin = new ArrayList(); 60 | filter.reset(); 61 | System.out.println(); 62 | while (filter.incrementToken()) { 63 | CharTermAttribute ta = filter.getAttribute(CharTermAttribute.class); 64 | pinyin.add(ta.toString()); 65 | System.out.println(ta.toString()); 66 | } 67 | 68 | Assert.assertEquals(3, pinyin.size()); 69 | Assert.assertEquals("l", pinyin.get(0)); 70 | Assert.assertEquals("d", pinyin.get(1)); 71 | Assert.assertEquals("h", pinyin.get(2)); 72 | 73 | sr = new StringReader("刘德华"); 74 | analyzer = new KeywordAnalyzer(); 75 | filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config); 76 | pinyin.clear(); 77 | filter.reset(); 78 | System.out.println(); 79 | while (filter.incrementToken()) { 80 | CharTermAttribute ta = filter.getAttribute(CharTermAttribute.class); 81 | pinyin.add(ta.toString()); 82 | System.out.println(ta.toString()); 83 | } 84 | Assert.assertEquals(1, pinyin.size()); 85 | Assert.assertEquals("ldh", pinyin.get(0)); 86 | 87 | 88 | config = new PinyinConfig(); 89 | config.keepFirstLetter = false; 90 | config.keepNoneChinese = true; 91 | config.keepOriginal = false; 92 | config.keepFullPinyin = true; 93 | config.ignorePinyinOffset = false; 94 | 95 | 96 | sr = new StringReader("刘德华"); 97 | analyzer = new StandardAnalyzer(); 98 | filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config); 99 | pinyin = new ArrayList(); 100 | filter.reset(); 101 | System.out.println(); 102 | while (filter.incrementToken()) { 103 | CharTermAttribute ta = filter.getAttribute(CharTermAttribute.class); 104 | pinyin.add(ta.toString()); 105 | System.out.println(ta.toString()); 106 | } 107 | Assert.assertEquals(3, pinyin.size()); 108 | Assert.assertEquals("liu", pinyin.get(0)); 109 | Assert.assertEquals("de", pinyin.get(1)); 110 | Assert.assertEquals("hua", pinyin.get(2)); 111 | 112 | 113 | config = new PinyinConfig(); 114 | config.keepFirstLetter = true; 115 | config.keepNoneChinese = true; 116 | config.keepOriginal = true; 117 | config.keepFullPinyin = true; 118 | config.ignorePinyinOffset = false; 119 | 120 | 121 | sr = new StringReader("刘德华"); 122 | analyzer = new StandardAnalyzer(); 123 | filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config); 124 | pinyin = new ArrayList(); 125 | filter.reset(); 126 | System.out.println(); 127 | while (filter.incrementToken()) { 128 | CharTermAttribute ta = filter.getAttribute(CharTermAttribute.class); 129 | pinyin.add(ta.toString()); 130 | System.out.println(ta.toString()); 131 | } 132 | 133 | Assert.assertEquals(9, pinyin.size()); 134 | Assert.assertEquals("liu", pinyin.get(0)); 135 | Assert.assertEquals("刘", pinyin.get(1)); 136 | Assert.assertEquals("l", pinyin.get(2)); 137 | Assert.assertEquals("de", pinyin.get(3)); 138 | Assert.assertEquals("德", pinyin.get(4)); 139 | Assert.assertEquals("d", pinyin.get(5)); 140 | Assert.assertEquals("hua", pinyin.get(6)); 141 | Assert.assertEquals("华", pinyin.get(7)); 142 | Assert.assertEquals("h", pinyin.get(8)); 143 | 144 | 145 | config = new PinyinConfig(); 146 | config.keepFirstLetter = true; 147 | config.keepNoneChinese = true; 148 | config.keepOriginal = true; 149 | config.keepFullPinyin = true; 150 | config.ignorePinyinOffset = false; 151 | 152 | 153 | sr = new StringReader("刘德华"); 154 | analyzer = new KeywordAnalyzer(); 155 | filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config); 156 | pinyin = new ArrayList(); 157 | filter.reset(); 158 | System.out.println(); 159 | while (filter.incrementToken()) { 160 | CharTermAttribute ta = filter.getAttribute(CharTermAttribute.class); 161 | pinyin.add(ta.toString()); 162 | System.out.println(ta.toString()); 163 | } 164 | 165 | Assert.assertEquals(5, pinyin.size()); 166 | Assert.assertEquals("liu", pinyin.get(0)); 167 | Assert.assertEquals("刘德华", pinyin.get(1)); 168 | Assert.assertEquals("ldh", pinyin.get(2)); 169 | Assert.assertEquals("de", pinyin.get(3)); 170 | Assert.assertEquals("hua", pinyin.get(4)); 171 | 172 | 173 | 174 | config = new PinyinConfig(); 175 | config.keepFirstLetter = true; 176 | config.keepNoneChinese = false; 177 | config.keepNoneChineseInFirstLetter = true; 178 | config.keepOriginal = false; 179 | config.keepFullPinyin = false; 180 | config.LimitFirstLetterLength = 5; 181 | config.lowercase = true; 182 | config.ignorePinyinOffset = false; 183 | 184 | 185 | sr = new StringReader("Go的数组是纯粹的值类型,传递一个[N]T的代价是N个T"); 186 | analyzer = new KeywordAnalyzer(); 187 | filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config); 188 | pinyin = new ArrayList(); 189 | filter.reset(); 190 | System.out.println(); 191 | while (filter.incrementToken()) { 192 | CharTermAttribute ta = filter.getAttribute(CharTermAttribute.class); 193 | pinyin.add(ta.toString()); 194 | System.out.println(ta.toString()); 195 | } 196 | 197 | Assert.assertEquals(1, pinyin.size()); 198 | Assert.assertEquals("godsz", pinyin.get(0)); 199 | 200 | 201 | config = new PinyinConfig(); 202 | config.keepFirstLetter = true; 203 | config.keepSeparateFirstLetter = true; 204 | config.keepNoneChinese = true; 205 | config.keepNoneChineseInFirstLetter = false; 206 | config.keepOriginal = false; 207 | config.keepFullPinyin = true; 208 | config.LimitFirstLetterLength = 5; 209 | config.lowercase = true; 210 | config.ignorePinyinOffset = false; 211 | 212 | 213 | sr = new StringReader("liu德hua 名字"); 214 | analyzer = new WhitespaceAnalyzer(); 215 | filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config); 216 | filter.reset(); 217 | System.out.println(); 218 | pinyin = getTokenFilterResult(filter); 219 | 220 | Assert.assertEquals(9, pinyin.size()); 221 | Assert.assertEquals("liu", pinyin.get(0)); 222 | Assert.assertEquals("d", pinyin.get(1)); 223 | Assert.assertEquals("de", pinyin.get(2)); 224 | Assert.assertEquals("hua", pinyin.get(3)); 225 | Assert.assertEquals("m", pinyin.get(4)); 226 | Assert.assertEquals("ming", pinyin.get(5)); 227 | Assert.assertEquals("z", pinyin.get(6)); 228 | Assert.assertEquals("zi", pinyin.get(7)); 229 | Assert.assertEquals("mz", pinyin.get(8)); 230 | 231 | 232 | config = new PinyinConfig(); 233 | config.keepFirstLetter = true; 234 | config.keepSeparateFirstLetter = true; 235 | config.keepNoneChinese = true; 236 | config.keepNoneChineseInFirstLetter = false; 237 | config.keepOriginal = false; 238 | config.keepFullPinyin = true; 239 | config.LimitFirstLetterLength = 5; 240 | config.lowercase = true; 241 | config.noneChinesePinyinTokenize=true; 242 | config.removeDuplicateTerm=false; 243 | config.ignorePinyinOffset = false; 244 | 245 | 246 | sr = new StringReader("liudehuaalibaba13zhuanghan134"); 247 | analyzer = new WhitespaceAnalyzer(); 248 | filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config); 249 | 250 | filter.reset(); 251 | System.out.println(); 252 | 253 | pinyin= getTokenFilterResult(filter); 254 | 255 | Assert.assertEquals(11, pinyin.size()); 256 | Assert.assertEquals("liu", pinyin.get(0)); 257 | Assert.assertEquals("de", pinyin.get(1)); 258 | Assert.assertEquals("hua", pinyin.get(2)); 259 | Assert.assertEquals("a", pinyin.get(3)); 260 | Assert.assertEquals("li", pinyin.get(4)); 261 | Assert.assertEquals("ba", pinyin.get(5)); 262 | Assert.assertEquals("ba", pinyin.get(6)); 263 | Assert.assertEquals("13", pinyin.get(7)); 264 | Assert.assertEquals("zhuang", pinyin.get(8)); 265 | Assert.assertEquals("han", pinyin.get(9)); 266 | Assert.assertEquals("134", pinyin.get(10)); 267 | 268 | 269 | 270 | config = new PinyinConfig(); 271 | config.keepFirstLetter=true; 272 | config.keepFullPinyin=false; 273 | config.keepJoinedFullPinyin =true; 274 | config.keepNoneChinese=false; 275 | config.keepNoneChineseTogether=true; 276 | config.noneChinesePinyinTokenize=true; 277 | config.keepNoneChineseInFirstLetter=true; 278 | config.keepOriginal=false; 279 | config.lowercase=true; 280 | config.trimWhitespace=true; 281 | config.fixedPinyinOffset =true; 282 | config.ignorePinyinOffset = false; 283 | 284 | sr = new StringReader("刘德华"); 285 | analyzer = new WhitespaceAnalyzer(); 286 | filter = new PinyinTokenFilter(analyzer.tokenStream("f", sr), config); 287 | filter.reset(); 288 | pinyin= getTokenFilterResult(filter); 289 | Assert.assertEquals("liudehua", pinyin.get(0)); 290 | Assert.assertEquals("ldh", pinyin.get(1)); 291 | 292 | 293 | } 294 | 295 | private List getTokenFilterResult(PinyinTokenFilter filter) throws IOException { 296 | List pinyin = new ArrayList(); 297 | int pos=0; 298 | while (filter.incrementToken()) { 299 | CharTermAttribute ta = filter.getAttribute(CharTermAttribute.class); 300 | OffsetAttribute offset = filter.getAttribute(OffsetAttribute.class); 301 | PositionIncrementAttribute position = filter.getAttribute(PositionIncrementAttribute.class); 302 | pos=pos+position.getPositionIncrement(); 303 | pinyin.add(ta.toString()); 304 | Assert.assertTrue("startOffset must be non-negative",offset.startOffset()>=0); 305 | Assert.assertTrue("endOffset must be >= startOffset",offset.startOffset()>=0); 306 | System.out.println(ta.toString()+","+offset.startOffset()+","+offset.endOffset()+","+pos); 307 | } 308 | return pinyin; 309 | } 310 | 311 | 312 | @Test 313 | public void TestTokenizer() throws IOException { 314 | String[] s = 315 | {"刘德华" 316 | , "劉德華", "刘德华A1", 317 | "讲话频率小,不能发高音", "T波低平或倒置", "β-氨基酸尿", 318 | "DJ音乐家", "人生一大乐事, 哈哈", 319 | }; 320 | 321 | PinyinConfig config = new PinyinConfig(); 322 | config.noneChinesePinyinTokenize=false; 323 | config.keepOriginal=true; 324 | config.ignorePinyinOffset = false; 325 | 326 | HashMap> result = getStringArrayListHashMap(s, config); 327 | 328 | ArrayList re = result.get("刘德华"); 329 | Assert.assertEquals(5, re.size()); 330 | Assert.assertEquals("liu", re.get(0).term); 331 | Assert.assertEquals("刘德华", re.get(1).term); 332 | Assert.assertEquals("ldh", re.get(2).term); 333 | Assert.assertEquals("de", re.get(3).term); 334 | Assert.assertEquals("hua", re.get(4).term); 335 | 336 | 337 | re = result.get("劉德華"); 338 | Assert.assertEquals(5, re.size()); 339 | Assert.assertEquals("liu", re.get(0).term); 340 | Assert.assertEquals("劉德華", re.get(1).term); 341 | Assert.assertEquals("ldh", re.get(2).term); 342 | Assert.assertEquals("de", re.get(3).term); 343 | Assert.assertEquals("hua", re.get(4).term); 344 | 345 | 346 | re = result.get("刘德华A1"); 347 | Assert.assertEquals(6, re.size()); 348 | Assert.assertEquals("liu", re.get(0).term); 349 | Assert.assertEquals("刘德华a1", re.get(1).term); 350 | Assert.assertEquals("ldha1", re.get(2).term); 351 | Assert.assertEquals("de", re.get(3).term); 352 | Assert.assertEquals("hua", re.get(4).term); 353 | Assert.assertEquals("a1", re.get(5).term); 354 | 355 | 356 | re = result.get("讲话频率小,不能发高音"); 357 | Assert.assertEquals(12, re.size()); 358 | Assert.assertEquals("jiang", re.get(0).term); 359 | Assert.assertEquals("讲话频率小,不能发高音", re.get(1).term); 360 | Assert.assertEquals("jhplxbnfgy", re.get(2).term); 361 | Assert.assertEquals("hua", re.get(3).term); 362 | Assert.assertEquals("pin", re.get(4).term); 363 | Assert.assertEquals("lv", re.get(5).term); 364 | Assert.assertEquals("xiao", re.get(6).term); 365 | Assert.assertEquals("bu", re.get(7).term); 366 | Assert.assertEquals("neng", re.get(8).term); 367 | Assert.assertEquals("fa", re.get(9).term); 368 | Assert.assertEquals("gao", re.get(10).term); 369 | Assert.assertEquals("yin", re.get(11).term); 370 | 371 | 372 | re = result.get("T波低平或倒置"); 373 | Assert.assertEquals(9, re.size()); 374 | Assert.assertEquals("t", re.get(0).term); 375 | Assert.assertEquals("t波低平或倒置", re.get(1).term); 376 | Assert.assertEquals("tbdphdz", re.get(2).term); 377 | Assert.assertEquals("bo", re.get(3).term); 378 | Assert.assertEquals("di", re.get(4).term); 379 | Assert.assertEquals("ping", re.get(5).term); 380 | Assert.assertEquals("huo", re.get(6).term); 381 | Assert.assertEquals("dao", re.get(7).term); 382 | Assert.assertEquals("zhi", re.get(8).term); 383 | 384 | 385 | re = result.get("β-氨基酸尿"); 386 | Assert.assertEquals(6, re.size()); 387 | Assert.assertEquals("β-氨基酸尿", re.get(1).term); 388 | Assert.assertEquals("ajsn", re.get(2).term); 389 | Assert.assertEquals("an", re.get(0).term); 390 | Assert.assertEquals("ji", re.get(3).term); 391 | Assert.assertEquals("suan", re.get(4).term); 392 | Assert.assertEquals("niao", re.get(5).term); 393 | 394 | re = result.get("DJ音乐家"); 395 | Assert.assertEquals(6, re.size()); 396 | Assert.assertEquals("dj", re.get(0).term); 397 | Assert.assertEquals("dj音乐家", re.get(1).term); 398 | Assert.assertEquals("djyyj", re.get(2).term); 399 | Assert.assertEquals("yin", re.get(3).term); 400 | Assert.assertEquals("yue", re.get(4).term); 401 | Assert.assertEquals("jia", re.get(5).term); 402 | 403 | 404 | String[] s1 = 405 | {"刘德华", "刘 de 华"}; 406 | config = new PinyinConfig(); 407 | config.keepFirstLetter = true; 408 | config.keepSeparateFirstLetter = true; 409 | config.keepNoneChinese = false; 410 | config.keepNoneChineseInFirstLetter = false; 411 | config.keepOriginal = false; 412 | config.keepFullPinyin = true; 413 | config.LimitFirstLetterLength = 5; 414 | config.lowercase = false; 415 | config.ignorePinyinOffset = false; 416 | 417 | 418 | result = getStringArrayListHashMap(s1, config); 419 | 420 | re = result.get("刘德华"); 421 | Assert.assertEquals(7, re.size()); 422 | Assert.assertEquals("l", re.get(0).term); 423 | Assert.assertEquals("liu", re.get(1).term); 424 | Assert.assertEquals("ldh", re.get(2).term); 425 | Assert.assertEquals("d", re.get(3).term); 426 | Assert.assertEquals("de", re.get(4).term); 427 | Assert.assertEquals("h", re.get(5).term); 428 | Assert.assertEquals("hua", re.get(6).term); 429 | 430 | s1 = new String[]{"我的的"}; 431 | config = new PinyinConfig(); 432 | config.keepFirstLetter = true; 433 | config.keepSeparateFirstLetter = true; 434 | config.keepNoneChinese = false; 435 | config.keepNoneChineseInFirstLetter = false; 436 | config.keepOriginal = false; 437 | config.keepFullPinyin = true; 438 | config.LimitFirstLetterLength = 5; 439 | config.removeDuplicateTerm = true; 440 | config.lowercase = false; 441 | config.ignorePinyinOffset = false; 442 | 443 | 444 | result = getStringArrayListHashMap(s1, config); 445 | 446 | re = result.get("我的的"); 447 | Assert.assertEquals(5, re.size()); 448 | Assert.assertEquals("w", re.get(0).term); 449 | Assert.assertEquals("wo", re.get(1).term); 450 | Assert.assertEquals("wdd", re.get(2).term); 451 | Assert.assertEquals("d", re.get(3).term); 452 | Assert.assertEquals("de", re.get(4).term); 453 | 454 | s1 = new String[]{"lu金 s刘德华 张学友 郭富城 黎明 四大lao天王liudehua"}; 455 | config = new PinyinConfig(); 456 | config.keepFirstLetter=true; 457 | config.keepFullPinyin=false; 458 | config.keepNoneChinese=false; 459 | config.keepNoneChineseTogether=true; 460 | config.noneChinesePinyinTokenize=true; 461 | config.keepNoneChineseInFirstLetter=true; 462 | config.keepOriginal=false; 463 | config.lowercase=true; 464 | config.trimWhitespace=true; 465 | config.ignorePinyinOffset = false; 466 | 467 | 468 | result = getStringArrayListHashMap(s1, config); 469 | 470 | re = result.get("lu金 s刘德华 张学友 郭富城 黎明 四大lao天王liudehua"); 471 | Assert.assertEquals("lujsldhzxygfclms", re.get(0).term); 472 | 473 | 474 | s1 = new String[]{"刘德华"}; 475 | config = new PinyinConfig(); 476 | config.keepFirstLetter=true; 477 | config.keepFullPinyin=false; 478 | config.keepJoinedFullPinyin =true; 479 | config.keepNoneChinese=false; 480 | config.keepNoneChineseTogether=true; 481 | config.noneChinesePinyinTokenize=true; 482 | config.keepNoneChineseInFirstLetter=true; 483 | config.keepOriginal=false; 484 | config.lowercase=true; 485 | config.trimWhitespace=true; 486 | config.ignorePinyinOffset = false; 487 | 488 | 489 | result = getStringArrayListHashMap(s1, config); 490 | 491 | re = result.get("刘德华"); 492 | Assert.assertEquals("liudehua", re.get(0).term); 493 | Assert.assertEquals("ldh", re.get(1).term); 494 | 495 | s1 = new String[]{"刘德华"}; 496 | config = new PinyinConfig(); 497 | config.keepFirstLetter=false; 498 | config.keepFullPinyin=false; 499 | config.keepJoinedFullPinyin =true; 500 | config.keepNoneChinese=false; 501 | config.keepNoneChineseTogether=true; 502 | config.noneChinesePinyinTokenize=true; 503 | config.keepNoneChineseInFirstLetter=true; 504 | config.keepOriginal=false; 505 | config.lowercase=true; 506 | config.trimWhitespace=true; 507 | config.ignorePinyinOffset = false; 508 | 509 | 510 | result = getStringArrayListHashMap(s1, config); 511 | 512 | re = result.get("刘德华"); 513 | Assert.assertEquals("liudehua", re.get(0).term); 514 | 515 | 516 | s1 = new String[]{"ceshi"}; 517 | config = new PinyinConfig(); 518 | config.keepFirstLetter=false; 519 | config.keepSeparateFirstLetter=false; 520 | config.keepFullPinyin=false; 521 | config.keepJoinedFullPinyin =true; 522 | config.keepNoneChinese=true; 523 | config.keepNoneChineseTogether=true; 524 | config.keepOriginal=true; 525 | config.LimitFirstLetterLength=16; 526 | config.noneChinesePinyinTokenize=true; 527 | config.lowercase=true; 528 | config.ignorePinyinOffset = false; 529 | 530 | 531 | result = getStringArrayListHashMap(s1, config); 532 | 533 | re = result.get("ceshi"); 534 | Assert.assertEquals("ce", re.get(0).term); 535 | Assert.assertEquals("shi", re.get(2).term); 536 | Assert.assertEquals("ceshi", re.get(1).term); 537 | 538 | 539 | 540 | 541 | } 542 | 543 | @Test 544 | public void TestFirstLetters() throws IOException { 545 | String[] s1 = new String[]{"刘德华"}; 546 | PinyinConfig config = new PinyinConfig(); 547 | config.keepFirstLetter = false; 548 | config.keepSeparateFirstLetter = true; 549 | config.keepFullPinyin = false; 550 | config.keepJoinedFullPinyin = false; 551 | config.keepNoneChinese = true; 552 | config.keepNoneChineseTogether = true; 553 | config.keepOriginal = false; 554 | config.LimitFirstLetterLength = 16; 555 | config.noneChinesePinyinTokenize = true; 556 | config.lowercase = true; 557 | config.ignorePinyinOffset = false; 558 | 559 | 560 | HashMap> result = getStringArrayListHashMap(s1, config); 561 | 562 | ArrayList re = result.get("刘德华"); 563 | Assert.assertEquals("l", re.get(0).term); 564 | Assert.assertEquals("d", re.get(1).term); 565 | Assert.assertEquals("h", re.get(2).term); 566 | 567 | Assert.assertEquals(0, re.get(0).startOffset); 568 | Assert.assertEquals(1, re.get(1).startOffset); 569 | Assert.assertEquals(2, re.get(2).startOffset); 570 | 571 | Assert.assertEquals(1, re.get(0).endOffset); 572 | Assert.assertEquals(2, re.get(1).endOffset); 573 | Assert.assertEquals(3, re.get(2).endOffset); 574 | } 575 | 576 | @Test 577 | public void TestOnlyLetters() throws IOException { 578 | String[] s1 = new String[]{"ldh"}; 579 | PinyinConfig config = new PinyinConfig(); 580 | config.keepFirstLetter=false; 581 | config.keepSeparateFirstLetter=false; 582 | config.keepFullPinyin=true; 583 | config.keepJoinedFullPinyin =false; 584 | config.keepNoneChinese=true; 585 | config.keepNoneChineseTogether=true; 586 | config.keepOriginal=false; 587 | config.LimitFirstLetterLength=16; 588 | config.noneChinesePinyinTokenize=true; 589 | config.lowercase=true; 590 | config.ignorePinyinOffset = false; 591 | 592 | 593 | HashMap> result = getStringArrayListHashMap(s1, config); 594 | 595 | ArrayList re = result.get("ldh"); 596 | Assert.assertEquals("l", re.get(0).term); 597 | Assert.assertEquals("d", re.get(1).term); 598 | Assert.assertEquals("h", re.get(2).term); 599 | 600 | Assert.assertEquals(0, re.get(0).startOffset); 601 | Assert.assertEquals(1, re.get(1).startOffset); 602 | Assert.assertEquals(2, re.get(2).startOffset); 603 | 604 | Assert.assertEquals(1, re.get(0).endOffset); 605 | Assert.assertEquals(2, re.get(1).endOffset); 606 | Assert.assertEquals(3, re.get(2).endOffset); 607 | 608 | 609 | s1 = new String[]{"liuldhdehua"}; 610 | config = new PinyinConfig(); 611 | config.keepFirstLetter=false; 612 | config.keepSeparateFirstLetter=false; 613 | config.keepFullPinyin=true; 614 | config.keepJoinedFullPinyin =false; 615 | config.keepNoneChinese=true; 616 | config.keepNoneChineseTogether=true; 617 | config.keepOriginal=false; 618 | config.LimitFirstLetterLength=16; 619 | config.noneChinesePinyinTokenize=true; 620 | config.lowercase=true; 621 | config.ignorePinyinOffset = false; 622 | 623 | 624 | result = getStringArrayListHashMap(s1, config); 625 | 626 | re = result.get("liuldhdehua"); 627 | Assert.assertEquals("liu", re.get(0).term); 628 | Assert.assertEquals("l", re.get(1).term); 629 | Assert.assertEquals("d", re.get(2).term); 630 | Assert.assertEquals("h", re.get(3).term); 631 | Assert.assertEquals("de", re.get(4).term); 632 | Assert.assertEquals("hua", re.get(5).term); 633 | 634 | s1 = new String[]{"liuldh"}; 635 | config = new PinyinConfig(); 636 | config.keepFirstLetter=false; 637 | config.keepSeparateFirstLetter=false; 638 | config.keepFullPinyin=true; 639 | config.keepJoinedFullPinyin =false; 640 | config.keepNoneChinese=true; 641 | config.keepNoneChineseTogether=true; 642 | config.keepOriginal=false; 643 | config.LimitFirstLetterLength=16; 644 | config.noneChinesePinyinTokenize=true; 645 | config.lowercase=true; 646 | config.ignorePinyinOffset = false; 647 | 648 | 649 | result = getStringArrayListHashMap(s1, config); 650 | 651 | re = result.get("liuldh"); 652 | Assert.assertEquals("liu", re.get(0).term); 653 | Assert.assertEquals("l", re.get(1).term); 654 | Assert.assertEquals("d", re.get(2).term); 655 | Assert.assertEquals("h", re.get(3).term); 656 | 657 | s1 = new String[]{"ldhdehua"}; 658 | config = new PinyinConfig(); 659 | config.keepFirstLetter=false; 660 | config.keepSeparateFirstLetter=false; 661 | config.keepFullPinyin=true; 662 | config.keepJoinedFullPinyin =false; 663 | config.keepNoneChinese=true; 664 | config.keepNoneChineseTogether=true; 665 | config.keepOriginal=false; 666 | config.LimitFirstLetterLength=16; 667 | config.noneChinesePinyinTokenize=true; 668 | config.lowercase=true; 669 | config.ignorePinyinOffset = false; 670 | 671 | 672 | result = getStringArrayListHashMap(s1, config); 673 | 674 | re = result.get("ldhdehua"); 675 | Assert.assertEquals("l", re.get(0).term); 676 | Assert.assertEquals("d", re.get(1).term); 677 | Assert.assertEquals("h", re.get(2).term); 678 | Assert.assertEquals("de", re.get(3).term); 679 | Assert.assertEquals("hua", re.get(4).term); 680 | 681 | s1 = new String[]{"ldh123dehua"}; 682 | config = new PinyinConfig(); 683 | config.keepFirstLetter=false; 684 | config.keepSeparateFirstLetter=false; 685 | config.keepFullPinyin=true; 686 | config.keepJoinedFullPinyin =false; 687 | config.keepNoneChinese=true; 688 | config.keepNoneChineseTogether=true; 689 | config.keepOriginal=false; 690 | config.LimitFirstLetterLength=16; 691 | config.noneChinesePinyinTokenize=true; 692 | config.lowercase=true; 693 | config.ignorePinyinOffset = false; 694 | 695 | 696 | result = getStringArrayListHashMap(s1, config); 697 | 698 | re = result.get("ldh123dehua"); 699 | Assert.assertEquals("l", re.get(0).term); 700 | Assert.assertEquals("d", re.get(1).term); 701 | Assert.assertEquals("h", re.get(2).term); 702 | Assert.assertEquals("123", re.get(3).term); 703 | Assert.assertEquals("de", re.get(4).term); 704 | Assert.assertEquals("hua", re.get(5).term); 705 | } 706 | 707 | @Test 708 | public void TestOnlyFirstLetterTokenizer() throws IOException { 709 | String[] s = 710 | {"刘德华", "β-氨基酸尿", "DJ音乐家" 711 | }; 712 | 713 | PinyinConfig config = new PinyinConfig(); 714 | config.keepFirstLetter = true; 715 | config.keepNoneChinese = true; 716 | config.keepOriginal = false; 717 | config.keepFullPinyin = false; 718 | config.keepNoneChineseTogether = false; 719 | config.ignorePinyinOffset = false; 720 | 721 | 722 | HashMap> result = getStringArrayListHashMap(s, config); 723 | 724 | ArrayList re = result.get("刘德华"); 725 | Assert.assertEquals(1, re.size()); 726 | Assert.assertEquals("ldh", re.get(0).term); 727 | 728 | re = result.get("β-氨基酸尿"); 729 | Assert.assertEquals(1, re.size()); 730 | Assert.assertEquals("ajsn", re.get(0).term); 731 | 732 | re = result.get("DJ音乐家"); 733 | Assert.assertEquals(3, re.size()); 734 | Assert.assertEquals("d", re.get(0).term); 735 | Assert.assertEquals("djyyj", re.get(1).term); 736 | Assert.assertEquals("j", re.get(2).term); 737 | 738 | 739 | config = new PinyinConfig(); 740 | config.keepFirstLetter = true; 741 | config.keepNoneChinese = false; 742 | config.keepNoneChineseInFirstLetter = false; 743 | config.keepOriginal = false; 744 | config.keepFullPinyin = false; 745 | config.keepNoneChineseTogether = false; 746 | config.ignorePinyinOffset = false; 747 | 748 | 749 | result = getStringArrayListHashMap(s, config); 750 | 751 | re = result.get("DJ音乐家"); 752 | Assert.assertEquals(1, re.size()); 753 | Assert.assertEquals("yyj", re.get(0).term); 754 | 755 | config = new PinyinConfig(); 756 | config.keepFirstLetter = true; 757 | config.keepNoneChinese=true; 758 | config.keepNoneChineseInFirstLetter = true; 759 | config.keepNoneChineseTogether = true; 760 | config.keepOriginal = false; 761 | config.keepFullPinyin = false; 762 | config.noneChinesePinyinTokenize=false; 763 | config.ignorePinyinOffset = false; 764 | 765 | result = getStringArrayListHashMap(s, config); 766 | 767 | re = result.get("DJ音乐家"); 768 | Assert.assertEquals(2, re.size()); 769 | Assert.assertEquals("dj", re.get(0).term); 770 | Assert.assertEquals("djyyj", re.get(1).term); 771 | 772 | } 773 | 774 | @Test 775 | public void TestFullJoinedPinyin() throws IOException{ 776 | String[] s = 777 | {"DJ音乐家" 778 | }; 779 | PinyinConfig config = new PinyinConfig(); 780 | config.keepFirstLetter = false; 781 | config.keepNoneChineseInFirstLetter = false; 782 | config.keepOriginal = false; 783 | config.keepFullPinyin = false; 784 | config.noneChinesePinyinTokenize=false; 785 | config.keepNoneChinese=true; 786 | config.keepJoinedFullPinyin=true; 787 | config.keepNoneChineseTogether = true; 788 | config.keepNoneChineseInJoinedFullPinyin=true; 789 | config.ignorePinyinOffset = false; 790 | 791 | HashMap> result = getStringArrayListHashMap(s, config); 792 | 793 | ArrayList re = result.get("DJ音乐家"); 794 | Assert.assertEquals(1, re.size()); 795 | Assert.assertEquals("djyinyuejia", re.get(0).term); 796 | } 797 | 798 | @Test 799 | public void TestMixedPinyinTokenizer() throws IOException { 800 | String[] s = 801 | { 802 | "刘德华", 803 | "刘de华", 804 | "liude华", 805 | " liude 华"}; 806 | 807 | PinyinConfig config = new PinyinConfig(); 808 | config.keepFirstLetter = true; 809 | config.keepSeparateFirstLetter = true; 810 | config.keepNoneChinese = true; 811 | config.keepOriginal = true; 812 | config.keepFullPinyin = true; 813 | config.keepNoneChineseTogether = true; 814 | config.ignorePinyinOffset = false; 815 | 816 | 817 | HashMap> result = getStringArrayListHashMap(s, config); 818 | 819 | ArrayList re = result.get("刘德华"); 820 | Assert.assertEquals(8, re.size()); 821 | Assert.assertEquals("l", re.get(0).term); 822 | Assert.assertEquals(0, re.get(0).startOffset); 823 | Assert.assertEquals(1, re.get(0).endOffset); 824 | Assert.assertEquals("liu", re.get(1).term); 825 | Assert.assertEquals(0, re.get(1).startOffset); 826 | Assert.assertEquals(1, re.get(1).endOffset); 827 | 828 | Assert.assertEquals("刘德华", re.get(2).term); 829 | Assert.assertEquals(0, re.get(2).startOffset); 830 | Assert.assertEquals(3, re.get(2).endOffset); 831 | Assert.assertEquals("ldh", re.get(3).term); 832 | Assert.assertEquals(0, re.get(3).startOffset); 833 | Assert.assertEquals(3, re.get(3).endOffset); 834 | 835 | Assert.assertEquals("d", re.get(4).term); 836 | Assert.assertEquals(1, re.get(4).startOffset); 837 | Assert.assertEquals(2, re.get(4).endOffset); 838 | Assert.assertEquals("de", re.get(5).term); 839 | Assert.assertEquals(1, re.get(5).startOffset); 840 | Assert.assertEquals(2, re.get(5).endOffset); 841 | Assert.assertEquals("h", re.get(6).term); 842 | Assert.assertEquals(2, re.get(6).startOffset); 843 | Assert.assertEquals(3, re.get(6).endOffset); 844 | Assert.assertEquals("hua", re.get(7).term); 845 | Assert.assertEquals(2, re.get(7).startOffset); 846 | Assert.assertEquals(3, re.get(7).endOffset); 847 | 848 | } 849 | 850 | @Test 851 | public void TestPinyinTokenizerOffsetWithExtraTerms() throws IOException { 852 | String[] s = 853 | { 854 | "ceshi", 855 | "测shi", 856 | "ce试", 857 | "测试", 858 | "1测shi", 859 | }; 860 | 861 | PinyinConfig config = new PinyinConfig(); 862 | config.keepFirstLetter = false; 863 | config.keepSeparateFirstLetter = false; 864 | config.keepNoneChinese = true; 865 | config.keepOriginal = false; 866 | config.keepFullPinyin = true; 867 | config.keepNoneChineseTogether = true; 868 | config.removeDuplicateTerm = true; 869 | config.fixedPinyinOffset=false; 870 | config.keepJoinedFullPinyin=false; 871 | config.ignorePinyinOffset = false; 872 | 873 | 874 | 875 | HashMap> result = getStringArrayListHashMap(s, config); 876 | 877 | ArrayList re; 878 | 879 | re = result.get("ceshi"); 880 | Assert.assertEquals(2, re.size()); 881 | Assert.assertEquals("ce", re.get(0).term); 882 | Assert.assertEquals(0, re.get(0).startOffset); 883 | Assert.assertEquals(2, re.get(0).endOffset); 884 | Assert.assertEquals("shi", re.get(1).term); 885 | Assert.assertEquals(2, re.get(1).startOffset); 886 | Assert.assertEquals(5, re.get(1).endOffset); 887 | 888 | re = result.get("测shi"); 889 | Assert.assertEquals(2, re.size()); 890 | Assert.assertEquals("ce", re.get(0).term); 891 | Assert.assertEquals(0, re.get(0).startOffset); 892 | Assert.assertEquals(1, re.get(0).endOffset); 893 | Assert.assertEquals("shi", re.get(1).term); 894 | Assert.assertEquals(1, re.get(1).startOffset); 895 | Assert.assertEquals(4, re.get(1).endOffset); 896 | 897 | re = result.get("ce试"); 898 | Assert.assertEquals(2, re.size()); 899 | Assert.assertEquals("ce", re.get(0).term); 900 | Assert.assertEquals(0, re.get(0).startOffset); 901 | Assert.assertEquals(2, re.get(0).endOffset); 902 | Assert.assertEquals("shi", re.get(1).term); 903 | Assert.assertEquals(2, re.get(1).startOffset); 904 | Assert.assertEquals(3, re.get(1).endOffset); 905 | 906 | re = result.get("测试"); 907 | Assert.assertEquals(2, re.size()); 908 | Assert.assertEquals("ce", re.get(0).term); 909 | Assert.assertEquals(0, re.get(0).startOffset); 910 | Assert.assertEquals(1, re.get(0).endOffset); 911 | Assert.assertEquals("shi", re.get(1).term); 912 | Assert.assertEquals(1, re.get(1).startOffset); 913 | Assert.assertEquals(2, re.get(1).endOffset); 914 | 915 | re = result.get("1测shi"); 916 | Assert.assertEquals(3, re.size()); 917 | Assert.assertEquals("1", re.get(0).term); 918 | Assert.assertEquals(0, re.get(0).startOffset); 919 | Assert.assertEquals(1, re.get(0).endOffset); 920 | Assert.assertEquals("ce", re.get(1).term); 921 | Assert.assertEquals(1, re.get(1).startOffset); 922 | Assert.assertEquals(2, re.get(1).endOffset); 923 | Assert.assertEquals("shi", re.get(2).term); 924 | Assert.assertEquals(2, re.get(2).startOffset); 925 | Assert.assertEquals(5, re.get(2).endOffset); 926 | 927 | } 928 | 929 | @Test 930 | public void TestPinyinTokenizerOffset() throws IOException { 931 | String[] s = 932 | { 933 | "ceshi", 934 | "测shi", 935 | "ce试", 936 | "测试", 937 | "1测shi", 938 | }; 939 | 940 | PinyinConfig config = new PinyinConfig(); 941 | config.keepFirstLetter = false; 942 | config.keepSeparateFirstLetter = false; 943 | config.keepNoneChinese = true; 944 | config.keepOriginal = false; 945 | config.keepFullPinyin = true; 946 | config.keepNoneChineseTogether = true; 947 | config.fixedPinyinOffset=false; 948 | config.ignorePinyinOffset = false; 949 | 950 | HashMap> result = getStringArrayListHashMap(s, config); 951 | 952 | ArrayList re; 953 | 954 | re = result.get("ceshi"); 955 | Assert.assertEquals(2, re.size()); 956 | Assert.assertEquals("ce", re.get(0).term); 957 | Assert.assertEquals(0, re.get(0).startOffset); 958 | Assert.assertEquals(2, re.get(0).endOffset); 959 | Assert.assertEquals("shi", re.get(1).term); 960 | Assert.assertEquals(2, re.get(1).startOffset); 961 | Assert.assertEquals(5, re.get(1).endOffset); 962 | 963 | re = result.get("测shi"); 964 | Assert.assertEquals(2, re.size()); 965 | Assert.assertEquals("ce", re.get(0).term); 966 | Assert.assertEquals(0, re.get(0).startOffset); 967 | Assert.assertEquals(1, re.get(0).endOffset); 968 | Assert.assertEquals("shi", re.get(1).term); 969 | Assert.assertEquals(1, re.get(1).startOffset); 970 | Assert.assertEquals(4, re.get(1).endOffset); 971 | 972 | re = result.get("ce试"); 973 | Assert.assertEquals(2, re.size()); 974 | Assert.assertEquals("ce", re.get(0).term); 975 | Assert.assertEquals(0, re.get(0).startOffset); 976 | Assert.assertEquals(2, re.get(0).endOffset); 977 | Assert.assertEquals("shi", re.get(1).term); 978 | Assert.assertEquals(2, re.get(1).startOffset); 979 | Assert.assertEquals(3, re.get(1).endOffset); 980 | 981 | re = result.get("测试"); 982 | Assert.assertEquals(2, re.size()); 983 | Assert.assertEquals("ce", re.get(0).term); 984 | Assert.assertEquals(0, re.get(0).startOffset); 985 | Assert.assertEquals(1, re.get(0).endOffset); 986 | Assert.assertEquals("shi", re.get(1).term); 987 | Assert.assertEquals(1, re.get(1).startOffset); 988 | Assert.assertEquals(2, re.get(1).endOffset); 989 | 990 | re = result.get("1测shi"); 991 | Assert.assertEquals(3, re.size()); 992 | Assert.assertEquals("1", re.get(0).term); 993 | Assert.assertEquals(0, re.get(0).startOffset); 994 | Assert.assertEquals(1, re.get(0).endOffset); 995 | Assert.assertEquals("ce", re.get(1).term); 996 | Assert.assertEquals(1, re.get(1).startOffset); 997 | Assert.assertEquals(2, re.get(1).endOffset); 998 | Assert.assertEquals("shi", re.get(2).term); 999 | Assert.assertEquals(2, re.get(2).startOffset); 1000 | Assert.assertEquals(5, re.get(2).endOffset); 1001 | 1002 | } 1003 | 1004 | @Test 1005 | public void TestPinyinTokenizerFixedOffset() throws IOException { 1006 | String[] s = 1007 | { 1008 | "ceshi", 1009 | "测shi", 1010 | // "ce试", 1011 | "测试", 1012 | "1测shi", 1013 | }; 1014 | 1015 | PinyinConfig config = new PinyinConfig(); 1016 | config.keepFirstLetter = false; 1017 | config.keepSeparateFirstLetter = false; 1018 | config.keepNoneChinese = true; 1019 | config.keepOriginal = false; 1020 | config.keepFullPinyin = true; 1021 | config.keepNoneChineseTogether = true; 1022 | config.fixedPinyinOffset=true; 1023 | config.ignorePinyinOffset = false; 1024 | 1025 | 1026 | HashMap> result = getStringArrayListHashMap(s, config); 1027 | 1028 | ArrayList re; 1029 | 1030 | re = result.get("ceshi"); 1031 | Assert.assertEquals(2, re.size()); 1032 | Assert.assertEquals("ce", re.get(0).term); 1033 | Assert.assertEquals(0, re.get(0).startOffset); 1034 | Assert.assertEquals(1, re.get(0).endOffset); 1035 | Assert.assertEquals("shi", re.get(1).term); 1036 | Assert.assertEquals(1, re.get(1).startOffset); 1037 | Assert.assertEquals(2, re.get(1).endOffset); 1038 | 1039 | re = result.get("测shi"); 1040 | Assert.assertEquals(2, re.size()); 1041 | Assert.assertEquals("ce", re.get(0).term); 1042 | Assert.assertEquals(0, re.get(0).startOffset); 1043 | Assert.assertEquals(1, re.get(0).endOffset); 1044 | Assert.assertEquals("shi", re.get(1).term); 1045 | Assert.assertEquals(1, re.get(1).startOffset); 1046 | Assert.assertEquals(2, re.get(1).endOffset); 1047 | 1048 | // re = result.get("ce试"); 1049 | // Assert.assertEquals(2, re.size()); 1050 | // Assert.assertEquals("ce", re.get(0).term); 1051 | // Assert.assertEquals(0, re.get(0).startOffset); 1052 | // Assert.assertEquals(1, re.get(0).endOffset); 1053 | // Assert.assertEquals("shi", re.get(1).term); 1054 | // Assert.assertEquals(1, re.get(1).startOffset); 1055 | // Assert.assertEquals(2, re.get(1).endOffset); 1056 | 1057 | re = result.get("测试"); 1058 | Assert.assertEquals(2, re.size()); 1059 | Assert.assertEquals("ce", re.get(0).term); 1060 | Assert.assertEquals(0, re.get(0).startOffset); 1061 | Assert.assertEquals(1, re.get(0).endOffset); 1062 | Assert.assertEquals("shi", re.get(1).term); 1063 | Assert.assertEquals(1, re.get(1).startOffset); 1064 | Assert.assertEquals(2, re.get(1).endOffset); 1065 | 1066 | re = result.get("1测shi"); 1067 | Assert.assertEquals(3, re.size()); 1068 | Assert.assertEquals("1", re.get(0).term); 1069 | Assert.assertEquals(0, re.get(0).startOffset); 1070 | Assert.assertEquals(1, re.get(0).endOffset); 1071 | Assert.assertEquals("ce", re.get(1).term); 1072 | Assert.assertEquals(1, re.get(1).startOffset); 1073 | Assert.assertEquals(2, re.get(1).endOffset); 1074 | Assert.assertEquals("shi", re.get(2).term); 1075 | Assert.assertEquals(2, re.get(2).startOffset); 1076 | Assert.assertEquals(3, re.get(2).endOffset); 1077 | 1078 | } 1079 | 1080 | @Test 1081 | public void TestPinyin() { 1082 | List result = Pinyin.pinyin("德"); 1083 | for (int i = 0; i < result.size(); i++) { 1084 | String s = result.get(i); 1085 | System.out.println(s); 1086 | } 1087 | Assert.assertEquals("de", result.get(0)); 1088 | } 1089 | 1090 | private HashMap> getStringArrayListHashMap(String[] s, PinyinConfig config) throws IOException { 1091 | HashMap> result = new HashMap<>(); 1092 | for (String value : s) { 1093 | System.out.println("\n" + value); 1094 | StringReader sr = new StringReader(value); 1095 | 1096 | PinyinTokenizer tokenizer = new PinyinTokenizer(config); 1097 | tokenizer.setReader(sr); 1098 | 1099 | tokenizer.reset(); 1100 | 1101 | boolean hasnext = tokenizer.incrementToken(); 1102 | 1103 | int pos=0; 1104 | ArrayList re = new ArrayList<>(); 1105 | while (hasnext) { 1106 | CharTermAttribute ta = tokenizer.getAttribute(CharTermAttribute.class); 1107 | PositionIncrementAttribute position = tokenizer.getAttribute(PositionIncrementAttribute.class); 1108 | OffsetAttribute offset = tokenizer.getAttribute(OffsetAttribute.class); 1109 | pos=pos+position.getPositionIncrement(); 1110 | System.out.printf("%s: %d -> %d ,%d\n", ta.toString(), offset.startOffset(), offset.endOffset(),pos); 1111 | re.add(new TermItem(ta.toString(),offset.startOffset(),offset.endOffset(),pos)); 1112 | hasnext = tokenizer.incrementToken(); 1113 | } 1114 | result.put(value, re); 1115 | } 1116 | return result; 1117 | } 1118 | 1119 | @Test 1120 | public void TestPinyinFunction() { 1121 | List result = Pinyin.pinyin("貌美如誮"); 1122 | for (int i = 0; i < result.size(); i++) { 1123 | String s = result.get(i); 1124 | System.out.println(s); 1125 | } 1126 | Assert.assertEquals("mao", result.get(0)); 1127 | Assert.assertEquals("mei", result.get(1)); 1128 | Assert.assertEquals("ru", result.get(2)); 1129 | Assert.assertEquals("hua", result.get(3)); 1130 | } 1131 | 1132 | @Test 1133 | public void TestPinyinTokenize(){ 1134 | String str ="liudehuaalibaba13zhuanghan134"; 1135 | List result = PinyinAlphabetTokenizer.walk(str); 1136 | for (int i = 0; i < result.size(); i++) { 1137 | System.out.println(result.get(i)); 1138 | } 1139 | Assert.assertEquals("liu", result.get(0)); 1140 | Assert.assertEquals("de", result.get(1)); 1141 | Assert.assertEquals("hua", result.get(2)); 1142 | Assert.assertEquals("a", result.get(3)); 1143 | Assert.assertEquals("li", result.get(4)); 1144 | Assert.assertEquals("ba", result.get(5)); 1145 | Assert.assertEquals("ba", result.get(6)); 1146 | Assert.assertEquals("13", result.get(7)); 1147 | Assert.assertEquals("zhuang", result.get(8)); 1148 | Assert.assertEquals("han", result.get(9)); 1149 | Assert.assertEquals("134", result.get(10)); 1150 | 1151 | str ="a123"; 1152 | result = PinyinAlphabetTokenizer.walk(str); 1153 | for (int i = 0; i < result.size(); i++) { 1154 | System.out.println(result.get(i)); 1155 | } 1156 | Assert.assertEquals("a", result.get(0)); 1157 | Assert.assertEquals("123", result.get(1)); 1158 | 1159 | str ="liudehua"; 1160 | result = PinyinAlphabetTokenizer.walk(str); 1161 | for (int i = 0; i < result.size(); i++) { 1162 | System.out.println(result.get(i)); 1163 | } 1164 | Assert.assertEquals("liu", result.get(0)); 1165 | Assert.assertEquals("de", result.get(1)); 1166 | Assert.assertEquals("hua", result.get(2)); 1167 | 1168 | 1169 | str ="ceshi"; 1170 | result = PinyinAlphabetTokenizer.walk(str); 1171 | for (int i = 0; i < result.size(); i++) { 1172 | System.out.println(i+": "+result.get(i)); 1173 | } 1174 | Assert.assertEquals("ce", result.get(0)); 1175 | Assert.assertEquals("shi", result.get(1)); 1176 | } 1177 | 1178 | @Test 1179 | public void TestPinyinPosition1() throws IOException { 1180 | String[] s ={ "刘德华"}; 1181 | 1182 | PinyinConfig config = new PinyinConfig(); 1183 | config.keepFirstLetter = true; 1184 | config.keepSeparateFirstLetter = true; 1185 | config.keepNoneChinese = true; 1186 | config.keepOriginal = true; 1187 | config.keepFullPinyin = true; 1188 | config.keepNoneChineseTogether = true; 1189 | config.ignorePinyinOffset = false; 1190 | 1191 | HashMap> result = getStringArrayListHashMap(s, config); 1192 | 1193 | ArrayList re = result.get("刘德华"); 1194 | Assert.assertEquals("l", re.get(0).term); 1195 | Assert.assertEquals(0, re.get(0).startOffset); 1196 | Assert.assertEquals(1, re.get(0).endOffset); 1197 | Assert.assertEquals(1, re.get(0).position); 1198 | Assert.assertEquals("liu", re.get(1).term); 1199 | Assert.assertEquals(0, re.get(1).startOffset); 1200 | Assert.assertEquals(1, re.get(1).endOffset); 1201 | Assert.assertEquals(1, re.get(1).position); 1202 | 1203 | Assert.assertEquals("刘德华", re.get(2).term); 1204 | Assert.assertEquals(0, re.get(2).startOffset); 1205 | Assert.assertEquals(3, re.get(2).endOffset); 1206 | Assert.assertEquals(1, re.get(2).position); 1207 | Assert.assertEquals("ldh", re.get(3).term); 1208 | Assert.assertEquals(0, re.get(3).startOffset); 1209 | Assert.assertEquals(3, re.get(3).endOffset); 1210 | Assert.assertEquals(1, re.get(3).position); 1211 | 1212 | Assert.assertEquals("d", re.get(4).term); 1213 | Assert.assertEquals(1, re.get(4).startOffset); 1214 | Assert.assertEquals(2, re.get(4).endOffset); 1215 | Assert.assertEquals(2, re.get(4).position); 1216 | Assert.assertEquals("de", re.get(5).term); 1217 | Assert.assertEquals(1, re.get(5).startOffset); 1218 | Assert.assertEquals(2, re.get(5).endOffset); 1219 | Assert.assertEquals(2, re.get(5).position); 1220 | Assert.assertEquals("h", re.get(6).term); 1221 | Assert.assertEquals(2, re.get(6).startOffset); 1222 | Assert.assertEquals(3, re.get(6).endOffset); 1223 | Assert.assertEquals(3, re.get(6).position); 1224 | Assert.assertEquals("hua", re.get(7).term); 1225 | Assert.assertEquals(2, re.get(7).startOffset); 1226 | Assert.assertEquals(3, re.get(7).endOffset); 1227 | Assert.assertEquals(3, re.get(7).position); 1228 | } 1229 | 1230 | @Test 1231 | public void TestPinyinPosition2() throws IOException { 1232 | String[] s ={ "l德华"}; 1233 | 1234 | PinyinConfig config = new PinyinConfig(); 1235 | config.keepFirstLetter = true; 1236 | config.keepSeparateFirstLetter = true; 1237 | config.keepNoneChinese = true; 1238 | config.keepOriginal = true; 1239 | config.keepFullPinyin = true; 1240 | config.keepNoneChineseTogether = true; 1241 | config.ignorePinyinOffset = false; 1242 | 1243 | 1244 | HashMap> result = getStringArrayListHashMap(s, config); 1245 | 1246 | ArrayList re = result.get("l德华"); 1247 | Assert.assertEquals("l", re.get(0).term); 1248 | Assert.assertEquals(0, re.get(0).startOffset); 1249 | Assert.assertEquals(1, re.get(0).endOffset); 1250 | Assert.assertEquals(1, re.get(0).position); 1251 | 1252 | Assert.assertEquals("l德华", re.get(1).term); 1253 | Assert.assertEquals(0, re.get(1).startOffset); 1254 | Assert.assertEquals(3, re.get(1).endOffset); 1255 | Assert.assertEquals(1, re.get(1).position); 1256 | Assert.assertEquals("ldh", re.get(2).term); 1257 | Assert.assertEquals(0, re.get(2).startOffset); 1258 | Assert.assertEquals(3, re.get(2).endOffset); 1259 | Assert.assertEquals(1, re.get(2).position); 1260 | 1261 | Assert.assertEquals("d", re.get(3).term); 1262 | Assert.assertEquals(1, re.get(3).startOffset); 1263 | Assert.assertEquals(2, re.get(3).endOffset); 1264 | Assert.assertEquals(2, re.get(3).position); 1265 | Assert.assertEquals("de", re.get(4).term); 1266 | Assert.assertEquals(1, re.get(4).startOffset); 1267 | Assert.assertEquals(2, re.get(4).endOffset); 1268 | Assert.assertEquals(2, re.get(4).position); 1269 | Assert.assertEquals("h", re.get(5).term); 1270 | Assert.assertEquals(2, re.get(5).startOffset); 1271 | Assert.assertEquals(3, re.get(5).endOffset); 1272 | Assert.assertEquals(3, re.get(5).position); 1273 | Assert.assertEquals("hua", re.get(6).term); 1274 | Assert.assertEquals(2, re.get(6).startOffset); 1275 | Assert.assertEquals(3, re.get(6).endOffset); 1276 | Assert.assertEquals(3, re.get(6).position); 1277 | } 1278 | 1279 | @Test 1280 | public void TestPinyinPosition3() throws IOException { 1281 | String[] s ={ "liude华","liudehua","ldhua","刘de华","刘dehua","DJ音乐家"}; 1282 | 1283 | PinyinConfig config = new PinyinConfig(); 1284 | config.keepFirstLetter = true; 1285 | config.keepSeparateFirstLetter = true; 1286 | config.keepNoneChinese = true; 1287 | config.keepOriginal = true; 1288 | config.keepFullPinyin = true; 1289 | config.keepNoneChineseTogether = true; 1290 | config.ignorePinyinOffset = false; 1291 | 1292 | 1293 | HashMap> result = getStringArrayListHashMap(s, config); 1294 | 1295 | ArrayList re = result.get("liude华"); 1296 | Assert.assertEquals("liu", re.get(0).term); 1297 | Assert.assertEquals(0, re.get(0).startOffset); 1298 | Assert.assertEquals(3, re.get(0).endOffset); 1299 | Assert.assertEquals(1, re.get(0).position); 1300 | 1301 | Assert.assertEquals("liude华", re.get(1).term); 1302 | Assert.assertEquals(0, re.get(1).startOffset); 1303 | Assert.assertEquals(6, re.get(1).endOffset); 1304 | Assert.assertEquals(1, re.get(1).position); 1305 | 1306 | Assert.assertEquals("liudeh", re.get(2).term); 1307 | Assert.assertEquals(0, re.get(2).startOffset); 1308 | Assert.assertEquals(6, re.get(2).endOffset); 1309 | Assert.assertEquals(1, re.get(2).position); 1310 | 1311 | Assert.assertEquals("de", re.get(3).term); 1312 | Assert.assertEquals(3, re.get(3).startOffset); 1313 | Assert.assertEquals(5, re.get(3).endOffset); 1314 | Assert.assertEquals(2, re.get(3).position); 1315 | 1316 | 1317 | Assert.assertEquals("h", re.get(4).term); 1318 | Assert.assertEquals(5, re.get(4).startOffset); 1319 | Assert.assertEquals(6, re.get(4).endOffset); 1320 | Assert.assertEquals(3, re.get(4).position); 1321 | 1322 | Assert.assertEquals("hua", re.get(5).term); 1323 | Assert.assertEquals(5, re.get(5).startOffset); 1324 | Assert.assertEquals(6, re.get(5).endOffset); 1325 | Assert.assertEquals(3, re.get(5).position); 1326 | 1327 | } 1328 | 1329 | @Test 1330 | public void TestPinyinPosition4() throws IOException { 1331 | String[] s ={ "medcl"}; 1332 | 1333 | PinyinConfig config = new PinyinConfig(); 1334 | config.keepFirstLetter = true; 1335 | config.keepSeparateFirstLetter = true; 1336 | config.keepNoneChinese = true; 1337 | config.keepOriginal = true; 1338 | config.keepFullPinyin = true; 1339 | config.keepNoneChineseTogether = true; 1340 | config.ignorePinyinOffset = false; 1341 | 1342 | 1343 | HashMap> result= getStringArrayListHashMap(s, config); 1344 | 1345 | ArrayList re = result.get("medcl"); 1346 | Assert.assertEquals("me", re.get(0).term); 1347 | Assert.assertEquals(0, re.get(0).startOffset); 1348 | Assert.assertEquals(2, re.get(0).endOffset); 1349 | Assert.assertEquals(1, re.get(0).position); 1350 | 1351 | Assert.assertEquals("medcl", re.get(1).term); 1352 | Assert.assertEquals(0, re.get(1).startOffset); 1353 | Assert.assertEquals(5, re.get(1).endOffset); 1354 | Assert.assertEquals(1, re.get(1).position); 1355 | 1356 | config = new PinyinConfig(); 1357 | config.keepFirstLetter = true; 1358 | config.keepSeparateFirstLetter = true; 1359 | config.keepNoneChinese = true; 1360 | config.keepOriginal = true; 1361 | config.keepFullPinyin = true; 1362 | config.keepNoneChineseTogether = false; 1363 | config.keepJoinedFullPinyin = true; 1364 | config.ignorePinyinOffset = false; 1365 | 1366 | 1367 | result = getStringArrayListHashMap(s, config); 1368 | 1369 | re = result.get("medcl"); 1370 | Assert.assertEquals("m", re.get(0).term); 1371 | Assert.assertEquals(0, re.get(0).startOffset); 1372 | Assert.assertEquals(1, re.get(0).endOffset); 1373 | Assert.assertEquals(1, re.get(0).position); 1374 | 1375 | Assert.assertEquals("medcl", re.get(1).term); 1376 | Assert.assertEquals(0, re.get(1).startOffset); 1377 | Assert.assertEquals(5, re.get(1).endOffset); 1378 | Assert.assertEquals(1, re.get(1).position); 1379 | 1380 | 1381 | 1382 | Assert.assertEquals("e", re.get(2).term); 1383 | Assert.assertEquals(1, re.get(2).startOffset); 1384 | Assert.assertEquals(2, re.get(2).endOffset); 1385 | Assert.assertEquals(2, re.get(2).position); 1386 | 1387 | Assert.assertEquals("d", re.get(3).term); 1388 | Assert.assertEquals(2, re.get(3).startOffset); 1389 | Assert.assertEquals(3, re.get(3).endOffset); 1390 | Assert.assertEquals(3, re.get(3).position); 1391 | 1392 | 1393 | } 1394 | } 1395 | --------------------------------------------------------------------------------