├── .gitignore ├── README.md ├── _site └── README.md ├── bin └── build.sh ├── pom.xml └── src ├── main ├── assemblies │ └── plugin.xml ├── dic │ ├── sougou.dict │ ├── stopwords.txt │ └── user.dict ├── java │ └── org │ │ └── elasticsearch │ │ ├── index │ │ └── analysis │ │ │ ├── JiebaAnalysisBinderProcessor.java │ │ │ ├── JiebaAnalyzer.java │ │ │ ├── JiebaAnalyzerProvider.java │ │ │ ├── JiebaTokenFilter.java │ │ │ ├── JiebaTokenFilterFactory.java │ │ │ ├── OtherTokenizer.java │ │ │ └── SentenceTokenizer.java │ │ ├── indices │ │ └── analysis │ │ │ ├── JiebaIndicesAnalysis.java │ │ │ └── JiebaIndicesAnalysisModule.java │ │ └── plugin │ │ └── analysis │ │ └── jieba │ │ └── AnalysisJiebaPlugin.java ├── plugin-metadata │ └── plugin-security.policy └── resources │ └── plugin-descriptor.properties └── test └── java └── org └── elasticsearch └── index └── analysis └── JiebaAnalyzerTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .gradle/ 3 | *.iml 4 | work/ 5 | logs/ 6 | .DS_Store 7 | build/ 8 | target/ 9 | 10 | 11 | ## eclipse ignores (use 'gradle eclipse' to build eclipse projects) 12 | .project 13 | .classpath 14 | .settings 15 | */.project 16 | */.classpath 17 | */.settings 18 | */eclipse-build 19 | 20 | ## netbeans ignores 21 | nb-configuration.xml 22 | nbactions.xml 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 结巴分词 ElasticSearch 插件 2 | =========================== 3 | 4 | 集成 Lucene / Jieba Analyzer,支持自定义词典。 5 | 6 | 7 | | Jieba Chinese Analysis Plugin | ElasticSearch | Analyzer | 8 | |-------------------------------|---------------|----------------| 9 | | 0.0.2 | 1.0.0RC2 | 0.0.2 | 10 | | 0.0.3-SNAPSHOT | 1.3.0 | 1.0.0 | 11 | | 0.0.4 | 1.5.x | 1.0.2 | 12 | | 2.3.3 | 2.3.3 | 1.0.2 | 13 | | 2.3.4 | 2.3.4 | 1.0.2 | 14 | | 2.3.5 | 2.3.5 | 1.0.2 | 15 | 16 | 17 | > 本插件包括 `jieba analyzer`、`jieba tokenizer`、`jieba token filter`,有三种模式供选择。 18 | 19 | - index 主要用于索引分词,分词粒度较细 20 | - search 主要用于查询分词,分词粒度较粗 21 | - other 全角转半角、大写转小写、字符分词 22 | 23 | 安装 24 | ---- 25 | 26 | ## ES 2.x 以上版本 27 | 28 | > 插件版本跟 ES 版本保持一致 29 | 30 | **2.3.5** 31 | ```sh 32 | ./bin/plugin install https://github.com/huaban/elasticsearch-analysis-jieba/releases/download/v2.3.5/elasticsearch-analysis-jieba-2.3.5-bin.zip 33 | ``` 34 | 35 | **2.3.4** 36 | ```sh 37 | ./bin/plugin install https://github.com/huaban/elasticsearch-analysis-jieba/releases/download/v2.3.4/elasticsearch-analysis-jieba-2.3.4-bin.zip 38 | ``` 39 | 40 | **2.3.3** 41 | ```sh 42 | ./bin/plugin install https://github.com/huaban/elasticsearch-analysis-jieba/releases/download/v2.3.3/elasticsearch-analysis-jieba-2.3.3-bin.zip 43 | ``` 44 | 45 | ## ES 2.x 以下版本 46 | 47 | > 请使用插件 0.0.4 版本编译安装 48 | 49 | ```sh 50 | cd {your_es_path} 51 | mkdir plugins/jieba 52 | 53 | # 拷贝 jar 54 | copy jieba-analysis-1.0.2.jar and elasticsearch-analysis-jieba-0.0.4.jar to plugins/jieba 55 | 56 | # 拷贝用户字典 57 | cp -r data/jieba {your_es_path}/config/ 58 | ``` 59 | 60 | 测试 61 | ---- 62 | 63 | ```sh 64 | curl -XPUT 127.0.0.1:9200/test -d '{ 65 | "settings" : { 66 | "number_of_shards" : 1, 67 | "number_of_replicas" : 0 68 | 69 | }, 70 | "mappings" : { 71 | "test" : { 72 | "_all" : { "enabled" : false }, 73 | "properties" : { 74 | "name" : { "type" : "string", "analyzer" : "jieba_index", "search_analyzer" : "jieba_search" } 75 | } 76 | } 77 | } 78 | }';echo 79 | 80 | 81 | 82 | curl 'http://127.0.0.1:9200/test/_analyze?analyzer=jieba_index' -d '中华人民共和国';echo 83 | curl 'http://127.0.0.1:9200/test/_analyze?analyzer=jieba_search' -d '中华人民共和国';echo 84 | curl 'http://127.0.0.1:9200/test/_analyze?analyzer=jieba_other' -d '中华人民共和国 HelLo';echo 85 | ``` 86 | 87 | 如何发布一个版本 88 | ------ 89 | 90 | 91 | ``` 92 | github-release release \ 93 | --user huaban \ 94 | --repo elasticsearch-analysis-jieba \ 95 | --tag v2.3.5 \ 96 | --name "v2.3.5" \ 97 | --description "支持 ES v2.3.5" 98 | 99 | github-release upload \ 100 | --user huaban \ 101 | --repo elasticsearch-analysis-jieba \ 102 | --tag v2.3.5 \ 103 | --name "elasticsearch-analysis-jieba-2.3.5-bin.zip" \ 104 | --label "plugin.zip" \ 105 | --file target/releases/elasticsearch-analysis-jieba-2.3.5-bin.zip 106 | ``` 107 | 108 | 109 | 捐赠 110 | =========== 111 | 112 | **一顿黄焖鸡** 113 | 114 | ![](http://7xkgzh.com1.z0.glb.clouddn.com/0a9db33a25bce898c088462ddb726e57.png?imageView2/5/w/300/h/300) 115 | 116 | **请我喝一杯** 117 | 118 | ![](http://7xkgzh.com1.z0.glb.clouddn.com/01e2fc2635f7ac26a9e8b21157dc2840.png?imageView2/5/w/300/h/300) 119 | 120 | **或者随君意** 121 | 122 | ![](http://7xkgzh.com1.z0.glb.clouddn.com/2344d83c9be4b56cb66f696dcfb25ceb.png?imageView2/5/w/300/h/300) 123 | 124 | 125 | License 126 | ------- 127 | 128 | ``` 129 | This software is licensed under the Apache 2 license, quoted below. 130 | 131 | Copyright (C) 2013 libin and Huaban Inc 132 | 133 | Licensed under the Apache License, Version 2.0 (the "License"); you may not 134 | use this file except in compliance with the License. You may obtain a copy of 135 | the License at 136 | 137 | http://www.apache.org/licenses/LICENSE-2.0 138 | 139 | Unless required by applicable law or agreed to in writing, software 140 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 141 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 142 | License for the specific language governing permissions and limitations under 143 | the License. 144 | ``` 145 | -------------------------------------------------------------------------------- /_site/README.md: -------------------------------------------------------------------------------- 1 | action plugin must have _site? -------------------------------------------------------------------------------- /bin/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ROOT=`dirname $0` 4 | cd $ROOT/.. 5 | mvn package install -DcreateChecksum=true -DskipTests 6 | 7 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | org.elasticsearch 5 | elasticsearch-analysis-jieba 6 | 2.3.5 7 | jar 8 | elasticsearch-analysis-jieba 9 | http://maven.apache.org 10 | 11 | 12 | The Apache Software License, Version 2.0 13 | http://www.apache.org/licenses/LICENSE-2.0.txt 14 | repo 15 | 16 | 17 | 18 | 19 | 2.3.5 20 | 1.7 21 | ${project.basedir}/src/main/assemblies/plugin.xml 22 | jieba 23 | org.elasticsearch.plugin.analysis.jieba.AnalysisJiebaPlugin 24 | true 25 | false 26 | true 27 | 28 | 29 | 30 | 31 | org.elasticsearch 32 | elasticsearch 33 | ${elasticsearch.version} 34 | 35 | 36 | com.huaban 37 | jieba-analysis 38 | 1.0.2 39 | 40 | 41 | junit 42 | junit 43 | 4.12 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | org.apache.maven.plugins 52 | maven-compiler-plugin 53 | 54 | 1.7 55 | 1.7 56 | 57 | 58 | 59 | org.apache.maven.plugins 60 | maven-source-plugin 61 | 62 | 63 | attach-sources 64 | 65 | jar 66 | 67 | 68 | 69 | 70 | 71 | maven-assembly-plugin 72 | 73 | ${project.build.directory}/releases/ 74 | 75 | ${basedir}/src/main/assemblies/plugin.xml 76 | 77 | 78 | 79 | 80 | package 81 | 82 | single 83 | 84 | 85 | 86 | 87 | 88 | com.carrotsearch.randomizedtesting 89 | junit4-maven-plugin 90 | 91 | 92 | ${basedir} 93 | 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /src/main/assemblies/plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | bin 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | ${project.basedir}/src/main/dic 11 | /dic 12 | 13 | 14 | 15 | ${project.basedir}/src/main/plugin-metadata 16 | / 17 | 18 | 19 | 20 | ${project.basedir}/_site/ 21 | _site/ 22 | 23 | 24 | 25 | 26 | ${project.basedir}/src/main/resources/plugin-descriptor.properties 27 | 28 | true 29 | 30 | 31 | 32 | 33 | / 34 | true 35 | true 36 | 37 | org.elasticsearch:elasticsearch 38 | org.apache.lucene:lucene* 39 | commons-lang* 40 | 41 | 42 | 43 | / 44 | true 45 | true 46 | 47 | com.huaban:jieba-analysis 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /src/main/dic/stopwords.txt: -------------------------------------------------------------------------------- 1 | ////////// Punctuation tokens to remove //////////////// 2 | , 3 | . 4 | ` 5 | - 6 | _ 7 | = 8 | ? 9 | ' 10 | | 11 | " 12 | ( 13 | ) 14 | { 15 | } 16 | [ 17 | ] 18 | < 19 | > 20 | * 21 | # 22 | & 23 | ^ 24 | $ 25 | @ 26 | ! 27 | ~ 28 | : 29 | ; 30 | + 31 | / 32 | \ 33 | 《 34 | 》 35 | — 36 | - 37 | , 38 | 。 39 | 、 40 | : 41 | ; 42 | ! 43 | · 44 | ? 45 | “ 46 | ” 47 | ) 48 | ( 49 | 【 50 | 】 51 | [ 52 | ] 53 | ● 54 | 請 55 | 還 56 | 是 57 | 的 58 | 惹 59 | 但 60 | 從 61 | 到 62 | 和 63 | 給 64 | 或 65 | 在 66 | 有 67 | 又 68 | 了 69 | 將 70 | 什 71 | 麼 72 | 雖 73 | 嗎 74 | 嘛 75 | 啊 76 | 呢 77 | 哈 78 | 呵 79 | 噢 80 | 哦 81 | 嗯 82 | 吧 83 | 哎 84 | 喲 85 | 呀 86 | 唉 87 | 啦 88 | 唄 89 | 兮 90 | 乎 91 | 矣 92 | 哉 93 | 就 94 | 這 95 | 那 96 | 他 97 | 她 98 | 它 99 | 們 100 | 你 101 | 您 102 | 我 103 | 得 104 | 很 105 | ! 106 | [ 107 | ] 108 | { 109 | } 110 | ( 111 | ) 112 | & 113 | % 114 | $ 115 | # 116 | " 117 | ' 118 | @ 119 | ` 120 | ~ 121 | < 122 | > 123 | 也 124 | 了 125 | 仍 126 | 从 127 | 以 128 | 使 129 | 则 130 | 却 131 | 又 132 | 及 133 | 对 134 | 就 135 | 并 136 | 很 137 | 或 138 | 把 139 | 的 140 | 着 141 | 给 142 | 而 143 | 被 144 | 让 145 | 在 146 | 还 147 | 比 148 | 等 149 | 当 150 | 与 151 | 于 152 | 但 153 | // the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese) 154 |   155 | 156 | //////////////// English Stop Words //////////////// 157 | 158 | //////////////// Chinese Stop Words //////////////// 159 | -------------------------------------------------------------------------------- /src/main/dic/user.dict: -------------------------------------------------------------------------------- 1 | 小清新 3 2 | 百搭 3 3 | 显瘦 3 4 | 隨身碟 100 5 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/JiebaAnalysisBinderProcessor.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | public class JiebaAnalysisBinderProcessor extends 4 | AnalysisModule.AnalysisBinderProcessor { 5 | 6 | @Override 7 | public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { 8 | tokenFiltersBindings.processTokenFilter("jieba", 9 | JiebaTokenFilterFactory.class); 10 | super.processTokenFilters(tokenFiltersBindings); 11 | } 12 | 13 | @Override 14 | public void processAnalyzers(AnalyzersBindings analyzersBindings) { 15 | analyzersBindings.processAnalyzer("jieba", JiebaAnalyzerProvider.class); 16 | super.processAnalyzers(analyzersBindings); 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/JiebaAnalyzer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import java.io.FileReader; 4 | import java.io.IOException; 5 | import java.nio.charset.StandardCharsets; 6 | import java.nio.file.Path; 7 | 8 | import org.apache.lucene.analysis.Analyzer; 9 | import org.apache.lucene.analysis.TokenStream; 10 | import org.apache.lucene.analysis.Tokenizer; 11 | import org.apache.lucene.analysis.core.StopFilter; 12 | import org.apache.lucene.analysis.util.CharArraySet; 13 | import org.apache.lucene.analysis.util.WordlistLoader; 14 | import org.apache.lucene.util.IOUtils; 15 | import org.elasticsearch.common.logging.ESLogger; 16 | import org.elasticsearch.common.logging.Loggers; 17 | import org.elasticsearch.common.settings.Settings; 18 | import org.elasticsearch.env.Environment; 19 | 20 | import com.huaban.analysis.jieba.WordDictionary; 21 | 22 | public class JiebaAnalyzer extends Analyzer { 23 | private final ESLogger log = Loggers.getLogger(JiebaAnalyzer.class); 24 | 25 | private final CharArraySet stopWords; 26 | 27 | private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt"; 28 | 29 | private static final String STOPWORD_FILE_COMMENT = "//"; 30 | 31 | /** 32 | * Returns an unmodifiable instance of the default stop-words set. 33 | * 34 | * @return an unmodifiable instance of the default stop-words set. 35 | */ 36 | public static CharArraySet getDefaultStopSet() { 37 | return DefaultSetHolder.DEFAULT_STOP_SET; 38 | } 39 | 40 | /** 41 | * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer 42 | * class accesses the static final set the first time.; 43 | */ 44 | private static class DefaultSetHolder { 45 | static final CharArraySet DEFAULT_STOP_SET; 46 | 47 | static { 48 | try { 49 | DEFAULT_STOP_SET = loadDefaultStopWordSet(); 50 | } catch (IOException ex) { 51 | // default set should always be present as it is part of the 52 | // distribution (JAR) 53 | throw new RuntimeException( 54 | "Unable to load default stopword set"); 55 | } 56 | } 57 | 58 | static CharArraySet loadDefaultStopWordSet() throws IOException { 59 | // make sure it is unmodifiable as we expose it in the outer class 60 | return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet( 61 | IOUtils.getDecodingReader(JiebaAnalyzer.class, 62 | DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), 63 | STOPWORD_FILE_COMMENT)); 64 | } 65 | } 66 | 67 | private String type; 68 | 69 | private CharArraySet loadStopWords(Path dataPath) { 70 | try { 71 | return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet( 72 | new FileReader(dataPath.resolve("stopwords.txt").toFile()), STOPWORD_FILE_COMMENT)); 73 | } catch (IOException e) { 74 | return DefaultSetHolder.DEFAULT_STOP_SET; 75 | } 76 | } 77 | 78 | public JiebaAnalyzer(Settings settings, Environment env) { 79 | this(settings.get("seg_mode", "index"), env.pluginsFile().resolve("jieba/dic"), 80 | settings.getAsBoolean("stop", true)); 81 | } 82 | 83 | public JiebaAnalyzer(String segMode, Path dataPath, boolean isStop) { 84 | super(); 85 | 86 | this.type = segMode; 87 | WordDictionary.getInstance().init(dataPath); 88 | this.stopWords = isStop ? this.loadStopWords(dataPath) 89 | : CharArraySet.EMPTY_SET; 90 | 91 | this.log.info("Jieba segMode = {}", type); 92 | this.log.info("JiebaAnalyzer isStop = {}", isStop); 93 | this.log.info("JiebaAnalyzer stopWords = {}", this.stopWords.toString()); 94 | } 95 | 96 | @Override 97 | protected TokenStreamComponents createComponents(String fieldName) { 98 | Tokenizer tokenizer; 99 | if (type.equals("other")) { 100 | tokenizer = new OtherTokenizer(); 101 | } else { 102 | tokenizer = new SentenceTokenizer(); 103 | } 104 | TokenStream result = new JiebaTokenFilter(type, tokenizer); 105 | if (!type.equals("other") && !stopWords.isEmpty()) { 106 | result = new StopFilter(result, stopWords); 107 | } 108 | return new TokenStreamComponents(tokenizer, result); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/JiebaAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.elasticsearch.common.inject.Inject; 4 | import org.elasticsearch.common.inject.assistedinject.Assisted; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.Index; 8 | import org.elasticsearch.index.settings.IndexSettingsService; 9 | 10 | public class JiebaAnalyzerProvider extends 11 | AbstractIndexAnalyzerProvider { 12 | private final JiebaAnalyzer analyzer; 13 | 14 | @Inject 15 | public JiebaAnalyzerProvider(Index index, IndexSettingsService indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { 16 | super(index, indexSettings.getSettings(), name, settings); 17 | analyzer = new JiebaAnalyzer(settings, env); 18 | } 19 | 20 | @Override 21 | public JiebaAnalyzer get() { 22 | return this.analyzer; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/JiebaTokenFilter.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Iterator; 6 | import java.util.List; 7 | 8 | import org.apache.lucene.analysis.TokenFilter; 9 | import org.apache.lucene.analysis.TokenStream; 10 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 11 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 12 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 13 | 14 | import com.huaban.analysis.jieba.JiebaSegmenter; 15 | import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; 16 | import com.huaban.analysis.jieba.SegToken; 17 | 18 | public final class JiebaTokenFilter extends TokenFilter { 19 | 20 | JiebaSegmenter segmenter; 21 | 22 | private Iterator tokenIter; 23 | private List array; 24 | private String type; 25 | 26 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 27 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); 28 | private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); 29 | 30 | public JiebaTokenFilter(String type, TokenStream input) { 31 | super(input); 32 | this.type = type; 33 | segmenter = new JiebaSegmenter(); 34 | } 35 | 36 | @Override 37 | public boolean incrementToken() throws IOException { 38 | if (tokenIter == null || !tokenIter.hasNext()) { 39 | if (input.incrementToken()) { 40 | if (type.equals("index")) 41 | array = segmenter 42 | .process(termAtt.toString(), SegMode.INDEX); 43 | else if (type.equals("other")) { 44 | array = new ArrayList(); 45 | String token = termAtt.toString(); 46 | char[] ctoken = token.toCharArray(); 47 | for (int i = 0; i < ctoken.length; i++) { 48 | /* 全角=>半角 */ 49 | if (ctoken[i] > 0xFF00 && ctoken[i] < 0xFF5F) 50 | ctoken[i] = (char) (ctoken[i] - 0xFEE0); 51 | 52 | /* 大写=>小写 */ 53 | if (ctoken[i] > 0x40 && ctoken[i] < 0x5b) 54 | ctoken[i] = (char) (ctoken[i] + 0x20); 55 | } 56 | token = String.valueOf(ctoken); 57 | array.add(new SegToken(token, 0, token.length())); 58 | } else 59 | array = segmenter.process(termAtt.toString(), 60 | SegMode.SEARCH); 61 | tokenIter = array.iterator(); 62 | if (!tokenIter.hasNext()) 63 | return false; 64 | } else { 65 | return false; // no more sentences, end of stream! 66 | } 67 | } 68 | // WordTokenFilter must clear attributes, as it is creating new tokens. 69 | clearAttributes(); 70 | 71 | SegToken token = tokenIter.next(); 72 | offsetAtt.setOffset(token.startOffset, token.endOffset); 73 | String tokenString = token.word; 74 | termAtt.copyBuffer(tokenString.toCharArray(), 0, tokenString.length()); 75 | typeAtt.setType("word"); 76 | return true; 77 | } 78 | 79 | @Override 80 | public void reset() throws IOException { 81 | super.reset(); 82 | tokenIter = null; 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/JiebaTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.apache.lucene.analysis.TokenStream; 4 | import org.elasticsearch.common.inject.Inject; 5 | import org.elasticsearch.common.inject.assistedinject.Assisted; 6 | import org.elasticsearch.common.settings.Settings; 7 | import org.elasticsearch.env.Environment; 8 | import org.elasticsearch.index.Index; 9 | import org.elasticsearch.index.settings.IndexSettingsService; 10 | 11 | import com.huaban.analysis.jieba.WordDictionary; 12 | 13 | public class JiebaTokenFilterFactory extends AbstractTokenFilterFactory { 14 | private String type; 15 | 16 | @Inject 17 | public JiebaTokenFilterFactory(Index index, 18 | IndexSettingsService indexSettings, @Assisted String name, 19 | @Assisted Settings settings) { 20 | super(index, indexSettings.getSettings(), name, settings); 21 | type = settings.get("seg_mode", "index"); 22 | Environment env = new Environment(indexSettings.getSettings()); 23 | WordDictionary.getInstance().init(env.pluginsFile().resolve("jieba/dic")); 24 | } 25 | 26 | @Override 27 | public TokenStream create(TokenStream input) { 28 | return new JiebaTokenFilter(type, input); 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/OtherTokenizer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | 4 | import org.apache.lucene.analysis.Tokenizer; 5 | import org.apache.lucene.analysis.util.CharTokenizer; 6 | import org.apache.lucene.util.AttributeFactory; 7 | import org.apache.lucene.util.Version; 8 | 9 | /** 10 | * A OtherTokenizer is a tokenizer that do nothing with text. 11 | *

12 | * You must specify the required {@link Version} 13 | * compatibility when creating {@link OtherTokenizer}: 14 | *

19 | *

20 | */ 21 | 22 | public class OtherTokenizer extends CharTokenizer { 23 | 24 | /** 25 | * Construct a new OtherTokenizer. 26 | */ 27 | public OtherTokenizer() { 28 | super(); 29 | } 30 | 31 | /** 32 | * Construct a new OtherTokenizer using a given 33 | * {@link org.apache.lucene.util.AttributeFactory}. 34 | * 35 | * @param factory 36 | * the attribute factory to use for this {@link Tokenizer} 37 | */ 38 | public OtherTokenizer(AttributeFactory factory) { 39 | super(factory); 40 | } 41 | 42 | /** 43 | * Collects only characters which satisfy {@link Character#isOther(int)}. 44 | */ 45 | @Override 46 | protected boolean isTokenChar(int c) { 47 | return true; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/SentenceTokenizer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.lucene.analysis.Tokenizer; 6 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 7 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 8 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 9 | import org.apache.lucene.util.AttributeFactory; 10 | 11 | public final class SentenceTokenizer extends Tokenizer { 12 | 13 | /** 14 | * End of sentence punctuation: 。,!?;,!?; 15 | */ 16 | private final static String PUNCTION = "。,!?;,!?;"; 17 | private final static String SPACES = "  \t\r\n"; 18 | 19 | private final StringBuilder buffer = new StringBuilder(); 20 | 21 | private int tokenStart = 0, tokenEnd = 0; 22 | 23 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 24 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); 25 | private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); 26 | 27 | public SentenceTokenizer() { 28 | super(); 29 | } 30 | 31 | public SentenceTokenizer(AttributeFactory factory) { 32 | super(factory); 33 | } 34 | 35 | @Override 36 | public boolean incrementToken() throws IOException { 37 | clearAttributes(); 38 | buffer.setLength(0); 39 | int ci; 40 | char ch, pch; 41 | boolean atBegin = true; 42 | tokenStart = tokenEnd; 43 | ci = input.read(); 44 | ch = (char) ci; 45 | 46 | while (true) { 47 | if (ci == -1) { 48 | break; 49 | } else if (PUNCTION.indexOf(ch) != -1) { 50 | // End of a sentence 51 | buffer.append(ch); 52 | tokenEnd++; 53 | break; 54 | } else if (atBegin && SPACES.indexOf(ch) != -1) { 55 | tokenStart++; 56 | tokenEnd++; 57 | ci = input.read(); 58 | ch = (char) ci; 59 | } else { 60 | buffer.append(ch); 61 | atBegin = false; 62 | tokenEnd++; 63 | pch = ch; 64 | ci = input.read(); 65 | ch = (char) ci; 66 | // Two spaces, such as CR, LF 67 | if (SPACES.indexOf(ch) != -1 && SPACES.indexOf(pch) != -1) { 68 | // buffer.append(ch); 69 | tokenEnd++; 70 | break; 71 | } 72 | } 73 | } 74 | if (buffer.length() == 0) 75 | return false; 76 | else { 77 | termAtt.setEmpty().append(buffer); 78 | offsetAtt.setOffset(correctOffset(tokenStart), 79 | correctOffset(tokenEnd)); 80 | typeAtt.setType("sentence"); 81 | return true; 82 | } 83 | } 84 | 85 | @Override 86 | public void reset() throws IOException { 87 | super.reset(); 88 | tokenStart = tokenEnd = 0; 89 | } 90 | 91 | @Override 92 | public void end() { 93 | // set final offset 94 | final int finalOffset = correctOffset(tokenEnd); 95 | offsetAtt.setOffset(finalOffset, finalOffset); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/indices/analysis/JiebaIndicesAnalysis.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.indices.analysis; 2 | 3 | import org.elasticsearch.common.component.AbstractComponent; 4 | import org.elasticsearch.common.inject.Inject; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.analysis.AnalyzerScope; 8 | import org.elasticsearch.index.analysis.JiebaAnalyzer; 9 | import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory; 10 | 11 | public class JiebaIndicesAnalysis extends AbstractComponent { 12 | private static final String JIEBA_INDEX = "jieba_index"; 13 | private static final String JIEBA_SEARCH = "jieba_search"; 14 | private static final String JIEBA_OTHER = "jieba_other"; 15 | 16 | @Inject 17 | public JiebaIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService, Environment env) { 18 | super(settings); 19 | 20 | indicesAnalysisService.analyzerProviderFactories().put(JIEBA_INDEX, 21 | new PreBuiltAnalyzerProviderFactory(JIEBA_INDEX, AnalyzerScope.GLOBAL, 22 | new JiebaAnalyzer("index", env.pluginsFile().resolve("jieba/dic"), true))); 23 | 24 | indicesAnalysisService.analyzerProviderFactories().put(JIEBA_SEARCH, 25 | new PreBuiltAnalyzerProviderFactory(JIEBA_SEARCH, AnalyzerScope.GLOBAL, 26 | new JiebaAnalyzer("search", env.pluginsFile().resolve("jieba/dic"), true))); 27 | 28 | indicesAnalysisService.analyzerProviderFactories().put(JIEBA_OTHER, 29 | new PreBuiltAnalyzerProviderFactory(JIEBA_OTHER, AnalyzerScope.GLOBAL, 30 | new JiebaAnalyzer("other", env.pluginsFile().resolve("jieba/dic"), true))); 31 | 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/indices/analysis/JiebaIndicesAnalysisModule.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.indices.analysis; 2 | 3 | import org.elasticsearch.common.inject.AbstractModule; 4 | 5 | /** 6 | *

Title: JiebaIndicesAnalysisModule

7 | *

Description:

8 | *

Copyright: Copyright (c) 2016

9 | *

Company: Solvento Soft

10 | *

Created Date: 2016/7/21 下午4:53

11 | * 12 | * @author Rex Chien 13 | * @version 1.0 14 | */ 15 | public class JiebaIndicesAnalysisModule extends AbstractModule { 16 | 17 | @Override 18 | protected void configure() { 19 | bind(JiebaIndicesAnalysis.class).asEagerSingleton(); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/plugin/analysis/jieba/AnalysisJiebaPlugin.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.analysis.jieba; 2 | 3 | import org.elasticsearch.common.inject.Module; 4 | import org.elasticsearch.index.analysis.AnalysisModule; 5 | import org.elasticsearch.index.analysis.JiebaAnalysisBinderProcessor; 6 | import org.elasticsearch.indices.analysis.JiebaIndicesAnalysisModule; 7 | import org.elasticsearch.plugins.Plugin; 8 | 9 | import java.util.Collection; 10 | import java.util.Collections; 11 | 12 | public class AnalysisJiebaPlugin extends Plugin { 13 | 14 | @Override 15 | public String name() { 16 | return "analysis-jieba"; 17 | } 18 | 19 | @Override 20 | public String description() { 21 | return "jieba analysis"; 22 | } 23 | 24 | @Override 25 | public Collection nodeModules() { 26 | return Collections.singletonList(new JiebaIndicesAnalysisModule()); 27 | } 28 | 29 | 30 | public void onModule(AnalysisModule module) { 31 | module.addProcessor(new JiebaAnalysisBinderProcessor()); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/plugin-metadata/plugin-security.policy: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Elasticsearch under one or more contributor 3 | * license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright 5 | * ownership. Elasticsearch licenses this file to you under 6 | * the Apache License, Version 2.0 (the "License"); you may 7 | * not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | grant { 21 | permission java.lang.RuntimePermission "getClassLoader"; 22 | permission java.lang.RuntimePermission "setContextClassLoader"; 23 | permission java.io.FilePermission "<>", "read,write"; 24 | }; 25 | -------------------------------------------------------------------------------- /src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | # Elasticsearch plugin descriptor file 2 | # This file must exist as 'plugin-descriptor.properties' at 3 | # the root directory of all plugins. 4 | # 5 | # A plugin can be 'site', 'jvm', or both. 6 | # 7 | ### example site plugin for "foo": 8 | # 9 | # foo.zip <-- zip file for the plugin, with this structure: 10 | # _site/ <-- the contents that will be served 11 | # plugin-descriptor.properties <-- example contents below: 12 | # 13 | # site=true 14 | # description=My cool plugin 15 | # version=1.0 16 | # 17 | ### example jvm plugin for "foo" 18 | # 19 | # foo.zip <-- zip file for the plugin, with this structure: 20 | # .jar <-- classes, resources, dependencies 21 | # .jar <-- any number of jars 22 | # plugin-descriptor.properties <-- example contents below: 23 | # 24 | # jvm=true 25 | # classname=foo.bar.BazPlugin 26 | # description=My cool plugin 27 | # version=2.0.0-rc1 28 | # elasticsearch.version=2.0 29 | # java.version=1.7 30 | # 31 | ### mandatory elements for all plugins: 32 | # 33 | # 'description': simple summary of the plugin 34 | description=${project.description} 35 | # 36 | # 'version': plugin's version 37 | version=${project.version} 38 | # 39 | # 'name': the plugin name 40 | name=${elasticsearch.plugin.name} 41 | 42 | ### mandatory elements for site plugins: 43 | # 44 | # 'site': set to true to indicate contents of the _site/ 45 | # directory in the root of the plugin should be served. 46 | site=${elasticsearch.plugin.site} 47 | # 48 | ### mandatory elements for jvm plugins : 49 | # 50 | # 'jvm': true if the 'classname' class should be loaded 51 | # from jar files in the root directory of the plugin. 52 | # Note that only jar files in the root directory are 53 | # added to the classpath for the plugin! If you need 54 | # other resources, package them into a resources jar. 55 | jvm=${elasticsearch.plugin.jvm} 56 | # 57 | # 'classname': the name of the class to load, fully-qualified. 58 | classname=${elasticsearch.plugin.classname} 59 | # 60 | # 'java.version' version of java the code is built against 61 | # use the system property java.specification.version 62 | # version string must be a sequence of nonnegative decimal integers 63 | # separated by "."'s and may have leading zeros 64 | java.version=${maven.compiler.target} 65 | # 66 | # 'elasticsearch.version' version of elasticsearch compiled against 67 | # You will have to release a new version of the plugin for each new 68 | # elasticsearch release. This version is checked when the plugin 69 | # is loaded so Elasticsearch will refuse to start in the presence of 70 | # plugins with the incorrect elasticsearch.version. 71 | elasticsearch.version=${elasticsearch.version} 72 | # 73 | ### deprecated elements for jvm plugins : 74 | # 75 | # 'isolated': true if the plugin should have its own classloader. 76 | # passing false is deprecated, and only intended to support plugins 77 | # that have hard dependencies against each other. If this is 78 | # not specified, then the plugin is isolated by default. 79 | isolated=${elasticsearch.plugin.isolated} 80 | # -------------------------------------------------------------------------------- /src/test/java/org/elasticsearch/index/analysis/JiebaAnalyzerTest.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.StringReader; 6 | import java.nio.file.Path; 7 | 8 | import org.apache.lucene.analysis.TokenStream; 9 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 10 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 11 | import org.junit.Test; 12 | 13 | public class JiebaAnalyzerTest { 14 | Path dataPath = new File(System.getProperty("basedir"), "src/main/dic").toPath(); 15 | String[] sentences = new String[] { 16 | "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", 17 | "我不喜欢日本和服。", 18 | "雷猴回归人间。", 19 | "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", 20 | "我需要廉租房", 21 | "永和服装饰品有限公司", 22 | "我爱北京天安门", 23 | "abc", 24 | "隐马尔可夫", 25 | "雷猴是个好网站", 26 | "“,”和“SOFTware(软件)”两部分组成", 27 | "草泥马和欺实马是今年的流行词汇", 28 | "伊藤洋华堂总府店", 29 | "中国科学院计算技术研究所", 30 | "罗密欧与朱丽叶", 31 | "我购买了道具和服装", 32 | "PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍", 33 | "湖北省石首市", 34 | "湖北省十堰市", 35 | "总经理完成了这件事情", 36 | "电脑修好了", 37 | "做好了这件事情就一了百了了", 38 | "人们审美的观点是不同的", 39 | "我们买了一个美的空调", 40 | "线程初始化时我们要注意", 41 | "一个分子是由好多原子组织成的", 42 | "祝你马到功成", 43 | "他掉进了无底洞里", 44 | "中国的首都是北京", 45 | "孙君意", 46 | "外交部发言人马朝旭", 47 | "领导人会议和第四届东亚峰会", 48 | "在过去的这五年", 49 | "还需要很长的路要走", 50 | "60周年首都阅兵", 51 | "你好人们审美的观点是不同的", 52 | "买水果然后来世博园", 53 | "买水果然后去世博园", 54 | "但是后来我才知道你是对的", 55 | "存在即合理", 56 | "的的的的的在的的的的就以和和和", 57 | "I love你,不以为耻,反以为rong", 58 | "因", 59 | "", 60 | "hello你好人们审美的观点是不同的", 61 | "很好但主要是基于网页形式", 62 | "hello你好人们审美的观点是不同的", 63 | "为什么我不能拥有想要的生活", 64 | "后来我才", 65 | "此次来中国是为了", 66 | "使用了它就可以解决一些问题", 67 | ",使用了它就可以解决一些问题", 68 | "其实使用了它就可以解决一些问题", 69 | "好人使用了它就可以解决一些问题", 70 | "是因为和国家", 71 | "老年搜索还支持", 72 | "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ", 73 | "大", "", "他说的确实在理", "长春市长春节讲话", "结婚的和尚未结婚的", "结合成分子时", "旅游和服务是最好的", 74 | "这件事情的确是我的错", "供大家参考指正", "哈尔滨政府公布塌桥原因", "我在机场入口处", "邢永臣摄影报道", 75 | "BP神经网络如何训练才能在分类时增加区分度?", "南京市长江大桥", 76 | "应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究", "长春市长春药店", "邓颖超生前最喜欢的衣服", 77 | "胡锦涛是热爱世界和平的政治局常委", "程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪", 78 | "一次性交多少钱", "两块五一套,三块八一斤,四块七一本,五块六一条", "小和尚留了一个像大和尚一样的和尚头", 79 | "我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站", "张晓梅去人民医院做了个B超然后去买了件T恤", 80 | "AT&T是一件不错的公司,给你发offer了吗?", "C++和c#是什么关系?11+122=133,是吗?PI=3.14159", 81 | "你认识那个和主席握手的的哥吗?他开一辆黑色的士。", "枪杆子中出政权" }; 82 | 83 | @Test 84 | public void test() throws IOException { 85 | JiebaAnalyzer analyzer = new JiebaAnalyzer("index", dataPath, true); 86 | 87 | for (String sentence : sentences) { 88 | TokenStream tokenStream = analyzer.tokenStream(null, 89 | new StringReader(sentence)); 90 | tokenStream.reset(); 91 | while (tokenStream.incrementToken()) { 92 | CharTermAttribute termAtt = tokenStream 93 | .getAttribute(CharTermAttribute.class); 94 | OffsetAttribute offsetAtt = tokenStream 95 | .getAttribute(OffsetAttribute.class); 96 | System.out 97 | .println(termAtt.toString() + "," 98 | + offsetAtt.startOffset() + "," 99 | + offsetAtt.endOffset()); 100 | } 101 | tokenStream.reset(); 102 | } 103 | 104 | analyzer.close(); 105 | } 106 | 107 | @Test 108 | public void testSegModeOther() throws IOException { 109 | JiebaAnalyzer analyzer = new JiebaAnalyzer("index", dataPath, true); 110 | 111 | for (String sentence : sentences) { 112 | TokenStream tokenStream = analyzer.tokenStream(null, 113 | new StringReader(sentence)); 114 | tokenStream.reset(); 115 | while (tokenStream.incrementToken()) { 116 | CharTermAttribute termAtt = tokenStream 117 | .getAttribute(CharTermAttribute.class); 118 | OffsetAttribute offsetAtt = tokenStream 119 | .getAttribute(OffsetAttribute.class); 120 | System.out 121 | .println(termAtt.toString() + "," 122 | + offsetAtt.startOffset() + "," 123 | + offsetAtt.endOffset()); 124 | } 125 | tokenStream.reset(); 126 | } 127 | 128 | analyzer.close(); 129 | } 130 | 131 | @Test 132 | public void testBugSentences() throws IOException { 133 | String[] bugSentences = new String[] { "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 " }; 134 | JiebaAnalyzer analyzer = new JiebaAnalyzer("index", dataPath, true); 135 | 136 | for (String sentence : bugSentences) { 137 | TokenStream tokenStream = analyzer.tokenStream(null, 138 | new StringReader(sentence)); 139 | tokenStream.reset(); 140 | while (tokenStream.incrementToken()) { 141 | CharTermAttribute termAtt = tokenStream 142 | .getAttribute(CharTermAttribute.class); 143 | OffsetAttribute offsetAtt = tokenStream 144 | .getAttribute(OffsetAttribute.class); 145 | System.out 146 | .println(termAtt.toString() + "," 147 | + offsetAtt.startOffset() + "," 148 | + offsetAtt.endOffset()); 149 | } 150 | tokenStream.reset(); 151 | } 152 | 153 | analyzer.close(); 154 | } 155 | 156 | @Test 157 | public void testLoadDict() throws IOException { 158 | JiebaAnalyzer analyzer = new JiebaAnalyzer("index", dataPath, true); 159 | 160 | String[] sentences = new String[] { 161 | "我剛買了一個 16GB 的 USB 隨身碟", 162 | "我剛買了一個 16GBUSB 隨身碟", 163 | "今天有iphone6和nexus5的大拍賣" 164 | }; 165 | 166 | for (String sentence : sentences) { 167 | TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(sentence)); 168 | tokenStream.reset(); 169 | System.out.println(sentence); 170 | while (tokenStream.incrementToken()) { 171 | CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); 172 | OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class); 173 | System.out.println( 174 | termAtt.toString() + "," + 175 | offsetAtt.startOffset() + "," + 176 | offsetAtt.endOffset() 177 | ); 178 | } 179 | System.out.println(); 180 | tokenStream.reset(); 181 | } 182 | 183 | analyzer.close(); 184 | } 185 | } 186 | --------------------------------------------------------------------------------