├── .gitignore
├── README.md
├── _site
└── README.md
├── bin
└── build.sh
├── pom.xml
└── src
├── main
├── assemblies
│ └── plugin.xml
├── dic
│ ├── sougou.dict
│ ├── stopwords.txt
│ └── user.dict
├── java
│ └── org
│ │ └── elasticsearch
│ │ ├── index
│ │ └── analysis
│ │ │ ├── JiebaAnalysisBinderProcessor.java
│ │ │ ├── JiebaAnalyzer.java
│ │ │ ├── JiebaAnalyzerProvider.java
│ │ │ ├── JiebaTokenFilter.java
│ │ │ ├── JiebaTokenFilterFactory.java
│ │ │ ├── OtherTokenizer.java
│ │ │ └── SentenceTokenizer.java
│ │ ├── indices
│ │ └── analysis
│ │ │ ├── JiebaIndicesAnalysis.java
│ │ │ └── JiebaIndicesAnalysisModule.java
│ │ └── plugin
│ │ └── analysis
│ │ └── jieba
│ │ └── AnalysisJiebaPlugin.java
├── plugin-metadata
│ └── plugin-security.policy
└── resources
│ └── plugin-descriptor.properties
└── test
└── java
└── org
└── elasticsearch
└── index
└── analysis
└── JiebaAnalyzerTest.java
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | .gradle/
3 | *.iml
4 | work/
5 | logs/
6 | .DS_Store
7 | build/
8 | target/
9 |
10 |
11 | ## eclipse ignores (use 'gradle eclipse' to build eclipse projects)
12 | .project
13 | .classpath
14 | .settings
15 | */.project
16 | */.classpath
17 | */.settings
18 | */eclipse-build
19 |
20 | ## netbeans ignores
21 | nb-configuration.xml
22 | nbactions.xml
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 结巴分词 ElasticSearch 插件
2 | ===========================
3 |
4 | 集成 Lucene / Jieba Analyzer,支持自定义词典。
5 |
6 |
7 | | Jieba Chinese Analysis Plugin | ElasticSearch | Analyzer |
8 | |-------------------------------|---------------|----------------|
9 | | 0.0.2 | 1.0.0RC2 | 0.0.2 |
10 | | 0.0.3-SNAPSHOT | 1.3.0 | 1.0.0 |
11 | | 0.0.4 | 1.5.x | 1.0.2 |
12 | | 2.3.3 | 2.3.3 | 1.0.2 |
13 | | 2.3.4 | 2.3.4 | 1.0.2 |
14 | | 2.3.5 | 2.3.5 | 1.0.2 |
15 |
16 |
17 | > 本插件包括 `jieba analyzer`、`jieba tokenizer`、`jieba token filter`,有三种模式供选择。
18 |
19 | - index 主要用于索引分词,分词粒度较细
20 | - search 主要用于查询分词,分词粒度较粗
21 | - other 全角转半角、大写转小写、字符分词
22 |
23 | 安装
24 | ----
25 |
26 | ## ES 2.x 以上版本
27 |
28 | > 插件版本跟 ES 版本保持一致
29 |
30 | **2.3.5**
31 | ```sh
32 | ./bin/plugin install https://github.com/huaban/elasticsearch-analysis-jieba/releases/download/v2.3.5/elasticsearch-analysis-jieba-2.3.5-bin.zip
33 | ```
34 |
35 | **2.3.4**
36 | ```sh
37 | ./bin/plugin install https://github.com/huaban/elasticsearch-analysis-jieba/releases/download/v2.3.4/elasticsearch-analysis-jieba-2.3.4-bin.zip
38 | ```
39 |
40 | **2.3.3**
41 | ```sh
42 | ./bin/plugin install https://github.com/huaban/elasticsearch-analysis-jieba/releases/download/v2.3.3/elasticsearch-analysis-jieba-2.3.3-bin.zip
43 | ```
44 |
45 | ## ES 2.x 以下版本
46 |
47 | > 请使用插件 0.0.4 版本编译安装
48 |
49 | ```sh
50 | cd {your_es_path}
51 | mkdir plugins/jieba
52 |
53 | # 拷贝 jar
54 | copy jieba-analysis-1.0.2.jar and elasticsearch-analysis-jieba-0.0.4.jar to plugins/jieba
55 |
56 | # 拷贝用户字典
57 | cp -r data/jieba {your_es_path}/config/
58 | ```
59 |
60 | 测试
61 | ----
62 |
63 | ```sh
64 | curl -XPUT 127.0.0.1:9200/test -d '{
65 | "settings" : {
66 | "number_of_shards" : 1,
67 | "number_of_replicas" : 0
68 |
69 | },
70 | "mappings" : {
71 | "test" : {
72 | "_all" : { "enabled" : false },
73 | "properties" : {
74 | "name" : { "type" : "string", "analyzer" : "jieba_index", "search_analyzer" : "jieba_search" }
75 | }
76 | }
77 | }
78 | }';echo
79 |
80 |
81 |
82 | curl 'http://127.0.0.1:9200/test/_analyze?analyzer=jieba_index' -d '中华人民共和国';echo
83 | curl 'http://127.0.0.1:9200/test/_analyze?analyzer=jieba_search' -d '中华人民共和国';echo
84 | curl 'http://127.0.0.1:9200/test/_analyze?analyzer=jieba_other' -d '中华人民共和国 HelLo';echo
85 | ```
86 |
87 | 如何发布一个版本
88 | ------
89 |
90 |
91 | ```
92 | github-release release \
93 | --user huaban \
94 | --repo elasticsearch-analysis-jieba \
95 | --tag v2.3.5 \
96 | --name "v2.3.5" \
97 | --description "支持 ES v2.3.5"
98 |
99 | github-release upload \
100 | --user huaban \
101 | --repo elasticsearch-analysis-jieba \
102 | --tag v2.3.5 \
103 | --name "elasticsearch-analysis-jieba-2.3.5-bin.zip" \
104 | --label "plugin.zip" \
105 | --file target/releases/elasticsearch-analysis-jieba-2.3.5-bin.zip
106 | ```
107 |
108 |
109 | 捐赠
110 | ===========
111 |
112 | **一顿黄焖鸡**
113 |
114 | 
115 |
116 | **请我喝一杯**
117 |
118 | 
119 |
120 | **或者随君意**
121 |
122 | 
123 |
124 |
125 | License
126 | -------
127 |
128 | ```
129 | This software is licensed under the Apache 2 license, quoted below.
130 |
131 | Copyright (C) 2013 libin and Huaban Inc
132 |
133 | Licensed under the Apache License, Version 2.0 (the "License"); you may not
134 | use this file except in compliance with the License. You may obtain a copy of
135 | the License at
136 |
137 | http://www.apache.org/licenses/LICENSE-2.0
138 |
139 | Unless required by applicable law or agreed to in writing, software
140 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
141 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
142 | License for the specific language governing permissions and limitations under
143 | the License.
144 | ```
145 |
--------------------------------------------------------------------------------
/_site/README.md:
--------------------------------------------------------------------------------
1 | action plugin must have _site?
--------------------------------------------------------------------------------
/bin/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ROOT=`dirname $0`
4 | cd $ROOT/..
5 | mvn package install -DcreateChecksum=true -DskipTests
6 |
7 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | org.elasticsearch
5 | elasticsearch-analysis-jieba
6 | 2.3.5
7 | jar
8 | elasticsearch-analysis-jieba
9 | http://maven.apache.org
10 |
11 |
12 | The Apache Software License, Version 2.0
13 | http://www.apache.org/licenses/LICENSE-2.0.txt
14 | repo
15 |
16 |
17 |
18 |
19 | 2.3.5
20 | 1.7
21 | ${project.basedir}/src/main/assemblies/plugin.xml
22 | jieba
23 | org.elasticsearch.plugin.analysis.jieba.AnalysisJiebaPlugin
24 | true
25 | false
26 | true
27 |
28 |
29 |
30 |
31 | org.elasticsearch
32 | elasticsearch
33 | ${elasticsearch.version}
34 |
35 |
36 | com.huaban
37 | jieba-analysis
38 | 1.0.2
39 |
40 |
41 | junit
42 | junit
43 | 4.12
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 | org.apache.maven.plugins
52 | maven-compiler-plugin
53 |
54 | 1.7
55 | 1.7
56 |
57 |
58 |
59 | org.apache.maven.plugins
60 | maven-source-plugin
61 |
62 |
63 | attach-sources
64 |
65 | jar
66 |
67 |
68 |
69 |
70 |
71 | maven-assembly-plugin
72 |
73 | ${project.build.directory}/releases/
74 |
75 | ${basedir}/src/main/assemblies/plugin.xml
76 |
77 |
78 |
79 |
80 | package
81 |
82 | single
83 |
84 |
85 |
86 |
87 |
88 | com.carrotsearch.randomizedtesting
89 | junit4-maven-plugin
90 |
91 |
92 | ${basedir}
93 |
94 |
95 |
96 |
97 |
98 |
99 |
--------------------------------------------------------------------------------
/src/main/assemblies/plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | bin
4 |
5 | zip
6 |
7 | false
8 |
9 |
10 | ${project.basedir}/src/main/dic
11 | /dic
12 |
13 |
14 |
15 | ${project.basedir}/src/main/plugin-metadata
16 | /
17 |
18 |
19 |
20 | ${project.basedir}/_site/
21 | _site/
22 |
23 |
24 |
25 |
26 | ${project.basedir}/src/main/resources/plugin-descriptor.properties
27 |
28 | true
29 |
30 |
31 |
32 |
33 | /
34 | true
35 | true
36 |
37 | org.elasticsearch:elasticsearch
38 | org.apache.lucene:lucene*
39 | commons-lang*
40 |
41 |
42 |
43 | /
44 | true
45 | true
46 |
47 | com.huaban:jieba-analysis
48 |
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/src/main/dic/stopwords.txt:
--------------------------------------------------------------------------------
1 | ////////// Punctuation tokens to remove ////////////////
2 | ,
3 | .
4 | `
5 | -
6 | _
7 | =
8 | ?
9 | '
10 | |
11 | "
12 | (
13 | )
14 | {
15 | }
16 | [
17 | ]
18 | <
19 | >
20 | *
21 | #
22 | &
23 | ^
24 | $
25 | @
26 | !
27 | ~
28 | :
29 | ;
30 | +
31 | /
32 | \
33 | 《
34 | 》
35 | —
36 | -
37 | ,
38 | 。
39 | 、
40 | :
41 | ;
42 | !
43 | ·
44 | ?
45 | “
46 | ”
47 | )
48 | (
49 | 【
50 | 】
51 | [
52 | ]
53 | ●
54 | 請
55 | 還
56 | 是
57 | 的
58 | 惹
59 | 但
60 | 從
61 | 到
62 | 和
63 | 給
64 | 或
65 | 在
66 | 有
67 | 又
68 | 了
69 | 將
70 | 什
71 | 麼
72 | 雖
73 | 嗎
74 | 嘛
75 | 啊
76 | 呢
77 | 哈
78 | 呵
79 | 噢
80 | 哦
81 | 嗯
82 | 吧
83 | 哎
84 | 喲
85 | 呀
86 | 唉
87 | 啦
88 | 唄
89 | 兮
90 | 乎
91 | 矣
92 | 哉
93 | 就
94 | 這
95 | 那
96 | 他
97 | 她
98 | 它
99 | 們
100 | 你
101 | 您
102 | 我
103 | 得
104 | 很
105 | !
106 | [
107 | ]
108 | {
109 | }
110 | (
111 | )
112 | &
113 | %
114 | $
115 | #
116 | "
117 | '
118 | @
119 | `
120 | ~
121 | <
122 | >
123 | 也
124 | 了
125 | 仍
126 | 从
127 | 以
128 | 使
129 | 则
130 | 却
131 | 又
132 | 及
133 | 对
134 | 就
135 | 并
136 | 很
137 | 或
138 | 把
139 | 的
140 | 着
141 | 给
142 | 而
143 | 被
144 | 让
145 | 在
146 | 还
147 | 比
148 | 等
149 | 当
150 | 与
151 | 于
152 | 但
153 | // the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese)
154 |
155 |
156 | //////////////// English Stop Words ////////////////
157 |
158 | //////////////// Chinese Stop Words ////////////////
159 |
--------------------------------------------------------------------------------
/src/main/dic/user.dict:
--------------------------------------------------------------------------------
1 | 小清新 3
2 | 百搭 3
3 | 显瘦 3
4 | 隨身碟 100
5 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/JiebaAnalysisBinderProcessor.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | public class JiebaAnalysisBinderProcessor extends
4 | AnalysisModule.AnalysisBinderProcessor {
5 |
6 | @Override
7 | public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
8 | tokenFiltersBindings.processTokenFilter("jieba",
9 | JiebaTokenFilterFactory.class);
10 | super.processTokenFilters(tokenFiltersBindings);
11 | }
12 |
13 | @Override
14 | public void processAnalyzers(AnalyzersBindings analyzersBindings) {
15 | analyzersBindings.processAnalyzer("jieba", JiebaAnalyzerProvider.class);
16 | super.processAnalyzers(analyzersBindings);
17 | }
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/JiebaAnalyzer.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import java.io.FileReader;
4 | import java.io.IOException;
5 | import java.nio.charset.StandardCharsets;
6 | import java.nio.file.Path;
7 |
8 | import org.apache.lucene.analysis.Analyzer;
9 | import org.apache.lucene.analysis.TokenStream;
10 | import org.apache.lucene.analysis.Tokenizer;
11 | import org.apache.lucene.analysis.core.StopFilter;
12 | import org.apache.lucene.analysis.util.CharArraySet;
13 | import org.apache.lucene.analysis.util.WordlistLoader;
14 | import org.apache.lucene.util.IOUtils;
15 | import org.elasticsearch.common.logging.ESLogger;
16 | import org.elasticsearch.common.logging.Loggers;
17 | import org.elasticsearch.common.settings.Settings;
18 | import org.elasticsearch.env.Environment;
19 |
20 | import com.huaban.analysis.jieba.WordDictionary;
21 |
22 | public class JiebaAnalyzer extends Analyzer {
23 | private final ESLogger log = Loggers.getLogger(JiebaAnalyzer.class);
24 |
25 | private final CharArraySet stopWords;
26 |
27 | private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
28 |
29 | private static final String STOPWORD_FILE_COMMENT = "//";
30 |
31 | /**
32 | * Returns an unmodifiable instance of the default stop-words set.
33 | *
34 | * @return an unmodifiable instance of the default stop-words set.
35 | */
36 | public static CharArraySet getDefaultStopSet() {
37 | return DefaultSetHolder.DEFAULT_STOP_SET;
38 | }
39 |
40 | /**
41 | * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer
42 | * class accesses the static final set the first time.;
43 | */
44 | private static class DefaultSetHolder {
45 | static final CharArraySet DEFAULT_STOP_SET;
46 |
47 | static {
48 | try {
49 | DEFAULT_STOP_SET = loadDefaultStopWordSet();
50 | } catch (IOException ex) {
51 | // default set should always be present as it is part of the
52 | // distribution (JAR)
53 | throw new RuntimeException(
54 | "Unable to load default stopword set");
55 | }
56 | }
57 |
58 | static CharArraySet loadDefaultStopWordSet() throws IOException {
59 | // make sure it is unmodifiable as we expose it in the outer class
60 | return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(
61 | IOUtils.getDecodingReader(JiebaAnalyzer.class,
62 | DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8),
63 | STOPWORD_FILE_COMMENT));
64 | }
65 | }
66 |
67 | private String type;
68 |
69 | private CharArraySet loadStopWords(Path dataPath) {
70 | try {
71 | return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(
72 | new FileReader(dataPath.resolve("stopwords.txt").toFile()), STOPWORD_FILE_COMMENT));
73 | } catch (IOException e) {
74 | return DefaultSetHolder.DEFAULT_STOP_SET;
75 | }
76 | }
77 |
78 | public JiebaAnalyzer(Settings settings, Environment env) {
79 | this(settings.get("seg_mode", "index"), env.pluginsFile().resolve("jieba/dic"),
80 | settings.getAsBoolean("stop", true));
81 | }
82 |
83 | public JiebaAnalyzer(String segMode, Path dataPath, boolean isStop) {
84 | super();
85 |
86 | this.type = segMode;
87 | WordDictionary.getInstance().init(dataPath);
88 | this.stopWords = isStop ? this.loadStopWords(dataPath)
89 | : CharArraySet.EMPTY_SET;
90 |
91 | this.log.info("Jieba segMode = {}", type);
92 | this.log.info("JiebaAnalyzer isStop = {}", isStop);
93 | this.log.info("JiebaAnalyzer stopWords = {}", this.stopWords.toString());
94 | }
95 |
96 | @Override
97 | protected TokenStreamComponents createComponents(String fieldName) {
98 | Tokenizer tokenizer;
99 | if (type.equals("other")) {
100 | tokenizer = new OtherTokenizer();
101 | } else {
102 | tokenizer = new SentenceTokenizer();
103 | }
104 | TokenStream result = new JiebaTokenFilter(type, tokenizer);
105 | if (!type.equals("other") && !stopWords.isEmpty()) {
106 | result = new StopFilter(result, stopWords);
107 | }
108 | return new TokenStreamComponents(tokenizer, result);
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/JiebaAnalyzerProvider.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import org.elasticsearch.common.inject.Inject;
4 | import org.elasticsearch.common.inject.assistedinject.Assisted;
5 | import org.elasticsearch.common.settings.Settings;
6 | import org.elasticsearch.env.Environment;
7 | import org.elasticsearch.index.Index;
8 | import org.elasticsearch.index.settings.IndexSettingsService;
9 |
10 | public class JiebaAnalyzerProvider extends
11 | AbstractIndexAnalyzerProvider {
12 | private final JiebaAnalyzer analyzer;
13 |
14 | @Inject
15 | public JiebaAnalyzerProvider(Index index, IndexSettingsService indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
16 | super(index, indexSettings.getSettings(), name, settings);
17 | analyzer = new JiebaAnalyzer(settings, env);
18 | }
19 |
20 | @Override
21 | public JiebaAnalyzer get() {
22 | return this.analyzer;
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/JiebaTokenFilter.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.Iterator;
6 | import java.util.List;
7 |
8 | import org.apache.lucene.analysis.TokenFilter;
9 | import org.apache.lucene.analysis.TokenStream;
10 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
11 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
12 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
13 |
14 | import com.huaban.analysis.jieba.JiebaSegmenter;
15 | import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
16 | import com.huaban.analysis.jieba.SegToken;
17 |
18 | public final class JiebaTokenFilter extends TokenFilter {
19 |
20 | JiebaSegmenter segmenter;
21 |
22 | private Iterator tokenIter;
23 | private List array;
24 | private String type;
25 |
26 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
27 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
28 | private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
29 |
30 | public JiebaTokenFilter(String type, TokenStream input) {
31 | super(input);
32 | this.type = type;
33 | segmenter = new JiebaSegmenter();
34 | }
35 |
36 | @Override
37 | public boolean incrementToken() throws IOException {
38 | if (tokenIter == null || !tokenIter.hasNext()) {
39 | if (input.incrementToken()) {
40 | if (type.equals("index"))
41 | array = segmenter
42 | .process(termAtt.toString(), SegMode.INDEX);
43 | else if (type.equals("other")) {
44 | array = new ArrayList();
45 | String token = termAtt.toString();
46 | char[] ctoken = token.toCharArray();
47 | for (int i = 0; i < ctoken.length; i++) {
48 | /* 全角=>半角 */
49 | if (ctoken[i] > 0xFF00 && ctoken[i] < 0xFF5F)
50 | ctoken[i] = (char) (ctoken[i] - 0xFEE0);
51 |
52 | /* 大写=>小写 */
53 | if (ctoken[i] > 0x40 && ctoken[i] < 0x5b)
54 | ctoken[i] = (char) (ctoken[i] + 0x20);
55 | }
56 | token = String.valueOf(ctoken);
57 | array.add(new SegToken(token, 0, token.length()));
58 | } else
59 | array = segmenter.process(termAtt.toString(),
60 | SegMode.SEARCH);
61 | tokenIter = array.iterator();
62 | if (!tokenIter.hasNext())
63 | return false;
64 | } else {
65 | return false; // no more sentences, end of stream!
66 | }
67 | }
68 | // WordTokenFilter must clear attributes, as it is creating new tokens.
69 | clearAttributes();
70 |
71 | SegToken token = tokenIter.next();
72 | offsetAtt.setOffset(token.startOffset, token.endOffset);
73 | String tokenString = token.word;
74 | termAtt.copyBuffer(tokenString.toCharArray(), 0, tokenString.length());
75 | typeAtt.setType("word");
76 | return true;
77 | }
78 |
79 | @Override
80 | public void reset() throws IOException {
81 | super.reset();
82 | tokenIter = null;
83 | }
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/JiebaTokenFilterFactory.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import org.apache.lucene.analysis.TokenStream;
4 | import org.elasticsearch.common.inject.Inject;
5 | import org.elasticsearch.common.inject.assistedinject.Assisted;
6 | import org.elasticsearch.common.settings.Settings;
7 | import org.elasticsearch.env.Environment;
8 | import org.elasticsearch.index.Index;
9 | import org.elasticsearch.index.settings.IndexSettingsService;
10 |
11 | import com.huaban.analysis.jieba.WordDictionary;
12 |
13 | public class JiebaTokenFilterFactory extends AbstractTokenFilterFactory {
14 | private String type;
15 |
16 | @Inject
17 | public JiebaTokenFilterFactory(Index index,
18 | IndexSettingsService indexSettings, @Assisted String name,
19 | @Assisted Settings settings) {
20 | super(index, indexSettings.getSettings(), name, settings);
21 | type = settings.get("seg_mode", "index");
22 | Environment env = new Environment(indexSettings.getSettings());
23 | WordDictionary.getInstance().init(env.pluginsFile().resolve("jieba/dic"));
24 | }
25 |
26 | @Override
27 | public TokenStream create(TokenStream input) {
28 | return new JiebaTokenFilter(type, input);
29 | }
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/OtherTokenizer.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 |
4 | import org.apache.lucene.analysis.Tokenizer;
5 | import org.apache.lucene.analysis.util.CharTokenizer;
6 | import org.apache.lucene.util.AttributeFactory;
7 | import org.apache.lucene.util.Version;
8 |
9 | /**
10 | * A OtherTokenizer is a tokenizer that do nothing with text.
11 | *
12 | * You must specify the required {@link Version}
13 | * compatibility when creating {@link OtherTokenizer}:
14 | *
15 | * As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
16 | * detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
17 | * {@link CharTokenizer#normalize(int)} for details.
18 | *
19 | *
20 | */
21 |
22 | public class OtherTokenizer extends CharTokenizer {
23 |
24 | /**
25 | * Construct a new OtherTokenizer.
26 | */
27 | public OtherTokenizer() {
28 | super();
29 | }
30 |
31 | /**
32 | * Construct a new OtherTokenizer using a given
33 | * {@link org.apache.lucene.util.AttributeFactory}.
34 | *
35 | * @param factory
36 | * the attribute factory to use for this {@link Tokenizer}
37 | */
38 | public OtherTokenizer(AttributeFactory factory) {
39 | super(factory);
40 | }
41 |
42 | /**
43 | * Collects only characters which satisfy {@link Character#isOther(int)}.
44 | */
45 | @Override
46 | protected boolean isTokenChar(int c) {
47 | return true;
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/SentenceTokenizer.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.lucene.analysis.Tokenizer;
6 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
7 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
8 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
9 | import org.apache.lucene.util.AttributeFactory;
10 |
11 | public final class SentenceTokenizer extends Tokenizer {
12 |
13 | /**
14 | * End of sentence punctuation: 。,!?;,!?;
15 | */
16 | private final static String PUNCTION = "。,!?;,!?;";
17 | private final static String SPACES = " \t\r\n";
18 |
19 | private final StringBuilder buffer = new StringBuilder();
20 |
21 | private int tokenStart = 0, tokenEnd = 0;
22 |
23 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
24 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
25 | private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
26 |
27 | public SentenceTokenizer() {
28 | super();
29 | }
30 |
31 | public SentenceTokenizer(AttributeFactory factory) {
32 | super(factory);
33 | }
34 |
35 | @Override
36 | public boolean incrementToken() throws IOException {
37 | clearAttributes();
38 | buffer.setLength(0);
39 | int ci;
40 | char ch, pch;
41 | boolean atBegin = true;
42 | tokenStart = tokenEnd;
43 | ci = input.read();
44 | ch = (char) ci;
45 |
46 | while (true) {
47 | if (ci == -1) {
48 | break;
49 | } else if (PUNCTION.indexOf(ch) != -1) {
50 | // End of a sentence
51 | buffer.append(ch);
52 | tokenEnd++;
53 | break;
54 | } else if (atBegin && SPACES.indexOf(ch) != -1) {
55 | tokenStart++;
56 | tokenEnd++;
57 | ci = input.read();
58 | ch = (char) ci;
59 | } else {
60 | buffer.append(ch);
61 | atBegin = false;
62 | tokenEnd++;
63 | pch = ch;
64 | ci = input.read();
65 | ch = (char) ci;
66 | // Two spaces, such as CR, LF
67 | if (SPACES.indexOf(ch) != -1 && SPACES.indexOf(pch) != -1) {
68 | // buffer.append(ch);
69 | tokenEnd++;
70 | break;
71 | }
72 | }
73 | }
74 | if (buffer.length() == 0)
75 | return false;
76 | else {
77 | termAtt.setEmpty().append(buffer);
78 | offsetAtt.setOffset(correctOffset(tokenStart),
79 | correctOffset(tokenEnd));
80 | typeAtt.setType("sentence");
81 | return true;
82 | }
83 | }
84 |
85 | @Override
86 | public void reset() throws IOException {
87 | super.reset();
88 | tokenStart = tokenEnd = 0;
89 | }
90 |
91 | @Override
92 | public void end() {
93 | // set final offset
94 | final int finalOffset = correctOffset(tokenEnd);
95 | offsetAtt.setOffset(finalOffset, finalOffset);
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/indices/analysis/JiebaIndicesAnalysis.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.indices.analysis;
2 |
3 | import org.elasticsearch.common.component.AbstractComponent;
4 | import org.elasticsearch.common.inject.Inject;
5 | import org.elasticsearch.common.settings.Settings;
6 | import org.elasticsearch.env.Environment;
7 | import org.elasticsearch.index.analysis.AnalyzerScope;
8 | import org.elasticsearch.index.analysis.JiebaAnalyzer;
9 | import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory;
10 |
11 | public class JiebaIndicesAnalysis extends AbstractComponent {
12 | private static final String JIEBA_INDEX = "jieba_index";
13 | private static final String JIEBA_SEARCH = "jieba_search";
14 | private static final String JIEBA_OTHER = "jieba_other";
15 |
16 | @Inject
17 | public JiebaIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService, Environment env) {
18 | super(settings);
19 |
20 | indicesAnalysisService.analyzerProviderFactories().put(JIEBA_INDEX,
21 | new PreBuiltAnalyzerProviderFactory(JIEBA_INDEX, AnalyzerScope.GLOBAL,
22 | new JiebaAnalyzer("index", env.pluginsFile().resolve("jieba/dic"), true)));
23 |
24 | indicesAnalysisService.analyzerProviderFactories().put(JIEBA_SEARCH,
25 | new PreBuiltAnalyzerProviderFactory(JIEBA_SEARCH, AnalyzerScope.GLOBAL,
26 | new JiebaAnalyzer("search", env.pluginsFile().resolve("jieba/dic"), true)));
27 |
28 | indicesAnalysisService.analyzerProviderFactories().put(JIEBA_OTHER,
29 | new PreBuiltAnalyzerProviderFactory(JIEBA_OTHER, AnalyzerScope.GLOBAL,
30 | new JiebaAnalyzer("other", env.pluginsFile().resolve("jieba/dic"), true)));
31 |
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/indices/analysis/JiebaIndicesAnalysisModule.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.indices.analysis;
2 |
3 | import org.elasticsearch.common.inject.AbstractModule;
4 |
5 | /**
6 | * Title: JiebaIndicesAnalysisModule
7 | * Description:
8 | * Copyright: Copyright (c) 2016
9 | * Company: Solvento Soft
10 | * Created Date: 2016/7/21 下午4:53
11 | *
12 | * @author Rex Chien
13 | * @version 1.0
14 | */
15 | public class JiebaIndicesAnalysisModule extends AbstractModule {
16 |
17 | @Override
18 | protected void configure() {
19 | bind(JiebaIndicesAnalysis.class).asEagerSingleton();
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/analysis/jieba/AnalysisJiebaPlugin.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.analysis.jieba;
2 |
3 | import org.elasticsearch.common.inject.Module;
4 | import org.elasticsearch.index.analysis.AnalysisModule;
5 | import org.elasticsearch.index.analysis.JiebaAnalysisBinderProcessor;
6 | import org.elasticsearch.indices.analysis.JiebaIndicesAnalysisModule;
7 | import org.elasticsearch.plugins.Plugin;
8 |
9 | import java.util.Collection;
10 | import java.util.Collections;
11 |
12 | public class AnalysisJiebaPlugin extends Plugin {
13 |
14 | @Override
15 | public String name() {
16 | return "analysis-jieba";
17 | }
18 |
19 | @Override
20 | public String description() {
21 | return "jieba analysis";
22 | }
23 |
24 | @Override
25 | public Collection nodeModules() {
26 | return Collections.singletonList(new JiebaIndicesAnalysisModule());
27 | }
28 |
29 |
30 | public void onModule(AnalysisModule module) {
31 | module.addProcessor(new JiebaAnalysisBinderProcessor());
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/plugin-metadata/plugin-security.policy:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Elasticsearch under one or more contributor
3 | * license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright
5 | * ownership. Elasticsearch licenses this file to you under
6 | * the Apache License, Version 2.0 (the "License"); you may
7 | * not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing,
13 | * software distributed under the License is distributed on an
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | * KIND, either express or implied. See the License for the
16 | * specific language governing permissions and limitations
17 | * under the License.
18 | */
19 |
20 | grant {
21 | permission java.lang.RuntimePermission "getClassLoader";
22 | permission java.lang.RuntimePermission "setContextClassLoader";
23 | permission java.io.FilePermission "<>", "read,write";
24 | };
25 |
--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
1 | # Elasticsearch plugin descriptor file
2 | # This file must exist as 'plugin-descriptor.properties' at
3 | # the root directory of all plugins.
4 | #
5 | # A plugin can be 'site', 'jvm', or both.
6 | #
7 | ### example site plugin for "foo":
8 | #
9 | # foo.zip <-- zip file for the plugin, with this structure:
10 | # _site/ <-- the contents that will be served
11 | # plugin-descriptor.properties <-- example contents below:
12 | #
13 | # site=true
14 | # description=My cool plugin
15 | # version=1.0
16 | #
17 | ### example jvm plugin for "foo"
18 | #
19 | # foo.zip <-- zip file for the plugin, with this structure:
20 | # .jar <-- classes, resources, dependencies
21 | # .jar <-- any number of jars
22 | # plugin-descriptor.properties <-- example contents below:
23 | #
24 | # jvm=true
25 | # classname=foo.bar.BazPlugin
26 | # description=My cool plugin
27 | # version=2.0.0-rc1
28 | # elasticsearch.version=2.0
29 | # java.version=1.7
30 | #
31 | ### mandatory elements for all plugins:
32 | #
33 | # 'description': simple summary of the plugin
34 | description=${project.description}
35 | #
36 | # 'version': plugin's version
37 | version=${project.version}
38 | #
39 | # 'name': the plugin name
40 | name=${elasticsearch.plugin.name}
41 |
42 | ### mandatory elements for site plugins:
43 | #
44 | # 'site': set to true to indicate contents of the _site/
45 | # directory in the root of the plugin should be served.
46 | site=${elasticsearch.plugin.site}
47 | #
48 | ### mandatory elements for jvm plugins :
49 | #
50 | # 'jvm': true if the 'classname' class should be loaded
51 | # from jar files in the root directory of the plugin.
52 | # Note that only jar files in the root directory are
53 | # added to the classpath for the plugin! If you need
54 | # other resources, package them into a resources jar.
55 | jvm=${elasticsearch.plugin.jvm}
56 | #
57 | # 'classname': the name of the class to load, fully-qualified.
58 | classname=${elasticsearch.plugin.classname}
59 | #
60 | # 'java.version' version of java the code is built against
61 | # use the system property java.specification.version
62 | # version string must be a sequence of nonnegative decimal integers
63 | # separated by "."'s and may have leading zeros
64 | java.version=${maven.compiler.target}
65 | #
66 | # 'elasticsearch.version' version of elasticsearch compiled against
67 | # You will have to release a new version of the plugin for each new
68 | # elasticsearch release. This version is checked when the plugin
69 | # is loaded so Elasticsearch will refuse to start in the presence of
70 | # plugins with the incorrect elasticsearch.version.
71 | elasticsearch.version=${elasticsearch.version}
72 | #
73 | ### deprecated elements for jvm plugins :
74 | #
75 | # 'isolated': true if the plugin should have its own classloader.
76 | # passing false is deprecated, and only intended to support plugins
77 | # that have hard dependencies against each other. If this is
78 | # not specified, then the plugin is isolated by default.
79 | isolated=${elasticsearch.plugin.isolated}
80 | #
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/JiebaAnalyzerTest.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.io.StringReader;
6 | import java.nio.file.Path;
7 |
8 | import org.apache.lucene.analysis.TokenStream;
9 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
10 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
11 | import org.junit.Test;
12 |
13 | public class JiebaAnalyzerTest {
14 | Path dataPath = new File(System.getProperty("basedir"), "src/main/dic").toPath();
15 | String[] sentences = new String[] {
16 | "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
17 | "我不喜欢日本和服。",
18 | "雷猴回归人间。",
19 | "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
20 | "我需要廉租房",
21 | "永和服装饰品有限公司",
22 | "我爱北京天安门",
23 | "abc",
24 | "隐马尔可夫",
25 | "雷猴是个好网站",
26 | "“,”和“SOFTware(软件)”两部分组成",
27 | "草泥马和欺实马是今年的流行词汇",
28 | "伊藤洋华堂总府店",
29 | "中国科学院计算技术研究所",
30 | "罗密欧与朱丽叶",
31 | "我购买了道具和服装",
32 | "PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍",
33 | "湖北省石首市",
34 | "湖北省十堰市",
35 | "总经理完成了这件事情",
36 | "电脑修好了",
37 | "做好了这件事情就一了百了了",
38 | "人们审美的观点是不同的",
39 | "我们买了一个美的空调",
40 | "线程初始化时我们要注意",
41 | "一个分子是由好多原子组织成的",
42 | "祝你马到功成",
43 | "他掉进了无底洞里",
44 | "中国的首都是北京",
45 | "孙君意",
46 | "外交部发言人马朝旭",
47 | "领导人会议和第四届东亚峰会",
48 | "在过去的这五年",
49 | "还需要很长的路要走",
50 | "60周年首都阅兵",
51 | "你好人们审美的观点是不同的",
52 | "买水果然后来世博园",
53 | "买水果然后去世博园",
54 | "但是后来我才知道你是对的",
55 | "存在即合理",
56 | "的的的的的在的的的的就以和和和",
57 | "I love你,不以为耻,反以为rong",
58 | "因",
59 | "",
60 | "hello你好人们审美的观点是不同的",
61 | "很好但主要是基于网页形式",
62 | "hello你好人们审美的观点是不同的",
63 | "为什么我不能拥有想要的生活",
64 | "后来我才",
65 | "此次来中国是为了",
66 | "使用了它就可以解决一些问题",
67 | ",使用了它就可以解决一些问题",
68 | "其实使用了它就可以解决一些问题",
69 | "好人使用了它就可以解决一些问题",
70 | "是因为和国家",
71 | "老年搜索还支持",
72 | "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ",
73 | "大", "", "他说的确实在理", "长春市长春节讲话", "结婚的和尚未结婚的", "结合成分子时", "旅游和服务是最好的",
74 | "这件事情的确是我的错", "供大家参考指正", "哈尔滨政府公布塌桥原因", "我在机场入口处", "邢永臣摄影报道",
75 | "BP神经网络如何训练才能在分类时增加区分度?", "南京市长江大桥",
76 | "应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究", "长春市长春药店", "邓颖超生前最喜欢的衣服",
77 | "胡锦涛是热爱世界和平的政治局常委", "程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪",
78 | "一次性交多少钱", "两块五一套,三块八一斤,四块七一本,五块六一条", "小和尚留了一个像大和尚一样的和尚头",
79 | "我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站", "张晓梅去人民医院做了个B超然后去买了件T恤",
80 | "AT&T是一件不错的公司,给你发offer了吗?", "C++和c#是什么关系?11+122=133,是吗?PI=3.14159",
81 | "你认识那个和主席握手的的哥吗?他开一辆黑色的士。", "枪杆子中出政权" };
82 |
83 | @Test
84 | public void test() throws IOException {
85 | JiebaAnalyzer analyzer = new JiebaAnalyzer("index", dataPath, true);
86 |
87 | for (String sentence : sentences) {
88 | TokenStream tokenStream = analyzer.tokenStream(null,
89 | new StringReader(sentence));
90 | tokenStream.reset();
91 | while (tokenStream.incrementToken()) {
92 | CharTermAttribute termAtt = tokenStream
93 | .getAttribute(CharTermAttribute.class);
94 | OffsetAttribute offsetAtt = tokenStream
95 | .getAttribute(OffsetAttribute.class);
96 | System.out
97 | .println(termAtt.toString() + ","
98 | + offsetAtt.startOffset() + ","
99 | + offsetAtt.endOffset());
100 | }
101 | tokenStream.reset();
102 | }
103 |
104 | analyzer.close();
105 | }
106 |
107 | @Test
108 | public void testSegModeOther() throws IOException {
109 | JiebaAnalyzer analyzer = new JiebaAnalyzer("index", dataPath, true);
110 |
111 | for (String sentence : sentences) {
112 | TokenStream tokenStream = analyzer.tokenStream(null,
113 | new StringReader(sentence));
114 | tokenStream.reset();
115 | while (tokenStream.incrementToken()) {
116 | CharTermAttribute termAtt = tokenStream
117 | .getAttribute(CharTermAttribute.class);
118 | OffsetAttribute offsetAtt = tokenStream
119 | .getAttribute(OffsetAttribute.class);
120 | System.out
121 | .println(termAtt.toString() + ","
122 | + offsetAtt.startOffset() + ","
123 | + offsetAtt.endOffset());
124 | }
125 | tokenStream.reset();
126 | }
127 |
128 | analyzer.close();
129 | }
130 |
131 | @Test
132 | public void testBugSentences() throws IOException {
133 | String[] bugSentences = new String[] { "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 " };
134 | JiebaAnalyzer analyzer = new JiebaAnalyzer("index", dataPath, true);
135 |
136 | for (String sentence : bugSentences) {
137 | TokenStream tokenStream = analyzer.tokenStream(null,
138 | new StringReader(sentence));
139 | tokenStream.reset();
140 | while (tokenStream.incrementToken()) {
141 | CharTermAttribute termAtt = tokenStream
142 | .getAttribute(CharTermAttribute.class);
143 | OffsetAttribute offsetAtt = tokenStream
144 | .getAttribute(OffsetAttribute.class);
145 | System.out
146 | .println(termAtt.toString() + ","
147 | + offsetAtt.startOffset() + ","
148 | + offsetAtt.endOffset());
149 | }
150 | tokenStream.reset();
151 | }
152 |
153 | analyzer.close();
154 | }
155 |
156 | @Test
157 | public void testLoadDict() throws IOException {
158 | JiebaAnalyzer analyzer = new JiebaAnalyzer("index", dataPath, true);
159 |
160 | String[] sentences = new String[] {
161 | "我剛買了一個 16GB 的 USB 隨身碟",
162 | "我剛買了一個 16GBUSB 隨身碟",
163 | "今天有iphone6和nexus5的大拍賣"
164 | };
165 |
166 | for (String sentence : sentences) {
167 | TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(sentence));
168 | tokenStream.reset();
169 | System.out.println(sentence);
170 | while (tokenStream.incrementToken()) {
171 | CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
172 | OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
173 | System.out.println(
174 | termAtt.toString() + "," +
175 | offsetAtt.startOffset() + "," +
176 | offsetAtt.endOffset()
177 | );
178 | }
179 | System.out.println();
180 | tokenStream.reset();
181 | }
182 |
183 | analyzer.close();
184 | }
185 | }
186 |
--------------------------------------------------------------------------------