├── README.md
├── config
├── hanlp.properties
├── plugin-security.policy
└── plugin.properties
├── pom.xml
└── src
├── main
├── java
│ ├── assembly
│ │ ├── plugin-jar.xml
│ │ └── plugin.xml
│ └── org
│ │ └── elasticsearch
│ │ ├── plugin
│ │ └── hanlp
│ │ │ ├── AnalysisHanLPPlugin.java
│ │ │ ├── analysis
│ │ │ ├── HanLPAnalyzerProvider.java
│ │ │ └── HanLPTokenizerFactory.java
│ │ │ ├── conf
│ │ │ ├── ConfigHelper.java
│ │ │ ├── DicConfig.java
│ │ │ └── HanLPConfig.java
│ │ │ └── lucene
│ │ │ ├── HanLPAnalyzer.java
│ │ │ ├── HanLPTokenFilter.java
│ │ │ ├── HanLPTokenizer.java
│ │ │ ├── PorterStemmer.java
│ │ │ └── SegmentWrapper.java
│ │ └── utils
│ │ └── CommUtils.java
└── resources
│ └── plugin-descriptor.properties
└── test
└── java
└── org
└── elasticsearch
└── plugin
└── hanlp
└── conf
└── ConfigHelperTest.java
/README.md:
--------------------------------------------------------------------------------
1 | HanLP Analysis for Elasticsearch
2 | =====
3 |
4 | 基于 [HanLP](https://github.com/hankcs/HanLP) 的 Elasticsearch 中文分词插件,核心功能:
5 |
6 | 1. 兼容 ES 5.x-7.x;
7 | 2. 内置词典,无需额外配置即可使用;
8 | 3. 支持用户自定义词典;
9 | 4. 支持远程词典热更新(待开发);
10 | 5. 内置多种分词模式,适合不同场景;
11 | 6. 拼音过滤器(待开发);
12 | 7. 简繁体转换过滤器(待开发)。
13 |
14 | ## 版本
15 | 插件版本和 ES 版本一致,直接下载对应版本的插件进行安装即可。
16 |
17 | - 插件开发完成时,最新版本已经为 6.5.2 了,所以个人只对典型的版本进行了测试;
18 | - 5.X 在 5.0.0、5.5.0 版本进行了测试;
19 | - 6.X 在 6.0.0、6.3.0、6.4.1、6.5.1 版本进行了测试;
20 | - 7.X 在 7.0.0 版本进行了测试。
21 |
22 | ## 安装使用
23 | ### 下载编译
24 | git clone 对应版本的代码,打开 `pom.xml` 文件,修改 `6.5.1` 为需要的 ES 版本;然后使用 `mvn package` 生产打包文件,最终文件在 `target/release` 文件夹下。
25 |
26 | 打包完成后,使用离线方式安装即可。
27 |
28 | ### 使用默认词典
29 | - 在线安装:`.\elasticsearch-plugin install https://github.com/AnyListen/elasticsearch-analysis-hanlp/releases/download/vA.B.C/elasticsearch-analysis-hanlp-A.B.C.zip`
30 | - 离线安装:`.\elasticsearch-plugin install file:///FILE_PATH/elasticsearch-analysis-hanlp-A.B.C.zip`
31 |
32 | > 离线安装请把 `FILE_PATH` 更改为 zip 文件路径;A、B、C 对应的是 ES 版本号。
33 |
34 | ### 使用自定义词典
35 | 默认词典是精简版的词典,能够满足基本需求,但是无法使用感知机和 CRF 等基于模型的分词器。
36 |
37 | HanLP 提供了更加[完整的词典](http://nlp.hankcs.com/download.php?file=data),请按需下载。
38 |
39 | 词典下载后,解压到任意目录,然后修改**插件安装目录下**的 `hanlp.properties` 文件,只需修改第一行
40 | ```
41 | root=D:/JavaProjects/HanLP/
42 | ```
43 | 为 `data` 的父目录即可,比如 `data` 目录是 `/Users/hankcs/Documents/data`,那么 `root=/Users/hankcs/Documents/`。
44 |
45 | ### 使用自定义配置文件
46 | 如果你在其他地方使用了 HanLP,希望能够复用 `hanlp.properties` 文件,你只需要修改**插件安装目录下**的 `plugin.properties` 文件,将 `configPath` 配置为已有的 `hanlp.properties` 文件地址即可。
47 |
48 | ## 内置分词器
49 | ### 分析器(Analysis)
50 | - hanlp_index:细粒度切分
51 | - hanlp_smart:常规切分
52 | - hanlp_nlp:命名实体识别
53 | - hanlp_per:感知机分词
54 | - hanlp_crf:CRF分词
55 | - hanlp:自定义
56 |
57 | ### 分词器(Tokenizer)
58 | - hanlp_index:细粒度切分
59 | - hanlp_smart:常规切分
60 | - hanlp_nlp:命名实体识别
61 | - hanlp_per:感知机分词
62 | - hanlp_crf:CRF分词
63 | - hanlp:自定义
64 |
65 | ### 自定义分词器
66 | 插件有较为丰富的选项允许用户自定义分词器,下面是可用的配置项:
67 |
68 | | 配置项名称 | 功能 | 默认值 |
69 | | -------- | -----: | :----: |
70 | | algorithm | 可选项有:
viterbi:维特比分词
| viterbi |
71 | | enableIndexMode | 设为索引模式(细粒度切分) | false |
72 | | enableCustomDictionary | 是否启用用户词典 | true |
73 | | customDictionaryPath | 用户词典路径(绝对路径,多个词典用`;`隔开) | null |
74 | | enableCustomDictionaryForcing | [用户词典高优先级](https://github.com/hankcs/HanLP/wiki/FAQ#%E4%B8%BA%E4%BB%80%E4%B9%88%E4%BF%AE%E6%94%B9%E4%BA%86%E8%AF%8D%E5%85%B8%E8%BF%98%E6%98%AF%E6%B2%A1%E6%9C%89%E6%95%88%E6%9E%9C) | false |
75 | | enableStopWord | 是否启用停用词过滤 | false |
76 | | stopWordDictionaryPath | 停用词词典路径 | null |
77 | | enableNumberQuantifierRecognize | 是否启用数词和数量词识别 | true |
78 | | enableNameRecognize | 开启人名识别 | true |
79 | | enableTranslatedNameRecognize | 是否启用音译人名识别 | false |
80 | | enableJapaneseNameRecognize | 是否启用日本人名识别 | false |
81 | | enableOrganizationRecognize | 开启机构名识别 | false |
82 | | enablePlaceRecognize | 开启地名识别 | false |
83 | | enableTraditionalChineseMode | 开启精准繁体中文分词 | false |
84 |
85 | **案例展示:**
86 | ```
87 | # 创建自定义分词器
88 | PUT my_index
89 | {
90 | "settings": {
91 | "analysis": {
92 | "analyzer": {
93 | "my_analyzer": {
94 | "type": "hanlp",
95 | "algorithm": "viterbi",
96 | "enableIndexMode": "true",
97 | "enableCustomDictionary": "true",
98 | "customDictionaryPath": "",
99 | "enableCustomDictionaryForcing": "false",
100 | "enableStopWord": "true",
101 | "stopWordDictionaryPath": "",
102 | "enableNumberQuantifierRecognize": "true",
103 | "enableNameRecognize": "true",
104 | "enableTranslatedNameRecognize": "true",
105 | "enableJapaneseNameRecognize": "true",
106 | "enableOrganizationRecognize": "true",
107 | "enablePlaceRecognize": "true",
108 | "enableTraditionalChineseMode": "false"
109 | }
110 | }
111 | }
112 | }
113 | }
114 |
115 | # 测试分词器
116 | POST my_index/_analyze
117 | {
118 | "analyzer": "my_analyzer",
119 | "text": "张惠妹在上海市举办演唱会啦"
120 | }
121 | ```
122 |
123 | ## 分词速度(仅供参考)
124 | > 借助 `_analyze` API(**1核1G单线程**),通过改变分词器类型,对 2W 字的文本进行分词,以下为从请求到返回的耗时:
125 |
126 | 分词器 | 耗时(ms)
127 | --- | ---
128 | `hanlp_smart` | 148
129 | `hanlp_nlp` | 182
130 | `hanlp_per` | 286
131 | `hanlp_crf` | 357
132 |
--------------------------------------------------------------------------------
/config/hanlp.properties:
--------------------------------------------------------------------------------
1 | #本配置文件中的路径的根目录,根目录+其他路径=完整路径(支持相对路径,请参考:https://github.com/hankcs/HanLP/pull/254)
2 | #Windows用户请注意,路径分隔符统一使用/
3 | #root=D:/JavaProjects/HanLP/
4 |
5 | #好了,以上为唯一需要修改的部分,以下配置项按需反注释编辑。
6 |
7 | #核心词典路径
8 | #CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt
9 | #2元语法词典路径
10 | #BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt
11 | #自定义词典路径,用;隔开多个自定义词典,空格开头表示在同一个目录,使用“文件名 词性”形式则表示这个词典的词性默认是该词性。优先级递减。
12 | #所有词典统一使用UTF-8编码,每一行代表一个单词,格式遵从[单词] [词性A] [A的频次] [词性B] [B的频次] ... 如果不填词性则表示采用词典的默认词性。
13 | #CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 现代汉语补充词库.txt; 全国地名大全.txt ns; 人名词典.txt; 机构名词典.txt; 上海地名.txt ns;data/dictionary/person/nrf.txt nrf;
14 | #停用词词典路径
15 | #CoreStopWordDictionaryPath=data/dictionary/stopwords.txt
16 | #同义词词典路径
17 | #CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt
18 | #人名词典路径
19 | #PersonDictionaryPath=data/dictionary/person/nr.txt
20 | #人名词典转移矩阵路径
21 | #PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt
22 | #繁简词典根目录
23 | #tcDictionaryRoot=data/dictionary/tc
24 | #HMM分词模型
25 | #HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin
26 | #分词结果是否展示词性
27 | #ShowTermNature=true
28 | #IO适配器,实现com.hankcs.hanlp.corpus.io.IIOAdapter接口以在不同的平台(Hadoop、Redis等)上运行HanLP
29 | #默认的IO适配器如下,该适配器是基于普通文件系统的。
30 | #IOAdapter=com.hankcs.hanlp.corpus.io.FileIOAdapter
31 | #感知机词法分析器
32 | #PerceptronCWSModelPath=data/model/perceptron/pku199801/cws.bin
33 | #PerceptronPOSModelPath=data/model/perceptron/pku199801/pos.bin
34 | #PerceptronNERModelPath=data/model/perceptron/pku199801/ner.bin
35 | #CRF词法分析器
36 | #CRFCWSModelPath=data/model/crf/pku199801/cws.txt
37 | #CRFPOSModelPath=data/model/crf/pku199801/pos.txt
38 | #CRFNERModelPath=data/model/crf/pku199801/ner.txt
39 | #更多配置项请参考 https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/HanLP.java#L59 自行添加
--------------------------------------------------------------------------------
/config/plugin-security.policy:
--------------------------------------------------------------------------------
1 | grant {
2 | permission java.io.FilePermission "<>", "read,write,delete";
3 | permission java.net.SocketPermission "*", "connect,resolve";
4 | permission java.util.PropertyPermission "*", "read,write";
5 | permission java.lang.RuntimePermission "setContextClassLoader";
6 | permission java.lang.RuntimePermission "getClassLoader";
7 | permission java.lang.RuntimePermission "createClassLoader";
8 | };
--------------------------------------------------------------------------------
/config/plugin.properties:
--------------------------------------------------------------------------------
1 | #hanlp配置文件路径
2 | #configPath=
3 |
4 | #远程自定义词典路径
5 | #remoteDicUrl=
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | org.elasticsearch
8 | elasticsearch-analysis-hanlp
9 | ${elasticsearch.version}
10 | jar
11 | HanLP Analyzer for Elasticsearch
12 |
13 |
14 | 7.0.0
15 | portable-1.7.3
16 | 1.8
17 | ${project.basedir}/src/main/resources/plugin-descriptor.properties
18 | analysis-hanlp
19 | org.elasticsearch.plugin.hanlp.AnalysisHanLPPlugin
20 | true
21 | false
22 | true
23 |
24 |
25 |
26 |
27 | com.hankcs
28 | hanlp
29 | ${hanlp.version}
30 |
31 |
32 |
33 | org.elasticsearch
34 | elasticsearch
35 | ${elasticsearch.version}
36 | compile
37 |
38 |
39 |
40 | junit
41 | junit
42 | 4.12
43 | test
44 |
45 |
46 |
47 |
48 |
49 |
50 | org.apache.maven.plugins
51 | maven-compiler-plugin
52 | 3.5.1
53 |
54 | ${maven.compiler.target}
55 | ${maven.compiler.target}
56 | utf8
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 | maven-assembly-plugin
74 |
75 | false
76 | ${project.build.directory}/releases/
77 |
78 | ${basedir}/src/main/java/assembly/plugin.xml
79 |
80 |
81 |
82 |
83 | fully.qualified.MainClass
84 |
85 |
86 |
87 |
88 |
89 | package
90 |
91 | single
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 | disable-java8-doclint
101 |
102 | [1.8,)
103 |
104 |
105 | -Xdoclint:none
106 |
107 |
108 |
109 |
--------------------------------------------------------------------------------
/src/main/java/assembly/plugin-jar.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | jar-with-dependencies
4 |
5 | jar
6 |
7 | false
8 |
9 |
10 | /
11 | true
12 | true
13 | true
14 | runtime
15 |
16 | org.elasticsearch:elasticsearch
17 |
18 |
19 |
20 | /
21 | true
22 | true
23 | true
24 |
25 | com.hankcs:hanlp
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/src/main/java/assembly/plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | plugin
4 |
5 | zip
6 |
7 | false
8 |
9 |
10 | ${project.basedir}/config
11 | /
12 |
13 |
14 |
15 |
16 | ${project.basedir}/src/main/resources/plugin-descriptor.properties
17 | true
18 | /
19 |
20 |
21 |
22 |
23 | /
24 | true
25 | true
26 | false
27 |
28 | org.elasticsearch:elasticsearch
29 |
30 |
31 |
32 | /
33 | true
34 | true
35 | false
36 |
37 | com.hankcs:hanlp
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/AnalysisHanLPPlugin.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.hanlp;
2 |
3 |
4 | import org.apache.lucene.analysis.Analyzer;
5 | import org.elasticsearch.index.analysis.AnalyzerProvider;
6 | import org.elasticsearch.index.analysis.TokenizerFactory;
7 | import org.elasticsearch.indices.analysis.AnalysisModule;
8 | import org.elasticsearch.plugin.hanlp.analysis.HanLPAnalyzerProvider;
9 | import org.elasticsearch.plugin.hanlp.analysis.HanLPTokenizerFactory;
10 | import org.elasticsearch.plugins.AnalysisPlugin;
11 | import org.elasticsearch.plugins.Plugin;
12 |
13 | import java.util.HashMap;
14 | import java.util.Map;
15 |
16 | /**
17 | * elasticsearch-analysis-hanlp
18 | * elasticsearch-analysis-hanlp
19 | * Created by hezl on 2018-11-20.
20 | */
21 | public class AnalysisHanLPPlugin extends Plugin implements AnalysisPlugin {
22 | public static String PLUGIN_NAME = "analysis-hanlp";
23 |
24 | @Override
25 | public Map> getTokenizers() {
26 | Map> extra = new HashMap<>();
27 | extra.put("hanlp_index", HanLPTokenizerFactory::getIndexTokenizerFactory);
28 | extra.put("hanlp_smart", HanLPTokenizerFactory::getSmartTokenizerFactory);
29 | extra.put("hanlp_nlp", HanLPTokenizerFactory::getNLPTokenizerFactory);
30 | extra.put("hanlp_per", HanLPTokenizerFactory::getPerceptronTokenizerFactory);
31 | extra.put("hanlp_crf", HanLPTokenizerFactory::getCRFTokenizerFactory);
32 | extra.put("hanlp", HanLPTokenizerFactory::new);
33 | return extra;
34 | }
35 |
36 | @Override
37 | public Map>> getAnalyzers() {
38 | Map>> extra = new HashMap<>();
39 | extra.put("hanlp_index", HanLPAnalyzerProvider::getIndexAnalyzerProvider);
40 | extra.put("hanlp_smart", HanLPAnalyzerProvider::getSmartAnalyzerProvider);
41 | extra.put("hanlp_nlp", HanLPAnalyzerProvider::getNLPAnalyzerProvider);
42 | extra.put("hanlp_per", HanLPAnalyzerProvider::getPerceptronAnalyzerProvider);
43 | extra.put("hanlp_crf", HanLPAnalyzerProvider::getCRFAnalyzerProvider);
44 | extra.put("hanlp", HanLPAnalyzerProvider::new);
45 | return extra;
46 | }
47 |
48 | // @Override
49 | // public Map> getTokenFilters() {
50 | // Map> extra = new HashMap<>();
51 | //
52 | // extra.put("py_all", HanLPTokenFilterFactory::getPinyinFilterFactory);
53 | // extra.put("py_mix", HanLPTokenFilterFactory::getPinyinMixFilterFactory);
54 | // extra.put("py_first", HanLPTokenFilterFactory::getPinyinFirstFilterFactory);
55 | // extra.put("py_full", HanLPTokenFilterFactory::getPinyinFullFilterFactory);
56 | //
57 | // extra.put("ts", HanLPTokenFilterFactory::getTSFilterFactory);
58 | // extra.put("t2s", HanLPTokenFilterFactory::getT2SFilterFactory);
59 | // extra.put("s2t", HanLPTokenFilterFactory::getS2TFilterFactory);
60 | //
61 | // return extra;
62 | // }
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/analysis/HanLPAnalyzerProvider.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.hanlp.analysis;
2 |
3 |
4 | import com.hankcs.hanlp.seg.Segment;
5 | import org.elasticsearch.common.collect.Tuple;
6 | import org.elasticsearch.common.settings.Settings;
7 | import org.elasticsearch.env.Environment;
8 | import org.elasticsearch.index.IndexSettings;
9 | import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
10 | import org.elasticsearch.plugin.hanlp.conf.ConfigHelper;
11 | import org.elasticsearch.plugin.hanlp.conf.DicConfig;
12 | import org.elasticsearch.plugin.hanlp.conf.HanLPConfig;
13 | import org.elasticsearch.plugin.hanlp.lucene.HanLPAnalyzer;
14 |
15 | import java.util.Set;
16 |
17 | /**
18 | * es-analysis-hanlp
19 | * net.luculent.bigdata.es.plugin.hanlp.analysis
20 | * Created by HEZHILONG on 2018-08-23.
21 | */
22 | public class HanLPAnalyzerProvider extends AbstractIndexAnalyzerProvider {
23 | private final HanLPAnalyzer analyzer;
24 |
25 | public HanLPAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
26 | super(indexSettings, name, settings);
27 | DicConfig.initConfig(env, settings);
28 | Tuple> tuple = ConfigHelper.getSegmentAndFilter(settings);
29 | analyzer = new HanLPAnalyzer(tuple.v1(), tuple.v2());
30 | }
31 |
32 | public HanLPAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings, HanLPConfig config) {
33 | super(indexSettings, name, settings);
34 | DicConfig.initConfig(env, settings);
35 | analyzer = new HanLPAnalyzer(ConfigHelper.getSegment(config), ConfigHelper.getStopWords(config));
36 | }
37 |
38 | public static HanLPAnalyzerProvider getIndexAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
39 | return new HanLPAnalyzerProvider(indexSettings, env, name, settings, ConfigHelper.INDEX_CONFIG);
40 | }
41 |
42 | public static HanLPAnalyzerProvider getSmartAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
43 | return new HanLPAnalyzerProvider(indexSettings, env, name, settings, ConfigHelper.SMART_CONFIG);
44 | }
45 |
46 | public static HanLPAnalyzerProvider getNLPAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
47 | return new HanLPAnalyzerProvider(indexSettings, env, name, settings, ConfigHelper.NLP_CONFIG);
48 | }
49 |
50 | public static HanLPAnalyzerProvider getPerceptronAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
51 | return new HanLPAnalyzerProvider(indexSettings, env, name, settings, ConfigHelper.PERCEPTRON_CONFIG);
52 | }
53 |
54 | public static HanLPAnalyzerProvider getCRFAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
55 | return new HanLPAnalyzerProvider(indexSettings, env, name, settings, ConfigHelper.CRF_CONFIG);
56 | }
57 |
58 | @Override
59 | public HanLPAnalyzer get() {
60 | return this.analyzer;
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/analysis/HanLPTokenizerFactory.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.hanlp.analysis;
2 |
3 |
4 | import com.hankcs.hanlp.seg.Segment;
5 | import org.apache.lucene.analysis.Tokenizer;
6 | import org.elasticsearch.common.collect.Tuple;
7 | import org.elasticsearch.common.settings.Settings;
8 | import org.elasticsearch.env.Environment;
9 | import org.elasticsearch.index.IndexSettings;
10 | import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
11 | import org.elasticsearch.plugin.hanlp.conf.ConfigHelper;
12 | import org.elasticsearch.plugin.hanlp.conf.DicConfig;
13 | import org.elasticsearch.plugin.hanlp.conf.HanLPConfig;
14 | import org.elasticsearch.plugin.hanlp.lucene.HanLPTokenizer;
15 |
16 | import java.util.Set;
17 |
18 | /**
19 | * es-analysis-hanlp
20 | * net.luculent.bigdata.es.plugin.hanlp.analysis
21 | * Created by HEZHILONG on 2018-08-23.
22 | */
23 | public class HanLPTokenizerFactory extends AbstractTokenizerFactory {
24 | private Tuple> tuple;
25 |
26 | public HanLPTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
27 | super(indexSettings, settings);
28 | DicConfig.initConfig(env, settings);
29 | tuple = ConfigHelper.getSegmentAndFilter(settings);
30 | }
31 |
32 | public HanLPTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings, HanLPConfig config) {
33 | super(indexSettings, settings);
34 | DicConfig.initConfig(env, settings);
35 | tuple = Tuple.tuple(ConfigHelper.getSegment(config), ConfigHelper.getStopWords(config));
36 | }
37 |
38 | public static HanLPTokenizerFactory getIndexTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
39 | return new HanLPTokenizerFactory(indexSettings, env, name, settings, ConfigHelper.INDEX_CONFIG);
40 | }
41 |
42 | public static HanLPTokenizerFactory getNLPTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
43 | return new HanLPTokenizerFactory(indexSettings, env, name, settings, ConfigHelper.NLP_CONFIG);
44 | }
45 |
46 | public static HanLPTokenizerFactory getSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
47 | return new HanLPTokenizerFactory(indexSettings, env, name, settings, ConfigHelper.SMART_CONFIG);
48 | }
49 |
50 | public static HanLPTokenizerFactory getPerceptronTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
51 | return new HanLPTokenizerFactory(indexSettings, env, name, settings, ConfigHelper.PERCEPTRON_CONFIG);
52 | }
53 |
54 | public static HanLPTokenizerFactory getCRFTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
55 | return new HanLPTokenizerFactory(indexSettings, env, name, settings, ConfigHelper.CRF_CONFIG);
56 | }
57 |
58 | @Override
59 | public Tokenizer create() {
60 | return new HanLPTokenizer(tuple.v1(), tuple.v2(), true);
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/conf/ConfigHelper.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.hanlp.conf;
2 |
3 |
4 | import com.hankcs.hanlp.HanLP;
5 | import com.hankcs.hanlp.corpus.io.IOUtil;
6 | import com.hankcs.hanlp.corpus.io.ResourceIOAdapter;
7 | import com.hankcs.hanlp.seg.Segment;
8 | import com.hankcs.hanlp.seg.Viterbi.ViterbiSegment;
9 | import com.hankcs.hanlp.utility.TextUtility;
10 | import org.apache.logging.log4j.Logger;
11 | import org.elasticsearch.SpecialPermission;
12 | import org.elasticsearch.common.collect.Tuple;
13 | import org.elasticsearch.common.logging.Loggers;
14 | import org.elasticsearch.common.settings.Settings;
15 |
16 | import java.io.*;
17 | import java.security.AccessController;
18 | import java.security.PrivilegedAction;
19 | import java.util.HashSet;
20 | import java.util.Set;
21 |
22 | /**
23 | * elasticsearch-analysis-hanlp
24 | * elasticsearch-analysis-hanlp
25 | * Created by hezl on 2018-12-03.
26 | */
27 | public class ConfigHelper {
28 | private static final Logger logger = Loggers.getLogger(ConfigHelper.class, "ConfigHelper");
29 |
30 | public static final HanLPConfig INDEX_CONFIG = new HanLPConfig(){{
31 | setAlgorithm("viterbi");
32 | setEnableIndexMode(true);
33 | //CustomDic
34 | setCustomDictionaryPath("");
35 | setEnableCustomDictionary(true);
36 | setEnableCustomDictionaryForcing(false);
37 | //StopWord
38 | setEnableStopWord(false);
39 | setStopWordDictionaryPath("");
40 | //NLP
41 | setEnableNameRecognize(true);
42 | setEnableJapaneseNameRecognize(false);
43 | setEnableTranslatedNameRecognize(false);
44 | setEnableNumberQuantifierRecognize(true);
45 | setEnableOrganizationRecognize(false);
46 | setEnablePlaceRecognize(false);
47 | setEnableTraditionalChineseMode(false);
48 | }};
49 |
50 | public static final HanLPConfig SMART_CONFIG = new HanLPConfig(){{
51 | setAlgorithm("viterbi");
52 | setEnableIndexMode(false);
53 | //CustomDic
54 | setCustomDictionaryPath("");
55 | setEnableCustomDictionary(true);
56 | setEnableCustomDictionaryForcing(false);
57 | //StopWord
58 | setEnableStopWord(false);
59 | setStopWordDictionaryPath("");
60 | //NLP
61 | setEnableNameRecognize(true);
62 | setEnableJapaneseNameRecognize(false);
63 | setEnableTranslatedNameRecognize(false);
64 | setEnableNumberQuantifierRecognize(true);
65 | setEnableOrganizationRecognize(false);
66 | setEnablePlaceRecognize(false);
67 | setEnableTraditionalChineseMode(false);
68 | }};
69 |
70 | public static final HanLPConfig NLP_CONFIG = new HanLPConfig(){{
71 | setAlgorithm("viterbi");
72 | setEnableIndexMode(false);
73 | //CustomDic
74 | setCustomDictionaryPath("");
75 | setEnableCustomDictionary(true);
76 | setEnableCustomDictionaryForcing(false);
77 | //StopWord
78 | setEnableStopWord(false);
79 | setStopWordDictionaryPath("");
80 | //NLP
81 | setEnableNameRecognize(true);
82 | setEnableJapaneseNameRecognize(true);
83 | setEnableTranslatedNameRecognize(true);
84 | setEnableNumberQuantifierRecognize(true);
85 | setEnableOrganizationRecognize(true);
86 | setEnablePlaceRecognize(true);
87 | setEnableTraditionalChineseMode(false);
88 | }};
89 |
90 | public static final HanLPConfig PERCEPTRON_CONFIG = new HanLPConfig(){{
91 | setAlgorithm("perceptron");
92 | setEnableIndexMode(false);
93 | //CustomDic
94 | setCustomDictionaryPath("");
95 | setEnableCustomDictionary(true);
96 | setEnableCustomDictionaryForcing(false);
97 | //StopWord
98 | setEnableStopWord(false);
99 | setStopWordDictionaryPath("");
100 | //NLP
101 | setEnableNameRecognize(true);
102 | setEnableJapaneseNameRecognize(false);
103 | setEnableTranslatedNameRecognize(false);
104 | setEnableNumberQuantifierRecognize(true);
105 | setEnableOrganizationRecognize(true);
106 | setEnablePlaceRecognize(true);
107 | setEnableTraditionalChineseMode(false);
108 | }};
109 |
110 | public static final HanLPConfig CRF_CONFIG = new HanLPConfig(){{
111 | setAlgorithm("crf");
112 | setEnableIndexMode(false);
113 | //CustomDic
114 | setCustomDictionaryPath("");
115 | setEnableCustomDictionary(true);
116 | setEnableCustomDictionaryForcing(false);
117 | //StopWord
118 | setEnableStopWord(false);
119 | setStopWordDictionaryPath("");
120 | //NLP
121 | setEnableNameRecognize(true);
122 | setEnableJapaneseNameRecognize(false);
123 | setEnableTranslatedNameRecognize(false);
124 | setEnableNumberQuantifierRecognize(true);
125 | setEnableOrganizationRecognize(true);
126 | setEnablePlaceRecognize(true);
127 | setEnableTraditionalChineseMode(false);
128 | }};
129 |
130 | public static Segment getSegment(HanLPConfig config) {
131 | //SpecialPermission.check();
132 | return AccessController.doPrivileged((PrivilegedAction) () -> {
133 | Segment segment;
134 | String algorithm = config.getAlgorithm();
135 | if ("crf".equals(algorithm) || "条件随机场".equals(algorithm) ||
136 | "perceptron".equals(algorithm) || "感知机".equals(algorithm)) {
137 | if (HanLP.Config.IOAdapter instanceof ResourceIOAdapter) {
138 | return null;
139 | }
140 | }
141 | if ("viterbi".equals(algorithm) || "维特比".equals(algorithm)) {
142 | String customDictionaryPath = config.getCustomDictionaryPath();
143 | if (TextUtility.isBlank(customDictionaryPath)) {
144 | segment = new ViterbiSegment();
145 | } else {
146 | segment = new ViterbiSegment(customDictionaryPath);
147 | }
148 | } else {
149 | segment = HanLP.newSegment(algorithm);
150 | }
151 | segment.enableIndexMode(config.isEnableIndexMode())
152 | .enableCustomDictionary(config.isEnableCustomDictionary())
153 | .enableCustomDictionaryForcing(config.isEnableCustomDictionaryForcing())
154 | .enableNameRecognize(config.isEnableNameRecognize())
155 | .enableJapaneseNameRecognize(config.isEnableJapaneseNameRecognize())
156 | .enableTranslatedNameRecognize(config.isEnableTranslatedNameRecognize())
157 | .enableNumberQuantifierRecognize(config.isEnableNumberQuantifierRecognize())
158 | .enableOrganizationRecognize(config.isEnableOrganizationRecognize())
159 | .enablePlaceRecognize(config.isEnablePlaceRecognize())
160 | .enableTranslatedNameRecognize(config.isEnableTraditionalChineseMode())
161 | .enableOffset(true).enablePartOfSpeechTagging(true);
162 | System.out.println(segment.seg("HanLP中文分词工具包!"));
163 | return segment;
164 | });
165 | }
166 |
167 | public static Set getStopWords(HanLPConfig config){
168 | if (!config.isEnableStopWord()){
169 | return null;
170 | }
171 | String filePath = config.getStopWordDictionaryPath();
172 | if (TextUtility.isBlank(filePath)){
173 | filePath = HanLP.Config.CoreStopWordDictionaryPath;
174 | }
175 | final String cfPath = filePath;
176 | try {
177 | //SpecialPermission.check();
178 | byte[] bytes = AccessController.doPrivileged((PrivilegedAction) () -> {
179 | byte[] bs;
180 | if (IOUtil.isResource(cfPath)) {
181 | try {
182 | bs = IOUtil.readBytesFromResource(cfPath);
183 | } catch (IOException e) {
184 | return new byte[0];
185 | }
186 | } else {
187 | bs = IOUtil.readBytes(cfPath);
188 | }
189 | return bs;
190 | });
191 | if (bytes == null || bytes.length <= 0){
192 | return null;
193 | }
194 | Set resultSet = new HashSet<>();
195 | ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bytes);
196 | InputStreamReader reader = new InputStreamReader(byteArrayInputStream);
197 | BufferedReader br = new BufferedReader(reader);
198 | String str;
199 | while ((str = br.readLine()) != null) {
200 | resultSet.add(str);
201 | }
202 | br.close();
203 | reader.close();
204 | byteArrayInputStream.close();
205 | return resultSet;
206 | }
207 | catch (Exception ex){
208 | logger.error(ex);
209 | }
210 | return null;
211 | }
212 |
213 | public static HanLPConfig getConfig(Settings settings){
214 | HanLPConfig config = new HanLPConfig();
215 | config.setAlgorithm(settings.get("algorithm", "viterbi"));
216 | config.setEnableIndexMode(settings.getAsBoolean("enableIndexMode", false));
217 | //CustomDic
218 | config.setCustomDictionaryPath(settings.get("customDictionaryPath", ""));
219 | config.setEnableCustomDictionary(settings.getAsBoolean("enableCustomDictionary", true));
220 | config.setEnableCustomDictionaryForcing(settings.getAsBoolean("enableCustomDictionaryForcing", false));
221 | //StopWord
222 | config.setEnableStopWord(settings.getAsBoolean("enableStopWord", false));
223 | config.setStopWordDictionaryPath(settings.get("stopWordDictionaryPath", ""));
224 | //NLP
225 | config.setEnableNameRecognize(settings.getAsBoolean("enableNameRecognize", true));
226 | config.setEnableJapaneseNameRecognize(settings.getAsBoolean("enableJapaneseNameRecognize", false));
227 | config.setEnableTranslatedNameRecognize(settings.getAsBoolean("enableTranslatedNameRecognize", false));
228 | config.setEnableNumberQuantifierRecognize(settings.getAsBoolean("enableNumberQuantifierRecognize", true));
229 | config.setEnableOrganizationRecognize(settings.getAsBoolean("enableOrganizationRecognize", false));
230 | config.setEnablePlaceRecognize(settings.getAsBoolean("enablePlaceRecognize", false));
231 | config.setEnableTraditionalChineseMode(settings.getAsBoolean("enableTraditionalChineseMode", false));
232 |
233 | return config;
234 | }
235 |
236 | public static Tuple> getSegmentAndFilter(Settings settings) {
237 | HanLPConfig config = getConfig(settings);
238 | return Tuple.tuple(getSegment(config), getStopWords(config));
239 | }
240 | }
241 |
242 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/conf/DicConfig.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.hanlp.conf;
2 |
3 |
4 | import com.hankcs.hanlp.utility.Predefine;
5 | import com.hankcs.hanlp.utility.TextUtility;
6 | import org.apache.logging.log4j.Logger;
7 | import org.elasticsearch.common.logging.Loggers;
8 | import org.elasticsearch.common.settings.Settings;
9 | import org.elasticsearch.env.Environment;
10 |
11 | import java.io.File;
12 | import java.io.FileInputStream;
13 | import java.io.InputStreamReader;
14 | import java.nio.file.Path;
15 | import java.util.Properties;
16 |
17 | /**
18 | * elasticsearch-analysis-hanlp
19 | * elasticsearch-analysis-hanlp
20 | * Created by hezl on 2018-11-20.
21 | */
22 | public class DicConfig {
23 | private static final Logger logger = Loggers.getLogger(DicConfig.class, "DicConfig");
24 | private static Environment env;
25 | private static Settings settings;
26 | private static String configPath;
27 | private static String remoteDicUrl;
28 | private static boolean isInit;
29 |
30 | /**
31 | * 根据配置文件
32 | * 初始化词典以及远程更新配置
33 | */
34 | public static synchronized void initConfig(Environment env, Settings settings) {
35 | if (isInit) {
36 | return;
37 | }
38 | DicConfig.env = env;
39 | DicConfig.settings = settings;
40 | File configFile = getConfigFilePath().toFile();
41 | if (!configFile.exists()) {
42 | return;
43 | }
44 | Properties properties = new Properties();
45 | try (InputStreamReader inputStreamReader = new InputStreamReader(new FileInputStream(configFile))) {
46 | properties.load(inputStreamReader);
47 | configPath = properties.getProperty("configPath", null);
48 | remoteDicUrl = properties.getProperty("remoteDicUrl", "");
49 | if (TextUtility.isBlank(configPath)) {
50 | if (getDefDicConfigPath().toFile().exists()) {
51 | configPath = getDefDicConfigPath().toAbsolutePath().toString();
52 | Properties cfProp = new Properties();
53 | FileInputStream inputStream = new FileInputStream(configPath);
54 | cfProp.load(inputStream);
55 | if (!cfProp.containsKey("root")){
56 | configPath = null;
57 | }
58 | inputStream.close();
59 | cfProp.clear();
60 | }
61 | }
62 | if (TextUtility.isBlank(configPath)) {
63 | configPath = null;
64 | }
65 | Predefine.HANLP_PROPERTIES_PATH = configPath;
66 | logger.info("HanLP Properties Path: " + Predefine.HANLP_PROPERTIES_PATH);
67 | //todo 远程更新
68 | } catch (Exception ex) {
69 | logger.error(ex);
70 | } finally {
71 | properties.clear();
72 | }
73 | isInit = true;
74 | }
75 |
76 | private static Path getPluginPath() {
77 | return env.pluginsFile().resolve("analysis-hanlp");
78 | }
79 |
80 | private static Path getDefDicConfigPath() {
81 | return env.pluginsFile().resolve("analysis-hanlp/hanlp.properties").toAbsolutePath();
82 | }
83 |
84 | private static Path getConfigFilePath() {
85 | return env.pluginsFile().resolve("analysis-hanlp/plugin.properties").toAbsolutePath();
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/conf/HanLPConfig.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.hanlp.conf;
2 |
3 |
4 | /**
5 | * elasticsearch-analysis-hanlp
6 | * elasticsearch-analysis-hanlp
7 | * Created by hezl on 2018-12-03.
8 | */
9 | public class HanLPConfig {
10 | /**
11 | * 分词算法,传入算法的中英文名都可以,可选列表:
12 | *
13 | * - 维特比 (viterbi):效率和效果的最佳平衡
14 | * - 双数组trie树 (dat):极速词典分词,千万字符每秒
15 | * - 条件随机场 (crf):分词、词性标注与命名实体识别精度都较高,适合要求较高的NLP任务
16 | * - 感知机 (perceptron):分词、词性标注与命名实体识别,支持在线学习
17 | * - N最短路 (nshort):命名实体识别稍微好一些,牺牲了速度
18 | *
19 | */
20 | private String algorithm;
21 | /**
22 | * 设为索引模式(最细粒度切分)
23 | */
24 | private boolean enableIndexMode;
25 | /**
26 | * 是否启用用户词典
27 | */
28 | private boolean enableCustomDictionary;
29 | /**
30 | * 用户词典路径(绝对路径,多个词典用 ; 隔开)
31 | */
32 | private String customDictionaryPath;
33 | /**
34 | * 用户词典高优先级
35 | */
36 | private boolean enableCustomDictionaryForcing;
37 | /**
38 | * 停用词词典路径
39 | */
40 | private boolean enableStopWord;
41 | /**
42 | * 停用词词典路径
43 | */
44 | private String stopWordDictionaryPath;
45 | /**
46 | * 是否启用数词和数量词识别
47 | */
48 | private boolean enableNumberQuantifierRecognize;
49 | /**
50 | * 开启人名识别
51 | */
52 | private boolean enableNameRecognize;
53 | /**
54 | * 是否启用音译人名识别
55 | */
56 | private boolean enableTranslatedNameRecognize;
57 | /**
58 | * 是否启用日本人名识别
59 | */
60 | private boolean enableJapaneseNameRecognize;
61 | /**
62 | * 开启机构名识别
63 | */
64 | private boolean enableOrganizationRecognize;
65 | /**
66 | * 开启地名识别
67 | */
68 | private boolean enablePlaceRecognize;
69 | /**
70 | * 开启精准繁体中文分词
71 | */
72 | private boolean enableTraditionalChineseMode;
73 |
74 | public String getAlgorithm() {
75 | return algorithm;
76 | }
77 |
78 | public void setAlgorithm(String algorithm) {
79 | this.algorithm = algorithm;
80 | }
81 |
82 | public boolean isEnableIndexMode() {
83 | return enableIndexMode;
84 | }
85 |
86 | public void setEnableIndexMode(boolean enableIndexMode) {
87 | this.enableIndexMode = enableIndexMode;
88 | }
89 |
90 | public boolean isEnableCustomDictionary() {
91 | return enableCustomDictionary;
92 | }
93 |
94 | public void setEnableCustomDictionary(boolean enableCustomDictionary) {
95 | this.enableCustomDictionary = enableCustomDictionary;
96 | }
97 |
98 | public String getCustomDictionaryPath() {
99 | return customDictionaryPath;
100 | }
101 |
102 | public void setCustomDictionaryPath(String customDictionaryPath) {
103 | this.customDictionaryPath = customDictionaryPath;
104 | }
105 |
106 | public boolean isEnableCustomDictionaryForcing() {
107 | return enableCustomDictionaryForcing;
108 | }
109 |
110 | public void setEnableCustomDictionaryForcing(boolean enableCustomDictionaryForcing) {
111 | this.enableCustomDictionaryForcing = enableCustomDictionaryForcing;
112 | }
113 |
114 | public String getStopWordDictionaryPath() {
115 | return stopWordDictionaryPath;
116 | }
117 |
118 | public void setStopWordDictionaryPath(String stopWordDictionaryPath) {
119 | this.stopWordDictionaryPath = stopWordDictionaryPath;
120 | }
121 |
122 | public boolean isEnableNumberQuantifierRecognize() {
123 | return enableNumberQuantifierRecognize;
124 | }
125 |
126 | public void setEnableNumberQuantifierRecognize(boolean enableNumberQuantifierRecognize) {
127 | this.enableNumberQuantifierRecognize = enableNumberQuantifierRecognize;
128 | }
129 |
130 | public boolean isEnableNameRecognize() {
131 | return enableNameRecognize;
132 | }
133 |
134 | public void setEnableNameRecognize(boolean enableNameRecognize) {
135 | this.enableNameRecognize = enableNameRecognize;
136 | }
137 |
138 | public boolean isEnableTranslatedNameRecognize() {
139 | return enableTranslatedNameRecognize;
140 | }
141 |
142 | public void setEnableTranslatedNameRecognize(boolean enableTranslatedNameRecognize) {
143 | this.enableTranslatedNameRecognize = enableTranslatedNameRecognize;
144 | }
145 |
146 | public boolean isEnableJapaneseNameRecognize() {
147 | return enableJapaneseNameRecognize;
148 | }
149 |
150 | public void setEnableJapaneseNameRecognize(boolean enableJapaneseNameRecognize) {
151 | this.enableJapaneseNameRecognize = enableJapaneseNameRecognize;
152 | }
153 |
154 | public boolean isEnableOrganizationRecognize() {
155 | return enableOrganizationRecognize;
156 | }
157 |
158 | public void setEnableOrganizationRecognize(boolean enableOrganizationRecognize) {
159 | this.enableOrganizationRecognize = enableOrganizationRecognize;
160 | }
161 |
162 | public boolean isEnablePlaceRecognize() {
163 | return enablePlaceRecognize;
164 | }
165 |
166 | public void setEnablePlaceRecognize(boolean enablePlaceRecognize) {
167 | this.enablePlaceRecognize = enablePlaceRecognize;
168 | }
169 |
170 | public boolean isEnableTraditionalChineseMode() {
171 | return enableTraditionalChineseMode;
172 | }
173 |
174 | public void setEnableTraditionalChineseMode(boolean enableTraditionalChineseMode) {
175 | this.enableTraditionalChineseMode = enableTraditionalChineseMode;
176 | }
177 |
178 | public boolean isEnableStopWord() {
179 | return enableStopWord;
180 | }
181 |
182 | public void setEnableStopWord(boolean enableStopWord) {
183 | this.enableStopWord = enableStopWord;
184 | }
185 | }
186 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/lucene/HanLPAnalyzer.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.hanlp.lucene;
2 |
3 | import com.hankcs.hanlp.HanLP;
4 | import com.hankcs.hanlp.seg.Segment;
5 | import org.apache.lucene.analysis.Analyzer;
6 | import org.apache.lucene.analysis.Tokenizer;
7 |
8 | import java.util.Set;
9 |
10 | public class HanLPAnalyzer extends Analyzer {
11 | private boolean enablePorterStemming;
12 | private Set filter;
13 | private Segment segment;
14 |
15 | /**
16 | * @param filter 停用词
17 | * @param enablePorterStemming 是否分析词干(仅限英文)
18 | */
19 | public HanLPAnalyzer(Segment segment, Set filter, boolean enablePorterStemming) {
20 | this.segment = segment;
21 | this.filter = filter;
22 | this.enablePorterStemming = enablePorterStemming;
23 | }
24 |
25 | /**
26 | * @param enablePorterStemming 是否分析词干.进行单复数,时态的转换
27 | */
28 | public HanLPAnalyzer(Segment segment, boolean enablePorterStemming) {
29 | this.segment = segment;
30 | this.enablePorterStemming = enablePorterStemming;
31 | }
32 |
33 | public HanLPAnalyzer(Segment segment, Set filter) {
34 | this.segment = segment;
35 | this.filter = filter;
36 | this.enablePorterStemming = true;
37 | }
38 |
39 | public HanLPAnalyzer(Segment segment) {
40 | this.segment = segment;
41 | this.enablePorterStemming = true;
42 | }
43 |
44 | public HanLPAnalyzer() {
45 | super();
46 | this.segment = HanLP.newSegment().enableOffset(true).enableIndexMode(true).enablePartOfSpeechTagging(true);
47 | }
48 |
49 | /**
50 | * 重载Analyzer接口,构造分词组件
51 | */
52 | @Override
53 | protected TokenStreamComponents createComponents(String fieldName) {
54 | Tokenizer tokenizer = new HanLPTokenizer(this.segment, filter, enablePorterStemming);
55 | return new TokenStreamComponents(tokenizer);
56 | }
57 | }
58 |
59 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/lucene/HanLPTokenFilter.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.hanlp.lucene;
2 |
3 | import com.hankcs.hanlp.corpus.tag.Nature;
4 | import com.hankcs.hanlp.seg.common.Term;
5 | import org.apache.lucene.analysis.TokenFilter;
6 | import org.apache.lucene.analysis.TokenStream;
7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
8 |
9 | import java.io.IOException;
10 | import java.util.ArrayList;
11 | import java.util.Iterator;
12 | import java.util.LinkedList;
13 | import java.util.List;
14 | /**
15 | * @author hankcs
16 | */
17 | public class HanLPTokenFilter extends TokenFilter {
18 | private CharTermAttribute termAtt = (CharTermAttribute) this.addAttribute(CharTermAttribute.class);
19 | private String mode;
20 | private LinkedList tokensCache = new LinkedList<>();
21 |
22 | public HanLPTokenFilter(String mode, TokenStream input) {
23 | super(input);
24 | this.mode = mode;
25 | }
26 |
27 | @Override
28 | public boolean incrementToken() throws IOException {
29 | if (hasMoreTokenInCache()) {
30 | this.termAtt.setEmpty();
31 | this.termAtt.append(nextTokenLexeme());
32 | return true;
33 | }
34 | if (this.input.incrementToken()) {
35 | char[] text = this.termAtt.buffer();
36 | List terms;
37 | switch (this.mode) {
38 | //todo 实现分词
39 |
40 | // case "t2s":
41 | // terms = SegmentHelper.segSentence(text, TsMode.T2S);
42 | // break;
43 | // case "s2t":
44 | // terms = SegmentHelper.segSentence(text, TsMode.S2T);
45 | // break;
46 | // case "ts":
47 | // terms = SegmentHelper.segSentence(text, TsMode.ALL);
48 | // break;
49 | // case "py_first":
50 | // terms = SegmentHelper.segSentence(text, PinyinMode.FIRST_LETTER);
51 | // break;
52 | // case "py_full":
53 | // terms = SegmentHelper.segSentence(text, PinyinMode.FULL_PINYIN);
54 | // break;
55 | // case "py_mix":
56 | // terms = SegmentHelper.segSentence(text, PinyinMode.MIX_PINYIN);
57 | // break;
58 | // case "py_all":
59 | // terms = SegmentHelper.segSentence(text, PinyinMode.MIX_ALL);
60 | // break;
61 | default:
62 | terms = new ArrayList<>();
63 | terms.add(new Term(new String(text), Nature.nz));
64 | break;
65 | }
66 | Iterator pinyinIterator = terms.iterator();
67 | if (pinyinIterator.hasNext()) {
68 | String pinyinItem = pinyinIterator.next().word;
69 | while (pinyinIterator.hasNext()) {
70 | addTokenToCache(pinyinIterator.next().word);
71 | }
72 | this.termAtt.setEmpty();
73 | this.termAtt.append(pinyinItem);
74 | }
75 | return true;
76 | }
77 | return false;
78 | }
79 |
80 | @Override
81 | public void reset() throws IOException {
82 | super.reset();
83 | tokensCache.clear();
84 | }
85 |
86 | protected boolean hasMoreTokenInCache() {
87 | return !tokensCache.isEmpty();
88 | }
89 |
90 | private String nextTokenLexeme() {
91 | return tokensCache.pollFirst();
92 | }
93 |
94 | private void addTokenToCache(String token) {
95 | if (token != null) {
96 | tokensCache.add(token);
97 | }
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/lucene/HanLPTokenizer.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.hanlp.lucene;
2 |
3 | import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
4 | import com.hankcs.hanlp.corpus.tag.Nature;
5 | import com.hankcs.hanlp.seg.Segment;
6 | import com.hankcs.hanlp.seg.common.Term;
7 | import com.hankcs.hanlp.utility.TextUtility;
8 | import org.apache.lucene.analysis.Tokenizer;
9 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
10 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
11 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
12 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
13 |
14 | import java.io.BufferedReader;
15 | import java.io.IOException;
16 | import java.util.Set;
17 |
18 | /**
19 | * @author hankcs
20 | */
21 | public class HanLPTokenizer extends Tokenizer {
22 | // 当前词
23 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
24 | // 偏移量
25 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
26 | // 距离
27 | private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
28 | // 词性
29 | private TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
30 |
31 | private SegmentWrapper segment;
32 | private BinTrie filter;
33 | private boolean enablePorterStemming;
34 | private final PorterStemmer stemmer = new PorterStemmer();
35 |
36 | /**
37 | * 单文档当前所在的总offset,当reset(切换multi-value fields中的value)的时候不清零,在end(切换field)时清零
38 | */
39 | private int totalOffset = 0;
40 |
41 | /**
42 | * @param segment HanLP中的某个分词器
43 | * @param filter 停用词
44 | * @param enablePorterStemming 英文原型转换
45 | */
46 | public HanLPTokenizer(Segment segment, Set filter, boolean enablePorterStemming) {
47 | super();
48 | this.segment = new SegmentWrapper(input, segment);
49 | if (filter != null && filter.size() > 0) {
50 | this.filter = new BinTrie();
51 | for (String stopWord : filter) {
52 | this.filter.put(stopWord, null);
53 | }
54 | }
55 | this.enablePorterStemming = enablePorterStemming;
56 | }
57 |
58 | @Override
59 | final public boolean incrementToken() throws IOException {
60 | clearAttributes();
61 | int position = 0;
62 | Term term;
63 | boolean un_increased = true;
64 | do {
65 | term = segment.next();
66 | if (term == null) {
67 | break;
68 | }
69 | if (TextUtility.isBlank(term.word)) // 过滤掉空白符,提高索引效率
70 | {
71 | continue;
72 | }
73 | if (enablePorterStemming && term.nature == Nature.nx) {
74 | term.word = stemmer.stem(term.word);
75 | }
76 |
77 | if (filter != null && filter.containsKey(term.word)) {
78 | continue;
79 | } else {
80 | ++position;
81 | un_increased = false;
82 | }
83 | }
84 | while (un_increased);
85 |
86 | if (term != null) {
87 | positionAttr.setPositionIncrement(position);
88 | termAtt.setEmpty().append(term.word);
89 | offsetAtt.setOffset(correctOffset(totalOffset + term.offset),
90 | correctOffset(totalOffset + term.offset + term.word.length()));
91 | typeAtt.setType(term.nature == null ? "null" : term.nature.toString());
92 | return true;
93 | } else {
94 | totalOffset += segment.offset;
95 | return false;
96 | }
97 | }
98 |
99 | @Override
100 | public void end() throws IOException {
101 | super.end();
102 | offsetAtt.setOffset(totalOffset, totalOffset);
103 | totalOffset = 0;
104 | }
105 |
106 | /**
107 | * 必须重载的方法,否则在批量索引文件时将会导致文件索引失败
108 | */
109 | @Override
110 | public void reset() throws IOException {
111 | super.reset();
112 | segment.reset(new BufferedReader(this.input));
113 | }
114 |
115 | }
116 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/lucene/PorterStemmer.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.hanlp.lucene;
2 |
3 | import org.apache.lucene.util.ArrayUtil;
4 |
5 | import java.io.FileInputStream;
6 | import java.io.IOException;
7 | import java.io.InputStream;
8 |
9 | public class PorterStemmer {
10 | private char[] b;
11 | private int i, /* offset into b */
12 | j, k, k0;
13 | private boolean dirty = false;
14 | private static final int INITIAL_SIZE = 50;
15 |
16 | public PorterStemmer() {
17 | b = new char[INITIAL_SIZE];
18 | i = 0;
19 | }
20 |
21 | /**
22 | * reset() resets the stemmer so it can stem another word. If you invoke the
23 | * stemmer by calling add(char) and then stem(), you must call reset()
24 | * before starting another word.
25 | */
26 | public void reset() {
27 | i = 0;
28 | dirty = false;
29 | }
30 |
31 | /**
32 | * Add a character to the word being stemmed. When you are finished adding
33 | * characters, you can call stem(void) to process the word.
34 | */
35 | public void add(char ch) {
36 | if (b.length <= i) {
37 | b = ArrayUtil.grow(b, i + 1);
38 | }
39 | b[i++] = ch;
40 | }
41 |
42 | /**
43 | * After a word has been stemmed, it can be retrieved by toString(), or a
44 | * reference to the internal buffer can be retrieved by getResultBuffer and
45 | * getResultLength (which is generally more efficient.)
46 | */
47 | @Override
48 | public String toString() {
49 | return new String(b, 0, i);
50 | }
51 |
52 | /**
53 | * Returns the length of the word resulting from the stemming process.
54 | */
55 | public int getResultLength() {
56 | return i;
57 | }
58 |
59 | /**
60 | * Returns a reference to a character buffer containing the results of the
61 | * stemming process. You also need to consult getResultLength() to determine
62 | * the length of the result.
63 | */
64 | public char[] getResultBuffer() {
65 | return b;
66 | }
67 |
68 | /* cons(i) is true <=> b[i] is a consonant. */
69 |
70 | private final boolean cons(int i) {
71 | switch (b[i]) {
72 | case 'a':
73 | case 'e':
74 | case 'i':
75 | case 'o':
76 | case 'u':
77 | return false;
78 | case 'y':
79 | return (i == k0) ? true : !cons(i - 1);
80 | default:
81 | return true;
82 | }
83 | }
84 |
85 | /*
86 | * m() measures the number of consonant sequences between k0 and j. if c is
87 | * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
88 | * presence,
89 | *
90 | * gives 0 vc gives 1 vcvc gives 2 vcvcvc gives 3
91 | * ....
92 | */
93 |
94 | private final int m() {
95 | int n = 0;
96 | int i = k0;
97 | while (true) {
98 | if (i > j)
99 | return n;
100 | if (!cons(i))
101 | break;
102 | i++;
103 | }
104 | i++;
105 | while (true) {
106 | while (true) {
107 | if (i > j)
108 | return n;
109 | if (cons(i))
110 | break;
111 | i++;
112 | }
113 | i++;
114 | n++;
115 | while (true) {
116 | if (i > j)
117 | return n;
118 | if (!cons(i))
119 | break;
120 | i++;
121 | }
122 | i++;
123 | }
124 | }
125 |
126 | /* vowelinstem() is true <=> k0,...j contains a vowel */
127 |
128 | private final boolean vowelinstem() {
129 | int i;
130 | for (i = k0; i <= j; i++)
131 | if (!cons(i))
132 | return true;
133 | return false;
134 | }
135 |
136 | /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
137 |
138 | private final boolean doublec(int j) {
139 | if (j < k0 + 1)
140 | return false;
141 | if (b[j] != b[j - 1])
142 | return false;
143 | return cons(j);
144 | }
145 |
146 | /*
147 | * cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
148 | * and also if the second c is not w,x or y. this is used when trying to
149 | * restore an e at the end of a short word. e.g.
150 | *
151 | * cav(e), lov(e), hop(e), crim(e), but snow, box, tray.
152 | */
153 |
154 | private final boolean cvc(int i) {
155 | if (i < k0 + 2 || !cons(i) || cons(i - 1) || !cons(i - 2))
156 | return false;
157 | else {
158 | int ch = b[i];
159 | if (ch == 'w' || ch == 'x' || ch == 'y')
160 | return false;
161 | }
162 | return true;
163 | }
164 |
165 | private final boolean ends(String s) {
166 | int l = s.length();
167 | int o = k - l + 1;
168 | if (o < k0)
169 | return false;
170 | for (int i = 0; i < l; i++)
171 | if (b[o + i] != s.charAt(i))
172 | return false;
173 | j = k - l;
174 | return true;
175 | }
176 |
177 | /*
178 | * setto(s) sets (j+1),...k to the characters in the string s, readjusting
179 | * k.
180 | */
181 |
182 | void setto(String s) {
183 | int l = s.length();
184 | int o = j + 1;
185 | for (int i = 0; i < l; i++)
186 | b[o + i] = s.charAt(i);
187 | k = j + l;
188 | dirty = true;
189 | }
190 |
191 | /* r(s) is used further down. */
192 |
193 | void r(String s) {
194 | if (m() > 0)
195 | setto(s);
196 | }
197 |
198 | /*
199 | * step1() gets rid of plurals and -ed or -ing. e.g.
200 | *
201 | * caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat
202 | *
203 | * feed -> feed agreed -> agree disabled -> disable
204 | *
205 | * matting -> mat mating -> mate meeting -> meet milling -> mill messing ->
206 | * mess
207 | *
208 | * meetings -> meet
209 | */
210 |
211 | private final void step1() {
212 | if (b[k] == 's') {
213 | if (ends("sses"))
214 | k -= 2;
215 | else if (ends("ies"))
216 | setto("i");
217 | else if (b[k - 1] != 's')
218 | k--;
219 | }
220 | if (ends("eed")) {
221 | if (m() > 0)
222 | k--;
223 | } else if ((ends("ed") || ends("ing")) && vowelinstem()) {
224 | k = j;
225 | if (ends("at"))
226 | setto("ate");
227 | else if (ends("bl"))
228 | setto("ble");
229 | else if (ends("iz"))
230 | setto("ize");
231 | else if (doublec(k)) {
232 | int ch = b[k--];
233 | if (ch == 'l' || ch == 's' || ch == 'z')
234 | k++;
235 | } else if (m() == 1 && cvc(k))
236 | setto("e");
237 | }
238 | }
239 |
240 | /* step2() turns terminal y to i when there is another vowel in the stem. */
241 |
242 | private final void step2() {
243 | if (ends("y") && vowelinstem()) {
244 | b[k] = 'i';
245 | dirty = true;
246 | }
247 | }
248 |
249 | /*
250 | * step3() maps double suffices to single ones. so -ization ( = -ize plus
251 | * -ation) maps to -ize etc. note that the string before the suffix must
252 | * give m() > 0.
253 | */
254 |
255 | private final void step3() {
256 | if (k == k0)
257 | return; /* For Bug 1 */
258 | switch (b[k - 1]) {
259 | case 'a':
260 | if (ends("ational")) {
261 | r("ate");
262 | break;
263 | }
264 | if (ends("tional")) {
265 | r("tion");
266 | break;
267 | }
268 | break;
269 | case 'c':
270 | if (ends("enci")) {
271 | r("ence");
272 | break;
273 | }
274 | if (ends("anci")) {
275 | r("ance");
276 | break;
277 | }
278 | break;
279 | case 'e':
280 | if (ends("izer")) {
281 | r("ize");
282 | break;
283 | }
284 | break;
285 | case 'l':
286 | if (ends("bli")) {
287 | r("ble");
288 | break;
289 | }
290 | if (ends("alli")) {
291 | r("al");
292 | break;
293 | }
294 | if (ends("entli")) {
295 | r("ent");
296 | break;
297 | }
298 | if (ends("eli")) {
299 | r("e");
300 | break;
301 | }
302 | if (ends("ousli")) {
303 | r("ous");
304 | break;
305 | }
306 | break;
307 | case 'o':
308 | if (ends("ization")) {
309 | r("ize");
310 | break;
311 | }
312 | if (ends("ation")) {
313 | r("ate");
314 | break;
315 | }
316 | if (ends("ator")) {
317 | r("ate");
318 | break;
319 | }
320 | break;
321 | case 's':
322 | if (ends("alism")) {
323 | r("al");
324 | break;
325 | }
326 | if (ends("iveness")) {
327 | r("ive");
328 | break;
329 | }
330 | if (ends("fulness")) {
331 | r("ful");
332 | break;
333 | }
334 | if (ends("ousness")) {
335 | r("ous");
336 | break;
337 | }
338 | break;
339 | case 't':
340 | if (ends("aliti")) {
341 | r("al");
342 | break;
343 | }
344 | if (ends("iviti")) {
345 | r("ive");
346 | break;
347 | }
348 | if (ends("biliti")) {
349 | r("ble");
350 | break;
351 | }
352 | break;
353 | case 'g':
354 | if (ends("logi")) {
355 | r("log");
356 | break;
357 | }
358 | }
359 | }
360 |
361 | /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
362 |
363 | private final void step4() {
364 | switch (b[k]) {
365 | case 'e':
366 | if (ends("icate")) {
367 | r("ic");
368 | break;
369 | }
370 | if (ends("ative")) {
371 | r("");
372 | break;
373 | }
374 | if (ends("alize")) {
375 | r("al");
376 | break;
377 | }
378 | break;
379 | case 'i':
380 | if (ends("iciti")) {
381 | r("ic");
382 | break;
383 | }
384 | break;
385 | case 'l':
386 | if (ends("ical")) {
387 | r("ic");
388 | break;
389 | }
390 | if (ends("ful")) {
391 | r("");
392 | break;
393 | }
394 | break;
395 | case 's':
396 | if (ends("ness")) {
397 | r("");
398 | break;
399 | }
400 | break;
401 | }
402 | }
403 |
404 | /* step5() takes off -ant, -ence etc., in context vcvc. */
405 |
406 | private final void step5() {
407 | if (k == k0)
408 | return; /* for Bug 1 */
409 | switch (b[k - 1]) {
410 | case 'a':
411 | if (ends("al"))
412 | break;
413 | return;
414 | case 'c':
415 | if (ends("ance"))
416 | break;
417 | if (ends("ence"))
418 | break;
419 | return;
420 | case 'e':
421 | if (ends("er"))
422 | break;
423 | return;
424 | case 'i':
425 | if (ends("ic"))
426 | break;
427 | return;
428 | case 'l':
429 | if (ends("able"))
430 | break;
431 | if (ends("ible"))
432 | break;
433 | return;
434 | case 'n':
435 | if (ends("ant"))
436 | break;
437 | if (ends("ement"))
438 | break;
439 | if (ends("ment"))
440 | break;
441 | /* element etc. not stripped before the m */
442 | if (ends("ent"))
443 | break;
444 | return;
445 | case 'o':
446 | if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
447 | break;
448 | /* j >= 0 fixes Bug 2 */
449 | if (ends("ou"))
450 | break;
451 | return;
452 | /* takes care of -ous */
453 | case 's':
454 | if (ends("ism"))
455 | break;
456 | return;
457 | case 't':
458 | if (ends("ate"))
459 | break;
460 | if (ends("iti"))
461 | break;
462 | return;
463 | case 'u':
464 | if (ends("ous"))
465 | break;
466 | return;
467 | case 'v':
468 | if (ends("ive"))
469 | break;
470 | return;
471 | case 'z':
472 | if (ends("ize"))
473 | break;
474 | return;
475 | default:
476 | return;
477 | }
478 | if (m() > 1)
479 | k = j;
480 | }
481 |
482 | /* step6() removes a final -e if m() > 1. */
483 |
484 | private final void step6() {
485 | j = k;
486 | if (b[k] == 'e') {
487 | int a = m();
488 | if (a > 1 || a == 1 && !cvc(k - 1))
489 | k--;
490 | }
491 | if (b[k] == 'l' && doublec(k) && m() > 1)
492 | k--;
493 | }
494 |
495 | /**
496 | * Stem a word provided as a String. Returns the result as a String.
497 | */
498 | public String stem(String s) {
499 | if (stem(s.toCharArray(), s.length()))
500 | return toString();
501 | else
502 | return s;
503 | }
504 |
505 | /**
506 | * Stem a word contained in a char[]. Returns true if the stemming process
507 | * resulted in a word different from the input. You can retrieve the result
508 | * with getResultLength()/getResultBuffer() or toString().
509 | */
510 | public boolean stem(char[] word) {
511 | return stem(word, word.length);
512 | }
513 |
514 | /**
515 | * Stem a word contained in a portion of a char[] array. Returns true if the
516 | * stemming process resulted in a word different from the input. You can
517 | * retrieve the result with getResultLength()/getResultBuffer() or
518 | * toString().
519 | */
520 | public boolean stem(char[] wordBuffer, int offset, int wordLen) {
521 | reset();
522 | if (b.length < wordLen) {
523 | b = new char[ArrayUtil.oversize(wordLen, Character.BYTES)];
524 | }
525 | System.arraycopy(wordBuffer, offset, b, 0, wordLen);
526 | i = wordLen;
527 | return stem(0);
528 | }
529 |
530 | /**
531 | * Stem a word contained in a leading portion of a char[] array. Returns
532 | * true if the stemming process resulted in a word different from the input.
533 | * You can retrieve the result with getResultLength()/getResultBuffer() or
534 | * toString().
535 | */
536 | public boolean stem(char[] word, int wordLen) {
537 | return stem(word, 0, wordLen);
538 | }
539 |
540 | /**
541 | * Stem the word placed into the Stemmer buffer through calls to add().
542 | * Returns true if the stemming process resulted in a word different from
543 | * the input. You can retrieve the result with
544 | * getResultLength()/getResultBuffer() or toString().
545 | */
546 | public boolean stem() {
547 | return stem(0);
548 | }
549 |
550 | public boolean stem(int i0) {
551 | k = i - 1;
552 | k0 = i0;
553 | if (k > k0 + 1) {
554 | step1();
555 | step2();
556 | step3();
557 | step4();
558 | step5();
559 | step6();
560 | }
561 | // Also, a word is considered dirty if we lopped off letters
562 | // Thanks to Ifigenia Vairelles for pointing this out.
563 | if (i != k + 1)
564 | dirty = true;
565 | i = k + 1;
566 | return dirty;
567 | }
568 |
569 | /**
570 | * Test program for demonstrating the Stemmer. It reads a file and stems
571 | * each word, writing the result to standard out. Usage: Stemmer file-name
572 | */
573 | public static void main(String[] args) {
574 | PorterStemmer s = new PorterStemmer();
575 |
576 | for (int i = 0; i < args.length; i++) {
577 | try {
578 | InputStream in = new FileInputStream(args[i]);
579 | byte[] buffer = new byte[1024];
580 | int bufferLen, offset, ch;
581 |
582 | bufferLen = in.read(buffer);
583 | offset = 0;
584 | s.reset();
585 |
586 | while (true) {
587 | if (offset < bufferLen)
588 | ch = buffer[offset++];
589 | else {
590 | bufferLen = in.read(buffer);
591 | offset = 0;
592 | if (bufferLen < 0)
593 | ch = -1;
594 | else
595 | ch = buffer[offset++];
596 | }
597 |
598 | if (Character.isLetter((char) ch)) {
599 | s.add(Character.toLowerCase((char) ch));
600 | } else {
601 | s.stem();
602 | System.out.print(s.toString());
603 | s.reset();
604 | if (ch < 0)
605 | break;
606 | else {
607 | System.out.print((char) ch);
608 | }
609 | }
610 | }
611 |
612 | in.close();
613 | } catch (IOException e) {
614 | System.out.println("error reading " + args[i]);
615 | }
616 | }
617 | }
618 |
619 | }
620 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/lucene/SegmentWrapper.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.hanlp.lucene;
2 |
3 |
4 | import com.hankcs.hanlp.seg.Segment;
5 | import com.hankcs.hanlp.seg.common.Term;
6 |
7 | import java.io.IOException;
8 | import java.io.Reader;
9 | import java.util.HashSet;
10 | import java.util.Iterator;
11 | import java.util.List;
12 | import java.util.Set;
13 |
14 | /**
15 | * 将分词器包装起来,每次输出一个token
16 | *
17 | * @author hankcs
18 | */
19 | public class SegmentWrapper {
20 | /**
21 | * 输入
22 | */
23 | private Reader input;
24 | /**
25 | * 分词器
26 | */
27 | private Segment segment;
28 | /**
29 | * 分词结果
30 | */
31 | private Iterator iterator;
32 | /**
33 | * term的偏移量,由于wrapper是按行读取的,必须对term.offset做一个校正
34 | */
35 | int offset;
36 | /**
37 | * 缓冲区大小
38 | */
39 | private static final int BUFFER_SIZE = 512;
40 | /**
41 | * 缓冲区
42 | */
43 | private char[] buffer = new char[BUFFER_SIZE];
44 | /**
45 | * 缓冲区未处理的下标
46 | */
47 | private int remainSize = 0;
48 |
49 | /**
50 | * 句子分隔符
51 | */
52 | private static final Set delimiterCharSet = new HashSet() {{
53 | add('\r');
54 | add('\n');
55 | add('。');
56 | add('!');
57 | add('!');
58 | }};
59 |
60 | public SegmentWrapper(Reader reader, Segment segment) {
61 | this.input = reader;
62 | this.segment = segment;
63 | }
64 |
65 | /**
66 | * 重置分词器
67 | *
68 | * @param reader
69 | */
70 | public void reset(Reader reader) {
71 | input = reader;
72 | offset = 0;
73 | iterator = null;
74 | }
75 |
76 | public Term next() throws IOException {
77 | if (iterator != null && iterator.hasNext()) return iterator.next();
78 | String line = readLine();
79 | if (line == null) return null;
80 | List termList = segment.seg(line);
81 | if (termList.size() == 0) return null;
82 | for (Term term : termList) {
83 | term.offset += offset;
84 | }
85 | offset += line.length();
86 | iterator = termList.iterator();
87 | return iterator.next();
88 | }
89 |
90 | private String readLine() throws IOException {
91 | int offset = 0;
92 | int length = BUFFER_SIZE;
93 | if (remainSize > 0) {
94 | offset = remainSize;
95 | length -= remainSize;
96 | }
97 | int n = input.read(buffer, offset, length);
98 | if (n < 0) {
99 | if (remainSize != 0) {
100 | String lastLine = new String(buffer, 0, remainSize);
101 | remainSize = 0;
102 | return lastLine;
103 | }
104 | return null;
105 | }
106 | n += offset;
107 |
108 | int eos = lastIndexOfEos(buffer, n);
109 | String line = new String(buffer, 0, eos);
110 | remainSize = n - eos;
111 | System.arraycopy(buffer, eos, buffer, 0, remainSize);
112 | return line;
113 | }
114 |
115 | private int lastIndexOfEos(char[] buffer, int length) {
116 | for (int i = length - 1; i > 0; i--) {
117 | if (delimiterCharSet.contains(buffer[i])) {
118 | return i + 1;
119 | }
120 | }
121 | return length;
122 | }
123 | }
124 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/utils/CommUtils.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.utils;
2 |
3 | import java.lang.reflect.Field;
4 | import java.util.Collections;
5 | import java.util.HashMap;
6 | import java.util.Map;
7 |
8 | /**
9 | * elasticsearch-analysis-hanlp
10 | * elasticsearch-analysis-hanlp
11 | * Created by hezl on 2018-11-20.
12 | */
13 | public class CommUtils {
14 |
15 | public static void setEnv(String key, String value){
16 | Map newEnv = new HashMap<>();
17 | newEnv.put(key, value);
18 | try{
19 | setEnv(newEnv);
20 | }
21 | catch (Exception ex){
22 | //
23 | }
24 | }
25 |
26 | @SuppressWarnings("unchecked")
27 | private static void setEnv(Map newEnv) throws Exception {
28 | try {
29 | Class> processEnvironmentClass = Class.forName("java.lang.ProcessEnvironment");
30 | Field theEnvironmentField = processEnvironmentClass.getDeclaredField("theEnvironment");
31 | theEnvironmentField.setAccessible(true);
32 | Map env = (Map) theEnvironmentField.get(null);
33 | env.putAll(newEnv);
34 | Field theCaseInsensitiveEnvironmentField = processEnvironmentClass.getDeclaredField("theCaseInsensitiveEnvironment");
35 | theCaseInsensitiveEnvironmentField.setAccessible(true);
36 | Map ciEnv = (Map) theCaseInsensitiveEnvironmentField.get(null);
37 | ciEnv.putAll(newEnv);
38 | } catch (NoSuchFieldException e) {
39 | Class[] classes = Collections.class.getDeclaredClasses();
40 | Map env = System.getenv();
41 | for (Class cl : classes) {
42 | if ("java.util.Collections$UnmodifiableMap".equals(cl.getName())) {
43 | Field field = cl.getDeclaredField("m");
44 | field.setAccessible(true);
45 | Object obj = field.get(env);
46 | Map map = (Map) obj;
47 | map.putAll(newEnv);
48 | }
49 | }
50 | }
51 | }
52 |
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
1 | # Elasticsearch plugin descriptor file
2 | # This file must exist as 'plugin-descriptor.properties' at
3 | # the root directory of all plugins.
4 | ### mandatory elements for all plugins:
5 | #
6 | # 'description': simple summary of the plugin
7 | description=${project.description}
8 | #
9 | # 'version': plugin's version
10 | version=${project.version}
11 | #
12 | # 'name': the plugin name
13 | name=${elasticsearch.plugin.name}
14 | #
15 | # 'classname': the name of the class to load, fully-qualified.
16 | classname=${elasticsearch.plugin.classname}
17 | #
18 | # 'java.version' version of java the code is built against
19 | # use the system property java.specification.version
20 | # version string must be a sequence of nonnegative decimal integers
21 | # separated by "."'s and may have leading zeros
22 | java.version=${maven.compiler.target}
23 | #
24 | # 'elasticsearch.version' version of elasticsearch compiled against
25 | # You will have to release a new version of the plugin for each new
26 | # elasticsearch release. This version is checked when the plugin
27 | # is loaded so Elasticsearch will refuse to start in the presence of
28 | # plugins with the incorrect elasticsearch.version.
29 | elasticsearch.version=${elasticsearch.version}
30 |
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/plugin/hanlp/conf/ConfigHelperTest.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.hanlp.conf;
2 |
3 | import com.hankcs.hanlp.HanLP;
4 | import org.junit.Test;
5 |
6 | /**
7 | * elasticsearch-analysis-hanlp
8 | * elasticsearch-analysis-hanlp
9 | * Created by hezl on 2018-12-05.
10 | */
11 | public class ConfigHelperTest {
12 |
13 | @Test
14 | public void getConfig() {
15 | System.out.println(HanLP.segment("你和对方但是"));
16 | }
17 | }
--------------------------------------------------------------------------------