├── README.md
├── config
    ├── hanlp.properties
    ├── plugin-security.policy
    └── plugin.properties
├── pom.xml
└── src
    ├── main
        ├── java
        │   ├── assembly
        │   │   ├── plugin-jar.xml
        │   │   └── plugin.xml
        │   └── org
        │   │   └── elasticsearch
        │   │       ├── plugin
        │   │           └── hanlp
        │   │           │   ├── AnalysisHanLPPlugin.java
        │   │           │   ├── analysis
        │   │           │       ├── HanLPAnalyzerProvider.java
        │   │           │       └── HanLPTokenizerFactory.java
        │   │           │   ├── conf
        │   │           │       ├── ConfigHelper.java
        │   │           │       ├── DicConfig.java
        │   │           │       └── HanLPConfig.java
        │   │           │   └── lucene
        │   │           │       ├── HanLPAnalyzer.java
        │   │           │       ├── HanLPTokenFilter.java
        │   │           │       ├── HanLPTokenizer.java
        │   │           │       ├── PorterStemmer.java
        │   │           │       └── SegmentWrapper.java
        │   │       └── utils
        │   │           └── CommUtils.java
        └── resources
        │   └── plugin-descriptor.properties
    └── test
        └── java
            └── org
                └── elasticsearch
                    └── plugin
                        └── hanlp
                            └── conf
                                └── ConfigHelperTest.java


/README.md:
--------------------------------------------------------------------------------
  1 | HanLP Analysis for Elasticsearch
  2 | =====
  3 | 
  4 | 基于 [HanLP](https://github.com/hankcs/HanLP) 的 Elasticsearch 中文分词插件，核心功能：
  5 | 
  6 | 1. 兼容 ES 5.x-7.x；
  7 | 2. 内置词典，无需额外配置即可使用；
  8 | 3. 支持用户自定义词典；
  9 | 4. 支持远程词典热更新（待开发）；
 10 | 5. 内置多种分词模式，适合不同场景；
 11 | 6. 拼音过滤器（待开发）；
 12 | 7. 简繁体转换过滤器（待开发）。
 13 | 
 14 | ## 版本
 15 | 插件版本和 ES 版本一致，直接下载对应版本的插件进行安装即可。
 16 | 
 17 | - 插件开发完成时，最新版本已经为 6.5.2 了，所以个人只对典型的版本进行了测试；
 18 | - 5.X 在 5.0.0、5.5.0 版本进行了测试；
 19 | - 6.X 在 6.0.0、6.3.0、6.4.1、6.5.1 版本进行了测试；
 20 | - 7.X 在 7.0.0 版本进行了测试。
 21 | 
 22 | ## 安装使用
 23 | ### 下载编译
 24 | git clone 对应版本的代码，打开 `pom.xml` 文件，修改 `<elasticsearch.version>6.5.1</elasticsearch.version>` 为需要的 ES 版本；然后使用 `mvn package` 生产打包文件，最终文件在 `target/release` 文件夹下。
 25 | 
 26 | 打包完成后，使用离线方式安装即可。
 27 | 
 28 | ### 使用默认词典
 29 | - 在线安装：`.\elasticsearch-plugin install https://github.com/AnyListen/elasticsearch-analysis-hanlp/releases/download/vA.B.C/elasticsearch-analysis-hanlp-A.B.C.zip`
 30 | - 离线安装：`.\elasticsearch-plugin install file:///FILE_PATH/elasticsearch-analysis-hanlp-A.B.C.zip`
 31 | 
 32 | > 离线安装请把 `FILE_PATH` 更改为 zip 文件路径；A、B、C 对应的是 ES 版本号。
 33 | 
 34 | ### 使用自定义词典
 35 | 默认词典是精简版的词典，能够满足基本需求，但是无法使用感知机和 CRF 等基于模型的分词器。
 36 | 
 37 | HanLP 提供了更加[完整的词典](http://nlp.hankcs.com/download.php?file=data)，请按需下载。
 38 | 
 39 | 词典下载后，解压到任意目录，然后修改**插件安装目录下**的 `hanlp.properties` 文件，只需修改第一行
 40 | ```
 41 | root=D:/JavaProjects/HanLP/
 42 | ```
 43 | 为 `data` 的父目录即可，比如 `data` 目录是 `/Users/hankcs/Documents/data`，那么 `root=/Users/hankcs/Documents/`。
 44 | 
 45 | ### 使用自定义配置文件
 46 | 如果你在其他地方使用了 HanLP，希望能够复用 `hanlp.properties` 文件，你只需要修改**插件安装目录下**的 `plugin.properties` 文件，将 `configPath` 配置为已有的 `hanlp.properties` 文件地址即可。
 47 | 
 48 | ## 内置分词器
 49 | ### 分析器(Analysis)
 50 | - hanlp_index：细粒度切分
 51 | - hanlp_smart：常规切分
 52 | - hanlp_nlp：命名实体识别
 53 | - hanlp_per：感知机分词
 54 | - hanlp_crf：CRF分词
 55 | - hanlp：自定义
 56 | 
 57 | ### 分词器(Tokenizer)
 58 | - hanlp_index：细粒度切分
 59 | - hanlp_smart：常规切分
 60 | - hanlp_nlp：命名实体识别
 61 | - hanlp_per：感知机分词
 62 | - hanlp_crf：CRF分词
 63 | - hanlp：自定义
 64 | 
 65 | ### 自定义分词器
 66 | 插件有较为丰富的选项允许用户自定义分词器，下面是可用的配置项：
 67 | 
 68 | | 配置项名称       | 功能   |  默认值  |
 69 | | --------   | -----:  | :----:  |
 70 | | algorithm   | 可选项有：<br/> viterbi：维特比分词 <br/> |   viterbi     |
 71 | | enableIndexMode    | 设为索引模式（细粒度切分） |   false     |
 72 | | enableCustomDictionary    | 是否启用用户词典 |   true     |
 73 | | customDictionaryPath    | 用户词典路径(绝对路径,多个词典用`;`隔开) |   null     |
 74 | | enableCustomDictionaryForcing    | [用户词典高优先级](https://github.com/hankcs/HanLP/wiki/FAQ#%E4%B8%BA%E4%BB%80%E4%B9%88%E4%BF%AE%E6%94%B9%E4%BA%86%E8%AF%8D%E5%85%B8%E8%BF%98%E6%98%AF%E6%B2%A1%E6%9C%89%E6%95%88%E6%9E%9C) |   false     |
 75 | | enableStopWord    | 是否启用停用词过滤 |   false     |
 76 | | stopWordDictionaryPath    | 停用词词典路径 |   null     |
 77 | | enableNumberQuantifierRecognize    | 是否启用数词和数量词识别 |   true     |
 78 | | enableNameRecognize    | 开启人名识别 |   true     |
 79 | | enableTranslatedNameRecognize    | 是否启用音译人名识别 |   false     |
 80 | | enableJapaneseNameRecognize    | 是否启用日本人名识别 |   false     |
 81 | | enableOrganizationRecognize    | 开启机构名识别 |   false     |
 82 | | enablePlaceRecognize    | 开启地名识别 |   false     |
 83 | | enableTraditionalChineseMode    | 开启精准繁体中文分词 |   false     |
 84 | 
 85 | **案例展示：**
 86 | ```
 87 | # 创建自定义分词器
 88 | PUT my_index
 89 | {
 90 |   "settings": {
 91 |     "analysis": {
 92 |       "analyzer": {
 93 |         "my_analyzer": {
 94 |           "type": "hanlp",
 95 |           "algorithm": "viterbi",
 96 |           "enableIndexMode": "true",
 97 |           "enableCustomDictionary": "true",
 98 |           "customDictionaryPath": "",
 99 |           "enableCustomDictionaryForcing": "false",
100 |           "enableStopWord": "true",
101 |           "stopWordDictionaryPath": "",
102 |           "enableNumberQuantifierRecognize": "true",
103 |           "enableNameRecognize": "true",
104 |           "enableTranslatedNameRecognize": "true",
105 |           "enableJapaneseNameRecognize": "true",
106 |           "enableOrganizationRecognize": "true",
107 |           "enablePlaceRecognize": "true",
108 |           "enableTraditionalChineseMode": "false"
109 |         }
110 |       }
111 |     }
112 |   }
113 | }
114 | 
115 | # 测试分词器
116 | POST my_index/_analyze
117 | {
118 |   "analyzer": "my_analyzer",
119 |   "text": "张惠妹在上海市举办演唱会啦"
120 | }
121 | ```
122 | 
123 | ## 分词速度（仅供参考）
124 | > 借助 `_analyze` API（**1核1G单线程**），通过改变分词器类型，对 2W 字的文本进行分词，以下为从请求到返回的耗时：
125 | 
126 | 分词器 | 耗时（ms）
127 | --- | ---
128 | `hanlp_smart` | 148
129 | `hanlp_nlp`  | 182
130 | `hanlp_per`  | 286
131 | `hanlp_crf` | 357
132 | 


--------------------------------------------------------------------------------
/config/hanlp.properties:
--------------------------------------------------------------------------------
 1 | #本配置文件中的路径的根目录，根目录+其他路径=完整路径（支持相对路径，请参考：https://github.com/hankcs/HanLP/pull/254）
 2 | #Windows用户请注意，路径分隔符统一使用/
 3 | #root=D:/JavaProjects/HanLP/
 4 | 
 5 | #好了，以上为唯一需要修改的部分，以下配置项按需反注释编辑。
 6 | 
 7 | #核心词典路径
 8 | #CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt
 9 | #2元语法词典路径
10 | #BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt
11 | #自定义词典路径，用;隔开多个自定义词典，空格开头表示在同一个目录，使用“文件名 词性”形式则表示这个词典的词性默认是该词性。优先级递减。
12 | #所有词典统一使用UTF-8编码，每一行代表一个单词，格式遵从[单词] [词性A] [A的频次] [词性B] [B的频次] ... 如果不填词性则表示采用词典的默认词性。
13 | #CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 现代汉语补充词库.txt; 全国地名大全.txt ns; 人名词典.txt; 机构名词典.txt; 上海地名.txt ns;data/dictionary/person/nrf.txt nrf;
14 | #停用词词典路径
15 | #CoreStopWordDictionaryPath=data/dictionary/stopwords.txt
16 | #同义词词典路径
17 | #CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt
18 | #人名词典路径
19 | #PersonDictionaryPath=data/dictionary/person/nr.txt
20 | #人名词典转移矩阵路径
21 | #PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt
22 | #繁简词典根目录
23 | #tcDictionaryRoot=data/dictionary/tc
24 | #HMM分词模型
25 | #HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin
26 | #分词结果是否展示词性
27 | #ShowTermNature=true
28 | #IO适配器，实现com.hankcs.hanlp.corpus.io.IIOAdapter接口以在不同的平台（Hadoop、Redis等）上运行HanLP
29 | #默认的IO适配器如下，该适配器是基于普通文件系统的。
30 | #IOAdapter=com.hankcs.hanlp.corpus.io.FileIOAdapter
31 | #感知机词法分析器
32 | #PerceptronCWSModelPath=data/model/perceptron/pku199801/cws.bin
33 | #PerceptronPOSModelPath=data/model/perceptron/pku199801/pos.bin
34 | #PerceptronNERModelPath=data/model/perceptron/pku199801/ner.bin
35 | #CRF词法分析器
36 | #CRFCWSModelPath=data/model/crf/pku199801/cws.txt
37 | #CRFPOSModelPath=data/model/crf/pku199801/pos.txt
38 | #CRFNERModelPath=data/model/crf/pku199801/ner.txt
39 | #更多配置项请参考 https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/HanLP.java#L59 自行添加


--------------------------------------------------------------------------------
/config/plugin-security.policy:
--------------------------------------------------------------------------------
1 | grant {
2 |     permission java.io.FilePermission "<<ALL FILES>>", "read,write,delete";
3 |     permission java.net.SocketPermission "*", "connect,resolve";
4 |     permission java.util.PropertyPermission "*", "read,write";
5 |     permission java.lang.RuntimePermission "setContextClassLoader";
6 |     permission java.lang.RuntimePermission "getClassLoader";
7 |     permission java.lang.RuntimePermission "createClassLoader";
8 | };


--------------------------------------------------------------------------------
/config/plugin.properties:
--------------------------------------------------------------------------------
1 | #hanlp配置文件路径
2 | #configPath=
3 | 
4 | #远程自定义词典路径
5 | #remoteDicUrl=


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>org.elasticsearch</groupId>
  8 |     <artifactId>elasticsearch-analysis-hanlp</artifactId>
  9 |     <version>${elasticsearch.version}</version>
 10 |     <packaging>jar</packaging>
 11 |     <description>HanLP Analyzer for Elasticsearch</description>
 12 | 
 13 |     <properties>
 14 |         <elasticsearch.version>7.0.0</elasticsearch.version>
 15 |         <hanlp.version>portable-1.7.3</hanlp.version>
 16 |         <maven.compiler.target>1.8</maven.compiler.target>
 17 |         <elasticsearch.assembly.descriptor>${project.basedir}/src/main/resources/plugin-descriptor.properties</elasticsearch.assembly.descriptor>
 18 |         <elasticsearch.plugin.name>analysis-hanlp</elasticsearch.plugin.name>
 19 |         <elasticsearch.plugin.classname>org.elasticsearch.plugin.hanlp.AnalysisHanLPPlugin</elasticsearch.plugin.classname>
 20 |         <elasticsearch.plugin.jvm>true</elasticsearch.plugin.jvm>
 21 |         <tests.rest.load_packaged>false</tests.rest.load_packaged>
 22 |         <skip.unit.tests>true</skip.unit.tests>
 23 |     </properties>
 24 | 
 25 |     <dependencies>
 26 |         <dependency>
 27 |             <groupId>com.hankcs</groupId>
 28 |             <artifactId>hanlp</artifactId>
 29 |             <version>${hanlp.version}</version>
 30 |         </dependency>
 31 | 
 32 |         <dependency>
 33 |             <groupId>org.elasticsearch</groupId>
 34 |             <artifactId>elasticsearch</artifactId>
 35 |             <version>${elasticsearch.version}</version>
 36 |             <scope>compile</scope>
 37 |         </dependency>
 38 | 
 39 |         <dependency>
 40 |             <groupId>junit</groupId>
 41 |             <artifactId>junit</artifactId>
 42 |             <version>4.12</version>
 43 |             <scope>test</scope>
 44 |         </dependency>
 45 |     </dependencies>
 46 | 
 47 |     <build>
 48 |         <plugins>
 49 |             <plugin>
 50 |                 <groupId>org.apache.maven.plugins</groupId>
 51 |                 <artifactId>maven-compiler-plugin</artifactId>
 52 |                 <version>3.5.1</version>
 53 |                 <configuration>
 54 |                     <source>${maven.compiler.target}</source>
 55 |                     <target>${maven.compiler.target}</target>
 56 |                     <encoding>utf8</encoding>
 57 |                 </configuration>
 58 |             </plugin>
 59 |             <!--<plugin>-->
 60 |             <!--<groupId>org.apache.maven.plugins</groupId>-->
 61 |             <!--<artifactId>maven-source-plugin</artifactId>-->
 62 |             <!--<version>2.1.2</version>-->
 63 |             <!--<executions>-->
 64 |             <!--<execution>-->
 65 |             <!--<id>attach-sources</id>-->
 66 |             <!--<goals>-->
 67 |             <!--<goal>jar</goal>-->
 68 |             <!--</goals>-->
 69 |             <!--</execution>-->
 70 |             <!--</executions>-->
 71 |             <!--</plugin>-->
 72 |             <plugin>
 73 |                 <artifactId>maven-assembly-plugin</artifactId>
 74 |                 <configuration>
 75 |                     <appendAssemblyId>false</appendAssemblyId>
 76 |                     <outputDirectory>${project.build.directory}/releases/</outputDirectory>
 77 |                     <descriptors>
 78 |                         <descriptor>${basedir}/src/main/java/assembly/plugin.xml</descriptor>
 79 |                         <!--<descriptor>${basedir}/src/main/java/assembly/plugin-jar.xml</descriptor>-->
 80 |                     </descriptors>
 81 |                     <archive>
 82 |                         <manifest>
 83 |                             <mainClass>fully.qualified.MainClass</mainClass>
 84 |                         </manifest>
 85 |                     </archive>
 86 |                 </configuration>
 87 |                 <executions>
 88 |                     <execution>
 89 |                         <phase>package</phase>
 90 |                         <goals>
 91 |                             <goal>single</goal>
 92 |                         </goals>
 93 |                     </execution>
 94 |                 </executions>
 95 |             </plugin>
 96 |         </plugins>
 97 |     </build>
 98 |     <profiles>
 99 |         <profile>
100 |             <id>disable-java8-doclint</id>
101 |             <activation>
102 |                 <jdk>[1.8,)</jdk>
103 |             </activation>
104 |             <properties>
105 |                 <additionalparam>-Xdoclint:none</additionalparam>
106 |             </properties>
107 |         </profile>
108 |     </profiles>
109 | </project>


--------------------------------------------------------------------------------
/src/main/java/assembly/plugin-jar.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <assembly>
 3 |     <id>jar-with-dependencies</id>
 4 |     <formats>
 5 |         <format>jar</format>
 6 |     </formats>
 7 |     <includeBaseDirectory>false</includeBaseDirectory>
 8 |     <dependencySets>
 9 |         <dependencySet>
10 |             <outputDirectory>/</outputDirectory>
11 |             <useProjectArtifact>true</useProjectArtifact>
12 |             <useTransitiveFiltering>true</useTransitiveFiltering>
13 |             <unpack>true</unpack>
14 |             <scope>runtime</scope>
15 |             <excludes>
16 |                 <exclude>org.elasticsearch:elasticsearch</exclude>
17 |             </excludes>
18 |         </dependencySet>
19 |         <dependencySet>
20 |             <outputDirectory>/</outputDirectory>
21 |             <useProjectArtifact>true</useProjectArtifact>
22 |             <useTransitiveFiltering>true</useTransitiveFiltering>
23 |             <unpack>true</unpack>
24 |             <includes>
25 |                 <include>com.hankcs:hanlp</include>
26 |             </includes>
27 |         </dependencySet>
28 |     </dependencySets>
29 | </assembly>


--------------------------------------------------------------------------------
/src/main/java/assembly/plugin.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <assembly>
 3 |     <id>plugin</id>
 4 |     <formats>
 5 |         <format>zip</format>
 6 |     </formats>
 7 |     <includeBaseDirectory>false</includeBaseDirectory>
 8 |     <fileSets>
 9 |         <fileSet>
10 |             <directory>${project.basedir}/config</directory>
11 |             <outputDirectory>/</outputDirectory>
12 |         </fileSet>
13 |     </fileSets>
14 |     <files>
15 |         <file>
16 |             <source>${project.basedir}/src/main/resources/plugin-descriptor.properties</source>
17 |             <filtered>true</filtered>
18 |             <outputDirectory>/</outputDirectory>
19 |         </file>
20 |     </files>
21 |     <dependencySets>
22 |         <dependencySet>
23 |             <outputDirectory>/</outputDirectory>
24 |             <useProjectArtifact>true</useProjectArtifact>
25 |             <useTransitiveFiltering>true</useTransitiveFiltering>
26 |             <unpack>false</unpack>
27 |             <excludes>
28 |                 <exclude>org.elasticsearch:elasticsearch</exclude>
29 |             </excludes>
30 |         </dependencySet>
31 |         <dependencySet>
32 |             <outputDirectory>/</outputDirectory>
33 |             <useProjectArtifact>true</useProjectArtifact>
34 |             <useTransitiveFiltering>true</useTransitiveFiltering>
35 |             <unpack>false</unpack>
36 |             <includes>
37 |                 <include>com.hankcs:hanlp</include>
38 |             </includes>
39 |         </dependencySet>
40 |     </dependencySets>
41 | </assembly>


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/AnalysisHanLPPlugin.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.plugin.hanlp;
 2 | 
 3 | 
 4 | import org.apache.lucene.analysis.Analyzer;
 5 | import org.elasticsearch.index.analysis.AnalyzerProvider;
 6 | import org.elasticsearch.index.analysis.TokenizerFactory;
 7 | import org.elasticsearch.indices.analysis.AnalysisModule;
 8 | import org.elasticsearch.plugin.hanlp.analysis.HanLPAnalyzerProvider;
 9 | import org.elasticsearch.plugin.hanlp.analysis.HanLPTokenizerFactory;
10 | import org.elasticsearch.plugins.AnalysisPlugin;
11 | import org.elasticsearch.plugins.Plugin;
12 | 
13 | import java.util.HashMap;
14 | import java.util.Map;
15 | 
16 | /**
17 |  * elasticsearch-analysis-hanlp
18 |  * elasticsearch-analysis-hanlp
19 |  * Created by hezl on 2018-11-20.
20 |  */
21 | public class AnalysisHanLPPlugin extends Plugin implements AnalysisPlugin {
22 |     public static String PLUGIN_NAME = "analysis-hanlp";
23 | 
24 |     @Override
25 |     public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
26 |         Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> extra = new HashMap<>();
27 |         extra.put("hanlp_index", HanLPTokenizerFactory::getIndexTokenizerFactory);
28 |         extra.put("hanlp_smart", HanLPTokenizerFactory::getSmartTokenizerFactory);
29 |         extra.put("hanlp_nlp", HanLPTokenizerFactory::getNLPTokenizerFactory);
30 |         extra.put("hanlp_per", HanLPTokenizerFactory::getPerceptronTokenizerFactory);
31 |         extra.put("hanlp_crf", HanLPTokenizerFactory::getCRFTokenizerFactory);
32 |         extra.put("hanlp", HanLPTokenizerFactory::new);
33 |         return extra;
34 |     }
35 | 
36 |     @Override
37 |     public Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
38 |         Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> extra = new HashMap<>();
39 |         extra.put("hanlp_index", HanLPAnalyzerProvider::getIndexAnalyzerProvider);
40 |         extra.put("hanlp_smart", HanLPAnalyzerProvider::getSmartAnalyzerProvider);
41 |         extra.put("hanlp_nlp", HanLPAnalyzerProvider::getNLPAnalyzerProvider);
42 |         extra.put("hanlp_per", HanLPAnalyzerProvider::getPerceptronAnalyzerProvider);
43 |         extra.put("hanlp_crf", HanLPAnalyzerProvider::getCRFAnalyzerProvider);
44 |         extra.put("hanlp", HanLPAnalyzerProvider::new);
45 |         return extra;
46 |     }
47 | 
48 | //    @Override
49 | //    public Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
50 | //        Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> extra = new HashMap<>();
51 | //
52 | //        extra.put("py_all", HanLPTokenFilterFactory::getPinyinFilterFactory);
53 | //        extra.put("py_mix", HanLPTokenFilterFactory::getPinyinMixFilterFactory);
54 | //        extra.put("py_first", HanLPTokenFilterFactory::getPinyinFirstFilterFactory);
55 | //        extra.put("py_full", HanLPTokenFilterFactory::getPinyinFullFilterFactory);
56 | //
57 | //        extra.put("ts", HanLPTokenFilterFactory::getTSFilterFactory);
58 | //        extra.put("t2s", HanLPTokenFilterFactory::getT2SFilterFactory);
59 | //        extra.put("s2t", HanLPTokenFilterFactory::getS2TFilterFactory);
60 | //
61 | //        return extra;
62 | //    }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/analysis/HanLPAnalyzerProvider.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.plugin.hanlp.analysis;
 2 | 
 3 | 
 4 | import com.hankcs.hanlp.seg.Segment;
 5 | import org.elasticsearch.common.collect.Tuple;
 6 | import org.elasticsearch.common.settings.Settings;
 7 | import org.elasticsearch.env.Environment;
 8 | import org.elasticsearch.index.IndexSettings;
 9 | import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
10 | import org.elasticsearch.plugin.hanlp.conf.ConfigHelper;
11 | import org.elasticsearch.plugin.hanlp.conf.DicConfig;
12 | import org.elasticsearch.plugin.hanlp.conf.HanLPConfig;
13 | import org.elasticsearch.plugin.hanlp.lucene.HanLPAnalyzer;
14 | 
15 | import java.util.Set;
16 | 
17 | /**
18 |  * es-analysis-hanlp
19 |  * net.luculent.bigdata.es.plugin.hanlp.analysis
20 |  * Created by HEZHILONG on 2018-08-23.
21 |  */
22 | public class HanLPAnalyzerProvider extends AbstractIndexAnalyzerProvider<HanLPAnalyzer> {
23 |     private final HanLPAnalyzer analyzer;
24 | 
25 |     public HanLPAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
26 |         super(indexSettings, name, settings);
27 |         DicConfig.initConfig(env, settings);
28 |         Tuple<Segment, Set<String>> tuple = ConfigHelper.getSegmentAndFilter(settings);
29 |         analyzer = new HanLPAnalyzer(tuple.v1(), tuple.v2());
30 |     }
31 | 
32 |     public HanLPAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings, HanLPConfig config) {
33 |         super(indexSettings, name, settings);
34 |         DicConfig.initConfig(env, settings);
35 |         analyzer = new HanLPAnalyzer(ConfigHelper.getSegment(config), ConfigHelper.getStopWords(config));
36 |     }
37 | 
38 |     public static HanLPAnalyzerProvider getIndexAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
39 |         return new HanLPAnalyzerProvider(indexSettings, env, name, settings, ConfigHelper.INDEX_CONFIG);
40 |     }
41 | 
42 |     public static HanLPAnalyzerProvider getSmartAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
43 |         return new HanLPAnalyzerProvider(indexSettings, env, name, settings, ConfigHelper.SMART_CONFIG);
44 |     }
45 | 
46 |     public static HanLPAnalyzerProvider getNLPAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
47 |         return new HanLPAnalyzerProvider(indexSettings, env, name, settings, ConfigHelper.NLP_CONFIG);
48 |     }
49 | 
50 |     public static HanLPAnalyzerProvider getPerceptronAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
51 |         return new HanLPAnalyzerProvider(indexSettings, env, name, settings, ConfigHelper.PERCEPTRON_CONFIG);
52 |     }
53 | 
54 |     public static HanLPAnalyzerProvider getCRFAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
55 |         return new HanLPAnalyzerProvider(indexSettings, env, name, settings, ConfigHelper.CRF_CONFIG);
56 |     }
57 | 
58 |     @Override
59 |     public HanLPAnalyzer get() {
60 |         return this.analyzer;
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/analysis/HanLPTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.plugin.hanlp.analysis;
 2 | 
 3 | 
 4 | import com.hankcs.hanlp.seg.Segment;
 5 | import org.apache.lucene.analysis.Tokenizer;
 6 | import org.elasticsearch.common.collect.Tuple;
 7 | import org.elasticsearch.common.settings.Settings;
 8 | import org.elasticsearch.env.Environment;
 9 | import org.elasticsearch.index.IndexSettings;
10 | import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
11 | import org.elasticsearch.plugin.hanlp.conf.ConfigHelper;
12 | import org.elasticsearch.plugin.hanlp.conf.DicConfig;
13 | import org.elasticsearch.plugin.hanlp.conf.HanLPConfig;
14 | import org.elasticsearch.plugin.hanlp.lucene.HanLPTokenizer;
15 | 
16 | import java.util.Set;
17 | 
18 | /**
19 |  * es-analysis-hanlp
20 |  * net.luculent.bigdata.es.plugin.hanlp.analysis
21 |  * Created by HEZHILONG on 2018-08-23.
22 |  */
23 | public class HanLPTokenizerFactory extends AbstractTokenizerFactory {
24 |     private Tuple<Segment, Set<String>> tuple;
25 | 
26 |     public HanLPTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
27 |         super(indexSettings, settings);
28 |         DicConfig.initConfig(env, settings);
29 |         tuple = ConfigHelper.getSegmentAndFilter(settings);
30 |     }
31 | 
32 |     public HanLPTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings, HanLPConfig config) {
33 |         super(indexSettings, settings);
34 |         DicConfig.initConfig(env, settings);
35 |         tuple = Tuple.tuple(ConfigHelper.getSegment(config), ConfigHelper.getStopWords(config));
36 |     }
37 | 
38 |     public static HanLPTokenizerFactory getIndexTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
39 |         return new HanLPTokenizerFactory(indexSettings, env, name, settings, ConfigHelper.INDEX_CONFIG);
40 |     }
41 | 
42 |     public static HanLPTokenizerFactory getNLPTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
43 |         return new HanLPTokenizerFactory(indexSettings, env, name, settings, ConfigHelper.NLP_CONFIG);
44 |     }
45 | 
46 |     public static HanLPTokenizerFactory getSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
47 |         return new HanLPTokenizerFactory(indexSettings, env, name, settings, ConfigHelper.SMART_CONFIG);
48 |     }
49 | 
50 |     public static HanLPTokenizerFactory getPerceptronTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
51 |         return new HanLPTokenizerFactory(indexSettings, env, name, settings, ConfigHelper.PERCEPTRON_CONFIG);
52 |     }
53 | 
54 |     public static HanLPTokenizerFactory getCRFTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
55 |         return new HanLPTokenizerFactory(indexSettings, env, name, settings, ConfigHelper.CRF_CONFIG);
56 |     }
57 | 
58 |     @Override
59 |     public Tokenizer create() {
60 |         return new HanLPTokenizer(tuple.v1(), tuple.v2(), true);
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/conf/ConfigHelper.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.plugin.hanlp.conf;
  2 | 
  3 | 
  4 | import com.hankcs.hanlp.HanLP;
  5 | import com.hankcs.hanlp.corpus.io.IOUtil;
  6 | import com.hankcs.hanlp.corpus.io.ResourceIOAdapter;
  7 | import com.hankcs.hanlp.seg.Segment;
  8 | import com.hankcs.hanlp.seg.Viterbi.ViterbiSegment;
  9 | import com.hankcs.hanlp.utility.TextUtility;
 10 | import org.apache.logging.log4j.Logger;
 11 | import org.elasticsearch.SpecialPermission;
 12 | import org.elasticsearch.common.collect.Tuple;
 13 | import org.elasticsearch.common.logging.Loggers;
 14 | import org.elasticsearch.common.settings.Settings;
 15 | 
 16 | import java.io.*;
 17 | import java.security.AccessController;
 18 | import java.security.PrivilegedAction;
 19 | import java.util.HashSet;
 20 | import java.util.Set;
 21 | 
 22 | /**
 23 |  * elasticsearch-analysis-hanlp
 24 |  * elasticsearch-analysis-hanlp
 25 |  * Created by hezl on 2018-12-03.
 26 |  */
 27 | public class ConfigHelper {
 28 |     private static final Logger logger = Loggers.getLogger(ConfigHelper.class, "ConfigHelper");
 29 | 
 30 |     public static final HanLPConfig INDEX_CONFIG = new HanLPConfig(){{
 31 |         setAlgorithm("viterbi");
 32 |         setEnableIndexMode(true);
 33 |         //CustomDic
 34 |         setCustomDictionaryPath("");
 35 |         setEnableCustomDictionary(true);
 36 |         setEnableCustomDictionaryForcing(false);
 37 |         //StopWord
 38 |         setEnableStopWord(false);
 39 |         setStopWordDictionaryPath("");
 40 |         //NLP
 41 |         setEnableNameRecognize(true);
 42 |         setEnableJapaneseNameRecognize(false);
 43 |         setEnableTranslatedNameRecognize(false);
 44 |         setEnableNumberQuantifierRecognize(true);
 45 |         setEnableOrganizationRecognize(false);
 46 |         setEnablePlaceRecognize(false);
 47 |         setEnableTraditionalChineseMode(false);
 48 |     }};
 49 | 
 50 |     public static final HanLPConfig SMART_CONFIG = new HanLPConfig(){{
 51 |         setAlgorithm("viterbi");
 52 |         setEnableIndexMode(false);
 53 |         //CustomDic
 54 |         setCustomDictionaryPath("");
 55 |         setEnableCustomDictionary(true);
 56 |         setEnableCustomDictionaryForcing(false);
 57 |         //StopWord
 58 |         setEnableStopWord(false);
 59 |         setStopWordDictionaryPath("");
 60 |         //NLP
 61 |         setEnableNameRecognize(true);
 62 |         setEnableJapaneseNameRecognize(false);
 63 |         setEnableTranslatedNameRecognize(false);
 64 |         setEnableNumberQuantifierRecognize(true);
 65 |         setEnableOrganizationRecognize(false);
 66 |         setEnablePlaceRecognize(false);
 67 |         setEnableTraditionalChineseMode(false);
 68 |     }};
 69 | 
 70 |     public static final HanLPConfig NLP_CONFIG = new HanLPConfig(){{
 71 |         setAlgorithm("viterbi");
 72 |         setEnableIndexMode(false);
 73 |         //CustomDic
 74 |         setCustomDictionaryPath("");
 75 |         setEnableCustomDictionary(true);
 76 |         setEnableCustomDictionaryForcing(false);
 77 |         //StopWord
 78 |         setEnableStopWord(false);
 79 |         setStopWordDictionaryPath("");
 80 |         //NLP
 81 |         setEnableNameRecognize(true);
 82 |         setEnableJapaneseNameRecognize(true);
 83 |         setEnableTranslatedNameRecognize(true);
 84 |         setEnableNumberQuantifierRecognize(true);
 85 |         setEnableOrganizationRecognize(true);
 86 |         setEnablePlaceRecognize(true);
 87 |         setEnableTraditionalChineseMode(false);
 88 |     }};
 89 | 
 90 |     public static final HanLPConfig PERCEPTRON_CONFIG = new HanLPConfig(){{
 91 |         setAlgorithm("perceptron");
 92 |         setEnableIndexMode(false);
 93 |         //CustomDic
 94 |         setCustomDictionaryPath("");
 95 |         setEnableCustomDictionary(true);
 96 |         setEnableCustomDictionaryForcing(false);
 97 |         //StopWord
 98 |         setEnableStopWord(false);
 99 |         setStopWordDictionaryPath("");
100 |         //NLP
101 |         setEnableNameRecognize(true);
102 |         setEnableJapaneseNameRecognize(false);
103 |         setEnableTranslatedNameRecognize(false);
104 |         setEnableNumberQuantifierRecognize(true);
105 |         setEnableOrganizationRecognize(true);
106 |         setEnablePlaceRecognize(true);
107 |         setEnableTraditionalChineseMode(false);
108 |     }};
109 | 
110 |     public static final HanLPConfig CRF_CONFIG = new HanLPConfig(){{
111 |         setAlgorithm("crf");
112 |         setEnableIndexMode(false);
113 |         //CustomDic
114 |         setCustomDictionaryPath("");
115 |         setEnableCustomDictionary(true);
116 |         setEnableCustomDictionaryForcing(false);
117 |         //StopWord
118 |         setEnableStopWord(false);
119 |         setStopWordDictionaryPath("");
120 |         //NLP
121 |         setEnableNameRecognize(true);
122 |         setEnableJapaneseNameRecognize(false);
123 |         setEnableTranslatedNameRecognize(false);
124 |         setEnableNumberQuantifierRecognize(true);
125 |         setEnableOrganizationRecognize(true);
126 |         setEnablePlaceRecognize(true);
127 |         setEnableTraditionalChineseMode(false);
128 |     }};
129 | 
130 |     public static Segment getSegment(HanLPConfig config) {
131 |         //SpecialPermission.check();
132 |         return AccessController.doPrivileged((PrivilegedAction<Segment>) () -> {
133 |             Segment segment;
134 |             String algorithm = config.getAlgorithm();
135 |             if ("crf".equals(algorithm) || "条件随机场".equals(algorithm) ||
136 |                     "perceptron".equals(algorithm) || "感知机".equals(algorithm)) {
137 |                 if (HanLP.Config.IOAdapter instanceof ResourceIOAdapter) {
138 |                     return null;
139 |                 }
140 |             }
141 |             if ("viterbi".equals(algorithm) || "维特比".equals(algorithm)) {
142 |                 String customDictionaryPath = config.getCustomDictionaryPath();
143 |                 if (TextUtility.isBlank(customDictionaryPath)) {
144 |                     segment = new ViterbiSegment();
145 |                 } else {
146 |                     segment = new ViterbiSegment(customDictionaryPath);
147 |                 }
148 |             } else {
149 |                 segment = HanLP.newSegment(algorithm);
150 |             }
151 |             segment.enableIndexMode(config.isEnableIndexMode())
152 |                     .enableCustomDictionary(config.isEnableCustomDictionary())
153 |                     .enableCustomDictionaryForcing(config.isEnableCustomDictionaryForcing())
154 |                     .enableNameRecognize(config.isEnableNameRecognize())
155 |                     .enableJapaneseNameRecognize(config.isEnableJapaneseNameRecognize())
156 |                     .enableTranslatedNameRecognize(config.isEnableTranslatedNameRecognize())
157 |                     .enableNumberQuantifierRecognize(config.isEnableNumberQuantifierRecognize())
158 |                     .enableOrganizationRecognize(config.isEnableOrganizationRecognize())
159 |                     .enablePlaceRecognize(config.isEnablePlaceRecognize())
160 |                     .enableTranslatedNameRecognize(config.isEnableTraditionalChineseMode())
161 |                     .enableOffset(true).enablePartOfSpeechTagging(true);
162 |             System.out.println(segment.seg("HanLP中文分词工具包！"));
163 |             return segment;
164 |         });
165 |     }
166 | 
167 |     public static Set<String> getStopWords(HanLPConfig config){
168 |         if (!config.isEnableStopWord()){
169 |             return null;
170 |         }
171 |         String filePath = config.getStopWordDictionaryPath();
172 |         if (TextUtility.isBlank(filePath)){
173 |             filePath = HanLP.Config.CoreStopWordDictionaryPath;
174 |         }
175 |         final String cfPath = filePath;
176 |         try {
177 |             //SpecialPermission.check();
178 |             byte[] bytes = AccessController.doPrivileged((PrivilegedAction<byte[]>) () -> {
179 |                 byte[] bs;
180 |                 if (IOUtil.isResource(cfPath)) {
181 |                     try {
182 |                         bs = IOUtil.readBytesFromResource(cfPath);
183 |                     } catch (IOException e) {
184 |                         return new byte[0];
185 |                     }
186 |                 } else {
187 |                     bs = IOUtil.readBytes(cfPath);
188 |                 }
189 |                 return bs;
190 |             });
191 |             if (bytes == null || bytes.length <= 0){
192 |                 return null;
193 |             }
194 |             Set<String> resultSet = new HashSet<>();
195 |             ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bytes);
196 |             InputStreamReader reader = new InputStreamReader(byteArrayInputStream);
197 |             BufferedReader br = new BufferedReader(reader);
198 |             String str;
199 |             while ((str = br.readLine()) != null) {
200 |                 resultSet.add(str);
201 |             }
202 |             br.close();
203 |             reader.close();
204 |             byteArrayInputStream.close();
205 |             return resultSet;
206 |         }
207 |         catch (Exception ex){
208 |             logger.error(ex);
209 |         }
210 |         return null;
211 |     }
212 | 
213 |     public static HanLPConfig getConfig(Settings settings){
214 |         HanLPConfig config = new HanLPConfig();
215 |         config.setAlgorithm(settings.get("algorithm", "viterbi"));
216 |         config.setEnableIndexMode(settings.getAsBoolean("enableIndexMode", false));
217 |         //CustomDic
218 |         config.setCustomDictionaryPath(settings.get("customDictionaryPath", ""));
219 |         config.setEnableCustomDictionary(settings.getAsBoolean("enableCustomDictionary", true));
220 |         config.setEnableCustomDictionaryForcing(settings.getAsBoolean("enableCustomDictionaryForcing", false));
221 |         //StopWord
222 |         config.setEnableStopWord(settings.getAsBoolean("enableStopWord", false));
223 |         config.setStopWordDictionaryPath(settings.get("stopWordDictionaryPath", ""));
224 |         //NLP
225 |         config.setEnableNameRecognize(settings.getAsBoolean("enableNameRecognize", true));
226 |         config.setEnableJapaneseNameRecognize(settings.getAsBoolean("enableJapaneseNameRecognize", false));
227 |         config.setEnableTranslatedNameRecognize(settings.getAsBoolean("enableTranslatedNameRecognize", false));
228 |         config.setEnableNumberQuantifierRecognize(settings.getAsBoolean("enableNumberQuantifierRecognize", true));
229 |         config.setEnableOrganizationRecognize(settings.getAsBoolean("enableOrganizationRecognize", false));
230 |         config.setEnablePlaceRecognize(settings.getAsBoolean("enablePlaceRecognize", false));
231 |         config.setEnableTraditionalChineseMode(settings.getAsBoolean("enableTraditionalChineseMode", false));
232 | 
233 |         return config;
234 |     }
235 | 
236 |     public static Tuple<Segment, Set<String>> getSegmentAndFilter(Settings settings) {
237 |         HanLPConfig config = getConfig(settings);
238 |         return Tuple.tuple(getSegment(config), getStopWords(config));
239 |     }
240 | }
241 | 
242 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/conf/DicConfig.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.plugin.hanlp.conf;
 2 | 
 3 | 
 4 | import com.hankcs.hanlp.utility.Predefine;
 5 | import com.hankcs.hanlp.utility.TextUtility;
 6 | import org.apache.logging.log4j.Logger;
 7 | import org.elasticsearch.common.logging.Loggers;
 8 | import org.elasticsearch.common.settings.Settings;
 9 | import org.elasticsearch.env.Environment;
10 | 
11 | import java.io.File;
12 | import java.io.FileInputStream;
13 | import java.io.InputStreamReader;
14 | import java.nio.file.Path;
15 | import java.util.Properties;
16 | 
17 | /**
18 |  * elasticsearch-analysis-hanlp
19 |  * elasticsearch-analysis-hanlp
20 |  * Created by hezl on 2018-11-20.
21 |  */
22 | public class DicConfig {
23 |     private static final Logger logger = Loggers.getLogger(DicConfig.class, "DicConfig");
24 |     private static Environment env;
25 |     private static Settings settings;
26 |     private static String configPath;
27 |     private static String remoteDicUrl;
28 |     private static boolean isInit;
29 | 
30 |     /**
31 |      * 根据配置文件
32 |      * 初始化词典以及远程更新配置
33 |      */
34 |     public static synchronized void initConfig(Environment env, Settings settings) {
35 |         if (isInit) {
36 |             return;
37 |         }
38 |         DicConfig.env = env;
39 |         DicConfig.settings = settings;
40 |         File configFile = getConfigFilePath().toFile();
41 |         if (!configFile.exists()) {
42 |             return;
43 |         }
44 |         Properties properties = new Properties();
45 |         try (InputStreamReader inputStreamReader = new InputStreamReader(new FileInputStream(configFile))) {
46 |             properties.load(inputStreamReader);
47 |             configPath = properties.getProperty("configPath", null);
48 |             remoteDicUrl = properties.getProperty("remoteDicUrl", "");
49 |             if (TextUtility.isBlank(configPath)) {
50 |                 if (getDefDicConfigPath().toFile().exists()) {
51 |                     configPath = getDefDicConfigPath().toAbsolutePath().toString();
52 |                     Properties cfProp = new Properties();
53 |                     FileInputStream inputStream = new FileInputStream(configPath);
54 |                     cfProp.load(inputStream);
55 |                     if (!cfProp.containsKey("root")){
56 |                         configPath = null;
57 |                     }
58 |                     inputStream.close();
59 |                     cfProp.clear();
60 |                 }
61 |             }
62 |             if (TextUtility.isBlank(configPath)) {
63 |                 configPath = null;
64 |             }
65 |             Predefine.HANLP_PROPERTIES_PATH = configPath;
66 |             logger.info("HanLP Properties Path: " + Predefine.HANLP_PROPERTIES_PATH);
67 |             //todo 远程更新
68 |         } catch (Exception ex) {
69 |             logger.error(ex);
70 |         } finally {
71 |             properties.clear();
72 |         }
73 |         isInit = true;
74 |     }
75 | 
76 |     private static Path getPluginPath() {
77 |         return env.pluginsFile().resolve("analysis-hanlp");
78 |     }
79 | 
80 |     private static Path getDefDicConfigPath() {
81 |         return env.pluginsFile().resolve("analysis-hanlp/hanlp.properties").toAbsolutePath();
82 |     }
83 | 
84 |     private static Path getConfigFilePath() {
85 |         return env.pluginsFile().resolve("analysis-hanlp/plugin.properties").toAbsolutePath();
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/conf/HanLPConfig.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.plugin.hanlp.conf;
  2 | 
  3 | 
  4 | /**
  5 |  * elasticsearch-analysis-hanlp
  6 |  * elasticsearch-analysis-hanlp
  7 |  * Created by hezl on 2018-12-03.
  8 |  */
  9 | public class HanLPConfig {
 10 |     /**
 11 |      * 分词算法，传入算法的中英文名都可以，可选列表：<br>
 12 |      * <ul>
 13 |      * <li>维特比 (viterbi)：效率和效果的最佳平衡</li>
 14 |      * <li>双数组trie树 (dat)：极速词典分词，千万字符每秒</li>
 15 |      * <li>条件随机场 (crf)：分词、词性标注与命名实体识别精度都较高，适合要求较高的NLP任务</li>
 16 |      * <li>感知机 (perceptron)：分词、词性标注与命名实体识别，支持在线学习</li>
 17 |      * <li>N最短路 (nshort)：命名实体识别稍微好一些，牺牲了速度</li>
 18 |      * </ul>
 19 |      */
 20 |     private String algorithm;
 21 |     /**
 22 |      * 设为索引模式（最细粒度切分）
 23 |      */
 24 |     private boolean enableIndexMode;
 25 |     /**
 26 |      * 是否启用用户词典
 27 |      */
 28 |     private boolean enableCustomDictionary;
 29 |     /**
 30 |      * 用户词典路径（绝对路径，多个词典用 ; 隔开）
 31 |      */
 32 |     private String customDictionaryPath;
 33 |     /**
 34 |      * 用户词典高优先级
 35 |      */
 36 |     private boolean enableCustomDictionaryForcing;
 37 |     /**
 38 |      * 停用词词典路径
 39 |      */
 40 |     private boolean enableStopWord;
 41 |     /**
 42 |      * 停用词词典路径
 43 |      */
 44 |     private String stopWordDictionaryPath;
 45 |     /**
 46 |      * 是否启用数词和数量词识别
 47 |      */
 48 |     private boolean enableNumberQuantifierRecognize;
 49 |     /**
 50 |      * 开启人名识别
 51 |      */
 52 |     private boolean enableNameRecognize;
 53 |     /**
 54 |      * 是否启用音译人名识别
 55 |      */
 56 |     private boolean enableTranslatedNameRecognize;
 57 |     /**
 58 |      * 是否启用日本人名识别
 59 |      */
 60 |     private boolean enableJapaneseNameRecognize;
 61 |     /**
 62 |      * 开启机构名识别
 63 |      */
 64 |     private boolean enableOrganizationRecognize;
 65 |     /**
 66 |      * 开启地名识别
 67 |      */
 68 |     private boolean enablePlaceRecognize;
 69 |     /**
 70 |      * 开启精准繁体中文分词
 71 |      */
 72 |     private boolean enableTraditionalChineseMode;
 73 | 
 74 |     public String getAlgorithm() {
 75 |         return algorithm;
 76 |     }
 77 | 
 78 |     public void setAlgorithm(String algorithm) {
 79 |         this.algorithm = algorithm;
 80 |     }
 81 | 
 82 |     public boolean isEnableIndexMode() {
 83 |         return enableIndexMode;
 84 |     }
 85 | 
 86 |     public void setEnableIndexMode(boolean enableIndexMode) {
 87 |         this.enableIndexMode = enableIndexMode;
 88 |     }
 89 | 
 90 |     public boolean isEnableCustomDictionary() {
 91 |         return enableCustomDictionary;
 92 |     }
 93 | 
 94 |     public void setEnableCustomDictionary(boolean enableCustomDictionary) {
 95 |         this.enableCustomDictionary = enableCustomDictionary;
 96 |     }
 97 | 
 98 |     public String getCustomDictionaryPath() {
 99 |         return customDictionaryPath;
100 |     }
101 | 
102 |     public void setCustomDictionaryPath(String customDictionaryPath) {
103 |         this.customDictionaryPath = customDictionaryPath;
104 |     }
105 | 
106 |     public boolean isEnableCustomDictionaryForcing() {
107 |         return enableCustomDictionaryForcing;
108 |     }
109 | 
110 |     public void setEnableCustomDictionaryForcing(boolean enableCustomDictionaryForcing) {
111 |         this.enableCustomDictionaryForcing = enableCustomDictionaryForcing;
112 |     }
113 | 
114 |     public String getStopWordDictionaryPath() {
115 |         return stopWordDictionaryPath;
116 |     }
117 | 
118 |     public void setStopWordDictionaryPath(String stopWordDictionaryPath) {
119 |         this.stopWordDictionaryPath = stopWordDictionaryPath;
120 |     }
121 | 
122 |     public boolean isEnableNumberQuantifierRecognize() {
123 |         return enableNumberQuantifierRecognize;
124 |     }
125 | 
126 |     public void setEnableNumberQuantifierRecognize(boolean enableNumberQuantifierRecognize) {
127 |         this.enableNumberQuantifierRecognize = enableNumberQuantifierRecognize;
128 |     }
129 | 
130 |     public boolean isEnableNameRecognize() {
131 |         return enableNameRecognize;
132 |     }
133 | 
134 |     public void setEnableNameRecognize(boolean enableNameRecognize) {
135 |         this.enableNameRecognize = enableNameRecognize;
136 |     }
137 | 
138 |     public boolean isEnableTranslatedNameRecognize() {
139 |         return enableTranslatedNameRecognize;
140 |     }
141 | 
142 |     public void setEnableTranslatedNameRecognize(boolean enableTranslatedNameRecognize) {
143 |         this.enableTranslatedNameRecognize = enableTranslatedNameRecognize;
144 |     }
145 | 
146 |     public boolean isEnableJapaneseNameRecognize() {
147 |         return enableJapaneseNameRecognize;
148 |     }
149 | 
150 |     public void setEnableJapaneseNameRecognize(boolean enableJapaneseNameRecognize) {
151 |         this.enableJapaneseNameRecognize = enableJapaneseNameRecognize;
152 |     }
153 | 
154 |     public boolean isEnableOrganizationRecognize() {
155 |         return enableOrganizationRecognize;
156 |     }
157 | 
158 |     public void setEnableOrganizationRecognize(boolean enableOrganizationRecognize) {
159 |         this.enableOrganizationRecognize = enableOrganizationRecognize;
160 |     }
161 | 
162 |     public boolean isEnablePlaceRecognize() {
163 |         return enablePlaceRecognize;
164 |     }
165 | 
166 |     public void setEnablePlaceRecognize(boolean enablePlaceRecognize) {
167 |         this.enablePlaceRecognize = enablePlaceRecognize;
168 |     }
169 | 
170 |     public boolean isEnableTraditionalChineseMode() {
171 |         return enableTraditionalChineseMode;
172 |     }
173 | 
174 |     public void setEnableTraditionalChineseMode(boolean enableTraditionalChineseMode) {
175 |         this.enableTraditionalChineseMode = enableTraditionalChineseMode;
176 |     }
177 | 
178 |     public boolean isEnableStopWord() {
179 |         return enableStopWord;
180 |     }
181 | 
182 |     public void setEnableStopWord(boolean enableStopWord) {
183 |         this.enableStopWord = enableStopWord;
184 |     }
185 | }
186 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/lucene/HanLPAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.plugin.hanlp.lucene;
 2 | 
 3 | import com.hankcs.hanlp.HanLP;
 4 | import com.hankcs.hanlp.seg.Segment;
 5 | import org.apache.lucene.analysis.Analyzer;
 6 | import org.apache.lucene.analysis.Tokenizer;
 7 | 
 8 | import java.util.Set;
 9 | 
10 | public class HanLPAnalyzer extends Analyzer {
11 |     private boolean enablePorterStemming;
12 |     private Set<String> filter;
13 |     private Segment segment;
14 | 
15 |     /**
16 |      * @param filter               停用词
17 |      * @param enablePorterStemming 是否分析词干（仅限英文）
18 |      */
19 |     public HanLPAnalyzer(Segment segment, Set<String> filter, boolean enablePorterStemming) {
20 |         this.segment = segment;
21 |         this.filter = filter;
22 |         this.enablePorterStemming = enablePorterStemming;
23 |     }
24 | 
25 |     /**
26 |      * @param enablePorterStemming 是否分析词干.进行单复数,时态的转换
27 |      */
28 |     public HanLPAnalyzer(Segment segment, boolean enablePorterStemming) {
29 |         this.segment = segment;
30 |         this.enablePorterStemming = enablePorterStemming;
31 |     }
32 | 
33 |     public HanLPAnalyzer(Segment segment, Set<String> filter) {
34 |         this.segment = segment;
35 |         this.filter = filter;
36 |         this.enablePorterStemming = true;
37 |     }
38 | 
39 |     public HanLPAnalyzer(Segment segment) {
40 |         this.segment = segment;
41 |         this.enablePorterStemming = true;
42 |     }
43 | 
44 |     public HanLPAnalyzer() {
45 |         super();
46 |         this.segment = HanLP.newSegment().enableOffset(true).enableIndexMode(true).enablePartOfSpeechTagging(true);
47 |     }
48 | 
49 |     /**
50 |      * 重载Analyzer接口，构造分词组件
51 |      */
52 |     @Override
53 |     protected TokenStreamComponents createComponents(String fieldName) {
54 |         Tokenizer tokenizer = new HanLPTokenizer(this.segment, filter, enablePorterStemming);
55 |         return new TokenStreamComponents(tokenizer);
56 |     }
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/lucene/HanLPTokenFilter.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.plugin.hanlp.lucene;
  2 | 
  3 | import com.hankcs.hanlp.corpus.tag.Nature;
  4 | import com.hankcs.hanlp.seg.common.Term;
  5 | import org.apache.lucene.analysis.TokenFilter;
  6 | import org.apache.lucene.analysis.TokenStream;
  7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  8 | 
  9 | import java.io.IOException;
 10 | import java.util.ArrayList;
 11 | import java.util.Iterator;
 12 | import java.util.LinkedList;
 13 | import java.util.List;
 14 | /**
 15 |  * @author hankcs
 16 |  */
 17 | public class HanLPTokenFilter extends TokenFilter {
 18 |     private CharTermAttribute termAtt = (CharTermAttribute) this.addAttribute(CharTermAttribute.class);
 19 |     private String mode;
 20 |     private LinkedList<String> tokensCache = new LinkedList<>();
 21 | 
 22 |     public HanLPTokenFilter(String mode, TokenStream input) {
 23 |         super(input);
 24 |         this.mode = mode;
 25 |     }
 26 | 
 27 |     @Override
 28 |     public boolean incrementToken() throws IOException {
 29 |         if (hasMoreTokenInCache()) {
 30 |             this.termAtt.setEmpty();
 31 |             this.termAtt.append(nextTokenLexeme());
 32 |             return true;
 33 |         }
 34 |         if (this.input.incrementToken()) {
 35 |             char[] text = this.termAtt.buffer();
 36 |             List<Term> terms;
 37 |             switch (this.mode) {
 38 |                 //todo 实现分词
 39 | 
 40 | //                case "t2s":
 41 | //                    terms = SegmentHelper.segSentence(text, TsMode.T2S);
 42 | //                    break;
 43 | //                case "s2t":
 44 | //                    terms = SegmentHelper.segSentence(text, TsMode.S2T);
 45 | //                    break;
 46 | //                case "ts":
 47 | //                    terms = SegmentHelper.segSentence(text, TsMode.ALL);
 48 | //                    break;
 49 | //                case "py_first":
 50 | //                    terms = SegmentHelper.segSentence(text, PinyinMode.FIRST_LETTER);
 51 | //                    break;
 52 | //                case "py_full":
 53 | //                    terms = SegmentHelper.segSentence(text, PinyinMode.FULL_PINYIN);
 54 | //                    break;
 55 | //                case "py_mix":
 56 | //                    terms = SegmentHelper.segSentence(text, PinyinMode.MIX_PINYIN);
 57 | //                    break;
 58 | //                case "py_all":
 59 | //                    terms = SegmentHelper.segSentence(text, PinyinMode.MIX_ALL);
 60 | //                    break;
 61 |                 default:
 62 |                     terms = new ArrayList<>();
 63 |                     terms.add(new Term(new String(text), Nature.nz));
 64 |                     break;
 65 |             }
 66 |             Iterator<Term> pinyinIterator = terms.iterator();
 67 |             if (pinyinIterator.hasNext()) {
 68 |                 String pinyinItem = pinyinIterator.next().word;
 69 |                 while (pinyinIterator.hasNext()) {
 70 |                     addTokenToCache(pinyinIterator.next().word);
 71 |                 }
 72 |                 this.termAtt.setEmpty();
 73 |                 this.termAtt.append(pinyinItem);
 74 |             }
 75 |             return true;
 76 |         }
 77 |         return false;
 78 |     }
 79 | 
 80 |     @Override
 81 |     public void reset() throws IOException {
 82 |         super.reset();
 83 |         tokensCache.clear();
 84 |     }
 85 | 
 86 |     protected boolean hasMoreTokenInCache() {
 87 |         return !tokensCache.isEmpty();
 88 |     }
 89 | 
 90 |     private String nextTokenLexeme() {
 91 |         return tokensCache.pollFirst();
 92 |     }
 93 | 
 94 |     private void addTokenToCache(String token) {
 95 |         if (token != null) {
 96 |             tokensCache.add(token);
 97 |         }
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/lucene/HanLPTokenizer.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.plugin.hanlp.lucene;
  2 | 
  3 | import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
  4 | import com.hankcs.hanlp.corpus.tag.Nature;
  5 | import com.hankcs.hanlp.seg.Segment;
  6 | import com.hankcs.hanlp.seg.common.Term;
  7 | import com.hankcs.hanlp.utility.TextUtility;
  8 | import org.apache.lucene.analysis.Tokenizer;
  9 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 10 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 11 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 12 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 13 | 
 14 | import java.io.BufferedReader;
 15 | import java.io.IOException;
 16 | import java.util.Set;
 17 | 
 18 | /**
 19 |  * @author hankcs
 20 |  */
 21 | public class HanLPTokenizer extends Tokenizer {
 22 |     // 当前词
 23 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 24 |     // 偏移量
 25 |     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 26 |     // 距离
 27 |     private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
 28 |     // 词性
 29 |     private TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
 30 | 
 31 |     private SegmentWrapper segment;
 32 |     private BinTrie<String> filter;
 33 |     private boolean enablePorterStemming;
 34 |     private final PorterStemmer stemmer = new PorterStemmer();
 35 | 
 36 |     /**
 37 |      * 单文档当前所在的总offset，当reset（切换multi-value fields中的value）的时候不清零，在end（切换field）时清零
 38 |      */
 39 |     private int totalOffset = 0;
 40 | 
 41 |     /**
 42 |      * @param segment              HanLP中的某个分词器
 43 |      * @param filter               停用词
 44 |      * @param enablePorterStemming 英文原型转换
 45 |      */
 46 |     public HanLPTokenizer(Segment segment, Set<String> filter, boolean enablePorterStemming) {
 47 |         super();
 48 |         this.segment = new SegmentWrapper(input, segment);
 49 |         if (filter != null && filter.size() > 0) {
 50 |             this.filter = new BinTrie<String>();
 51 |             for (String stopWord : filter) {
 52 |                 this.filter.put(stopWord, null);
 53 |             }
 54 |         }
 55 |         this.enablePorterStemming = enablePorterStemming;
 56 |     }
 57 | 
 58 |     @Override
 59 |     final public boolean incrementToken() throws IOException {
 60 |         clearAttributes();
 61 |         int position = 0;
 62 |         Term term;
 63 |         boolean un_increased = true;
 64 |         do {
 65 |             term = segment.next();
 66 |             if (term == null) {
 67 |                 break;
 68 |             }
 69 |             if (TextUtility.isBlank(term.word)) // 过滤掉空白符，提高索引效率
 70 |             {
 71 |                 continue;
 72 |             }
 73 |             if (enablePorterStemming && term.nature == Nature.nx) {
 74 |                 term.word = stemmer.stem(term.word);
 75 |             }
 76 | 
 77 |             if (filter != null && filter.containsKey(term.word)) {
 78 |                 continue;
 79 |             } else {
 80 |                 ++position;
 81 |                 un_increased = false;
 82 |             }
 83 |         }
 84 |         while (un_increased);
 85 | 
 86 |         if (term != null) {
 87 |             positionAttr.setPositionIncrement(position);
 88 |             termAtt.setEmpty().append(term.word);
 89 |             offsetAtt.setOffset(correctOffset(totalOffset + term.offset),
 90 |                     correctOffset(totalOffset + term.offset + term.word.length()));
 91 |             typeAtt.setType(term.nature == null ? "null" : term.nature.toString());
 92 |             return true;
 93 |         } else {
 94 |             totalOffset += segment.offset;
 95 |             return false;
 96 |         }
 97 |     }
 98 | 
 99 |     @Override
100 |     public void end() throws IOException {
101 |         super.end();
102 |         offsetAtt.setOffset(totalOffset, totalOffset);
103 |         totalOffset = 0;
104 |     }
105 | 
106 |     /**
107 |      * 必须重载的方法，否则在批量索引文件时将会导致文件索引失败
108 |      */
109 |     @Override
110 |     public void reset() throws IOException {
111 |         super.reset();
112 |         segment.reset(new BufferedReader(this.input));
113 |     }
114 | 
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/lucene/PorterStemmer.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.plugin.hanlp.lucene;
  2 | 
  3 | import org.apache.lucene.util.ArrayUtil;
  4 | 
  5 | import java.io.FileInputStream;
  6 | import java.io.IOException;
  7 | import java.io.InputStream;
  8 | 
  9 | public class PorterStemmer {
 10 |     private char[] b;
 11 |     private int i, /* offset into b */
 12 |             j, k, k0;
 13 |     private boolean dirty = false;
 14 |     private static final int INITIAL_SIZE = 50;
 15 | 
 16 |     public PorterStemmer() {
 17 |         b = new char[INITIAL_SIZE];
 18 |         i = 0;
 19 |     }
 20 | 
 21 |     /**
 22 |      * reset() resets the stemmer so it can stem another word. If you invoke the
 23 |      * stemmer by calling add(char) and then stem(), you must call reset()
 24 |      * before starting another word.
 25 |      */
 26 |     public void reset() {
 27 |         i = 0;
 28 |         dirty = false;
 29 |     }
 30 | 
 31 |     /**
 32 |      * Add a character to the word being stemmed. When you are finished adding
 33 |      * characters, you can call stem(void) to process the word.
 34 |      */
 35 |     public void add(char ch) {
 36 |         if (b.length <= i) {
 37 |             b = ArrayUtil.grow(b, i + 1);
 38 |         }
 39 |         b[i++] = ch;
 40 |     }
 41 | 
 42 |     /**
 43 |      * After a word has been stemmed, it can be retrieved by toString(), or a
 44 |      * reference to the internal buffer can be retrieved by getResultBuffer and
 45 |      * getResultLength (which is generally more efficient.)
 46 |      */
 47 |     @Override
 48 |     public String toString() {
 49 |         return new String(b, 0, i);
 50 |     }
 51 | 
 52 |     /**
 53 |      * Returns the length of the word resulting from the stemming process.
 54 |      */
 55 |     public int getResultLength() {
 56 |         return i;
 57 |     }
 58 | 
 59 |     /**
 60 |      * Returns a reference to a character buffer containing the results of the
 61 |      * stemming process. You also need to consult getResultLength() to determine
 62 |      * the length of the result.
 63 |      */
 64 |     public char[] getResultBuffer() {
 65 |         return b;
 66 |     }
 67 | 
 68 |     /* cons(i) is true <=> b[i] is a consonant. */
 69 | 
 70 |     private final boolean cons(int i) {
 71 |         switch (b[i]) {
 72 |             case 'a':
 73 |             case 'e':
 74 |             case 'i':
 75 |             case 'o':
 76 |             case 'u':
 77 |                 return false;
 78 |             case 'y':
 79 |                 return (i == k0) ? true : !cons(i - 1);
 80 |             default:
 81 |                 return true;
 82 |         }
 83 |     }
 84 | 
 85 |     /*
 86 |      * m() measures the number of consonant sequences between k0 and j. if c is
 87 |      * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
 88 |      * presence,
 89 |      *
 90 |      * <c><v> gives 0 <c>vc<v> gives 1 <c>vcvc<v> gives 2 <c>vcvcvc<v> gives 3
 91 |      * ....
 92 |      */
 93 | 
 94 |     private final int m() {
 95 |         int n = 0;
 96 |         int i = k0;
 97 |         while (true) {
 98 |             if (i > j)
 99 |                 return n;
100 |             if (!cons(i))
101 |                 break;
102 |             i++;
103 |         }
104 |         i++;
105 |         while (true) {
106 |             while (true) {
107 |                 if (i > j)
108 |                     return n;
109 |                 if (cons(i))
110 |                     break;
111 |                 i++;
112 |             }
113 |             i++;
114 |             n++;
115 |             while (true) {
116 |                 if (i > j)
117 |                     return n;
118 |                 if (!cons(i))
119 |                     break;
120 |                 i++;
121 |             }
122 |             i++;
123 |         }
124 |     }
125 | 
126 |     /* vowelinstem() is true <=> k0,...j contains a vowel */
127 | 
128 |     private final boolean vowelinstem() {
129 |         int i;
130 |         for (i = k0; i <= j; i++)
131 |             if (!cons(i))
132 |                 return true;
133 |         return false;
134 |     }
135 | 
136 |     /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
137 | 
138 |     private final boolean doublec(int j) {
139 |         if (j < k0 + 1)
140 |             return false;
141 |         if (b[j] != b[j - 1])
142 |             return false;
143 |         return cons(j);
144 |     }
145 | 
146 |     /*
147 |      * cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
148 |      * and also if the second c is not w,x or y. this is used when trying to
149 |      * restore an e at the end of a short word. e.g.
150 |      *
151 |      * cav(e), lov(e), hop(e), crim(e), but snow, box, tray.
152 |      */
153 | 
154 |     private final boolean cvc(int i) {
155 |         if (i < k0 + 2 || !cons(i) || cons(i - 1) || !cons(i - 2))
156 |             return false;
157 |         else {
158 |             int ch = b[i];
159 |             if (ch == 'w' || ch == 'x' || ch == 'y')
160 |                 return false;
161 |         }
162 |         return true;
163 |     }
164 | 
165 |     private final boolean ends(String s) {
166 |         int l = s.length();
167 |         int o = k - l + 1;
168 |         if (o < k0)
169 |             return false;
170 |         for (int i = 0; i < l; i++)
171 |             if (b[o + i] != s.charAt(i))
172 |                 return false;
173 |         j = k - l;
174 |         return true;
175 |     }
176 | 
177 |     /*
178 |      * setto(s) sets (j+1),...k to the characters in the string s, readjusting
179 |      * k.
180 |      */
181 | 
182 |     void setto(String s) {
183 |         int l = s.length();
184 |         int o = j + 1;
185 |         for (int i = 0; i < l; i++)
186 |             b[o + i] = s.charAt(i);
187 |         k = j + l;
188 |         dirty = true;
189 |     }
190 | 
191 |     /* r(s) is used further down. */
192 | 
193 |     void r(String s) {
194 |         if (m() > 0)
195 |             setto(s);
196 |     }
197 | 
198 |     /*
199 |      * step1() gets rid of plurals and -ed or -ing. e.g.
200 |      *
201 |      * caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat
202 |      *
203 |      * feed -> feed agreed -> agree disabled -> disable
204 |      *
205 |      * matting -> mat mating -> mate meeting -> meet milling -> mill messing ->
206 |      * mess
207 |      *
208 |      * meetings -> meet
209 |      */
210 | 
211 |     private final void step1() {
212 |         if (b[k] == 's') {
213 |             if (ends("sses"))
214 |                 k -= 2;
215 |             else if (ends("ies"))
216 |                 setto("i");
217 |             else if (b[k - 1] != 's')
218 |                 k--;
219 |         }
220 |         if (ends("eed")) {
221 |             if (m() > 0)
222 |                 k--;
223 |         } else if ((ends("ed") || ends("ing")) && vowelinstem()) {
224 |             k = j;
225 |             if (ends("at"))
226 |                 setto("ate");
227 |             else if (ends("bl"))
228 |                 setto("ble");
229 |             else if (ends("iz"))
230 |                 setto("ize");
231 |             else if (doublec(k)) {
232 |                 int ch = b[k--];
233 |                 if (ch == 'l' || ch == 's' || ch == 'z')
234 |                     k++;
235 |             } else if (m() == 1 && cvc(k))
236 |                 setto("e");
237 |         }
238 |     }
239 | 
240 |     /* step2() turns terminal y to i when there is another vowel in the stem. */
241 | 
242 |     private final void step2() {
243 |         if (ends("y") && vowelinstem()) {
244 |             b[k] = 'i';
245 |             dirty = true;
246 |         }
247 |     }
248 | 
249 |     /*
250 |      * step3() maps double suffices to single ones. so -ization ( = -ize plus
251 |      * -ation) maps to -ize etc. note that the string before the suffix must
252 |      * give m() > 0.
253 |      */
254 | 
255 |     private final void step3() {
256 |         if (k == k0)
257 |             return; /* For Bug 1 */
258 |         switch (b[k - 1]) {
259 |             case 'a':
260 |                 if (ends("ational")) {
261 |                     r("ate");
262 |                     break;
263 |                 }
264 |                 if (ends("tional")) {
265 |                     r("tion");
266 |                     break;
267 |                 }
268 |                 break;
269 |             case 'c':
270 |                 if (ends("enci")) {
271 |                     r("ence");
272 |                     break;
273 |                 }
274 |                 if (ends("anci")) {
275 |                     r("ance");
276 |                     break;
277 |                 }
278 |                 break;
279 |             case 'e':
280 |                 if (ends("izer")) {
281 |                     r("ize");
282 |                     break;
283 |                 }
284 |                 break;
285 |             case 'l':
286 |                 if (ends("bli")) {
287 |                     r("ble");
288 |                     break;
289 |                 }
290 |                 if (ends("alli")) {
291 |                     r("al");
292 |                     break;
293 |                 }
294 |                 if (ends("entli")) {
295 |                     r("ent");
296 |                     break;
297 |                 }
298 |                 if (ends("eli")) {
299 |                     r("e");
300 |                     break;
301 |                 }
302 |                 if (ends("ousli")) {
303 |                     r("ous");
304 |                     break;
305 |                 }
306 |                 break;
307 |             case 'o':
308 |                 if (ends("ization")) {
309 |                     r("ize");
310 |                     break;
311 |                 }
312 |                 if (ends("ation")) {
313 |                     r("ate");
314 |                     break;
315 |                 }
316 |                 if (ends("ator")) {
317 |                     r("ate");
318 |                     break;
319 |                 }
320 |                 break;
321 |             case 's':
322 |                 if (ends("alism")) {
323 |                     r("al");
324 |                     break;
325 |                 }
326 |                 if (ends("iveness")) {
327 |                     r("ive");
328 |                     break;
329 |                 }
330 |                 if (ends("fulness")) {
331 |                     r("ful");
332 |                     break;
333 |                 }
334 |                 if (ends("ousness")) {
335 |                     r("ous");
336 |                     break;
337 |                 }
338 |                 break;
339 |             case 't':
340 |                 if (ends("aliti")) {
341 |                     r("al");
342 |                     break;
343 |                 }
344 |                 if (ends("iviti")) {
345 |                     r("ive");
346 |                     break;
347 |                 }
348 |                 if (ends("biliti")) {
349 |                     r("ble");
350 |                     break;
351 |                 }
352 |                 break;
353 |             case 'g':
354 |                 if (ends("logi")) {
355 |                     r("log");
356 |                     break;
357 |                 }
358 |         }
359 |     }
360 | 
361 |     /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
362 | 
363 |     private final void step4() {
364 |         switch (b[k]) {
365 |             case 'e':
366 |                 if (ends("icate")) {
367 |                     r("ic");
368 |                     break;
369 |                 }
370 |                 if (ends("ative")) {
371 |                     r("");
372 |                     break;
373 |                 }
374 |                 if (ends("alize")) {
375 |                     r("al");
376 |                     break;
377 |                 }
378 |                 break;
379 |             case 'i':
380 |                 if (ends("iciti")) {
381 |                     r("ic");
382 |                     break;
383 |                 }
384 |                 break;
385 |             case 'l':
386 |                 if (ends("ical")) {
387 |                     r("ic");
388 |                     break;
389 |                 }
390 |                 if (ends("ful")) {
391 |                     r("");
392 |                     break;
393 |                 }
394 |                 break;
395 |             case 's':
396 |                 if (ends("ness")) {
397 |                     r("");
398 |                     break;
399 |                 }
400 |                 break;
401 |         }
402 |     }
403 | 
404 |     /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
405 | 
406 |     private final void step5() {
407 |         if (k == k0)
408 |             return; /* for Bug 1 */
409 |         switch (b[k - 1]) {
410 |             case 'a':
411 |                 if (ends("al"))
412 |                     break;
413 |                 return;
414 |             case 'c':
415 |                 if (ends("ance"))
416 |                     break;
417 |                 if (ends("ence"))
418 |                     break;
419 |                 return;
420 |             case 'e':
421 |                 if (ends("er"))
422 |                     break;
423 |                 return;
424 |             case 'i':
425 |                 if (ends("ic"))
426 |                     break;
427 |                 return;
428 |             case 'l':
429 |                 if (ends("able"))
430 |                     break;
431 |                 if (ends("ible"))
432 |                     break;
433 |                 return;
434 |             case 'n':
435 |                 if (ends("ant"))
436 |                     break;
437 |                 if (ends("ement"))
438 |                     break;
439 |                 if (ends("ment"))
440 |                     break;
441 |                 /* element etc. not stripped before the m */
442 |                 if (ends("ent"))
443 |                     break;
444 |                 return;
445 |             case 'o':
446 |                 if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
447 |                     break;
448 |                 /* j >= 0 fixes Bug 2 */
449 |                 if (ends("ou"))
450 |                     break;
451 |                 return;
452 |             /* takes care of -ous */
453 |             case 's':
454 |                 if (ends("ism"))
455 |                     break;
456 |                 return;
457 |             case 't':
458 |                 if (ends("ate"))
459 |                     break;
460 |                 if (ends("iti"))
461 |                     break;
462 |                 return;
463 |             case 'u':
464 |                 if (ends("ous"))
465 |                     break;
466 |                 return;
467 |             case 'v':
468 |                 if (ends("ive"))
469 |                     break;
470 |                 return;
471 |             case 'z':
472 |                 if (ends("ize"))
473 |                     break;
474 |                 return;
475 |             default:
476 |                 return;
477 |         }
478 |         if (m() > 1)
479 |             k = j;
480 |     }
481 | 
482 |     /* step6() removes a final -e if m() > 1. */
483 | 
484 |     private final void step6() {
485 |         j = k;
486 |         if (b[k] == 'e') {
487 |             int a = m();
488 |             if (a > 1 || a == 1 && !cvc(k - 1))
489 |                 k--;
490 |         }
491 |         if (b[k] == 'l' && doublec(k) && m() > 1)
492 |             k--;
493 |     }
494 | 
495 |     /**
496 |      * Stem a word provided as a String. Returns the result as a String.
497 |      */
498 |     public String stem(String s) {
499 |         if (stem(s.toCharArray(), s.length()))
500 |             return toString();
501 |         else
502 |             return s;
503 |     }
504 | 
505 |     /**
506 |      * Stem a word contained in a char[]. Returns true if the stemming process
507 |      * resulted in a word different from the input. You can retrieve the result
508 |      * with getResultLength()/getResultBuffer() or toString().
509 |      */
510 |     public boolean stem(char[] word) {
511 |         return stem(word, word.length);
512 |     }
513 | 
514 |     /**
515 |      * Stem a word contained in a portion of a char[] array. Returns true if the
516 |      * stemming process resulted in a word different from the input. You can
517 |      * retrieve the result with getResultLength()/getResultBuffer() or
518 |      * toString().
519 |      */
520 |     public boolean stem(char[] wordBuffer, int offset, int wordLen) {
521 |         reset();
522 |         if (b.length < wordLen) {
523 |             b = new char[ArrayUtil.oversize(wordLen, Character.BYTES)];
524 |         }
525 |         System.arraycopy(wordBuffer, offset, b, 0, wordLen);
526 |         i = wordLen;
527 |         return stem(0);
528 |     }
529 | 
530 |     /**
531 |      * Stem a word contained in a leading portion of a char[] array. Returns
532 |      * true if the stemming process resulted in a word different from the input.
533 |      * You can retrieve the result with getResultLength()/getResultBuffer() or
534 |      * toString().
535 |      */
536 |     public boolean stem(char[] word, int wordLen) {
537 |         return stem(word, 0, wordLen);
538 |     }
539 | 
540 |     /**
541 |      * Stem the word placed into the Stemmer buffer through calls to add().
542 |      * Returns true if the stemming process resulted in a word different from
543 |      * the input. You can retrieve the result with
544 |      * getResultLength()/getResultBuffer() or toString().
545 |      */
546 |     public boolean stem() {
547 |         return stem(0);
548 |     }
549 | 
550 |     public boolean stem(int i0) {
551 |         k = i - 1;
552 |         k0 = i0;
553 |         if (k > k0 + 1) {
554 |             step1();
555 |             step2();
556 |             step3();
557 |             step4();
558 |             step5();
559 |             step6();
560 |         }
561 |         // Also, a word is considered dirty if we lopped off letters
562 |         // Thanks to Ifigenia Vairelles for pointing this out.
563 |         if (i != k + 1)
564 |             dirty = true;
565 |         i = k + 1;
566 |         return dirty;
567 |     }
568 | 
569 |     /**
570 |      * Test program for demonstrating the Stemmer. It reads a file and stems
571 |      * each word, writing the result to standard out. Usage: Stemmer file-name
572 |      */
573 |     public static void main(String[] args) {
574 |         PorterStemmer s = new PorterStemmer();
575 | 
576 |         for (int i = 0; i < args.length; i++) {
577 |             try {
578 |                 InputStream in = new FileInputStream(args[i]);
579 |                 byte[] buffer = new byte[1024];
580 |                 int bufferLen, offset, ch;
581 | 
582 |                 bufferLen = in.read(buffer);
583 |                 offset = 0;
584 |                 s.reset();
585 | 
586 |                 while (true) {
587 |                     if (offset < bufferLen)
588 |                         ch = buffer[offset++];
589 |                     else {
590 |                         bufferLen = in.read(buffer);
591 |                         offset = 0;
592 |                         if (bufferLen < 0)
593 |                             ch = -1;
594 |                         else
595 |                             ch = buffer[offset++];
596 |                     }
597 | 
598 |                     if (Character.isLetter((char) ch)) {
599 |                         s.add(Character.toLowerCase((char) ch));
600 |                     } else {
601 |                         s.stem();
602 |                         System.out.print(s.toString());
603 |                         s.reset();
604 |                         if (ch < 0)
605 |                             break;
606 |                         else {
607 |                             System.out.print((char) ch);
608 |                         }
609 |                     }
610 |                 }
611 | 
612 |                 in.close();
613 |             } catch (IOException e) {
614 |                 System.out.println("error reading " + args[i]);
615 |             }
616 |         }
617 |     }
618 | 
619 | }
620 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/hanlp/lucene/SegmentWrapper.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.plugin.hanlp.lucene;
  2 | 
  3 | 
  4 | import com.hankcs.hanlp.seg.Segment;
  5 | import com.hankcs.hanlp.seg.common.Term;
  6 | 
  7 | import java.io.IOException;
  8 | import java.io.Reader;
  9 | import java.util.HashSet;
 10 | import java.util.Iterator;
 11 | import java.util.List;
 12 | import java.util.Set;
 13 | 
 14 | /**
 15 |  * 将分词器包装起来，每次输出一个token
 16 |  *
 17 |  * @author hankcs
 18 |  */
 19 | public class SegmentWrapper {
 20 |     /**
 21 |      * 输入
 22 |      */
 23 |     private Reader input;
 24 |     /**
 25 |      * 分词器
 26 |      */
 27 |     private Segment segment;
 28 |     /**
 29 |      * 分词结果
 30 |      */
 31 |     private Iterator<Term> iterator;
 32 |     /**
 33 |      * term的偏移量，由于wrapper是按行读取的，必须对term.offset做一个校正
 34 |      */
 35 |     int offset;
 36 |     /**
 37 |      * 缓冲区大小
 38 |      */
 39 |     private static final int BUFFER_SIZE = 512;
 40 |     /**
 41 |      * 缓冲区
 42 |      */
 43 |     private char[] buffer = new char[BUFFER_SIZE];
 44 |     /**
 45 |      * 缓冲区未处理的下标
 46 |      */
 47 |     private int remainSize = 0;
 48 | 
 49 |     /**
 50 |      * 句子分隔符
 51 |      */
 52 |     private static final Set<Character> delimiterCharSet = new HashSet<Character>() {{
 53 |         add('\r');
 54 |         add('\n');
 55 |         add('。');
 56 |         add('!');
 57 |         add('！');
 58 |     }};
 59 | 
 60 |     public SegmentWrapper(Reader reader, Segment segment) {
 61 |         this.input = reader;
 62 |         this.segment = segment;
 63 |     }
 64 | 
 65 |     /**
 66 |      * 重置分词器
 67 |      *
 68 |      * @param reader
 69 |      */
 70 |     public void reset(Reader reader) {
 71 |         input = reader;
 72 |         offset = 0;
 73 |         iterator = null;
 74 |     }
 75 | 
 76 |     public Term next() throws IOException {
 77 |         if (iterator != null && iterator.hasNext()) return iterator.next();
 78 |         String line = readLine();
 79 |         if (line == null) return null;
 80 |         List<Term> termList = segment.seg(line);
 81 |         if (termList.size() == 0) return null;
 82 |         for (Term term : termList) {
 83 |             term.offset += offset;
 84 |         }
 85 |         offset += line.length();
 86 |         iterator = termList.iterator();
 87 |         return iterator.next();
 88 |     }
 89 | 
 90 |     private String readLine() throws IOException {
 91 |         int offset = 0;
 92 |         int length = BUFFER_SIZE;
 93 |         if (remainSize > 0) {
 94 |             offset = remainSize;
 95 |             length -= remainSize;
 96 |         }
 97 |         int n = input.read(buffer, offset, length);
 98 |         if (n < 0) {
 99 |             if (remainSize != 0) {
100 |                 String lastLine = new String(buffer, 0, remainSize);
101 |                 remainSize = 0;
102 |                 return lastLine;
103 |             }
104 |             return null;
105 |         }
106 |         n += offset;
107 | 
108 |         int eos = lastIndexOfEos(buffer, n);
109 |         String line = new String(buffer, 0, eos);
110 |         remainSize = n - eos;
111 |         System.arraycopy(buffer, eos, buffer, 0, remainSize);
112 |         return line;
113 |     }
114 | 
115 |     private int lastIndexOfEos(char[] buffer, int length) {
116 |         for (int i = length - 1; i > 0; i--) {
117 |             if (delimiterCharSet.contains(buffer[i])) {
118 |                 return i + 1;
119 |             }
120 |         }
121 |         return length;
122 |     }
123 | }
124 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/utils/CommUtils.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.utils;
 2 | 
 3 | import java.lang.reflect.Field;
 4 | import java.util.Collections;
 5 | import java.util.HashMap;
 6 | import java.util.Map;
 7 | 
 8 | /**
 9 |  * elasticsearch-analysis-hanlp
10 |  * elasticsearch-analysis-hanlp
11 |  * Created by hezl on 2018-11-20.
12 |  */
13 | public class CommUtils {
14 | 
15 |     public static void setEnv(String key, String value){
16 |         Map<String, String> newEnv = new HashMap<>();
17 |         newEnv.put(key, value);
18 |         try{
19 |             setEnv(newEnv);
20 |         }
21 |         catch (Exception ex){
22 |             //
23 |         }
24 |     }
25 | 
26 |     @SuppressWarnings("unchecked")
27 |     private static void setEnv(Map<String, String> newEnv) throws Exception {
28 |         try {
29 |             Class<?> processEnvironmentClass = Class.forName("java.lang.ProcessEnvironment");
30 |             Field theEnvironmentField = processEnvironmentClass.getDeclaredField("theEnvironment");
31 |             theEnvironmentField.setAccessible(true);
32 |             Map<String, String> env = (Map<String, String>) theEnvironmentField.get(null);
33 |             env.putAll(newEnv);
34 |             Field theCaseInsensitiveEnvironmentField = processEnvironmentClass.getDeclaredField("theCaseInsensitiveEnvironment");
35 |             theCaseInsensitiveEnvironmentField.setAccessible(true);
36 |             Map<String, String> ciEnv = (Map<String, String>) theCaseInsensitiveEnvironmentField.get(null);
37 |             ciEnv.putAll(newEnv);
38 |         } catch (NoSuchFieldException e) {
39 |             Class[] classes = Collections.class.getDeclaredClasses();
40 |             Map<String, String> env = System.getenv();
41 |             for (Class cl : classes) {
42 |                 if ("java.util.Collections$UnmodifiableMap".equals(cl.getName())) {
43 |                     Field field = cl.getDeclaredField("m");
44 |                     field.setAccessible(true);
45 |                     Object obj = field.get(env);
46 |                     Map<String, String> map = (Map<String, String>) obj;
47 |                     map.putAll(newEnv);
48 |                 }
49 |             }
50 |         }
51 |     }
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
 1 | # Elasticsearch plugin descriptor file
 2 | # This file must exist as 'plugin-descriptor.properties' at
 3 | # the root directory of all plugins.
 4 | ### mandatory elements for all plugins:
 5 | #
 6 | # 'description': simple summary of the plugin
 7 | description=${project.description}
 8 | #
 9 | # 'version': plugin's version
10 | version=${project.version}
11 | #
12 | # 'name': the plugin name
13 | name=${elasticsearch.plugin.name}
14 | #
15 | # 'classname': the name of the class to load, fully-qualified.
16 | classname=${elasticsearch.plugin.classname}
17 | #
18 | # 'java.version' version of java the code is built against
19 | # use the system property java.specification.version
20 | # version string must be a sequence of nonnegative decimal integers
21 | # separated by "."'s and may have leading zeros
22 | java.version=${maven.compiler.target}
23 | #
24 | # 'elasticsearch.version' version of elasticsearch compiled against
25 | # You will have to release a new version of the plugin for each new
26 | # elasticsearch release. This version is checked when the plugin
27 | # is loaded so Elasticsearch will refuse to start in the presence of
28 | # plugins with the incorrect elasticsearch.version.
29 | elasticsearch.version=${elasticsearch.version}
30 | 


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/plugin/hanlp/conf/ConfigHelperTest.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.plugin.hanlp.conf;
 2 | 
 3 | import com.hankcs.hanlp.HanLP;
 4 | import org.junit.Test;
 5 | 
 6 | /**
 7 |  * elasticsearch-analysis-hanlp
 8 |  * elasticsearch-analysis-hanlp
 9 |  * Created by hezl on 2018-12-05.
10 |  */
11 | public class ConfigHelperTest {
12 | 
13 |     @Test
14 |     public void getConfig() {
15 |         System.out.println(HanLP.segment("你和对方但是"));
16 |     }
17 | }


--------------------------------------------------------------------------------