├── .gitignore
├── LICENSE
├── README.md
├── config
    ├── jieba_config.properties
    └── plugin-descriptor.properties
├── pom.xml
└── src
    ├── main
        ├── assemblies
        │   └── plugin.xml
        ├── java
        │   └── com
        │   │   └── github
        │   │       └── hongfuli
        │   │           ├── jieba
        │   │               ├── FinalSeg.java
        │   │               ├── Token.java
        │   │               ├── Tokenizer.java
        │   │               ├── elasticsearch
        │   │               │   ├── JiebaAnalysisPlugin.java
        │   │               │   └── JiebaAnalyzerProvider.java
        │   │               └── lucene
        │   │               │   ├── JiebaAnalyzer.java
        │   │               │   ├── JiebaStopTokenFilter.java
        │   │               │   └── JiebaTokenizer.java
        │   │           └── utils
        │   │               └── MtyStringUtils.java
        └── resources
        │   ├── dict.txt
        │   └── finalseg_prob_emit.txt
    └── test
        ├── java
            └── com
            │   └── github
            │       └── hongfuli
            │           ├── jieba
            │               ├── FinalSegTest.java
            │               ├── TokenizerTest.java
            │               └── lucene
            │               │   └── JiebaAnalyzerTest.java
            │           └── utils
            │               └── MtyStringUtilsTest.java
        └── resources
            ├── emit_test.txt
            ├── log4j2.xml
            └── userdict.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled class file
 2 | *.class
 3 | 
 4 | # Log file
 5 | *.log
 6 | 
 7 | # BlueJ files
 8 | *.ctxt
 9 | 
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 | 
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.ear
17 | *.zip
18 | *.tar.gz
19 | *.rar
20 | 
21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
22 | hs_err_pid*
23 | 
24 | .idea/
25 | *.iml
26 | target/
27 | .DS_Store
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 基于 [jieba](https://github.com/fxsjy/jieba) 的 [elasticsearch](https://www.elastic.co/products/elasticsearch) 中文分词插件。
 2 | 
 3 | 集成到ElasticSearch
 4 | =======
 5 | 
 6 | ```bash
 7 | git clone git@github.com:hongfuli/elasticsearch-analysis-jieba.git
 8 | cd elasticsearch-analysis-jieba
 9 | mvn package
10 | ```
11 | 把release/elasticsearch-analysis-jieba-{version}.zip文件解压到 elasticsearch 的 plugins 目录下，重启elasticsearch即可。
12 | 
13 | 创建字段：
14 | ```bash
15 | 
16 | curl -XPOST http://localhost:9200/index/type/_mapping -d'
17 | {
18 |         "properties": {
19 |             "content": {
20 |                 "type": "text",
21 |                 "analyzer": "jieba",
22 |                 "search_analyzer": "jieba"
23 |             }
24 |         }
25 |     }
26 | }'
27 | ```
28 | 
29 | 
30 | 直接使用Tokenizer分词
31 | =======
32 | 可直接使用 `com.github.hongfuli.jieba.Tokenizer` 对文本字符进行分词，方法参数完全和 [jieba python](https://github.com/fxsjy/jieba) 一致。
33 | 
34 | ```java
35 | imort com.github.hongfuli.jieba.Tokenizer
36 | 
37 | Tokenizer t = new Tokenizer();
38 | t.cut("这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。", false, true);
39 | ```
40 | 
41 | 集成到Lucene
42 | =======
43 | 
44 | ```java
45 | import com.github.hongfuli.jieba.lucene.JiebaAnalyzer;
46 | 
47 | Analyzer analyzer = new JiebaAnalyzer();
48 | try(TokenStream ts = analyzer.tokenStream("field", "这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。")) {
49 |       StringBuilder b = new StringBuilder();
50 |       CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
51 |       PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
52 |       PositionLengthAttribute posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
53 |       OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
54 |       assertNotNull(offsetAtt);
55 |       ts.reset();
56 |       int pos = -1;
57 |       while (ts.incrementToken()) {
58 |         pos += posIncAtt.getPositionIncrement();
59 |         b.append(termAtt);
60 |         b.append(" at pos=");
61 |         b.append(pos);
62 |         if (posLengthAtt != null) {
63 |           b.append(" to pos=");
64 |           b.append(pos + posLengthAtt.getPositionLength());
65 |         }
66 |         b.append(" offsets=");
67 |         b.append(offsetAtt.startOffset());
68 |         b.append('-');
69 |         b.append(offsetAtt.endOffset());
70 |         b.append('\n');
71 |       }
72 |       ts.end();
73 |       return b.toString();
74 |     }
75 | ```


--------------------------------------------------------------------------------
/config/jieba_config.properties:
--------------------------------------------------------------------------------
1 | # 自定义字典文件路径
2 | #user_dict=/home/user_dict.txt


--------------------------------------------------------------------------------
/config/plugin-descriptor.properties:
--------------------------------------------------------------------------------
1 | name=${elasticsearch.plugin.name}
2 | description=${project.description}
3 | version=${project.version}
4 | jvm=${elasticsearch.plugin.jvm}
5 | classname=${elasticsearch.plugin.classname}
6 | java.version=${maven.compile.target}
7 | elasticsearch.version=${elasticsearch.version}
8 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |     <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |     <groupId>com.github.hongfuli</groupId>
  6 |     <artifactId>elasticsearch-analysis-jieba</artifactId>
  7 |     <version>1.0-SNAPSHOT</version>
  8 |     <packaging>jar</packaging>
  9 |     <description>Jieba analyzer for ElasticSearch</description>
 10 | 
 11 |     <name>elasticsearch-analysis-jieba</name>
 12 |     <url>http://maven.apache.org</url>
 13 | 
 14 |     <properties>
 15 |         <elasticsearch.version>5.4.1</elasticsearch.version>
 16 |         <maven.compile.target>1.8</maven.compile.target>
 17 |         <elasticsearch.plugin.name>analysis-jieba</elasticsearch.plugin.name>
 18 |         <elasticsearch.plugin.classname>com.github.hongfuli.jieba.elasticsearch.JiebaAnalysisPlugin
 19 |         </elasticsearch.plugin.classname>
 20 |         <elasticsearch.plugin.jvm>true</elasticsearch.plugin.jvm>
 21 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 22 |         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
 23 |         <skip.unit.tests>true</skip.unit.tests>
 24 |     </properties>
 25 | 
 26 |     <dependencies>
 27 |         <dependency>
 28 |             <groupId>org.elasticsearch</groupId>
 29 |             <artifactId>elasticsearch</artifactId>
 30 |             <version>5.4.1</version>
 31 |             <scope>compile</scope>
 32 |         </dependency>
 33 |         <dependency>
 34 |             <groupId>junit</groupId>
 35 |             <artifactId>junit</artifactId>
 36 |             <version>4.10</version>
 37 |             <scope>test</scope>
 38 |         </dependency>
 39 |         <dependency>
 40 |             <groupId>org.apache.lucene</groupId>
 41 |             <artifactId>lucene-test-framework</artifactId>
 42 |             <version>6.5.1</version>
 43 |             <scope>test</scope>
 44 |         </dependency>
 45 |         <dependency>
 46 |             <groupId>org.apache.logging.log4j</groupId>
 47 |             <artifactId>log4j-core</artifactId>
 48 |             <version>2.8.2</version>
 49 |             <scope>compile</scope>
 50 |         </dependency>
 51 | 
 52 | 
 53 |     </dependencies>
 54 | 
 55 |     <build>
 56 |         <plugins>
 57 |             <plugin>
 58 |                 <groupId>org.apache.maven.plugins</groupId>
 59 |                 <artifactId>maven-compiler-plugin</artifactId>
 60 |                 <version>3.5.1</version>
 61 |                 <configuration>
 62 |                     <source>${maven.compile.target}</source>
 63 |                     <target>${maven.compile.target}</target>
 64 |                 </configuration>
 65 |             </plugin>
 66 |             <plugin>
 67 |                 <groupId>org.apache.maven.plugins</groupId>
 68 |                 <artifactId>maven-surefire-plugin</artifactId>
 69 |                 <version>2.11</version>
 70 |                 <configuration>
 71 |                     <skipTests>true</skipTests>
 72 |                 </configuration>
 73 |             </plugin>
 74 |             <plugin>
 75 |                 <artifactId>maven-assembly-plugin</artifactId>
 76 |                 <configuration>
 77 |                     <appendAssemblyId>false</appendAssemblyId>
 78 |                     <outputDirectory>${project.build.directory}/release</outputDirectory>
 79 |                     <descriptors>
 80 |                         <descriptor>${basedir}/src/main/assemblies/plugin.xml</descriptor>
 81 |                     </descriptors>
 82 |                     <archive>
 83 |                         <manifest>
 84 |                             <mainClass>fully.qualified.MainClass</mainClass>
 85 |                         </manifest>
 86 |                     </archive>
 87 |                 </configuration>
 88 |                 <executions>
 89 |                     <execution>
 90 |                         <phase>package</phase>
 91 |                         <goals>
 92 |                             <goal>single</goal>
 93 |                         </goals>
 94 |                     </execution>
 95 |                 </executions>
 96 |             </plugin>
 97 |         </plugins>
 98 |     </build>
 99 | </project>
100 | 


--------------------------------------------------------------------------------
/src/main/assemblies/plugin.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <assembly>
 3 |     <id>analysis-jieba-release</id>
 4 |     <formats>
 5 |         <format>zip</format>
 6 |     </formats>
 7 |     <includeBaseDirectory>false</includeBaseDirectory>
 8 | 
 9 |     <fileSets>
10 |         <fileSet>
11 |             <directory>${project.basedir}/config</directory>
12 |             <outputDirectory>/</outputDirectory>
13 |             <filtered>true</filtered>
14 |         </fileSet>
15 |     </fileSets>
16 | 
17 |     <dependencySets>
18 |         <dependencySet>
19 |             <outputDirectory>/</outputDirectory>
20 |             <useProjectArtifact>true</useProjectArtifact>
21 |             <useTransitiveFiltering>true</useTransitiveFiltering>
22 |             <excludes>
23 |                 <exclude>org.elasticsearch:elasticsearch</exclude>
24 |                 <exclude>org.apache.logging.log4j:log4j-core</exclude>
25 |             </excludes>
26 |         </dependencySet>
27 |     </dependencySets>
28 | </assembly>


--------------------------------------------------------------------------------
/src/main/java/com/github/hongfuli/jieba/FinalSeg.java:
--------------------------------------------------------------------------------
  1 | package com.github.hongfuli.jieba;
  2 | 
  3 | import com.github.hongfuli.utils.MtyStringUtils;
  4 | 
  5 | import java.io.BufferedReader;
  6 | import java.io.IOException;
  7 | import java.io.InputStreamReader;
  8 | import java.util.*;
  9 | import java.util.regex.Matcher;
 10 | import java.util.regex.Pattern;
 11 | 
 12 | /**
 13 |  * Created by lihongfu on 17/6/8.
 14 |  */
 15 | public class FinalSeg {
 16 |     private static final Double MIN_FLOAT = -3.14e100;
 17 |     private Map<Character, Map<Character, Double>> emitP;
 18 | 
 19 |     private static final String DEFAULT_EMIT_FILE = "/finalseg_prob_emit.txt";
 20 | 
 21 |     private static final Set<Character> STATES = new HashSet();
 22 |     private static final Map<Character, Double> PROB_START = new HashMap();
 23 |     private static final Map<Character, Map<Character, Double>> PROB_TRANS = new HashMap();
 24 |     private static final Map<Character, Character[]> PREV_STATES = new HashMap();
 25 | 
 26 |     private static final Pattern RE_HAN = Pattern.compile("([\\u4E00-\\u9FD5]+)");
 27 |     private static final Pattern RE_SKIP = Pattern.compile("(\\d+\\.\\d+|[a-zA-Z0-9]+)");
 28 | 
 29 |     static {
 30 |         STATES.add('B');
 31 |         STATES.add('E');
 32 |         STATES.add('M');
 33 |         STATES.add('S');
 34 | 
 35 |         PROB_START.put('B', -0.26268660809250016);
 36 |         PROB_START.put('E', -3.14e+100);
 37 |         PROB_START.put('M', -3.14e+100);
 38 |         PROB_START.put('S', -1.4652633398537678);
 39 | 
 40 |         PROB_TRANS.put('B', new HashMap<Character, Double>() {{
 41 |             put('E', -0.510825623765990);
 42 |             put('M', -0.916290731874155);
 43 |         }});
 44 |         PROB_TRANS.put('E', new HashMap<Character, Double>() {{
 45 |             put('B', -0.5897149736854513);
 46 |             put('S', -0.8085250474669937);
 47 |         }});
 48 |         PROB_TRANS.put('M', new HashMap<Character, Double>() {{
 49 |             put('E', -0.33344856811948514);
 50 |             put('M', -1.2603623820268226);
 51 |         }});
 52 |         PROB_TRANS.put('S', new HashMap<Character, Double>() {{
 53 |             put('B', -0.7211965654669841);
 54 |             put('S', -0.6658631448798212);
 55 |         }});
 56 | 
 57 |         PREV_STATES.put('B', new Character[]{'E', 'S'});
 58 |         PREV_STATES.put('M', new Character[]{'M', 'B'});
 59 |         PREV_STATES.put('S', new Character[]{'E', 'S'});
 60 |         PREV_STATES.put('E', new Character[]{'B', 'M'});
 61 |     }
 62 | 
 63 |     public FinalSeg() throws IOException {
 64 |         this(DEFAULT_EMIT_FILE);
 65 |     }
 66 | 
 67 |     public FinalSeg(String emitFileName) throws IOException {
 68 |         this.loadEmitP(emitFileName);
 69 |     }
 70 | 
 71 | 
 72 |     protected void loadEmitP(String emitPFileName) throws IOException {
 73 |         emitP = new HashMap();
 74 |         for (Character s : STATES) {
 75 |             emitP.put(s, new HashMap(10000));
 76 |         }
 77 | 
 78 |         Pattern wordPattern = Pattern.compile("'\\\\u(.*?)': (.*?),");
 79 | 
 80 |         BufferedReader reader = new BufferedReader(new InputStreamReader(Tokenizer.class.getResourceAsStream(emitPFileName)));
 81 |         String line;
 82 |         Character currentType = null;
 83 |         while ((line = reader.readLine()) != null) {
 84 |             line = line.trim();
 85 |             if (line.isEmpty())
 86 |                 continue;
 87 | 
 88 |             if (line.length() == 1 && STATES.contains(line.charAt(0))) {
 89 |                 currentType = line.charAt(0);
 90 |                 continue;
 91 |             } else {
 92 |                 if (currentType == null) {
 93 |                     throw new IllegalStateException("emit probability data must be followed the BEMS character");
 94 |                 }
 95 |             }
 96 | 
 97 |             Map<Character, Double> stateP = emitP.get(currentType);
 98 |             Matcher matcher = wordPattern.matcher(line);
 99 |             if (matcher.find()) {
100 |                 String word = matcher.group(1);
101 |                 Double p = Double.valueOf(matcher.group(2));
102 |                 stateP.put((char) Integer.parseInt(word, 16), p);
103 |             }
104 |         }
105 |     }
106 | 
107 |     private String viterbi(String obs, Set<Character> states, Map<Character, Double> startP,
108 |                            Map<Character, Map<Character, Double>> transP,
109 |                            Map<Character, Map<Character, Double>> emitP) {
110 |         List<Map<Character, Double>> V = new ArrayList(obs.length());
111 |         Map<Character, Double> first = new HashMap();
112 |         V.add(first);
113 |         Map<Character, String> path = new HashMap();
114 |         for (Character y : states) {
115 |             first.put(y, startP.get(y) + emitP.get(y).getOrDefault(obs.charAt(0), MIN_FLOAT));
116 |             path.put(y, String.valueOf(y));
117 |         }
118 | 
119 |         for (int i = 1; i < obs.length(); i++) {
120 |             Map<Character, Double> v = new HashMap();
121 |             V.add(v);
122 | 
123 |             Map<Character, String> newPath = new HashMap();
124 |             for (Character y : states) {
125 |                 double emP = emitP.get(y).getOrDefault(obs.charAt(i), MIN_FLOAT);
126 |                 double maxProb = Double.NEGATIVE_INFINITY;
127 |                 Character bestY = null;
128 |                 for (Character y0 : PREV_STATES.get(y)) {
129 |                     double emP0 = V.get(i - 1).get(y0) + transP.get(y0).getOrDefault(y, MIN_FLOAT) + emP;
130 |                     if (emP0 > maxProb) {
131 |                         maxProb = emP0;
132 |                         bestY = y0;
133 |                     }
134 |                 }
135 |                 V.get(i).put(y, maxProb);
136 |                 newPath.put(y, path.get(bestY) + String.valueOf(y));
137 |             }
138 |             path = newPath;
139 |         }
140 | 
141 |         Double maxD = null;
142 |         Character finalState = null;
143 |         for (Character y : new Character[]{'E', 'S'}) {
144 |             Double d = V.get(obs.length() - 1).get(y);
145 |             if (maxD == null || d > maxD) {
146 |                 maxD = d;
147 |                 finalState = y;
148 |             }
149 |         }
150 | 
151 |         return path.get(finalState);
152 |     }
153 | 
154 |     private List<String> innerCut(String sentence) {
155 |         List<String> result = new ArrayList();
156 |         String posList = viterbi(sentence, STATES, PROB_START, PROB_TRANS, emitP);
157 |         int begin = 0, nextI = 0;
158 |         for (int i = 0; i < sentence.length(); i++) {
159 |             char ch = sentence.charAt(i);
160 |             char pos = posList.charAt(i);
161 |             if (pos == 'B') {
162 |                 begin = i;
163 |             } else if (pos == 'E') {
164 |                 result.add(sentence.substring(begin, i + 1));
165 |                 nextI = i + 1;
166 |             } else if (pos == 'S') {
167 |                 result.add(String.valueOf(ch));
168 |                 nextI = i + 1;
169 |             }
170 |         }
171 | 
172 |         if (nextI < sentence.length()) {
173 |             result.add(sentence.substring(nextI));
174 |         }
175 |         return result;
176 |     }
177 | 
178 | 
179 |     public List<String> cut(String sentence) {
180 |         List<String> blocks = MtyStringUtils.splitAndReturnDelimiters(RE_HAN, sentence);
181 |         List<String> result = new ArrayList<String>();
182 |         for (String blk : blocks) {
183 |             if (RE_HAN.matcher(blk).matches()) {
184 |                 result.addAll(innerCut(blk));
185 |             } else {
186 |                 List<String> tmp = MtyStringUtils.splitAndReturnDelimiters(RE_SKIP, blk);
187 |                 for (String x : tmp) {
188 |                     if (!x.isEmpty()) {
189 |                         result.add(x);
190 |                     }
191 |                 }
192 |             }
193 |         }
194 |         return result;
195 |     }
196 | 
197 | 
198 |     public Map<Character, Map<Character, Double>> getEmitP() {
199 |         return emitP;
200 |     }
201 | 
202 | 
203 | }
204 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/hongfuli/jieba/Token.java:
--------------------------------------------------------------------------------
 1 | package com.github.hongfuli.jieba;
 2 | 
 3 | /**
 4 |  * Created by lihongfu on 17/6/13.
 5 |  */
 6 | public class Token {
 7 |     public String value;
 8 |     public int startPos;
 9 |     public int endPos;
10 | 
11 |     public Token(String value, int startPos, int endPos) {
12 |         this.value = value;
13 |         this.startPos = startPos;
14 |         this.endPos = endPos;
15 |     }
16 | 
17 |     @Override
18 |     public String toString(){
19 |         return "token: value = " + value + "; startPos = " + startPos + "; endPos = " + endPos;
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/hongfuli/jieba/Tokenizer.java:
--------------------------------------------------------------------------------
  1 | package com.github.hongfuli.jieba;
  2 | 
  3 | import com.github.hongfuli.utils.MtyStringUtils;
  4 | 
  5 | import java.io.BufferedReader;
  6 | import java.io.IOException;
  7 | import java.io.InputStream;
  8 | import java.io.InputStreamReader;
  9 | import java.nio.charset.Charset;
 10 | import java.nio.file.Files;
 11 | import java.nio.file.Path;
 12 | import java.util.ArrayList;
 13 | import java.util.HashMap;
 14 | import java.util.List;
 15 | import java.util.Map;
 16 | import java.util.regex.Matcher;
 17 | import java.util.regex.Pattern;
 18 | import java.util.stream.Stream;
 19 | 
 20 | /**
 21 |  * Created by lihongfu on 17/5/31.
 22 |  */
 23 | public class Tokenizer {
 24 | 
 25 | 
 26 |     private Map<String, Integer> freq = new HashMap(349050);
 27 |     private FinalSeg finalSeg;
 28 |     private long total;
 29 |     private boolean initialized;
 30 | 
 31 |     private static final String DEFAULT_DICT_FILE_NAME = "/dict.txt";
 32 | 
 33 |     private static final Pattern RE_HAN_DEFAULT = Pattern.compile("([\\u4E00-\\u9FD5a-zA-Z0-9+#&\\._]+)");
 34 |     private static final Pattern RE_SKIP_DEFAULT = Pattern.compile("(\\r\\n|\\s)");
 35 |     private static final Pattern RE_HAN_CUT_ALL = Pattern.compile("([\\u4E00-\\u9FD5]+)");
 36 |     private static final Pattern RE_SKIP_HAN_CUT_ALL = Pattern.compile("[^a-zA-Z0-9+#\\n]");
 37 |     private static final Pattern RE_ENG = Pattern.compile("[a-zA-Z0-9]");
 38 | 
 39 |     private static final Pattern RE_USERDICT = Pattern.compile("^(.+?)( [0-9]+)?( [a-z]+)?$");
 40 | 
 41 | 
 42 |     public Tokenizer() {
 43 |         try {
 44 |             initialize();
 45 |         } catch (IOException e) {
 46 |             e.printStackTrace();
 47 |         }
 48 | 
 49 |     }
 50 | 
 51 |     private void initialize() throws IOException {
 52 |         this.genPfDict();
 53 |         this.finalSeg = new FinalSeg();
 54 |     }
 55 | 
 56 | 
 57 |     private void genPfDict() throws IOException {
 58 |         BufferedReader reader = new BufferedReader(new InputStreamReader(Tokenizer.class.getResourceAsStream(DEFAULT_DICT_FILE_NAME)));
 59 |         String line;
 60 |         while ((line = reader.readLine()) != null) {
 61 |             line = line.trim();
 62 |             if (line.isEmpty())
 63 |                 continue;
 64 |             String[] wordFreqs = line.split(" ");
 65 |             String word = wordFreqs[0];
 66 |             int freq = Integer.parseInt(wordFreqs[1]);
 67 |             this.freq.put(word, freq);
 68 |             this.total += freq;
 69 |             for (int i = 1; i <= word.length(); i++) {
 70 |                 String wfrag = word.substring(0, i);
 71 |                 if (!this.freq.containsKey(wfrag)) {
 72 |                     this.freq.put(wfrag, 0);
 73 |                 }
 74 |             }
 75 |         }
 76 |     }
 77 | 
 78 | 
 79 |     private Map<Integer, List<Integer>> getDAG(String sentence) {
 80 |         Map<Integer, List<Integer>> DAG = new HashMap<Integer, List<Integer>>();
 81 |         int N = sentence.length();
 82 |         for (int k = 0; k < N; k++) {
 83 |             List<Integer> tmpList = new ArrayList<Integer>();
 84 |             int i = k;
 85 |             String frag = sentence.substring(k, k + 1);
 86 |             while (i < N && this.freq.containsKey(frag)) {
 87 |                 if (this.freq.get(frag) > 0) {
 88 |                     tmpList.add(i);
 89 |                 }
 90 |                 i += 1;
 91 |                 if (i < N) {
 92 |                     frag = sentence.substring(k, i + 1);
 93 |                 }
 94 |             }
 95 |             if (tmpList.isEmpty()) {
 96 |                 tmpList.add(k);
 97 |             }
 98 |             DAG.put(k, tmpList);
 99 |         }
100 |         return DAG;
101 |     }
102 | 
103 |     private static class Pair<E, F> {
104 |         private E first;
105 |         private F second;
106 | 
107 |         private Pair(E first, F second) {
108 |             this.first = first;
109 |             this.second = second;
110 |         }
111 | 
112 |         public static <E, F> Pair newPair(E first, F second) {
113 |             return new Pair<E, F>(first, second);
114 |         }
115 | 
116 |         public E getFirst() {
117 |             return first;
118 |         }
119 | 
120 |         public F getSecond() {
121 |             return second;
122 |         }
123 | 
124 |         @Override
125 |         public String toString() {
126 |             return "pair value: " + first + " , " + second;
127 |         }
128 |     }
129 | 
130 |     private void calc(String sentence, Map<Integer, List<Integer>> DAG, Map<Integer, Pair<Double, Integer>> route) {
131 |         int N = sentence.length();
132 |         route.put(N, Pair.newPair(0.0, 0));
133 |         double logTotal = Math.log(this.total);
134 |         for (int idx = N - 1; idx >= 0; idx--) {
135 |             double maxFreq = -Double.MAX_VALUE;
136 |             int maxIdx = idx;
137 |             for (int x : DAG.get(idx)) {
138 |                 Integer freq = this.freq.get(sentence.substring(idx, x + 1));
139 |                 double logFreq = Math.log(freq == null || freq == 0 ? 1 : freq) - logTotal + route.get(x + 1).getFirst();
140 |                 if (logFreq > maxFreq) {
141 |                     maxIdx = x;
142 |                     maxFreq = logFreq;
143 |                 }
144 |             }
145 |             route.put(idx, Pair.newPair(maxFreq, maxIdx));
146 |         }
147 |     }
148 | 
149 |     public List<String> cut(String sentence, boolean cut_all, boolean HMM) {
150 |         Pattern reHan, reSkip;
151 |         if (cut_all) {
152 |             reHan = RE_HAN_CUT_ALL;
153 |             reSkip = RE_SKIP_HAN_CUT_ALL;
154 |         } else {
155 |             reHan = RE_HAN_DEFAULT;
156 |             reSkip = RE_SKIP_DEFAULT;
157 |         }
158 |         CutStrategy cs;
159 |         if (cut_all) {
160 |             cs = new CutAllStrategy();
161 |         } else if (HMM) {
162 |             cs = new CutDAGStrategy();
163 |         } else {
164 |             cs = new CutDAGNoHMMStrategy();
165 |         }
166 |         List<String> blocks = MtyStringUtils.splitAndReturnDelimiters(reHan, sentence);
167 |         List<String> tokens = new ArrayList();
168 |         for (String blk : blocks) {
169 |             if (blk.isEmpty()) {
170 |                 continue;
171 |             }
172 |             if (reHan.matcher(blk).matches()) {
173 |                 for (String word : cs.cut(blk)) {
174 |                     tokens.add(word);
175 |                 }
176 |             } else {
177 |                 for (String x : reSkip.split(blk)) {
178 |                     if (reSkip.matcher(x).matches()) {
179 |                         tokens.add(x);
180 |                     } else if (!cut_all) {
181 |                         for (String c : x.split("(?!^)")) {
182 |                             tokens.add(c);
183 |                         }
184 |                     } else {
185 |                         tokens.add(x);
186 |                     }
187 | 
188 |                 }
189 |             }
190 | 
191 |         }
192 |         return tokens;
193 |     }
194 | 
195 |     public List<String> cutForSearch(String sentence, boolean HMM) {
196 |         List<String> frags = this.cut(sentence, false, HMM);
197 |         List<String> result = new ArrayList<String>();
198 |         for (String w : frags) {
199 |             if (w.length() > 2) {
200 |                 for (int i = 0; i < w.length() - 1; i++) {
201 |                     String gram2 = w.substring(i, i + 2);
202 |                     if (freq.getOrDefault(gram2, 0) > 0) {
203 |                         result.add(gram2);
204 |                     }
205 |                 }
206 |             }
207 |             if (w.length() > 3) {
208 |                 for (int i = 0; i < w.length() - 2; i++) {
209 |                     String gram3 = w.substring(i, i + 3);
210 |                     if (freq.getOrDefault(gram3, 0) > 0) {
211 |                         result.add(gram3);
212 |                     }
213 |                 }
214 |             }
215 |             result.add(w);
216 |         }
217 |         return result;
218 |     }
219 | 
220 |     public List<String> cutForSearch(String sentence) {
221 |         return cutForSearch(sentence, true);
222 |     }
223 | 
224 |     public List<Token> tokenize(String sentence, boolean forSearch, boolean HMM) {
225 |         List<Token> tokens = new ArrayList<Token>();
226 |         int start = 0;
227 |         if (forSearch) {
228 |             for (String w : cut(sentence, false, HMM)) {
229 |                 int width = w.length();
230 |                 if (w.length() > 2) {
231 |                     for (int i = 0; i < w.length() - 1; i++) {
232 |                         String gram2 = w.substring(i, i + 2);
233 |                         if (freq.getOrDefault(gram2, 0) > 0) {
234 |                             tokens.add(new Token(gram2, start + i, start + i + 2));
235 |                         }
236 |                     }
237 |                 }
238 |                 if (w.length() > 3) {
239 |                     for (int i = 0; i < w.length() - 2; i++) {
240 |                         String gram3 = w.substring(i, i + 3);
241 |                         if (freq.getOrDefault(gram3, 0) > 0) {
242 |                             tokens.add(new Token(gram3, start + i, start + i + 3));
243 |                         }
244 |                     }
245 |                 }
246 |                 tokens.add(new Token(w, start, start + width));
247 |                 start += width;
248 |             }
249 |         } else {
250 |             for (String w : cut(sentence, false, HMM)) {
251 |                 tokens.add(new Token(w, start, start + w.length()));
252 |                 start += w.length();
253 |             }
254 |         }
255 | 
256 |         return tokens;
257 |     }
258 | 
259 | 
260 |     private interface CutStrategy {
261 |         List<String> cut(String sentence);
262 |     }
263 | 
264 |     private class CutAllStrategy implements CutStrategy {
265 | 
266 |         public List<String> cut(String sentence) {
267 |             List<String> frags = new ArrayList<String>();
268 |             Map<Integer, List<Integer>> dag = Tokenizer.this.getDAG(sentence);
269 |             int old_j = -1;
270 |             for (Integer k : dag.keySet()) {
271 |                 List<Integer> L = dag.get(k);
272 |                 if (L.size() == 1 && k > old_j) {
273 |                     frags.add(sentence.substring(k, L.get(0) + 1));
274 |                     old_j = L.get(0);
275 |                 } else {
276 |                     for (int j : L) {
277 |                         if (j > k) {
278 |                             frags.add(sentence.substring(k, j + 1));
279 |                             old_j = j;
280 |                         }
281 |                     }
282 |                 }
283 |             }
284 |             return frags;
285 |         }
286 |     }
287 | 
288 |     private class CutDAGStrategy implements CutStrategy {
289 | 
290 |         public List<String> cut(String sentence) {
291 |             List<String> frags = new ArrayList();
292 |             Map<Integer, List<Integer>> dag = Tokenizer.this.getDAG(sentence);
293 |             Map<Integer, Pair<Double, Integer>> route = new HashMap();
294 |             Tokenizer.this.calc(sentence, dag, route);
295 |             int x = 0;
296 |             int N = sentence.length();
297 |             StringBuffer buf = new StringBuffer();
298 |             while (x < N) {
299 |                 int y = route.get(x).getSecond() + 1;
300 |                 String lWord = sentence.substring(x, y);
301 |                 if (y - x == 1) {
302 |                     buf.append(lWord);
303 |                 } else {
304 |                     if (buf.length() > 0) {
305 |                         if (buf.length() == 1) {
306 |                             frags.add(buf.toString());
307 |                             buf.setLength(0);
308 |                         } else {
309 |                             if (freq.get(buf.toString()) == null || freq.get(buf.toString()) == 0) {
310 |                                 List<String> recognized = finalSeg.cut(buf.toString());
311 |                                 frags.addAll(recognized);
312 |                             } else {
313 |                                 for (Character elem : buf.toString().toCharArray()) {
314 |                                     frags.add(String.valueOf(elem));
315 |                                 }
316 |                             }
317 |                             buf.setLength(0);
318 |                         }
319 |                     }
320 |                     frags.add(lWord);
321 |                 }
322 |                 x = y;
323 |             }
324 | 
325 |             if (buf.length() > 0) {
326 |                 if (buf.length() == 1) {
327 |                     frags.add(buf.toString());
328 |                 } else if (freq.get(buf.toString()) == null || freq.get(buf.toString()) == 0) {
329 |                     List<String> recognized = finalSeg.cut(buf.toString());
330 |                     frags.addAll(recognized);
331 |                 } else {
332 |                     for (Character elem : buf.toString().toCharArray()) {
333 |                         frags.add(String.valueOf(elem));
334 |                     }
335 |                 }
336 |             }
337 | 
338 |             return frags;
339 |         }
340 |     }
341 | 
342 |     private class CutDAGNoHMMStrategy implements CutStrategy {
343 | 
344 |         public List<String> cut(String sentence) {
345 |             List<String> frags = new ArrayList();
346 |             Map<Integer, List<Integer>> dag = Tokenizer.this.getDAG(sentence);
347 |             Map<Integer, Pair<Double, Integer>> route = new HashMap();
348 |             Tokenizer.this.calc(sentence, dag, route);
349 |             int x = 0;
350 |             int N = sentence.length();
351 |             StringBuffer buf = new StringBuffer();
352 |             while (x < N) {
353 |                 int y = route.get(x).getSecond() + 1;
354 |                 String lWord = sentence.substring(x, y);
355 |                 if (RE_ENG.matcher(lWord).matches() && lWord.length() == 1) {
356 |                     buf.append(lWord);
357 |                     x = y;
358 |                 } else {
359 |                     if (buf.length() > 0) {
360 |                         frags.add(buf.toString());
361 |                         buf.setLength(0);
362 |                     }
363 |                     frags.add(lWord);
364 |                     x = y;
365 |                 }
366 |             }
367 | 
368 |             if (buf.length() > 0) {
369 |                 frags.add(buf.toString());
370 |             }
371 | 
372 |             return frags;
373 |         }
374 |     }
375 | 
376 |     private void loadUserDict(Stream<String> stream) throws IOException {
377 |         try {
378 |             stream.forEach(line -> {
379 |                 Matcher matcher = RE_USERDICT.matcher(line.trim());
380 |                 if (matcher.find()) {
381 |                     String word = matcher.group(1).trim();
382 |                     String freqStr = matcher.group(2);
383 |                     int freq = 1;
384 |                     if (freqStr != null) {
385 |                         freq = Integer.parseInt(freqStr.trim());
386 |                     } else {
387 |                         double df = 1.;
388 |                         for (String seg : this.cut(word, false, false)) {
389 |                             df *= this.freq.getOrDefault(seg, 1) / total;
390 |                         }
391 |                         freq = Math.max((int) (df * total) + 1, this.freq.getOrDefault(word, 1));
392 |                     }
393 | 
394 |                     this.freq.put(word, freq);
395 |                     this.total += freq;
396 | 
397 |                     for (int i = 1; i <= word.length(); i++) {
398 |                         String wfrag = word.substring(0, i);
399 |                         if (!this.freq.containsKey(wfrag)) {
400 |                             this.freq.put(wfrag, 0);
401 |                         }
402 |                     }
403 |                 }
404 |             });
405 |         } finally {
406 |             stream.close();
407 |         }
408 |     }
409 | 
410 | 
411 |     public void loadUserDict(InputStream in) throws IOException {
412 |         this.loadUserDict(new BufferedReader(new InputStreamReader(in, Charset.forName("UTF-8"))).lines());
413 | 
414 |     }
415 | 
416 |     private void loadUserDict(Path path) throws IOException {
417 |         this.loadUserDict(Files.lines(path, Charset.forName("UTF-8")));
418 |     }
419 | 
420 |     public Map<String, Integer> getFreq() {
421 |         return freq;
422 |     }
423 | 
424 |     public long getTotal() {
425 |         return total;
426 |     }
427 | }
428 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/hongfuli/jieba/elasticsearch/JiebaAnalysisPlugin.java:
--------------------------------------------------------------------------------
 1 | package com.github.hongfuli.jieba.elasticsearch;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.elasticsearch.index.analysis.AnalyzerProvider;
 5 | import org.elasticsearch.indices.analysis.AnalysisModule;
 6 | import org.elasticsearch.plugins.AnalysisPlugin;
 7 | import org.elasticsearch.plugins.Plugin;
 8 | 
 9 | import java.util.HashMap;
10 | import java.util.Map;
11 | 
12 | /**
13 |  * Created by lihongfu on 17/6/23.
14 |  */
15 | public class JiebaAnalysisPlugin extends Plugin implements AnalysisPlugin {
16 | 
17 | 
18 |     @Override
19 |     public Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
20 |         Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> map = new HashMap<>();
21 |         map.put("jieba", JiebaAnalyzerProvider::new);
22 |         return map;
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/hongfuli/jieba/elasticsearch/JiebaAnalyzerProvider.java:
--------------------------------------------------------------------------------
 1 | package com.github.hongfuli.jieba.elasticsearch;
 2 | 
 3 | import com.github.hongfuli.jieba.lucene.JiebaAnalyzer;
 4 | import org.apache.logging.log4j.LogManager;
 5 | import org.apache.logging.log4j.Logger;
 6 | import org.elasticsearch.common.io.PathUtils;
 7 | import org.elasticsearch.common.settings.Settings;
 8 | import org.elasticsearch.env.Environment;
 9 | import org.elasticsearch.index.IndexSettings;
10 | import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
11 | 
12 | import java.io.File;
13 | import java.io.FileInputStream;
14 | import java.io.FileNotFoundException;
15 | import java.io.IOException;
16 | import java.nio.file.Files;
17 | import java.nio.file.Path;
18 | import java.util.Properties;
19 | 
20 | /**
21 |  * Created by lihongfu on 17/6/23.
22 |  */
23 | public class JiebaAnalyzerProvider extends AbstractIndexAnalyzerProvider<JiebaAnalyzer> {
24 |     private final JiebaAnalyzer analyzer;
25 |     private static final Logger logger = LogManager.getLogger(JiebaAnalyzerProvider.class);
26 | 
27 |     public JiebaAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
28 |         super(indexSettings, name, settings);
29 |         analyzer = new JiebaAnalyzer();
30 | 
31 |         logger.info("load jieba_config.properties");
32 |         Path configPath = PathUtils.get(new File(JiebaAnalyzer.class.getProtectionDomain().getCodeSource().getLocation().getPath()).getParent()).toAbsolutePath().resolve("jieba_config.properties");
33 |         Properties props = new Properties();
34 |         try {
35 |             props.load(Files.newInputStream(configPath));
36 |         } catch (IOException e) {
37 |             throw new RuntimeException("load jieba_config.properties error");
38 |         }
39 | 
40 |         String userDictPath = props.getProperty("user_dict");
41 |         if (userDictPath != null && !userDictPath.trim().isEmpty()) {
42 |             try {
43 |                 logger.info("load user dict from file: " + userDictPath);
44 |                 analyzer.setUserDictIn(new FileInputStream(userDictPath));
45 |             } catch (FileNotFoundException e) {
46 |                 throw new IllegalArgumentException("user_dict file path cannot load: " + userDictPath);
47 |             }
48 |         }
49 | 
50 |     }
51 | 
52 |     @Override
53 |     public JiebaAnalyzer get() {
54 |         return this.analyzer;
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/hongfuli/jieba/lucene/JiebaAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package com.github.hongfuli.jieba.lucene;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.apache.lucene.analysis.TokenFilter;
 5 | 
 6 | import java.io.IOException;
 7 | import java.io.InputStream;
 8 | 
 9 | /**
10 |  * Created by lihongfu on 17/6/19.
11 |  */
12 | public final class JiebaAnalyzer extends Analyzer {
13 | //    private static final Pattern RE_SKIP_DEFAULT = Pattern.compile("(\\r\\n|\\s)");
14 | 
15 |     private InputStream userDictIn;
16 | 
17 |     public JiebaAnalyzer() {
18 |     }
19 | 
20 |     public JiebaAnalyzer(InputStream userDictIn) {
21 |         setUserDictIn(userDictIn);
22 |     }
23 | 
24 |     @Override
25 |     protected TokenStreamComponents createComponents(String fieldName) {
26 |         JiebaTokenizer tokenizer = new JiebaTokenizer();
27 |         if (userDictIn != null) {
28 |             try {
29 |                 tokenizer.loadUserDict(userDictIn);
30 |             } catch (IOException e) {
31 |                 throw new RuntimeException("load user dict error");
32 |             }
33 |         }
34 |         TokenFilter stopFilter = new JiebaStopTokenFilter(tokenizer);
35 |         return new TokenStreamComponents(tokenizer, stopFilter);
36 |     }
37 | 
38 |     public void setUserDictIn(InputStream userDictIn) {
39 |         if (userDictIn == null) {
40 |             throw new IllegalArgumentException("userDictIn is null");
41 |         }
42 |         this.userDictIn = userDictIn;
43 |     }
44 | 
45 | 
46 |     //    @Override
47 | //    protected Reader initReader(String fieldName, Reader reader) {
48 | //        return new PatternReplaceCharFilter(RE_SKIP_DEFAULT, ",", reader);
49 | //    }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/hongfuli/jieba/lucene/JiebaStopTokenFilter.java:
--------------------------------------------------------------------------------
 1 | package com.github.hongfuli.jieba.lucene;
 2 | 
 3 | import org.apache.lucene.analysis.FilteringTokenFilter;
 4 | import org.apache.lucene.analysis.TokenStream;
 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.regex.Pattern;
 9 | 
10 | /**
11 |  * Created by lihongfu on 17/6/23.
12 |  */
13 | public class JiebaStopTokenFilter extends FilteringTokenFilter {
14 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
15 | 
16 |     private static final Pattern NOT_WORD = Pattern.compile("\\w", Pattern.UNICODE_CHARACTER_CLASS);
17 | 
18 |     public JiebaStopTokenFilter(TokenStream in) {
19 |         super(in);
20 |     }
21 | 
22 |     @Override
23 |     protected boolean accept() throws IOException {
24 |         String term = termAtt.toString();
25 |         if (term.length() > 1){
26 |             return true;
27 |         }else{
28 |             return NOT_WORD.matcher(term).matches();
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/hongfuli/jieba/lucene/JiebaTokenizer.java:
--------------------------------------------------------------------------------
 1 | package com.github.hongfuli.jieba.lucene;
 2 | 
 3 | import com.github.hongfuli.jieba.Token;
 4 | import com.github.hongfuli.jieba.Tokenizer;
 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 6 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 7 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 8 | 
 9 | import java.io.BufferedReader;
10 | import java.io.IOException;
11 | import java.io.InputStream;
12 | import java.util.List;
13 | 
14 | /**
15 |  * Created by lihongfu on 17/6/19.
16 |  */
17 | public class JiebaTokenizer extends org.apache.lucene.analysis.Tokenizer {
18 |     private com.github.hongfuli.jieba.Tokenizer scanner;
19 |     private BufferedReader bufferReader;
20 |     private int tokenIndex;
21 |     private List<Token> tokenBuffer;
22 |     private int finalOffset;
23 | 
24 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
25 |     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
26 |     private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
27 | 
28 |     public JiebaTokenizer() {
29 |         this.scanner = new Tokenizer();
30 |     }
31 | 
32 |     @Override
33 |     public final boolean incrementToken() throws IOException {
34 |         if (bufferReader == null) {
35 |             throw new IllegalStateException("must call reset before call incrementToken()");
36 |         }
37 |         clearAttributes();
38 |         if (tokenBuffer == null || tokenIndex >= tokenBuffer.size()) {
39 |             String line = bufferReader.readLine();
40 |             if (line == null) {
41 |                 return false;
42 |             }
43 |             tokenBuffer = scanner.tokenize(line, true, true);
44 |             tokenIndex = 0;
45 |         }
46 |         Token token = tokenBuffer.get(tokenIndex);
47 |         termAtt.append(token.value);
48 |         offsetAtt.setOffset(correctOffset(token.startPos), correctOffset(token.endPos));
49 |         posIncrAtt.setPositionIncrement(1);
50 |         tokenIndex += 1;
51 |         finalOffset = correctOffset(token.endPos);
52 |         return true;
53 |     }
54 | 
55 |     @Override
56 |     public void end() throws IOException {
57 |         super.end();
58 |         offsetAtt.setOffset(finalOffset + 1, finalOffset + 1);
59 |     }
60 | 
61 |     @Override
62 |     public void reset() throws IOException {
63 |         super.reset();
64 |         if (BufferedReader.class.isAssignableFrom(input.getClass())) {
65 |             bufferReader = (BufferedReader) input;
66 |         } else {
67 |             bufferReader = new BufferedReader(this.input);
68 |         }
69 |         tokenIndex = 0;
70 |         tokenBuffer = null;
71 |     }
72 | 
73 |     @Override
74 |     public void close() throws IOException {
75 |         super.close();
76 |         if (bufferReader != null){
77 |             bufferReader.close();
78 |             bufferReader = null;
79 |         }
80 |     }
81 | 
82 |     public void loadUserDict(InputStream in) throws IOException {
83 |         if (this.scanner == null){
84 |             throw new IllegalStateException("not initialized tokenizer correct");
85 |         }
86 |         this.scanner.loadUserDict(in);
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/hongfuli/utils/MtyStringUtils.java:
--------------------------------------------------------------------------------
 1 | package com.github.hongfuli.utils;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | import java.util.regex.Matcher;
 6 | import java.util.regex.Pattern;
 7 | 
 8 | /**
 9 |  * Created by lihongfu on 17/6/3.
10 |  */
11 | public class MtyStringUtils {
12 |     /**
13 |      * 该方法和 {@link Pattern#split(CharSequence)} 功能一样,利用pattern匹配规则分割字符str, 但是会把匹配的分割符串也返回.
14 |      * 比如字符串 "hello123word" 用 (\d+) 分割, 会返回["hello", "123", "word"]
15 |      *
16 |      * @param pattern
17 |      * @param str
18 |      * @return
19 |      */
20 |     public static List<String> splitAndReturnDelimiters(Pattern pattern, String str) {
21 |         Matcher matcher = pattern.matcher(str);
22 |         List<String> result = new ArrayList<String>();
23 |         int strLen = str.length();
24 |         int lastMatchIdx = 0;
25 |         while (matcher.find()) {
26 |             int start = matcher.start();
27 |             int end = matcher.end();
28 |             String ds = matcher.group();
29 | 
30 |             if (lastMatchIdx != start) {
31 |                 String leftS = str.substring(lastMatchIdx, start);
32 |                 result.add(leftS);
33 |             }
34 | 
35 |             result.add(ds);
36 | 
37 |             lastMatchIdx = end;
38 |         }
39 | 
40 |         if (lastMatchIdx < strLen) {
41 |             result.add(str.substring(lastMatchIdx, strLen));
42 |         }
43 |         return result;
44 | 
45 |     }
46 | 
47 | 
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/hongfuli/jieba/FinalSegTest.java:
--------------------------------------------------------------------------------
 1 | package com.github.hongfuli.jieba;
 2 | 
 3 | import junit.framework.TestCase;
 4 | 
 5 | import java.io.IOException;
 6 | 
 7 | /**
 8 |  * Created by lihongfu on 17/6/9.
 9 |  */
10 | public class FinalSegTest extends TestCase {
11 | 
12 | 
13 |     public void testLoadEmitP() throws IOException {
14 |         FinalSeg seg = new FinalSeg("/emit_test.txt");
15 |         System.out.println(seg.getEmitP());
16 |     }
17 | 
18 |     public void testLoadEmitPDefault() throws IOException {
19 |         FinalSeg seg = new FinalSeg();
20 |         System.out.println(seg.getEmitP().get('B').size() + seg.getEmitP().get('E').size()
21 |                 + seg.getEmitP().get('M').size() + seg.getEmitP().get('S').size());
22 |     }
23 | 
24 |     public void testCut() throws IOException {
25 |         FinalSeg seg = new FinalSeg();
26 |         System.out.println(seg.cut("我最喜欢青白玉"));
27 |         System.out.println(seg.cut("你是喜欢Python还是Java呢,我也不知道吧"));
28 |     }
29 | 
30 | }


--------------------------------------------------------------------------------
/src/test/java/com/github/hongfuli/jieba/TokenizerTest.java:
--------------------------------------------------------------------------------
  1 | package com.github.hongfuli.jieba;
  2 | 
  3 | import junit.framework.TestCase;
  4 | 
  5 | import java.io.IOException;
  6 | import java.util.List;
  7 | import java.util.regex.Matcher;
  8 | import java.util.regex.Pattern;
  9 | 
 10 | /**
 11 |  * Created by lihongfu on 17/6/2.
 12 |  */
 13 | public class TokenizerTest extends TestCase {
 14 | 
 15 |     public void testHanPattern() {
 16 |         Pattern hanP = Pattern.compile("([\\u4E00-\\u9FD5]+)");
 17 |         String sentence = "abc我是中国人bc你好 workd";
 18 |         Matcher matcher = hanP.matcher(sentence);
 19 |         System.out.println(matcher.matches());
 20 |         for (String s : hanP.split(sentence)) {
 21 |             System.out.print(s + " / ");
 22 |         }
 23 |     }
 24 | 
 25 |     public void testSplit() {
 26 |         for (String x : "hek fd 133 4.def".split("((?!^))")) {
 27 |             System.out.println(x + "====");
 28 |         }
 29 |     }
 30 | 
 31 |     private void printResult(List<String> tokens) {
 32 |         for (String t : tokens) {
 33 |             System.out.print(t + "|");
 34 |         }
 35 |         System.out.println();
 36 | 
 37 |     }
 38 | 
 39 |     private void printTokens(List<Token> tokens){
 40 |         for (Token token : tokens){
 41 |             System.out.println(token);
 42 |         }
 43 |     }
 44 | 
 45 |     public void testCutAll() {
 46 |         Tokenizer t = new Tokenizer();
 47 |         printResult(t.cut("这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。", true, false));
 48 |         printResult(t.cut("我不喜欢日本和服。", true, false));
 49 |         printResult(t.cut("雷猴回归人间。", true, false));
 50 |         printResult(t.cut("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", true, false));
 51 |         printResult(t.cut("“Microsoft”一词由“MICROcomputer（微型计算机）”和“SOFTware（软件）”两部分组成", true, false));
 52 |     }
 53 | 
 54 |     public void testCutNoHMM() {
 55 |         Tokenizer t = new Tokenizer();
 56 |         printResult(t.cut("这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。", false, false));
 57 |         printResult(t.cut("我不喜欢日本和服。", false, false));
 58 |         printResult(t.cut("雷猴回归人间。", false, false));
 59 |         printResult(t.cut("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", false, false));
 60 |         printResult(t.cut("“Microsoft”一词由“MICROcomputer（微型计算机）”和“SOFTware（软件）”两部分组成", false, false));
 61 |     }
 62 | 
 63 |     public void testCutHMM() {
 64 |         Tokenizer t = new Tokenizer();
 65 |         printResult(t.cut("这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。", false, true));
 66 |         printResult(t.cut("我不喜欢日本和服。", false, true));
 67 |         printResult(t.cut("雷猴回归人间。", false, true));
 68 |         printResult(t.cut("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", false, true));
 69 |         printResult(t.cut("“Microsoft”一词由“MICROcomputer（微型计算机）”和“SOFTware（软件）”两部分组成", false, true));
 70 |         printResult(t.cut("这个洒金皮的和田玉你喜欢吗", false, true));
 71 |     }
 72 | 
 73 |     public void testCutForSearch() {
 74 |         Tokenizer t = new Tokenizer();
 75 |         printResult(t.cutForSearch("这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。"));
 76 |         printResult(t.cutForSearch("我不喜欢日本和服。"));
 77 |         printResult(t.cutForSearch("雷猴回归人间。"));
 78 |         printResult(t.cutForSearch("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"));
 79 |     }
 80 | 
 81 |     public void testToknizer() {
 82 |         Tokenizer t = new Tokenizer();
 83 |         printTokens(t.tokenize("这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。", true, true));
 84 |         System.out.println("====================");
 85 |         printTokens(t.tokenize("hello world this is my first program", false, true));
 86 |         System.out.println("====================");
 87 |         printTokens(t.tokenize("hello,,world,this,is,my,first,program", false, true));
 88 |     }
 89 | 
 90 | 
 91 |     public void testLoadUserDict() throws IOException {
 92 |         Tokenizer t = new Tokenizer();
 93 |         printResult(t.cut("这个洒金皮的和田玉我很喜欢呢", false, false));
 94 |         System.out.println("====================");
 95 |         t.loadUserDict(this.getClass().getResourceAsStream("/userdict.txt"));
 96 |         printResult(t.cut("这个洒金皮的和田玉我很喜欢呢", false, false));
 97 |     }
 98 | 
 99 | }
100 | 
101 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/hongfuli/jieba/lucene/JiebaAnalyzerTest.java:
--------------------------------------------------------------------------------
 1 | package com.github.hongfuli.jieba.lucene;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 5 | 
 6 | import java.io.IOException;
 7 | import java.util.Random;
 8 | 
 9 | /**
10 |  * Created by lihongfu on 17/6/19.
11 |  */
12 | public class JiebaAnalyzerTest extends BaseTokenStreamTestCase {
13 | 
14 |     public void testStandardAnalyzer() throws IOException {
15 |         Analyzer analyzer = new JiebaAnalyzer();
16 | 
17 |         checkRandomData(new Random(0), analyzer, 1);
18 | 
19 |         System.out.println(BaseTokenStreamTestCase.toString(analyzer, "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"));
20 |         System.out.println("==============");
21 |         System.out.println(BaseTokenStreamTestCase.toString(analyzer, "hello  world,this is my first program"));
22 |         System.out.println("==============");
23 |         System.out.println(BaseTokenStreamTestCase.toString(analyzer, "这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。"));
24 | 
25 |     }
26 | 
27 | 
28 | }


--------------------------------------------------------------------------------
/src/test/java/com/github/hongfuli/utils/MtyStringUtilsTest.java:
--------------------------------------------------------------------------------
 1 | package com.github.hongfuli.utils;
 2 | 
 3 | import junit.framework.TestCase;
 4 | 
 5 | import java.util.List;
 6 | import java.util.regex.Pattern;
 7 | 
 8 | /**
 9 |  * Created by lihongfu on 17/6/5.
10 |  */
11 | public class MtyStringUtilsTest extends TestCase {
12 | 
13 |     public void testSplitAndReturnDelimiters(){
14 |         Pattern hanP = Pattern.compile("([\\u4E00-\\u9FD5]+)");
15 |         String sentence = "abc我是中国人bc你好 workd";
16 |         List<String> strings = MtyStringUtils.splitAndReturnDelimiters(hanP, sentence);
17 |         System.out.println(strings);
18 | 
19 |         sentence = "这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。";
20 |         strings = MtyStringUtils.splitAndReturnDelimiters(hanP, sentence);
21 |         System.out.println(strings);
22 | 
23 | 
24 |         hanP = Pattern.compile("[^a-zA-Z0-9+#\\n]");
25 |         sentence = "C++。";
26 |         strings = MtyStringUtils.splitAndReturnDelimiters(hanP, sentence);
27 |         System.out.println(strings);
28 |     }
29 | 
30 |     public void testSplitAndReturnDelimiters4continue(){
31 |         Pattern hanP = Pattern.compile("(ab)");
32 |         String sentence = "ababab";
33 |         List<String> strings = MtyStringUtils.splitAndReturnDelimiters(hanP, sentence);
34 |         System.out.println(strings);
35 |     }
36 | 
37 | }


--------------------------------------------------------------------------------
/src/test/resources/emit_test.txt:
--------------------------------------------------------------------------------
 1 | B
 2 | '\u4e00': -3.6544978750449433,
 3 |        '\u4e01': -8.125041941842026,
 4 | E
 5 | '\u4e00': -6.044987536255073,
 6 |        '\u4e01': -9.075800412310807,
 7 | M
 8 | '\u4e00': -4.428158526435913,
 9 |        '\u4e01': -7.932945687598502,
10 | S
11 | '\u2236': -15.828865681131282,
12 |        '\u4e00': -4.92368982120877,
13 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <Configuration status="WARN">
 3 |     <Appenders>
 4 |         <Console name="Console" target="SYSTEM_OUT">
 5 |             <PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
 6 |         </Console>
 7 |     </Appenders>
 8 |     <Loggers>
 9 |         <Root level="debug">
10 |             <AppenderRef ref="Console"/>
11 |         </Root>
12 |     </Loggers>
13 | </Configuration>


--------------------------------------------------------------------------------
/src/test/resources/userdict.txt:
--------------------------------------------------------------------------------
1 | 洒金皮
2 | 桥北中学 10


--------------------------------------------------------------------------------