├── .gitignore ├── LICENSE ├── README.md ├── config ├── jieba_config.properties └── plugin-descriptor.properties ├── pom.xml └── src ├── main ├── assemblies │ └── plugin.xml ├── java │ └── com │ │ └── github │ │ └── hongfuli │ │ ├── jieba │ │ ├── FinalSeg.java │ │ ├── Token.java │ │ ├── Tokenizer.java │ │ ├── elasticsearch │ │ │ ├── JiebaAnalysisPlugin.java │ │ │ └── JiebaAnalyzerProvider.java │ │ └── lucene │ │ │ ├── JiebaAnalyzer.java │ │ │ ├── JiebaStopTokenFilter.java │ │ │ └── JiebaTokenizer.java │ │ └── utils │ │ └── MtyStringUtils.java └── resources │ ├── dict.txt │ └── finalseg_prob_emit.txt └── test ├── java └── com │ └── github │ └── hongfuli │ ├── jieba │ ├── FinalSegTest.java │ ├── TokenizerTest.java │ └── lucene │ │ └── JiebaAnalyzerTest.java │ └── utils │ └── MtyStringUtilsTest.java └── resources ├── emit_test.txt ├── log4j2.xml └── userdict.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.ear 17 | *.zip 18 | *.tar.gz 19 | *.rar 20 | 21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 22 | hs_err_pid* 23 | 24 | .idea/ 25 | *.iml 26 | target/ 27 | .DS_Store 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 基于 [jieba](https://github.com/fxsjy/jieba) 的 [elasticsearch](https://www.elastic.co/products/elasticsearch) 中文分词插件。 2 | 3 | 集成到ElasticSearch 4 | ======= 5 | 6 | ```bash 7 | git clone git@github.com:hongfuli/elasticsearch-analysis-jieba.git 8 | cd elasticsearch-analysis-jieba 9 | mvn package 10 | ``` 11 | 把release/elasticsearch-analysis-jieba-{version}.zip文件解压到 elasticsearch 的 plugins 目录下,重启elasticsearch即可。 12 | 13 | 创建字段: 14 | ```bash 15 | 16 | curl -XPOST http://localhost:9200/index/type/_mapping -d' 17 | { 18 | "properties": { 19 | "content": { 20 | "type": "text", 21 | "analyzer": "jieba", 22 | "search_analyzer": "jieba" 23 | } 24 | } 25 | } 26 | }' 27 | ``` 28 | 29 | 30 | 直接使用Tokenizer分词 31 | ======= 32 | 可直接使用 `com.github.hongfuli.jieba.Tokenizer` 对文本字符进行分词,方法参数完全和 [jieba python](https://github.com/fxsjy/jieba) 一致。 33 | 34 | ```java 35 | imort com.github.hongfuli.jieba.Tokenizer 36 | 37 | Tokenizer t = new Tokenizer(); 38 | t.cut("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", false, true); 39 | ``` 40 | 41 | 集成到Lucene 42 | ======= 43 | 44 | ```java 45 | import com.github.hongfuli.jieba.lucene.JiebaAnalyzer; 46 | 47 | Analyzer analyzer = new JiebaAnalyzer(); 48 | try(TokenStream ts = analyzer.tokenStream("field", "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")) { 49 | StringBuilder b = new StringBuilder(); 50 | CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); 51 | PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class); 52 | PositionLengthAttribute posLengthAtt = ts.getAttribute(PositionLengthAttribute.class); 53 | OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class); 54 | assertNotNull(offsetAtt); 55 | ts.reset(); 56 | int pos = -1; 57 | while (ts.incrementToken()) { 58 | pos += posIncAtt.getPositionIncrement(); 59 | b.append(termAtt); 60 | b.append(" at pos="); 61 | b.append(pos); 62 | if (posLengthAtt != null) { 63 | b.append(" to pos="); 64 | b.append(pos + posLengthAtt.getPositionLength()); 65 | } 66 | b.append(" offsets="); 67 | b.append(offsetAtt.startOffset()); 68 | b.append('-'); 69 | b.append(offsetAtt.endOffset()); 70 | b.append('\n'); 71 | } 72 | ts.end(); 73 | return b.toString(); 74 | } 75 | ``` -------------------------------------------------------------------------------- /config/jieba_config.properties: -------------------------------------------------------------------------------- 1 | # 自定义字典文件路径 2 | #user_dict=/home/user_dict.txt -------------------------------------------------------------------------------- /config/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | name=${elasticsearch.plugin.name} 2 | description=${project.description} 3 | version=${project.version} 4 | jvm=${elasticsearch.plugin.jvm} 5 | classname=${elasticsearch.plugin.classname} 6 | java.version=${maven.compile.target} 7 | elasticsearch.version=${elasticsearch.version} 8 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.github.hongfuli 6 | elasticsearch-analysis-jieba 7 | 1.0-SNAPSHOT 8 | jar 9 | Jieba analyzer for ElasticSearch 10 | 11 | elasticsearch-analysis-jieba 12 | http://maven.apache.org 13 | 14 | 15 | 5.4.1 16 | 1.8 17 | analysis-jieba 18 | com.github.hongfuli.jieba.elasticsearch.JiebaAnalysisPlugin 19 | 20 | true 21 | UTF-8 22 | UTF-8 23 | true 24 | 25 | 26 | 27 | 28 | org.elasticsearch 29 | elasticsearch 30 | 5.4.1 31 | compile 32 | 33 | 34 | junit 35 | junit 36 | 4.10 37 | test 38 | 39 | 40 | org.apache.lucene 41 | lucene-test-framework 42 | 6.5.1 43 | test 44 | 45 | 46 | org.apache.logging.log4j 47 | log4j-core 48 | 2.8.2 49 | compile 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | org.apache.maven.plugins 59 | maven-compiler-plugin 60 | 3.5.1 61 | 62 | ${maven.compile.target} 63 | ${maven.compile.target} 64 | 65 | 66 | 67 | org.apache.maven.plugins 68 | maven-surefire-plugin 69 | 2.11 70 | 71 | true 72 | 73 | 74 | 75 | maven-assembly-plugin 76 | 77 | false 78 | ${project.build.directory}/release 79 | 80 | ${basedir}/src/main/assemblies/plugin.xml 81 | 82 | 83 | 84 | fully.qualified.MainClass 85 | 86 | 87 | 88 | 89 | 90 | package 91 | 92 | single 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /src/main/assemblies/plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | analysis-jieba-release 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | 11 | ${project.basedir}/config 12 | / 13 | true 14 | 15 | 16 | 17 | 18 | 19 | / 20 | true 21 | true 22 | 23 | org.elasticsearch:elasticsearch 24 | org.apache.logging.log4j:log4j-core 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/main/java/com/github/hongfuli/jieba/FinalSeg.java: -------------------------------------------------------------------------------- 1 | package com.github.hongfuli.jieba; 2 | 3 | import com.github.hongfuli.utils.MtyStringUtils; 4 | 5 | import java.io.BufferedReader; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.*; 9 | import java.util.regex.Matcher; 10 | import java.util.regex.Pattern; 11 | 12 | /** 13 | * Created by lihongfu on 17/6/8. 14 | */ 15 | public class FinalSeg { 16 | private static final Double MIN_FLOAT = -3.14e100; 17 | private Map> emitP; 18 | 19 | private static final String DEFAULT_EMIT_FILE = "/finalseg_prob_emit.txt"; 20 | 21 | private static final Set STATES = new HashSet(); 22 | private static final Map PROB_START = new HashMap(); 23 | private static final Map> PROB_TRANS = new HashMap(); 24 | private static final Map PREV_STATES = new HashMap(); 25 | 26 | private static final Pattern RE_HAN = Pattern.compile("([\\u4E00-\\u9FD5]+)"); 27 | private static final Pattern RE_SKIP = Pattern.compile("(\\d+\\.\\d+|[a-zA-Z0-9]+)"); 28 | 29 | static { 30 | STATES.add('B'); 31 | STATES.add('E'); 32 | STATES.add('M'); 33 | STATES.add('S'); 34 | 35 | PROB_START.put('B', -0.26268660809250016); 36 | PROB_START.put('E', -3.14e+100); 37 | PROB_START.put('M', -3.14e+100); 38 | PROB_START.put('S', -1.4652633398537678); 39 | 40 | PROB_TRANS.put('B', new HashMap() {{ 41 | put('E', -0.510825623765990); 42 | put('M', -0.916290731874155); 43 | }}); 44 | PROB_TRANS.put('E', new HashMap() {{ 45 | put('B', -0.5897149736854513); 46 | put('S', -0.8085250474669937); 47 | }}); 48 | PROB_TRANS.put('M', new HashMap() {{ 49 | put('E', -0.33344856811948514); 50 | put('M', -1.2603623820268226); 51 | }}); 52 | PROB_TRANS.put('S', new HashMap() {{ 53 | put('B', -0.7211965654669841); 54 | put('S', -0.6658631448798212); 55 | }}); 56 | 57 | PREV_STATES.put('B', new Character[]{'E', 'S'}); 58 | PREV_STATES.put('M', new Character[]{'M', 'B'}); 59 | PREV_STATES.put('S', new Character[]{'E', 'S'}); 60 | PREV_STATES.put('E', new Character[]{'B', 'M'}); 61 | } 62 | 63 | public FinalSeg() throws IOException { 64 | this(DEFAULT_EMIT_FILE); 65 | } 66 | 67 | public FinalSeg(String emitFileName) throws IOException { 68 | this.loadEmitP(emitFileName); 69 | } 70 | 71 | 72 | protected void loadEmitP(String emitPFileName) throws IOException { 73 | emitP = new HashMap(); 74 | for (Character s : STATES) { 75 | emitP.put(s, new HashMap(10000)); 76 | } 77 | 78 | Pattern wordPattern = Pattern.compile("'\\\\u(.*?)': (.*?),"); 79 | 80 | BufferedReader reader = new BufferedReader(new InputStreamReader(Tokenizer.class.getResourceAsStream(emitPFileName))); 81 | String line; 82 | Character currentType = null; 83 | while ((line = reader.readLine()) != null) { 84 | line = line.trim(); 85 | if (line.isEmpty()) 86 | continue; 87 | 88 | if (line.length() == 1 && STATES.contains(line.charAt(0))) { 89 | currentType = line.charAt(0); 90 | continue; 91 | } else { 92 | if (currentType == null) { 93 | throw new IllegalStateException("emit probability data must be followed the BEMS character"); 94 | } 95 | } 96 | 97 | Map stateP = emitP.get(currentType); 98 | Matcher matcher = wordPattern.matcher(line); 99 | if (matcher.find()) { 100 | String word = matcher.group(1); 101 | Double p = Double.valueOf(matcher.group(2)); 102 | stateP.put((char) Integer.parseInt(word, 16), p); 103 | } 104 | } 105 | } 106 | 107 | private String viterbi(String obs, Set states, Map startP, 108 | Map> transP, 109 | Map> emitP) { 110 | List> V = new ArrayList(obs.length()); 111 | Map first = new HashMap(); 112 | V.add(first); 113 | Map path = new HashMap(); 114 | for (Character y : states) { 115 | first.put(y, startP.get(y) + emitP.get(y).getOrDefault(obs.charAt(0), MIN_FLOAT)); 116 | path.put(y, String.valueOf(y)); 117 | } 118 | 119 | for (int i = 1; i < obs.length(); i++) { 120 | Map v = new HashMap(); 121 | V.add(v); 122 | 123 | Map newPath = new HashMap(); 124 | for (Character y : states) { 125 | double emP = emitP.get(y).getOrDefault(obs.charAt(i), MIN_FLOAT); 126 | double maxProb = Double.NEGATIVE_INFINITY; 127 | Character bestY = null; 128 | for (Character y0 : PREV_STATES.get(y)) { 129 | double emP0 = V.get(i - 1).get(y0) + transP.get(y0).getOrDefault(y, MIN_FLOAT) + emP; 130 | if (emP0 > maxProb) { 131 | maxProb = emP0; 132 | bestY = y0; 133 | } 134 | } 135 | V.get(i).put(y, maxProb); 136 | newPath.put(y, path.get(bestY) + String.valueOf(y)); 137 | } 138 | path = newPath; 139 | } 140 | 141 | Double maxD = null; 142 | Character finalState = null; 143 | for (Character y : new Character[]{'E', 'S'}) { 144 | Double d = V.get(obs.length() - 1).get(y); 145 | if (maxD == null || d > maxD) { 146 | maxD = d; 147 | finalState = y; 148 | } 149 | } 150 | 151 | return path.get(finalState); 152 | } 153 | 154 | private List innerCut(String sentence) { 155 | List result = new ArrayList(); 156 | String posList = viterbi(sentence, STATES, PROB_START, PROB_TRANS, emitP); 157 | int begin = 0, nextI = 0; 158 | for (int i = 0; i < sentence.length(); i++) { 159 | char ch = sentence.charAt(i); 160 | char pos = posList.charAt(i); 161 | if (pos == 'B') { 162 | begin = i; 163 | } else if (pos == 'E') { 164 | result.add(sentence.substring(begin, i + 1)); 165 | nextI = i + 1; 166 | } else if (pos == 'S') { 167 | result.add(String.valueOf(ch)); 168 | nextI = i + 1; 169 | } 170 | } 171 | 172 | if (nextI < sentence.length()) { 173 | result.add(sentence.substring(nextI)); 174 | } 175 | return result; 176 | } 177 | 178 | 179 | public List cut(String sentence) { 180 | List blocks = MtyStringUtils.splitAndReturnDelimiters(RE_HAN, sentence); 181 | List result = new ArrayList(); 182 | for (String blk : blocks) { 183 | if (RE_HAN.matcher(blk).matches()) { 184 | result.addAll(innerCut(blk)); 185 | } else { 186 | List tmp = MtyStringUtils.splitAndReturnDelimiters(RE_SKIP, blk); 187 | for (String x : tmp) { 188 | if (!x.isEmpty()) { 189 | result.add(x); 190 | } 191 | } 192 | } 193 | } 194 | return result; 195 | } 196 | 197 | 198 | public Map> getEmitP() { 199 | return emitP; 200 | } 201 | 202 | 203 | } 204 | -------------------------------------------------------------------------------- /src/main/java/com/github/hongfuli/jieba/Token.java: -------------------------------------------------------------------------------- 1 | package com.github.hongfuli.jieba; 2 | 3 | /** 4 | * Created by lihongfu on 17/6/13. 5 | */ 6 | public class Token { 7 | public String value; 8 | public int startPos; 9 | public int endPos; 10 | 11 | public Token(String value, int startPos, int endPos) { 12 | this.value = value; 13 | this.startPos = startPos; 14 | this.endPos = endPos; 15 | } 16 | 17 | @Override 18 | public String toString(){ 19 | return "token: value = " + value + "; startPos = " + startPos + "; endPos = " + endPos; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/com/github/hongfuli/jieba/Tokenizer.java: -------------------------------------------------------------------------------- 1 | package com.github.hongfuli.jieba; 2 | 3 | import com.github.hongfuli.utils.MtyStringUtils; 4 | 5 | import java.io.BufferedReader; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.InputStreamReader; 9 | import java.nio.charset.Charset; 10 | import java.nio.file.Files; 11 | import java.nio.file.Path; 12 | import java.util.ArrayList; 13 | import java.util.HashMap; 14 | import java.util.List; 15 | import java.util.Map; 16 | import java.util.regex.Matcher; 17 | import java.util.regex.Pattern; 18 | import java.util.stream.Stream; 19 | 20 | /** 21 | * Created by lihongfu on 17/5/31. 22 | */ 23 | public class Tokenizer { 24 | 25 | 26 | private Map freq = new HashMap(349050); 27 | private FinalSeg finalSeg; 28 | private long total; 29 | private boolean initialized; 30 | 31 | private static final String DEFAULT_DICT_FILE_NAME = "/dict.txt"; 32 | 33 | private static final Pattern RE_HAN_DEFAULT = Pattern.compile("([\\u4E00-\\u9FD5a-zA-Z0-9+#&\\._]+)"); 34 | private static final Pattern RE_SKIP_DEFAULT = Pattern.compile("(\\r\\n|\\s)"); 35 | private static final Pattern RE_HAN_CUT_ALL = Pattern.compile("([\\u4E00-\\u9FD5]+)"); 36 | private static final Pattern RE_SKIP_HAN_CUT_ALL = Pattern.compile("[^a-zA-Z0-9+#\\n]"); 37 | private static final Pattern RE_ENG = Pattern.compile("[a-zA-Z0-9]"); 38 | 39 | private static final Pattern RE_USERDICT = Pattern.compile("^(.+?)( [0-9]+)?( [a-z]+)?$"); 40 | 41 | 42 | public Tokenizer() { 43 | try { 44 | initialize(); 45 | } catch (IOException e) { 46 | e.printStackTrace(); 47 | } 48 | 49 | } 50 | 51 | private void initialize() throws IOException { 52 | this.genPfDict(); 53 | this.finalSeg = new FinalSeg(); 54 | } 55 | 56 | 57 | private void genPfDict() throws IOException { 58 | BufferedReader reader = new BufferedReader(new InputStreamReader(Tokenizer.class.getResourceAsStream(DEFAULT_DICT_FILE_NAME))); 59 | String line; 60 | while ((line = reader.readLine()) != null) { 61 | line = line.trim(); 62 | if (line.isEmpty()) 63 | continue; 64 | String[] wordFreqs = line.split(" "); 65 | String word = wordFreqs[0]; 66 | int freq = Integer.parseInt(wordFreqs[1]); 67 | this.freq.put(word, freq); 68 | this.total += freq; 69 | for (int i = 1; i <= word.length(); i++) { 70 | String wfrag = word.substring(0, i); 71 | if (!this.freq.containsKey(wfrag)) { 72 | this.freq.put(wfrag, 0); 73 | } 74 | } 75 | } 76 | } 77 | 78 | 79 | private Map> getDAG(String sentence) { 80 | Map> DAG = new HashMap>(); 81 | int N = sentence.length(); 82 | for (int k = 0; k < N; k++) { 83 | List tmpList = new ArrayList(); 84 | int i = k; 85 | String frag = sentence.substring(k, k + 1); 86 | while (i < N && this.freq.containsKey(frag)) { 87 | if (this.freq.get(frag) > 0) { 88 | tmpList.add(i); 89 | } 90 | i += 1; 91 | if (i < N) { 92 | frag = sentence.substring(k, i + 1); 93 | } 94 | } 95 | if (tmpList.isEmpty()) { 96 | tmpList.add(k); 97 | } 98 | DAG.put(k, tmpList); 99 | } 100 | return DAG; 101 | } 102 | 103 | private static class Pair { 104 | private E first; 105 | private F second; 106 | 107 | private Pair(E first, F second) { 108 | this.first = first; 109 | this.second = second; 110 | } 111 | 112 | public static Pair newPair(E first, F second) { 113 | return new Pair(first, second); 114 | } 115 | 116 | public E getFirst() { 117 | return first; 118 | } 119 | 120 | public F getSecond() { 121 | return second; 122 | } 123 | 124 | @Override 125 | public String toString() { 126 | return "pair value: " + first + " , " + second; 127 | } 128 | } 129 | 130 | private void calc(String sentence, Map> DAG, Map> route) { 131 | int N = sentence.length(); 132 | route.put(N, Pair.newPair(0.0, 0)); 133 | double logTotal = Math.log(this.total); 134 | for (int idx = N - 1; idx >= 0; idx--) { 135 | double maxFreq = -Double.MAX_VALUE; 136 | int maxIdx = idx; 137 | for (int x : DAG.get(idx)) { 138 | Integer freq = this.freq.get(sentence.substring(idx, x + 1)); 139 | double logFreq = Math.log(freq == null || freq == 0 ? 1 : freq) - logTotal + route.get(x + 1).getFirst(); 140 | if (logFreq > maxFreq) { 141 | maxIdx = x; 142 | maxFreq = logFreq; 143 | } 144 | } 145 | route.put(idx, Pair.newPair(maxFreq, maxIdx)); 146 | } 147 | } 148 | 149 | public List cut(String sentence, boolean cut_all, boolean HMM) { 150 | Pattern reHan, reSkip; 151 | if (cut_all) { 152 | reHan = RE_HAN_CUT_ALL; 153 | reSkip = RE_SKIP_HAN_CUT_ALL; 154 | } else { 155 | reHan = RE_HAN_DEFAULT; 156 | reSkip = RE_SKIP_DEFAULT; 157 | } 158 | CutStrategy cs; 159 | if (cut_all) { 160 | cs = new CutAllStrategy(); 161 | } else if (HMM) { 162 | cs = new CutDAGStrategy(); 163 | } else { 164 | cs = new CutDAGNoHMMStrategy(); 165 | } 166 | List blocks = MtyStringUtils.splitAndReturnDelimiters(reHan, sentence); 167 | List tokens = new ArrayList(); 168 | for (String blk : blocks) { 169 | if (blk.isEmpty()) { 170 | continue; 171 | } 172 | if (reHan.matcher(blk).matches()) { 173 | for (String word : cs.cut(blk)) { 174 | tokens.add(word); 175 | } 176 | } else { 177 | for (String x : reSkip.split(blk)) { 178 | if (reSkip.matcher(x).matches()) { 179 | tokens.add(x); 180 | } else if (!cut_all) { 181 | for (String c : x.split("(?!^)")) { 182 | tokens.add(c); 183 | } 184 | } else { 185 | tokens.add(x); 186 | } 187 | 188 | } 189 | } 190 | 191 | } 192 | return tokens; 193 | } 194 | 195 | public List cutForSearch(String sentence, boolean HMM) { 196 | List frags = this.cut(sentence, false, HMM); 197 | List result = new ArrayList(); 198 | for (String w : frags) { 199 | if (w.length() > 2) { 200 | for (int i = 0; i < w.length() - 1; i++) { 201 | String gram2 = w.substring(i, i + 2); 202 | if (freq.getOrDefault(gram2, 0) > 0) { 203 | result.add(gram2); 204 | } 205 | } 206 | } 207 | if (w.length() > 3) { 208 | for (int i = 0; i < w.length() - 2; i++) { 209 | String gram3 = w.substring(i, i + 3); 210 | if (freq.getOrDefault(gram3, 0) > 0) { 211 | result.add(gram3); 212 | } 213 | } 214 | } 215 | result.add(w); 216 | } 217 | return result; 218 | } 219 | 220 | public List cutForSearch(String sentence) { 221 | return cutForSearch(sentence, true); 222 | } 223 | 224 | public List tokenize(String sentence, boolean forSearch, boolean HMM) { 225 | List tokens = new ArrayList(); 226 | int start = 0; 227 | if (forSearch) { 228 | for (String w : cut(sentence, false, HMM)) { 229 | int width = w.length(); 230 | if (w.length() > 2) { 231 | for (int i = 0; i < w.length() - 1; i++) { 232 | String gram2 = w.substring(i, i + 2); 233 | if (freq.getOrDefault(gram2, 0) > 0) { 234 | tokens.add(new Token(gram2, start + i, start + i + 2)); 235 | } 236 | } 237 | } 238 | if (w.length() > 3) { 239 | for (int i = 0; i < w.length() - 2; i++) { 240 | String gram3 = w.substring(i, i + 3); 241 | if (freq.getOrDefault(gram3, 0) > 0) { 242 | tokens.add(new Token(gram3, start + i, start + i + 3)); 243 | } 244 | } 245 | } 246 | tokens.add(new Token(w, start, start + width)); 247 | start += width; 248 | } 249 | } else { 250 | for (String w : cut(sentence, false, HMM)) { 251 | tokens.add(new Token(w, start, start + w.length())); 252 | start += w.length(); 253 | } 254 | } 255 | 256 | return tokens; 257 | } 258 | 259 | 260 | private interface CutStrategy { 261 | List cut(String sentence); 262 | } 263 | 264 | private class CutAllStrategy implements CutStrategy { 265 | 266 | public List cut(String sentence) { 267 | List frags = new ArrayList(); 268 | Map> dag = Tokenizer.this.getDAG(sentence); 269 | int old_j = -1; 270 | for (Integer k : dag.keySet()) { 271 | List L = dag.get(k); 272 | if (L.size() == 1 && k > old_j) { 273 | frags.add(sentence.substring(k, L.get(0) + 1)); 274 | old_j = L.get(0); 275 | } else { 276 | for (int j : L) { 277 | if (j > k) { 278 | frags.add(sentence.substring(k, j + 1)); 279 | old_j = j; 280 | } 281 | } 282 | } 283 | } 284 | return frags; 285 | } 286 | } 287 | 288 | private class CutDAGStrategy implements CutStrategy { 289 | 290 | public List cut(String sentence) { 291 | List frags = new ArrayList(); 292 | Map> dag = Tokenizer.this.getDAG(sentence); 293 | Map> route = new HashMap(); 294 | Tokenizer.this.calc(sentence, dag, route); 295 | int x = 0; 296 | int N = sentence.length(); 297 | StringBuffer buf = new StringBuffer(); 298 | while (x < N) { 299 | int y = route.get(x).getSecond() + 1; 300 | String lWord = sentence.substring(x, y); 301 | if (y - x == 1) { 302 | buf.append(lWord); 303 | } else { 304 | if (buf.length() > 0) { 305 | if (buf.length() == 1) { 306 | frags.add(buf.toString()); 307 | buf.setLength(0); 308 | } else { 309 | if (freq.get(buf.toString()) == null || freq.get(buf.toString()) == 0) { 310 | List recognized = finalSeg.cut(buf.toString()); 311 | frags.addAll(recognized); 312 | } else { 313 | for (Character elem : buf.toString().toCharArray()) { 314 | frags.add(String.valueOf(elem)); 315 | } 316 | } 317 | buf.setLength(0); 318 | } 319 | } 320 | frags.add(lWord); 321 | } 322 | x = y; 323 | } 324 | 325 | if (buf.length() > 0) { 326 | if (buf.length() == 1) { 327 | frags.add(buf.toString()); 328 | } else if (freq.get(buf.toString()) == null || freq.get(buf.toString()) == 0) { 329 | List recognized = finalSeg.cut(buf.toString()); 330 | frags.addAll(recognized); 331 | } else { 332 | for (Character elem : buf.toString().toCharArray()) { 333 | frags.add(String.valueOf(elem)); 334 | } 335 | } 336 | } 337 | 338 | return frags; 339 | } 340 | } 341 | 342 | private class CutDAGNoHMMStrategy implements CutStrategy { 343 | 344 | public List cut(String sentence) { 345 | List frags = new ArrayList(); 346 | Map> dag = Tokenizer.this.getDAG(sentence); 347 | Map> route = new HashMap(); 348 | Tokenizer.this.calc(sentence, dag, route); 349 | int x = 0; 350 | int N = sentence.length(); 351 | StringBuffer buf = new StringBuffer(); 352 | while (x < N) { 353 | int y = route.get(x).getSecond() + 1; 354 | String lWord = sentence.substring(x, y); 355 | if (RE_ENG.matcher(lWord).matches() && lWord.length() == 1) { 356 | buf.append(lWord); 357 | x = y; 358 | } else { 359 | if (buf.length() > 0) { 360 | frags.add(buf.toString()); 361 | buf.setLength(0); 362 | } 363 | frags.add(lWord); 364 | x = y; 365 | } 366 | } 367 | 368 | if (buf.length() > 0) { 369 | frags.add(buf.toString()); 370 | } 371 | 372 | return frags; 373 | } 374 | } 375 | 376 | private void loadUserDict(Stream stream) throws IOException { 377 | try { 378 | stream.forEach(line -> { 379 | Matcher matcher = RE_USERDICT.matcher(line.trim()); 380 | if (matcher.find()) { 381 | String word = matcher.group(1).trim(); 382 | String freqStr = matcher.group(2); 383 | int freq = 1; 384 | if (freqStr != null) { 385 | freq = Integer.parseInt(freqStr.trim()); 386 | } else { 387 | double df = 1.; 388 | for (String seg : this.cut(word, false, false)) { 389 | df *= this.freq.getOrDefault(seg, 1) / total; 390 | } 391 | freq = Math.max((int) (df * total) + 1, this.freq.getOrDefault(word, 1)); 392 | } 393 | 394 | this.freq.put(word, freq); 395 | this.total += freq; 396 | 397 | for (int i = 1; i <= word.length(); i++) { 398 | String wfrag = word.substring(0, i); 399 | if (!this.freq.containsKey(wfrag)) { 400 | this.freq.put(wfrag, 0); 401 | } 402 | } 403 | } 404 | }); 405 | } finally { 406 | stream.close(); 407 | } 408 | } 409 | 410 | 411 | public void loadUserDict(InputStream in) throws IOException { 412 | this.loadUserDict(new BufferedReader(new InputStreamReader(in, Charset.forName("UTF-8"))).lines()); 413 | 414 | } 415 | 416 | private void loadUserDict(Path path) throws IOException { 417 | this.loadUserDict(Files.lines(path, Charset.forName("UTF-8"))); 418 | } 419 | 420 | public Map getFreq() { 421 | return freq; 422 | } 423 | 424 | public long getTotal() { 425 | return total; 426 | } 427 | } 428 | -------------------------------------------------------------------------------- /src/main/java/com/github/hongfuli/jieba/elasticsearch/JiebaAnalysisPlugin.java: -------------------------------------------------------------------------------- 1 | package com.github.hongfuli.jieba.elasticsearch; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.elasticsearch.index.analysis.AnalyzerProvider; 5 | import org.elasticsearch.indices.analysis.AnalysisModule; 6 | import org.elasticsearch.plugins.AnalysisPlugin; 7 | import org.elasticsearch.plugins.Plugin; 8 | 9 | import java.util.HashMap; 10 | import java.util.Map; 11 | 12 | /** 13 | * Created by lihongfu on 17/6/23. 14 | */ 15 | public class JiebaAnalysisPlugin extends Plugin implements AnalysisPlugin { 16 | 17 | 18 | @Override 19 | public Map>> getAnalyzers() { 20 | Map>> map = new HashMap<>(); 21 | map.put("jieba", JiebaAnalyzerProvider::new); 22 | return map; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/com/github/hongfuli/jieba/elasticsearch/JiebaAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package com.github.hongfuli.jieba.elasticsearch; 2 | 3 | import com.github.hongfuli.jieba.lucene.JiebaAnalyzer; 4 | import org.apache.logging.log4j.LogManager; 5 | import org.apache.logging.log4j.Logger; 6 | import org.elasticsearch.common.io.PathUtils; 7 | import org.elasticsearch.common.settings.Settings; 8 | import org.elasticsearch.env.Environment; 9 | import org.elasticsearch.index.IndexSettings; 10 | import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider; 11 | 12 | import java.io.File; 13 | import java.io.FileInputStream; 14 | import java.io.FileNotFoundException; 15 | import java.io.IOException; 16 | import java.nio.file.Files; 17 | import java.nio.file.Path; 18 | import java.util.Properties; 19 | 20 | /** 21 | * Created by lihongfu on 17/6/23. 22 | */ 23 | public class JiebaAnalyzerProvider extends AbstractIndexAnalyzerProvider { 24 | private final JiebaAnalyzer analyzer; 25 | private static final Logger logger = LogManager.getLogger(JiebaAnalyzerProvider.class); 26 | 27 | public JiebaAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, Settings settings) { 28 | super(indexSettings, name, settings); 29 | analyzer = new JiebaAnalyzer(); 30 | 31 | logger.info("load jieba_config.properties"); 32 | Path configPath = PathUtils.get(new File(JiebaAnalyzer.class.getProtectionDomain().getCodeSource().getLocation().getPath()).getParent()).toAbsolutePath().resolve("jieba_config.properties"); 33 | Properties props = new Properties(); 34 | try { 35 | props.load(Files.newInputStream(configPath)); 36 | } catch (IOException e) { 37 | throw new RuntimeException("load jieba_config.properties error"); 38 | } 39 | 40 | String userDictPath = props.getProperty("user_dict"); 41 | if (userDictPath != null && !userDictPath.trim().isEmpty()) { 42 | try { 43 | logger.info("load user dict from file: " + userDictPath); 44 | analyzer.setUserDictIn(new FileInputStream(userDictPath)); 45 | } catch (FileNotFoundException e) { 46 | throw new IllegalArgumentException("user_dict file path cannot load: " + userDictPath); 47 | } 48 | } 49 | 50 | } 51 | 52 | @Override 53 | public JiebaAnalyzer get() { 54 | return this.analyzer; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/com/github/hongfuli/jieba/lucene/JiebaAnalyzer.java: -------------------------------------------------------------------------------- 1 | package com.github.hongfuli.jieba.lucene; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.apache.lucene.analysis.TokenFilter; 5 | 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | 9 | /** 10 | * Created by lihongfu on 17/6/19. 11 | */ 12 | public final class JiebaAnalyzer extends Analyzer { 13 | // private static final Pattern RE_SKIP_DEFAULT = Pattern.compile("(\\r\\n|\\s)"); 14 | 15 | private InputStream userDictIn; 16 | 17 | public JiebaAnalyzer() { 18 | } 19 | 20 | public JiebaAnalyzer(InputStream userDictIn) { 21 | setUserDictIn(userDictIn); 22 | } 23 | 24 | @Override 25 | protected TokenStreamComponents createComponents(String fieldName) { 26 | JiebaTokenizer tokenizer = new JiebaTokenizer(); 27 | if (userDictIn != null) { 28 | try { 29 | tokenizer.loadUserDict(userDictIn); 30 | } catch (IOException e) { 31 | throw new RuntimeException("load user dict error"); 32 | } 33 | } 34 | TokenFilter stopFilter = new JiebaStopTokenFilter(tokenizer); 35 | return new TokenStreamComponents(tokenizer, stopFilter); 36 | } 37 | 38 | public void setUserDictIn(InputStream userDictIn) { 39 | if (userDictIn == null) { 40 | throw new IllegalArgumentException("userDictIn is null"); 41 | } 42 | this.userDictIn = userDictIn; 43 | } 44 | 45 | 46 | // @Override 47 | // protected Reader initReader(String fieldName, Reader reader) { 48 | // return new PatternReplaceCharFilter(RE_SKIP_DEFAULT, ",", reader); 49 | // } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/com/github/hongfuli/jieba/lucene/JiebaStopTokenFilter.java: -------------------------------------------------------------------------------- 1 | package com.github.hongfuli.jieba.lucene; 2 | 3 | import org.apache.lucene.analysis.FilteringTokenFilter; 4 | import org.apache.lucene.analysis.TokenStream; 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 6 | 7 | import java.io.IOException; 8 | import java.util.regex.Pattern; 9 | 10 | /** 11 | * Created by lihongfu on 17/6/23. 12 | */ 13 | public class JiebaStopTokenFilter extends FilteringTokenFilter { 14 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 15 | 16 | private static final Pattern NOT_WORD = Pattern.compile("\\w", Pattern.UNICODE_CHARACTER_CLASS); 17 | 18 | public JiebaStopTokenFilter(TokenStream in) { 19 | super(in); 20 | } 21 | 22 | @Override 23 | protected boolean accept() throws IOException { 24 | String term = termAtt.toString(); 25 | if (term.length() > 1){ 26 | return true; 27 | }else{ 28 | return NOT_WORD.matcher(term).matches(); 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/com/github/hongfuli/jieba/lucene/JiebaTokenizer.java: -------------------------------------------------------------------------------- 1 | package com.github.hongfuli.jieba.lucene; 2 | 3 | import com.github.hongfuli.jieba.Token; 4 | import com.github.hongfuli.jieba.Tokenizer; 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 6 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 7 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 8 | 9 | import java.io.BufferedReader; 10 | import java.io.IOException; 11 | import java.io.InputStream; 12 | import java.util.List; 13 | 14 | /** 15 | * Created by lihongfu on 17/6/19. 16 | */ 17 | public class JiebaTokenizer extends org.apache.lucene.analysis.Tokenizer { 18 | private com.github.hongfuli.jieba.Tokenizer scanner; 19 | private BufferedReader bufferReader; 20 | private int tokenIndex; 21 | private List tokenBuffer; 22 | private int finalOffset; 23 | 24 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 25 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); 26 | private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); 27 | 28 | public JiebaTokenizer() { 29 | this.scanner = new Tokenizer(); 30 | } 31 | 32 | @Override 33 | public final boolean incrementToken() throws IOException { 34 | if (bufferReader == null) { 35 | throw new IllegalStateException("must call reset before call incrementToken()"); 36 | } 37 | clearAttributes(); 38 | if (tokenBuffer == null || tokenIndex >= tokenBuffer.size()) { 39 | String line = bufferReader.readLine(); 40 | if (line == null) { 41 | return false; 42 | } 43 | tokenBuffer = scanner.tokenize(line, true, true); 44 | tokenIndex = 0; 45 | } 46 | Token token = tokenBuffer.get(tokenIndex); 47 | termAtt.append(token.value); 48 | offsetAtt.setOffset(correctOffset(token.startPos), correctOffset(token.endPos)); 49 | posIncrAtt.setPositionIncrement(1); 50 | tokenIndex += 1; 51 | finalOffset = correctOffset(token.endPos); 52 | return true; 53 | } 54 | 55 | @Override 56 | public void end() throws IOException { 57 | super.end(); 58 | offsetAtt.setOffset(finalOffset + 1, finalOffset + 1); 59 | } 60 | 61 | @Override 62 | public void reset() throws IOException { 63 | super.reset(); 64 | if (BufferedReader.class.isAssignableFrom(input.getClass())) { 65 | bufferReader = (BufferedReader) input; 66 | } else { 67 | bufferReader = new BufferedReader(this.input); 68 | } 69 | tokenIndex = 0; 70 | tokenBuffer = null; 71 | } 72 | 73 | @Override 74 | public void close() throws IOException { 75 | super.close(); 76 | if (bufferReader != null){ 77 | bufferReader.close(); 78 | bufferReader = null; 79 | } 80 | } 81 | 82 | public void loadUserDict(InputStream in) throws IOException { 83 | if (this.scanner == null){ 84 | throw new IllegalStateException("not initialized tokenizer correct"); 85 | } 86 | this.scanner.loadUserDict(in); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/java/com/github/hongfuli/utils/MtyStringUtils.java: -------------------------------------------------------------------------------- 1 | package com.github.hongfuli.utils; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.regex.Matcher; 6 | import java.util.regex.Pattern; 7 | 8 | /** 9 | * Created by lihongfu on 17/6/3. 10 | */ 11 | public class MtyStringUtils { 12 | /** 13 | * 该方法和 {@link Pattern#split(CharSequence)} 功能一样,利用pattern匹配规则分割字符str, 但是会把匹配的分割符串也返回. 14 | * 比如字符串 "hello123word" 用 (\d+) 分割, 会返回["hello", "123", "word"] 15 | * 16 | * @param pattern 17 | * @param str 18 | * @return 19 | */ 20 | public static List splitAndReturnDelimiters(Pattern pattern, String str) { 21 | Matcher matcher = pattern.matcher(str); 22 | List result = new ArrayList(); 23 | int strLen = str.length(); 24 | int lastMatchIdx = 0; 25 | while (matcher.find()) { 26 | int start = matcher.start(); 27 | int end = matcher.end(); 28 | String ds = matcher.group(); 29 | 30 | if (lastMatchIdx != start) { 31 | String leftS = str.substring(lastMatchIdx, start); 32 | result.add(leftS); 33 | } 34 | 35 | result.add(ds); 36 | 37 | lastMatchIdx = end; 38 | } 39 | 40 | if (lastMatchIdx < strLen) { 41 | result.add(str.substring(lastMatchIdx, strLen)); 42 | } 43 | return result; 44 | 45 | } 46 | 47 | 48 | } 49 | 50 | -------------------------------------------------------------------------------- /src/test/java/com/github/hongfuli/jieba/FinalSegTest.java: -------------------------------------------------------------------------------- 1 | package com.github.hongfuli.jieba; 2 | 3 | import junit.framework.TestCase; 4 | 5 | import java.io.IOException; 6 | 7 | /** 8 | * Created by lihongfu on 17/6/9. 9 | */ 10 | public class FinalSegTest extends TestCase { 11 | 12 | 13 | public void testLoadEmitP() throws IOException { 14 | FinalSeg seg = new FinalSeg("/emit_test.txt"); 15 | System.out.println(seg.getEmitP()); 16 | } 17 | 18 | public void testLoadEmitPDefault() throws IOException { 19 | FinalSeg seg = new FinalSeg(); 20 | System.out.println(seg.getEmitP().get('B').size() + seg.getEmitP().get('E').size() 21 | + seg.getEmitP().get('M').size() + seg.getEmitP().get('S').size()); 22 | } 23 | 24 | public void testCut() throws IOException { 25 | FinalSeg seg = new FinalSeg(); 26 | System.out.println(seg.cut("我最喜欢青白玉")); 27 | System.out.println(seg.cut("你是喜欢Python还是Java呢,我也不知道吧")); 28 | } 29 | 30 | } -------------------------------------------------------------------------------- /src/test/java/com/github/hongfuli/jieba/TokenizerTest.java: -------------------------------------------------------------------------------- 1 | package com.github.hongfuli.jieba; 2 | 3 | import junit.framework.TestCase; 4 | 5 | import java.io.IOException; 6 | import java.util.List; 7 | import java.util.regex.Matcher; 8 | import java.util.regex.Pattern; 9 | 10 | /** 11 | * Created by lihongfu on 17/6/2. 12 | */ 13 | public class TokenizerTest extends TestCase { 14 | 15 | public void testHanPattern() { 16 | Pattern hanP = Pattern.compile("([\\u4E00-\\u9FD5]+)"); 17 | String sentence = "abc我是中国人bc你好 workd"; 18 | Matcher matcher = hanP.matcher(sentence); 19 | System.out.println(matcher.matches()); 20 | for (String s : hanP.split(sentence)) { 21 | System.out.print(s + " / "); 22 | } 23 | } 24 | 25 | public void testSplit() { 26 | for (String x : "hek fd 133 4.def".split("((?!^))")) { 27 | System.out.println(x + "===="); 28 | } 29 | } 30 | 31 | private void printResult(List tokens) { 32 | for (String t : tokens) { 33 | System.out.print(t + "|"); 34 | } 35 | System.out.println(); 36 | 37 | } 38 | 39 | private void printTokens(List tokens){ 40 | for (Token token : tokens){ 41 | System.out.println(token); 42 | } 43 | } 44 | 45 | public void testCutAll() { 46 | Tokenizer t = new Tokenizer(); 47 | printResult(t.cut("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", true, false)); 48 | printResult(t.cut("我不喜欢日本和服。", true, false)); 49 | printResult(t.cut("雷猴回归人间。", true, false)); 50 | printResult(t.cut("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", true, false)); 51 | printResult(t.cut("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成", true, false)); 52 | } 53 | 54 | public void testCutNoHMM() { 55 | Tokenizer t = new Tokenizer(); 56 | printResult(t.cut("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", false, false)); 57 | printResult(t.cut("我不喜欢日本和服。", false, false)); 58 | printResult(t.cut("雷猴回归人间。", false, false)); 59 | printResult(t.cut("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", false, false)); 60 | printResult(t.cut("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成", false, false)); 61 | } 62 | 63 | public void testCutHMM() { 64 | Tokenizer t = new Tokenizer(); 65 | printResult(t.cut("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", false, true)); 66 | printResult(t.cut("我不喜欢日本和服。", false, true)); 67 | printResult(t.cut("雷猴回归人间。", false, true)); 68 | printResult(t.cut("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", false, true)); 69 | printResult(t.cut("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成", false, true)); 70 | printResult(t.cut("这个洒金皮的和田玉你喜欢吗", false, true)); 71 | } 72 | 73 | public void testCutForSearch() { 74 | Tokenizer t = new Tokenizer(); 75 | printResult(t.cutForSearch("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")); 76 | printResult(t.cutForSearch("我不喜欢日本和服。")); 77 | printResult(t.cutForSearch("雷猴回归人间。")); 78 | printResult(t.cutForSearch("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")); 79 | } 80 | 81 | public void testToknizer() { 82 | Tokenizer t = new Tokenizer(); 83 | printTokens(t.tokenize("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", true, true)); 84 | System.out.println("===================="); 85 | printTokens(t.tokenize("hello world this is my first program", false, true)); 86 | System.out.println("===================="); 87 | printTokens(t.tokenize("hello,,world,this,is,my,first,program", false, true)); 88 | } 89 | 90 | 91 | public void testLoadUserDict() throws IOException { 92 | Tokenizer t = new Tokenizer(); 93 | printResult(t.cut("这个洒金皮的和田玉我很喜欢呢", false, false)); 94 | System.out.println("===================="); 95 | t.loadUserDict(this.getClass().getResourceAsStream("/userdict.txt")); 96 | printResult(t.cut("这个洒金皮的和田玉我很喜欢呢", false, false)); 97 | } 98 | 99 | } 100 | 101 | -------------------------------------------------------------------------------- /src/test/java/com/github/hongfuli/jieba/lucene/JiebaAnalyzerTest.java: -------------------------------------------------------------------------------- 1 | package com.github.hongfuli.jieba.lucene; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.apache.lucene.analysis.BaseTokenStreamTestCase; 5 | 6 | import java.io.IOException; 7 | import java.util.Random; 8 | 9 | /** 10 | * Created by lihongfu on 17/6/19. 11 | */ 12 | public class JiebaAnalyzerTest extends BaseTokenStreamTestCase { 13 | 14 | public void testStandardAnalyzer() throws IOException { 15 | Analyzer analyzer = new JiebaAnalyzer(); 16 | 17 | checkRandomData(new Random(0), analyzer, 1); 18 | 19 | System.out.println(BaseTokenStreamTestCase.toString(analyzer, "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")); 20 | System.out.println("=============="); 21 | System.out.println(BaseTokenStreamTestCase.toString(analyzer, "hello world,this is my first program")); 22 | System.out.println("=============="); 23 | System.out.println(BaseTokenStreamTestCase.toString(analyzer, "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")); 24 | 25 | } 26 | 27 | 28 | } -------------------------------------------------------------------------------- /src/test/java/com/github/hongfuli/utils/MtyStringUtilsTest.java: -------------------------------------------------------------------------------- 1 | package com.github.hongfuli.utils; 2 | 3 | import junit.framework.TestCase; 4 | 5 | import java.util.List; 6 | import java.util.regex.Pattern; 7 | 8 | /** 9 | * Created by lihongfu on 17/6/5. 10 | */ 11 | public class MtyStringUtilsTest extends TestCase { 12 | 13 | public void testSplitAndReturnDelimiters(){ 14 | Pattern hanP = Pattern.compile("([\\u4E00-\\u9FD5]+)"); 15 | String sentence = "abc我是中国人bc你好 workd"; 16 | List strings = MtyStringUtils.splitAndReturnDelimiters(hanP, sentence); 17 | System.out.println(strings); 18 | 19 | sentence = "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。"; 20 | strings = MtyStringUtils.splitAndReturnDelimiters(hanP, sentence); 21 | System.out.println(strings); 22 | 23 | 24 | hanP = Pattern.compile("[^a-zA-Z0-9+#\\n]"); 25 | sentence = "C++。"; 26 | strings = MtyStringUtils.splitAndReturnDelimiters(hanP, sentence); 27 | System.out.println(strings); 28 | } 29 | 30 | public void testSplitAndReturnDelimiters4continue(){ 31 | Pattern hanP = Pattern.compile("(ab)"); 32 | String sentence = "ababab"; 33 | List strings = MtyStringUtils.splitAndReturnDelimiters(hanP, sentence); 34 | System.out.println(strings); 35 | } 36 | 37 | } -------------------------------------------------------------------------------- /src/test/resources/emit_test.txt: -------------------------------------------------------------------------------- 1 | B 2 | '\u4e00': -3.6544978750449433, 3 | '\u4e01': -8.125041941842026, 4 | E 5 | '\u4e00': -6.044987536255073, 6 | '\u4e01': -9.075800412310807, 7 | M 8 | '\u4e00': -4.428158526435913, 9 | '\u4e01': -7.932945687598502, 10 | S 11 | '\u2236': -15.828865681131282, 12 | '\u4e00': -4.92368982120877, 13 | -------------------------------------------------------------------------------- /src/test/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/test/resources/userdict.txt: -------------------------------------------------------------------------------- 1 | 洒金皮 2 | 桥北中学 10 --------------------------------------------------------------------------------