├── .gitignore ├── LICENSE ├── README.md ├── pom.xml └── src ├── main ├── java │ └── com │ │ └── chenlb │ │ └── mmseg4j │ │ ├── CharNode.java │ │ ├── Chunk.java │ │ ├── ComplexSeg.java │ │ ├── Dictionary.java │ │ ├── MMSeg.java │ │ ├── MaxWordSeg.java │ │ ├── Seg.java │ │ ├── Sentence.java │ │ ├── SimpleSeg.java │ │ ├── Word.java │ │ ├── example │ │ ├── Complex.java │ │ ├── MaxWord.java │ │ └── Simple.java │ │ └── rule │ │ ├── LargestAvgLenRule.java │ │ ├── LargestSumDegreeFreedomRule.java │ │ ├── MaxMatchRule.java │ │ ├── Rule.java │ │ └── SmallestVarianceRule.java └── resources │ └── data │ ├── chars.dic │ ├── units.dic │ └── words.dic └── test ├── java └── com │ └── chenlb │ └── mmseg4j │ ├── ComplexSegTest.java │ ├── DictionaryTest.java │ ├── KeyTreeTest.java │ ├── MMSegTest.java │ ├── MaxWordSegTest.java │ ├── MyTest.java │ └── SimpleSegTest.java └── resources └── data └── words-test-my.dic /.gitignore: -------------------------------------------------------------------------------- 1 | # Eclipse 2 | .classpath 3 | .project 4 | .settings/ 5 | 6 | # Intellij 7 | .idea/ 8 | *.iml 9 | *.iws 10 | 11 | # Mac 12 | .DS_Store 13 | 14 | # Maven 15 | log/ 16 | target/ 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mmseg4j core 使用 Chih-Hao Tsai 的 MMSeg 算法(http://technology.chtsai.org/mmseg/ )实现的中文分词器。 2 | 3 | MMSeg 算法有两种分词方法:Simple和Complex,都是基于正向最大匹配。Complex 加了四个规则过虑。官方说:词语的正确识别率达到了 98.41%。mmseg4j 已经实现了这两种分词算法。 4 | 5 | ```xml 6 | 7 | com.chenlb.mmseg4j 8 | mmseg4j-core 9 | 1.10.0 10 | 11 | ``` 12 | 13 | ## example 14 | 15 | ``` 16 | git clone https://github.com/chenlb/mmseg4j-core mmseg4j-core 17 | cd mmseg4j-core 18 | mvn compile 19 | 20 | #运行 21 | #Complex 分词模式 22 | java -cp .:target/classes com.chenlb.mmseg4j.example.Complex 23 | 24 | #Simple 分词模式 25 | java -cp .:target/classes com.chenlb.mmseg4j.example.Simple 26 | 27 | #MaxWord 分词模式 28 | java -cp .:target/classes com.chenlb.mmseg4j.example.MaxWord 29 | 30 | #或编译打包 31 | mvn package 32 | 33 | java -cp .:target/mmseg4j-core-1.10.1-SNAPSHOT.jar com.chenlb.mmseg4j.example.Complex 34 | ``` 35 | 36 | ## 其它 37 | 38 | * [早期的介绍](https://github.com/chenlb/mmseg4j-from-googlecode) -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | org.sonatype.oss 5 | oss-parent 6 | 7 7 | 8 | com.chenlb.mmseg4j 9 | mmseg4j-core 10 | mmseg4j-core 11 | https://github.com/chenlb/mmseg4j-core 12 | MMSEG cor for java chinese analyzer 13 | 14 | 15 | The Apache Software License, Version 2.0 16 | http://www.apache.org/licenses/LICENSE-2.0.txt 17 | repo 18 | 19 | 20 | 21 | http://blog.chenlb.com 22 | chenlb open source 23 | 24 | 25 | git@github.com:chenlb/mmseg4j-core.git 26 | scm:git:git@github.com:chenlb/mmseg4j-core.git 27 | scm:git:git@github.com:chenlb/mmseg4j-core.git 28 | 29 | 30 | 31 | chenlb 32 | LinBin Chen 33 | chenlb2008@gmail.com 34 | 35 | 36 | 37 | https://github.com/chenlb/mmseg4j-core/issues 38 | github.com 39 | 40 | 41 | 42 | junit 43 | junit 44 | 4.8 45 | test 46 | 47 | 48 | 49 | 50 | 51 | 52 | org.apache.maven.plugins 53 | maven-compiler-plugin 54 | 2.3.1 55 | 56 | 1.6 57 | 1.6 58 | UTF-8 59 | 60 | 61 | 62 | org.apache.maven.plugins 63 | maven-gpg-plugin 64 | 65 | 66 | sign-artifacts 67 | verify 68 | 69 | sign 70 | 71 | 72 | 73 | 74 | 75 | 76 | 1.10.1-SNAPSHOT 77 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/CharNode.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | 7 | /** 8 | * 所有词都记录在第一个字的结点下. 9 | * 10 | * @author chenlb 2009-2-20 下午11:30:14 11 | */ 12 | public class CharNode { 13 | 14 | private int freq = -1; //Degree of Morphemic Freedom of One-Character, 单字才需要 15 | private int maxLen = 0; //wordTail的最长 16 | 17 | private KeyTree ktWordTails = new KeyTree(); 18 | private int wordNum = 0; 19 | 20 | public CharNode() { 21 | 22 | } 23 | 24 | public void addWordTail(char[] wordTail) { 25 | ktWordTails.add(wordTail); 26 | wordNum++; 27 | if(wordTail.length > maxLen) { 28 | maxLen = wordTail.length; 29 | } 30 | } 31 | public int getFreq() { 32 | return freq; 33 | } 34 | 35 | public void setFreq(int freq) { 36 | this.freq = freq; 37 | } 38 | 39 | public int wordNum() { 40 | return wordNum; 41 | } 42 | 43 | /** 44 | * @param sen 句子, 一串文本. 45 | * @param offset 词在句子中的位置 46 | * @param tailLen 词尾的长度, 实际是去掉词的长度. 47 | * @author chenlb 2009-4-8 下午11:10:30 48 | */ 49 | public int indexOf(char[] sen, int offset, int tailLen) { 50 | //return binarySearch(wordTails, sen, offset+1, tailLen, casc); 51 | return ktWordTails.match(sen, offset+1, tailLen) ? 1 : -1; 52 | } 53 | 54 | /** 55 | * @param sen 句子, 一串文本. 56 | * @param wordTailOffset 词在句子中的位置, 实际是 offset 后面的开始找. 57 | * @return 返回词尾长, 没有就是 0 58 | * @author chenlb 2009-4-10 下午10:45:51 59 | */ 60 | public int maxMatch(char[] sen, int wordTailOffset) { 61 | return ktWordTails.maxMatch(sen, wordTailOffset); 62 | } 63 | 64 | /** 65 | * 66 | * @return 至少返回一个包括 0的int 67 | * @author chenlb 2009-4-12 上午10:01:35 68 | */ 69 | public ArrayList maxMatch(ArrayList tailLens, char[] sen, int wordTailOffset) { 70 | return ktWordTails.maxMatch(tailLens, sen, wordTailOffset); 71 | } 72 | 73 | public int getMaxLen() { 74 | return maxLen; 75 | } 76 | public void setMaxLen(int maxLen) { 77 | this.maxLen = maxLen; 78 | } 79 | 80 | public static class KeyTree { 81 | TreeNode head = new TreeNode(' '); 82 | 83 | public void add(char[] w) { 84 | if(w.length < 1) { 85 | return; 86 | } 87 | TreeNode p = head; 88 | for(int i=0; i maxMatch(ArrayList tailLens, char[] sen, int offset) { 119 | TreeNode node = head; 120 | for(int i=offset; i subNodes; 148 | boolean alsoLeaf; 149 | public TreeNode(char key) { 150 | this.key = key; 151 | subNodes = new HashMap(); 152 | } 153 | 154 | public void born(char k, TreeNode sub) { 155 | subNodes.put(k, sub); 156 | } 157 | 158 | public TreeNode subNode(char k) { 159 | return subNodes.get(k); 160 | } 161 | public boolean isAlsoLeaf() { 162 | return alsoLeaf; 163 | } 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/Chunk.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | 4 | /** 5 | * 它是MMSeg分词算法中一个关键的概念。Chunk中包含依据上下文分出的一组词和相关的属性,包括长度(Length)、平均长度(Average Length)、标准差的平方(Variance)和自由语素度(Degree Of Morphemic Freedom)。 6 | * 7 | * @author chenlb 2009-3-16 上午11:39:42 8 | */ 9 | public class Chunk { 10 | 11 | Word[] words = new Word[3]; 12 | 13 | int count = -1; 14 | 15 | /** Word Length */ 16 | private int len = -1; 17 | /** Largest Average Word Length */ 18 | private double avgLen = -1; 19 | /** Variance of Word Lengths 就是 标准差的平方 */ 20 | private double variance = -1; 21 | /** Sum of Degree of Morphemic Freedom of One-Character */ 22 | private int sumDegree = -1; 23 | 24 | /** Word Length */ 25 | public int getLen() { 26 | if(len < 0) { 27 | len = 0; 28 | count = 0; 29 | for(Word word : words) { 30 | if(word != null) { 31 | len += word.getLength(); 32 | count++; 33 | } 34 | } 35 | } 36 | return len; 37 | } 38 | 39 | /** 有多少个词,最多3个。*/ 40 | public int getCount() { 41 | if(count < 0) { 42 | count = 0; 43 | for(Word word : words) { 44 | if(word != null) { 45 | count++; 46 | } 47 | } 48 | } 49 | return count; 50 | } 51 | 52 | /** Largest Average Word Length */ 53 | public double getAvgLen() { 54 | if(avgLen < 0) { 55 | avgLen = (double)getLen()/getCount(); 56 | } 57 | return avgLen; 58 | } 59 | 60 | /** Variance of Word Lengths 就是 标准差的平方 */ 61 | public double getVariance() { 62 | if(variance < 0) { 63 | double sum = 0; 64 | for(Word word : words) { 65 | if(word != null) { 66 | sum += Math.pow(word.getLength()-getAvgLen(), 2); 67 | } 68 | } 69 | variance = sum/getCount(); 70 | } 71 | return variance; 72 | } 73 | 74 | /** Sum of Degree of Morphemic Freedom of One-Character */ 75 | public int getSumDegree() { 76 | if(sumDegree < 0) { 77 | int sum = 0; 78 | for(Word word : words) { 79 | if(word != null && word.getDegree() > -1) { 80 | sum += word.getDegree(); 81 | } 82 | } 83 | sumDegree = sum; 84 | } 85 | return sumDegree; 86 | } 87 | 88 | @Override 89 | public String toString() { 90 | StringBuilder sb = new StringBuilder(); 91 | for(Word word : words) { 92 | if(word != null) { 93 | sb.append(word.getString()).append('_'); 94 | } 95 | } 96 | return sb.toString(); 97 | } 98 | 99 | public String toFactorString() { 100 | StringBuilder sb = new StringBuilder(); 101 | sb.append("["); 102 | sb.append("len=").append(getLen()).append(", "); 103 | sb.append("avgLen=").append(getAvgLen()).append(", "); 104 | sb.append("variance=").append(getVariance()).append(", "); 105 | sb.append("sum100log=").append(getSumDegree()).append("]"); 106 | return sb.toString(); 107 | } 108 | 109 | public Word[] getWords() { 110 | return words; 111 | } 112 | 113 | public void setWords(Word[] words) { 114 | this.words = words; 115 | count = words.length; 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/ComplexSeg.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import com.chenlb.mmseg4j.rule.LargestAvgLenRule; 7 | import com.chenlb.mmseg4j.rule.LargestSumDegreeFreedomRule; 8 | import com.chenlb.mmseg4j.rule.MaxMatchRule; 9 | import com.chenlb.mmseg4j.rule.Rule; 10 | import com.chenlb.mmseg4j.rule.SmallestVarianceRule; 11 | 12 | 13 | /** 14 | * 正向最大匹配, 加四个过虑规则的分词方式. 15 | * 16 | * @author chenlb 2009-3-16 下午09:15:26 17 | */ 18 | public class ComplexSeg extends Seg{ 19 | 20 | private MaxMatchRule mmr = new MaxMatchRule(); 21 | private List otherRules = new ArrayList(); 22 | 23 | private static boolean showChunk = false; 24 | 25 | public ComplexSeg(Dictionary dic) { 26 | super(dic); 27 | otherRules.add(new LargestAvgLenRule()); 28 | otherRules.add(new SmallestVarianceRule()); 29 | otherRules.add(new LargestSumDegreeFreedomRule()); 30 | } 31 | 32 | public Chunk seg(Sentence sen) { 33 | char[] chs = sen.getText(); 34 | int[] tailLen = new int[3]; //记录词的尾长 35 | //int[] maxTailLen = new int[3]; 36 | @SuppressWarnings("unchecked") 37 | ArrayList[] tailLens = new ArrayList[2]; //记录词尾部允许的长度 38 | for(int i=0; i<2; i++) { 39 | tailLens[i] = new ArrayList(); 40 | } 41 | CharNode[] cns = new CharNode[3]; 42 | 43 | int[] offsets = new int[3]; //每个词在sen的开始位置 44 | mmr.reset(); 45 | if(!sen.isFinish()) { //sen.getOffset() < chs.length 46 | if(showChunk) { 47 | System.out.println(); 48 | } 49 | int maxLen = 0; 50 | offsets[0] = sen.getOffset(); 51 | /* 52 | * 遍历所有不同词长,还不是从最大到0(w[0]=maxLen(chs, offsets[0]); w[0]>=0; w[0]--) 53 | * 可以减少一部分多余的查找. 54 | */ 55 | maxMatch(cns, 0, chs, offsets[0], tailLens, 0); 56 | for(int aIdx=tailLens[0].size()-1; aIdx>=0; aIdx--) { 57 | 58 | tailLen[0] = tailLens[0].get(aIdx); 59 | 60 | offsets[1] = offsets[0]+1+tailLen[0]; //第二个词的开始位置 61 | 62 | maxMatch(cns, 1, chs, offsets[1], tailLens, 1); 63 | for(int bIdx=tailLens[1].size()-1; bIdx>=0; bIdx--) { 64 | 65 | tailLen[1] = tailLens[1].get(bIdx); 66 | offsets[2] = offsets[1]+1+tailLen[1]; 67 | 68 | //第三个词只需要最长的 69 | tailLen[2] = maxMatch(cns, 2, chs, offsets[2]); 70 | 71 | int sumChunkLen = 0; 72 | for(int i=0; i<3; i++) { 73 | sumChunkLen += tailLen[i]+1; 74 | } 75 | Chunk ck = null; 76 | if(sumChunkLen >= maxLen) { 77 | maxLen = sumChunkLen; //下一个chunk块的开始位置增量 78 | ck = createChunk(sen, chs, tailLen, offsets, cns); 79 | mmr.addChunk(ck); 80 | 81 | } 82 | if(showChunk) { 83 | if(ck == null) { 84 | ck = createChunk(sen, chs, tailLen, offsets, cns); 85 | mmr.addChunk(ck); 86 | } 87 | System.out.println(ck); 88 | } 89 | 90 | } 91 | } 92 | sen.addOffset(maxLen); //maxLen个字符已经处理完 93 | List chunks = mmr.remainChunks(); 94 | for(Rule rule : otherRules) { //其它规则过虑 95 | if(showChunk) { 96 | System.out.println("-------filter before "+rule+"----------"); 97 | printChunk(chunks); 98 | } 99 | if(chunks.size() > 1) { 100 | rule.reset(); 101 | rule.addChunks(chunks); 102 | chunks = rule.remainChunks(); 103 | } else { 104 | break; 105 | } 106 | } 107 | if(showChunk) { 108 | System.out.println("-------remainChunks----------"); 109 | printChunk(chunks); 110 | } 111 | if(chunks.size() > 0) { 112 | return chunks.get(0); 113 | } 114 | } 115 | return null; 116 | } 117 | 118 | private Chunk createChunk(Sentence sen, char[] chs, int[] tailLen, int[] offsets, CharNode[] cns/*, char[][] cks*/) { 119 | Chunk ck = new Chunk(); 120 | 121 | for(int i=0; i<3; i++) { 122 | 123 | if(offsets[i] < chs.length) { 124 | ck.words[i] = new Word(chs, sen.getStartOffset(), offsets[i], tailLen[i]+1);//new Word(cks[i], sen.getStartOffset()+offsets[i]); 125 | if(tailLen[i] == 0) { //单字的要取得"字频计算出自由度" 126 | CharNode cn = cns[i]; //dic.head(chs[offsets[i]]); 127 | if(cn !=null) { 128 | ck.words[i].setDegree(cn.getFreq()); 129 | } 130 | } 131 | } 132 | } 133 | return ck; 134 | } 135 | 136 | public static boolean isShowChunk() { 137 | return showChunk; 138 | } 139 | 140 | public static void setShowChunk(boolean showChunk) { 141 | ComplexSeg.showChunk = showChunk; 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/Dictionary.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | import java.io.BufferedInputStream; 4 | import java.io.BufferedReader; 5 | import java.io.File; 6 | import java.io.FileInputStream; 7 | import java.io.FilenameFilter; 8 | import java.io.IOException; 9 | import java.io.InputStream; 10 | import java.io.InputStreamReader; 11 | import java.net.URL; 12 | import java.util.ArrayList; 13 | import java.util.HashMap; 14 | import java.util.Map; 15 | import java.util.Map.Entry; 16 | import java.util.concurrent.ConcurrentHashMap; 17 | import java.util.logging.Level; 18 | import java.util.logging.Logger; 19 | 20 | /** 21 | * 词典类. 词库目录单例模式.
22 | * 保存单字与其频率,还有词库.
23 | * 有检测词典变更的接口,外部程序可以使用 {@link #wordsFileIsChange()} 和 {@link #reload()} 来完成检测与加载的工作. 24 | * 25 | * @author chenlb 2009-2-20 下午11:34:29 26 | */ 27 | public class Dictionary { 28 | 29 | private static final Logger log = Logger.getLogger(Dictionary.class.getName()); 30 | 31 | private File dicPath; //词库目录 32 | private volatile Map dict; 33 | private volatile Map unit; //单个字的单位 34 | 35 | /** 记录 word 文件的最后修改时间 */ 36 | private Map wordsLastTime = null; 37 | private long lastLoadTime = 0; 38 | 39 | /** 不要直接使用, 通过 {@link #getDefalutPath()} 使用*/ 40 | private static File defalutPath = null; 41 | private static final ConcurrentHashMap dics = new ConcurrentHashMap(); 42 | 43 | protected void finalize() throws Throwable { 44 | /* 45 | * 使 class reload 的时也可以释放词库 46 | */ 47 | destroy(); 48 | } 49 | 50 | /** 51 | * 从默认目录加载词库文件.

52 | * 查找默认目录顺序: 53 | *

    54 | *
  1. 从系统属性mmseg.dic.path指定的目录中加载
  2. 55 | *
  3. 从classpath/data目录
  4. 56 | *
  5. 从user.dir/data目录
  6. 57 | *
58 | * @see #getDefalutPath() 59 | */ 60 | public static Dictionary getInstance() { 61 | File path = getDefalutPath(); 62 | return getInstance(path); 63 | } 64 | 65 | /** 66 | * @param path 词典的目录 67 | */ 68 | public static Dictionary getInstance(String path) { 69 | return getInstance(new File(path)); 70 | } 71 | 72 | /** 73 | * @param path 词典的目录 74 | */ 75 | public static Dictionary getInstance(File path) { 76 | log.info("try to load dir="+path); 77 | File normalizeDir = normalizeFile(path); 78 | Dictionary dic = dics.get(normalizeDir); 79 | if(dic == null) { 80 | dic = new Dictionary(normalizeDir); 81 | dics.put(normalizeDir, dic); 82 | } 83 | return dic; 84 | } 85 | 86 | public static File normalizeFile(File file) { 87 | if(file == defalutPath) { 88 | return defalutPath; 89 | } 90 | try { 91 | return file.getCanonicalFile(); 92 | } catch (IOException e) { 93 | throw new RuntimeException("normalize file=["+file+"] fail", e); 94 | } 95 | } 96 | 97 | /** 98 | * 销毁, 释放资源. 此后此对像不再可用. 99 | */ 100 | void destroy() { 101 | clear(dicPath); 102 | 103 | dicPath = null; 104 | dict = null; 105 | unit = null; 106 | } 107 | 108 | /** 109 | * @see Dictionary#clear(File) 110 | */ 111 | public static Dictionary clear(String path) { 112 | return clear(new File(path)); 113 | } 114 | 115 | /** 116 | * 从单例缓存中去除 117 | * @param path 118 | * @return 没有返回 null 119 | */ 120 | public static Dictionary clear(File path) { 121 | File normalizeDir = normalizeFile(path); 122 | return dics.remove(normalizeDir); 123 | } 124 | 125 | /** 126 | * 词典的目录 127 | */ 128 | private Dictionary(File path) { 129 | init(path); 130 | } 131 | 132 | private void init(File path) { 133 | dicPath = path; 134 | wordsLastTime = new HashMap(); 135 | 136 | reload(); //加载词典 137 | } 138 | 139 | private static long now() { 140 | return System.currentTimeMillis(); 141 | } 142 | 143 | /** 144 | * 只要 wordsXXX.dic的文件 145 | * @return 146 | */ 147 | protected File[] listWordsFiles() { 148 | return dicPath.listFiles(new FilenameFilter() { 149 | 150 | public boolean accept(File dir, String name) { 151 | 152 | return name.startsWith("words") && name.endsWith(".dic"); 153 | } 154 | 155 | }); 156 | } 157 | 158 | private Map loadDic(File wordsPath) throws IOException { 159 | InputStream charsIn = null; 160 | File charsFile = new File(wordsPath, "chars.dic"); 161 | if(charsFile.exists()) { 162 | charsIn = new FileInputStream(charsFile); 163 | addLastTime(charsFile); //chars.dic 也检测是否变更 164 | } else { //从 jar 里加载 165 | charsIn = this.getClass().getResourceAsStream("/data/chars.dic"); 166 | charsFile = new File(this.getClass().getResource("/data/chars.dic").getFile()); //only for log 167 | } 168 | final Map dic = new HashMap(); 169 | int lineNum = 0; 170 | long s = now(); 171 | long ss = s; 172 | lineNum = load(charsIn, new FileLoading() { //单个字的 173 | 174 | public void row(String line, int n) { 175 | if(line.length() < 1) { 176 | return; 177 | } 178 | String[] w = line.split(" "); 179 | CharNode cn = new CharNode(); 180 | switch(w.length) { 181 | case 2: 182 | try { 183 | cn.setFreq((int)(Math.log(Integer.parseInt(w[1]))*100));//字频计算出自由度 184 | } catch(NumberFormatException e) { 185 | //eat... 186 | } 187 | case 1: 188 | 189 | dic.put(w[0].charAt(0), cn); 190 | } 191 | } 192 | }); 193 | log.info("chars loaded time="+(now()-s)+"ms, line="+lineNum+", on file="+charsFile); 194 | 195 | //try load words.dic in jar 196 | InputStream wordsDicIn = this.getClass().getResourceAsStream("/data/words.dic"); 197 | if(wordsDicIn != null) { 198 | File wordsDic = new File(this.getClass().getResource("/data/words.dic").getFile()); 199 | loadWord(wordsDicIn, dic, wordsDic); 200 | } 201 | 202 | File[] words = listWordsFiles(); //只要 wordsXXX.dic的文件 203 | if(words != null) { //扩展词库目录 204 | for(File wordsFile : words) { 205 | loadWord(new FileInputStream(wordsFile), dic, wordsFile); 206 | 207 | addLastTime(wordsFile); //用于检测是否修改 208 | } 209 | } 210 | 211 | log.info("load all dic use time="+(now()-ss)+"ms"); 212 | return dic; 213 | } 214 | 215 | /** 216 | * @param is 词库文件流 217 | * @param dic 加载的词保存在结构中 218 | * @param wordsFile 日志用 219 | * @throws IOException from {@link #load(InputStream, FileLoading)} 220 | */ 221 | private void loadWord(InputStream is, Map dic, File wordsFile) throws IOException { 222 | long s = now(); 223 | int lineNum = load(is, new WordsFileLoading(dic)); //正常的词库 224 | log.info("words loaded time="+(now()-s)+"ms, line="+lineNum+", on file="+wordsFile); 225 | } 226 | 227 | private Map loadUnit(File path) throws IOException { 228 | InputStream fin = null; 229 | File unitFile = new File(path, "units.dic"); 230 | if(unitFile.exists()) { 231 | fin = new FileInputStream(unitFile); 232 | addLastTime(unitFile); 233 | } else { //在jar包里的/data/unit.dic 234 | fin = Dictionary.class.getResourceAsStream("/data/units.dic"); 235 | unitFile = new File(Dictionary.class.getResource("/data/units.dic").getFile()); 236 | } 237 | 238 | final Map unit = new HashMap(); 239 | 240 | long s = now(); 241 | int lineNum = load(fin, new FileLoading() { 242 | 243 | public void row(String line, int n) { 244 | if(line.length() != 1) { 245 | return; 246 | } 247 | unit.put(line.charAt(0), Dictionary.class); 248 | } 249 | }); 250 | log.info("unit loaded time="+(now()-s)+"ms, line="+lineNum+", on file="+unitFile); 251 | 252 | return unit; 253 | } 254 | 255 | /** 256 | * 加载 wordsXXX.dic 文件类。 257 | * 258 | * @author chenlb 2009-10-15 下午02:12:55 259 | */ 260 | private static class WordsFileLoading implements FileLoading { 261 | final Map dic; 262 | 263 | /** 264 | * @param dic 加载的词,保存在此结构中。 265 | */ 266 | public WordsFileLoading(Map dic) { 267 | this.dic = dic; 268 | } 269 | 270 | public void row(String line, int n) { 271 | if(line.length() < 2) { 272 | return; 273 | } 274 | CharNode cn = dic.get(line.charAt(0)); 275 | if(cn == null) { 276 | cn = new CharNode(); 277 | dic.put(line.charAt(0), cn); 278 | } 279 | cn.addWordTail(tail(line)); 280 | } 281 | } 282 | 283 | /** 284 | * 加载词文件的模板 285 | * @return 文件总行数 286 | */ 287 | public static int load(InputStream fin, FileLoading loading) throws IOException { 288 | BufferedReader br = new BufferedReader( 289 | new InputStreamReader(new BufferedInputStream(fin), "UTF-8")); 290 | String line = null; 291 | int n = 0; 292 | while((line = br.readLine()) != null) { 293 | if(line == null || line.startsWith("#")) { 294 | continue; 295 | } 296 | n++; 297 | loading.row(line, n); 298 | } 299 | return n; 300 | } 301 | 302 | /** 303 | * 取得 str 除去第一个char的部分 304 | * @author chenlb 2009-3-3 下午10:05:26 305 | */ 306 | private static char[] tail(String str) { 307 | char[] cs = new char[str.length()-1]; 308 | str.getChars(1, str.length(), cs, 0); 309 | return cs; 310 | } 311 | 312 | public static interface FileLoading { 313 | /** 314 | * @param line 读出的一行 315 | * @param n 当前第几行 316 | * @author chenlb 2009-3-3 下午09:55:54 317 | */ 318 | void row(String line, int n); 319 | } 320 | 321 | /** 322 | * 把 wordsFile 文件的最后更新时间加记录下来. 323 | * @param wordsFile 非 null 324 | */ 325 | private synchronized void addLastTime(File wordsFile) { 326 | if(wordsFile != null) { 327 | wordsLastTime.put(wordsFile, wordsFile.lastModified()); 328 | } 329 | } 330 | 331 | /** 332 | * 词典文件是否有修改过 333 | * @return 334 | */ 335 | public synchronized boolean wordsFileIsChange() { 336 | //检查是否有修改文件,包括删除的 337 | for(Entry flt : wordsLastTime.entrySet()) { 338 | File words = flt.getKey(); 339 | if(!words.canRead()) { //可能是删除了 340 | return true; 341 | } 342 | if(words.lastModified() > flt.getValue()) { //更新了文件 343 | return true; 344 | } 345 | } 346 | //检查是否有新文件 347 | File[] words = listWordsFiles(); 348 | if(words != null) { 349 | for(File wordsFile : words) { 350 | if(!wordsLastTime.containsKey(wordsFile)) { //有新词典文件 351 | return true; 352 | } 353 | } 354 | } 355 | return false; 356 | } 357 | 358 | /** 359 | * 全新加载词库,没有成功加载会回滚。

360 | * 注意:重新加载时,务必有两倍的词库树结构的内存,默认词库是 50M/个 左右。否则抛出 OOM。 361 | * @return 是否成功加载 362 | */ 363 | public synchronized boolean reload() { 364 | Map oldWordsLastTime = new HashMap(wordsLastTime); 365 | Map oldDict = dict; 366 | Map oldUnit = unit; 367 | 368 | try { 369 | wordsLastTime.clear(); 370 | dict = loadDic(dicPath); 371 | unit = loadUnit(dicPath); 372 | lastLoadTime = System.currentTimeMillis(); 373 | } catch (IOException e) { 374 | //rollback 375 | wordsLastTime.putAll(oldWordsLastTime); 376 | dict = oldDict; 377 | unit = oldUnit; 378 | 379 | if(log.isLoggable(Level.WARNING)) { 380 | log.log(Level.WARNING, "reload dic error! dic="+dicPath+", and rollbacked.", e); 381 | } 382 | 383 | return false; 384 | } 385 | return true; 386 | } 387 | 388 | /** 389 | * word 能否在词库里找到 390 | * @author chenlb 2009-3-3 下午11:10:45 391 | */ 392 | public boolean match(String word) { 393 | if(word == null || word.length() < 2) { 394 | return false; 395 | } 396 | CharNode cn = dict.get(word.charAt(0)); 397 | return search(cn, word.toCharArray(), 0, word.length()-1) >= 0; 398 | } 399 | 400 | public CharNode head(char ch) { 401 | return dict.get(ch); 402 | } 403 | 404 | /** 405 | * sen[offset] 后 tailLen 长的词是否存在. 406 | * @see CharNode#indexOf(char[], int, int) 407 | * @author chenlb 2009-4-8 下午11:13:49 408 | */ 409 | public int search(CharNode node, char[] sen, int offset, int tailLen) { 410 | if(node != null) { 411 | return node.indexOf(sen, offset, tailLen); 412 | } 413 | return -1; 414 | } 415 | 416 | public int maxMatch(char[] sen, int offset) { 417 | CharNode node = dict.get(sen[offset]); 418 | return maxMatch(node, sen, offset); 419 | } 420 | 421 | public int maxMatch(CharNode node, char[] sen, int offset) { 422 | if(node != null) { 423 | return node.maxMatch(sen, offset+1); 424 | } 425 | return 0; 426 | } 427 | 428 | public ArrayList maxMatch(CharNode node, ArrayList tailLens, char[] sen, int offset) { 429 | tailLens.clear(); 430 | tailLens.add(0); 431 | if(node != null) { 432 | return node.maxMatch(tailLens, sen, offset+1); 433 | } 434 | return tailLens; 435 | } 436 | 437 | public boolean isUnit(Character ch) { 438 | return unit.containsKey(ch); 439 | } 440 | 441 | /** 442 | * 当 words.dic 是从 jar 里加载时, 可能 defalut 不存在 443 | */ 444 | public static File getDefalutPath() { 445 | if(defalutPath == null) { 446 | String defPath = System.getProperty("mmseg.dic.path"); 447 | log.info("look up in mmseg.dic.path="+defPath); 448 | if(defPath == null) { 449 | URL url = Dictionary.class.getClassLoader().getResource("data"); 450 | if(url != null) { 451 | defPath = url.getFile(); 452 | log.info("look up in classpath="+defPath); 453 | } else { 454 | defPath = System.getProperty("user.dir")+"/data"; 455 | log.info("look up in user.dir="+defPath); 456 | } 457 | 458 | } 459 | 460 | defalutPath = new File(defPath); 461 | if(!defalutPath.exists()) { 462 | log.warning("defalut dic path="+defalutPath+" not exist"); 463 | } 464 | } 465 | return defalutPath; 466 | } 467 | 468 | /** 469 | * 仅仅用来观察词库. 470 | */ 471 | public Map getDict() { 472 | return dict; 473 | } 474 | 475 | /** 476 | * 注意:当 words.dic 是从 jar 里加载时,此时 File 可能是不存在的。 477 | */ 478 | public File getDicPath() { 479 | return dicPath; 480 | } 481 | 482 | /** 最后加载词库的时间 */ 483 | public long getLastLoadTime() { 484 | return lastLoadTime; 485 | } 486 | } 487 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/MMSeg.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.PushbackReader; 6 | import java.io.Reader; 7 | import java.util.LinkedList; 8 | import java.util.Queue; 9 | 10 | /** 11 | * Reader 流的分词(有字母,数字等), 析出中文(其实是 CJK)成句子 {@link Sentence} 再对 mmseg 算法分词.
12 | * 13 | * 非线程安全 14 | * @author chenlb 2009-9-20下午10:41:41 15 | */ 16 | public class MMSeg { 17 | 18 | private PushbackReader reader; 19 | private Seg seg; 20 | 21 | private StringBuilder bufSentence = new StringBuilder(256); 22 | private Sentence currentSentence; 23 | private Queue bufWord; // word 缓存, 因为有 chunk 分析三个以上. 24 | 25 | public MMSeg(Reader input, Seg seg) { 26 | this.seg = seg; 27 | 28 | reset(input); 29 | } 30 | 31 | private int readedIdx = 0; 32 | 33 | public void reset(Reader input) { 34 | this.reader = new PushbackReader(new BufferedReader(input), 20); 35 | currentSentence = null; 36 | bufWord = new LinkedList(); 37 | bufSentence.setLength(0); 38 | readedIdx = -1; 39 | } 40 | 41 | private int readNext() throws IOException { 42 | int d = reader.read(); 43 | if(d > -1) { 44 | readedIdx++; 45 | d = Character.toLowerCase(d); 46 | } 47 | return d; 48 | } 49 | 50 | private void pushBack(int data) throws IOException { 51 | readedIdx--; 52 | reader.unread(data); 53 | } 54 | 55 | 56 | public Word next() throws IOException { 57 | //先从缓存中取 58 | Word word = bufWord.poll();; 59 | if(word == null) { 60 | bufSentence.setLength(0); 61 | 62 | int data = -1; 63 | boolean read = true; 64 | while(read && (data=readNext()) != -1) { 65 | read = false; //默认一次可以读出同一类字符,就可以分词内容 66 | int type = Character.getType(data); 67 | String wordType = Word.TYPE_WORD; 68 | switch(type) { 69 | case Character.UPPERCASE_LETTER: 70 | case Character.LOWERCASE_LETTER: 71 | case Character.TITLECASE_LETTER: 72 | case Character.MODIFIER_LETTER: 73 | /* 74 | * 1. 0x410-0x44f -> А-я //俄文 75 | * 2. 0x391-0x3a9 -> Α-Ω //希腊大写 76 | * 3. 0x3b1-0x3c9 -> α-ω //希腊小写 77 | */ 78 | data = toAscii(data); 79 | NationLetter nl = getNation(data); 80 | if(nl == NationLetter.UNKNOW) { 81 | read = true; 82 | break; 83 | } 84 | wordType = Word.TYPE_LETTER; 85 | bufSentence.appendCodePoint(data); 86 | switch(nl) { 87 | case EN: 88 | //字母后面的数字,如: VH049PA 89 | ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit(); 90 | readChars(bufSentence, rcad); 91 | if(rcad.hasDigit()) { 92 | wordType = Word.TYPE_LETTER_OR_DIGIT; 93 | } 94 | //only english 95 | //readChars(bufSentence, new ReadCharByAscii()); 96 | break; 97 | case RA: 98 | readChars(bufSentence, new ReadCharByRussia()); 99 | break; 100 | case GE: 101 | readChars(bufSentence, new ReadCharByGreece()); 102 | break; 103 | } 104 | bufWord.add(createWord(bufSentence, wordType)); 105 | 106 | bufSentence.setLength(0); 107 | 108 | break; 109 | case Character.OTHER_LETTER: 110 | /* 111 | * 1. 0x3041-0x30f6 -> ぁ-ヶ //日文(平|片)假名 112 | * 2. 0x3105-0x3129 -> ㄅ-ㄩ //注意符号 113 | */ 114 | bufSentence.appendCodePoint(data); 115 | readChars(bufSentence, new ReadCharByType(Character.OTHER_LETTER)); 116 | 117 | currentSentence = createSentence(bufSentence); 118 | 119 | bufSentence.setLength(0); 120 | 121 | break; 122 | case Character.DECIMAL_DIGIT_NUMBER: 123 | bufSentence.appendCodePoint(toAscii(data)); 124 | readChars(bufSentence, new ReadCharDigit()); //读后面的数字, AsciiLetterOr 125 | wordType = Word.TYPE_DIGIT; 126 | int d = readNext(); 127 | if(d > -1) { 128 | if(seg.isUnit(d)) { //单位,如时间 129 | bufWord.add(createWord(bufSentence, startIdx(bufSentence)-1, Word.TYPE_DIGIT)); //先把数字添加(独立) 130 | 131 | bufSentence.setLength(0); 132 | 133 | bufSentence.appendCodePoint(d); 134 | wordType = Word.TYPE_WORD; //单位是 word 135 | } else { //后面可能是字母和数字 136 | pushBack(d); 137 | if(readChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0) { //如果有字母或数字都会连在一起. 138 | wordType = Word.TYPE_DIGIT_OR_LETTER; 139 | } 140 | } 141 | } 142 | 143 | bufWord.add(createWord(bufSentence, wordType)); 144 | 145 | 146 | bufSentence.setLength(0); //缓存的字符清除 147 | 148 | break; 149 | case Character.LETTER_NUMBER: 150 | // ⅠⅡⅢ 单分 151 | bufSentence.appendCodePoint(data); 152 | readChars(bufSentence, new ReadCharByType(Character.LETTER_NUMBER)); 153 | 154 | int startIdx = startIdx(bufSentence); 155 | for(int i=0; i=65296 && codePoint<=65305) //0-9 330 | || (codePoint>=65313 && codePoint<=65338) //A-Z 331 | || (codePoint>=65345 && codePoint<=65370) //a-z 332 | ) { 333 | codePoint -= 65248; 334 | } 335 | return codePoint; 336 | } 337 | 338 | private static boolean isAsciiLetter(int codePoint) { 339 | return (codePoint >= 'A' && codePoint <= 'Z') || (codePoint >= 'a' && codePoint <= 'z'); 340 | } 341 | 342 | private static boolean isRussiaLetter(int codePoint) { 343 | return (codePoint >= 'А' && codePoint <= 'я') || codePoint=='Ё' || codePoint=='ё'; 344 | } 345 | 346 | private static boolean isGreeceLetter(int codePoint) { 347 | return (codePoint >= 'Α' && codePoint <= 'Ω') || (codePoint >= 'α' && codePoint <= 'ω'); 348 | } 349 | /** 350 | * EN -> 英语 351 | * RA -> 俄语 352 | * GE -> 希腊 353 | * 354 | */ 355 | private static enum NationLetter {EN, RA, GE, UNKNOW}; 356 | 357 | private NationLetter getNation(int codePoint) { 358 | if(isAsciiLetter(codePoint)) { 359 | return NationLetter.EN; 360 | } 361 | if(isRussiaLetter(codePoint)) { 362 | return NationLetter.RA; 363 | } 364 | if(isGreeceLetter(codePoint)) { 365 | return NationLetter.GE; 366 | } 367 | return NationLetter.UNKNOW; 368 | } 369 | 370 | @SuppressWarnings("unused") 371 | private static boolean isCJK(int type) { 372 | return type == Character.OTHER_LETTER; 373 | } 374 | private static boolean isDigit(int type) { 375 | return type == Character.DECIMAL_DIGIT_NUMBER; 376 | } 377 | @SuppressWarnings("unused") 378 | private static boolean isLetter(int type) { 379 | return type <= Character.MODIFIER_LETTER && type >= Character.UPPERCASE_LETTER; 380 | } 381 | } 382 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/MaxWordSeg.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * 最多分词. 在ComplexSeg基础上把长的词拆. 8 | * 9 | * @author chenlb 2009-4-6 下午08:12:11 10 | */ 11 | public class MaxWordSeg extends ComplexSeg { 12 | 13 | public MaxWordSeg(Dictionary dic) { 14 | super(dic); 15 | } 16 | 17 | public Chunk seg(Sentence sen) { 18 | 19 | Chunk chunk = super.seg(sen); 20 | if(chunk != null) { 21 | List cks = new ArrayList(); 22 | for(int i=0; i -1) { 35 | cks.add(new Word(chs, senStartOffset, offset, 2)); 36 | end = offset+2; 37 | n++; 38 | } else if(offset >= end) { //有单字 39 | cks.add(new Word(chs, senStartOffset, offset, 1)); 40 | end = offset+1; 41 | 42 | } 43 | } 44 | if(end > -1 && end < wordEnd) { 45 | cks.add(new Word(chs, senStartOffset, offset, 1)); 46 | } 47 | } 48 | 49 | } 50 | chunk.words = cks.toArray(new Word[cks.size()]); 51 | chunk.count = cks.size(); 52 | } 53 | 54 | return chunk; 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/Seg.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * 分词抽象类. 8 | * 9 | * @author chenlb 2009-3-16 下午09:15:30 10 | */ 11 | public abstract class Seg { 12 | 13 | protected Dictionary dic; 14 | 15 | public Seg(Dictionary dic) { 16 | super(); 17 | this.dic = dic; 18 | } 19 | 20 | /** 21 | * 输出 chunks, 调试用. 22 | */ 23 | protected void printChunk(List chunks) { 24 | for(Chunk ck : chunks) { 25 | System.out.println(ck+" -> "+ck.toFactorString()); 26 | } 27 | } 28 | 29 | /** 30 | * @see Dictionary#isUnit(Character) 31 | */ 32 | protected boolean isUnit(int codePoint) { 33 | return dic.isUnit((char) codePoint); 34 | } 35 | 36 | /** 37 | * 查找chs[offset]后面的 tailLen个char是否为词. 38 | * @return 返回chs[offset]字符结点下的词尾索引号,没找到返回 -1. 39 | */ 40 | protected int search(char[] chs, int offset, int tailLen) { 41 | if(tailLen == 0) { 42 | return -1; 43 | } 44 | CharNode cn = dic.head(chs[offset]); 45 | 46 | return search(cn, chs, offset, tailLen); 47 | } 48 | 49 | /** 50 | * 没有数组的复制. 51 | * @author chenlb 2009-4-8 下午11:39:15 52 | */ 53 | protected int search(CharNode cn, char[] chs, int offset, int tailLen) { 54 | if(tailLen == 0 || cn == null) { 55 | return -1; 56 | } 57 | return dic.search(cn, chs, offset, tailLen); 58 | } 59 | 60 | /** 61 | * 最大匹配
62 | * 从 chs[offset] 开始匹配, 同时把 chs[offset] 的字符结点保存在 cns[cnIdx] 63 | * @return 最大匹配到的词尾长, > 0 找到 64 | */ 65 | protected int maxMatch(CharNode[] cns, int cnIdx, char[] chs, int offset) { 66 | CharNode cn = null; 67 | if(offset < chs.length) { 68 | cn = dic.head(chs[offset]); 69 | } 70 | cns[cnIdx] = cn; 71 | return dic.maxMatch(cn, chs, offset); 72 | } 73 | 74 | /** 75 | * 匹配,同时找出长度.
76 | * 从 chs[offset] 开始找所有匹配的词, 找到的放到 tailLens[tailLensIdx] 中.
77 | * 同时把 chs[offset] 的字符结点保存在 cns[cnIdx]. 78 | * @author chenlb 2009-4-12 上午10:37:58 79 | */ 80 | protected void maxMatch(CharNode[] cns, int cnIdx, char[] chs, int offset, ArrayList[] tailLens, int tailLensIdx) { 81 | CharNode cn = null; 82 | if(offset < chs.length) { 83 | cn = dic.head(chs[offset]); 84 | } 85 | cns[cnIdx] = cn; 86 | dic.maxMatch(cn, tailLens[tailLensIdx], chs, offset); 87 | } 88 | 89 | /** 90 | * 对句子 sen 进行分词. 91 | * @return 不返回 null. 92 | */ 93 | public abstract Chunk seg(Sentence sen); 94 | } 95 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/Sentence.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | /** 4 | * 句子, 在一大串字符中断出连续中文的文本. 5 | * 6 | * @author chenlb 2009-3-3 下午11:56:53 7 | */ 8 | public class Sentence { 9 | 10 | private char[] text; 11 | private int startOffset; 12 | 13 | private int offset; 14 | 15 | public Sentence() { 16 | text = new char[0]; 17 | } 18 | 19 | public Sentence(char[] text, int startOffset) { 20 | reinit(text, startOffset); 21 | } 22 | 23 | public void reinit(char[] text, int startOffset) { 24 | this.text = text; 25 | this.startOffset = startOffset; 26 | offset = 0; 27 | } 28 | 29 | public char[] getText() { 30 | return text; 31 | } 32 | 33 | /** 句子开始处理的偏移位置 */ 34 | public int getOffset() { 35 | return offset; 36 | } 37 | 38 | /** 句子开始处理的偏移位置 */ 39 | public void setOffset(int offset) { 40 | this.offset = offset; 41 | } 42 | 43 | public void addOffset(int inc) { 44 | offset += inc; 45 | } 46 | 47 | /** 句子处理完成 */ 48 | public boolean isFinish() { 49 | return offset >= text.length; 50 | } 51 | 52 | /** 句子在文本中的偏移位置 */ 53 | public int getStartOffset() { 54 | return startOffset; 55 | } 56 | 57 | /** 句子在文本中的偏移位置 */ 58 | public void setStartOffset(int startOffset) { 59 | this.startOffset = startOffset; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/SimpleSeg.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | 4 | /** 5 | * 正向最大匹配的分词方式. 6 | * 7 | * @author chenlb 2009-3-16 下午09:07:36 8 | */ 9 | public class SimpleSeg extends Seg{ 10 | 11 | public SimpleSeg(Dictionary dic) { 12 | super(dic); 13 | } 14 | 15 | public Chunk seg(Sentence sen) { 16 | Chunk chunk = new Chunk(); 17 | char[] chs = sen.getText(); 18 | for(int k=0; k<3&&!sen.isFinish(); k++) { 19 | int offset = sen.getOffset(); 20 | int maxLen = 0; 21 | 22 | //有了 key tree 的支持可以从头开始 max match 23 | maxLen = dic.maxMatch(chs, offset); 24 | 25 | chunk.words[k] = new Word(chs, sen.getStartOffset(), offset, maxLen+1); 26 | 27 | offset += maxLen + 1; 28 | sen.setOffset(offset); 29 | } 30 | 31 | return chunk; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/Word.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | /** 4 | * 类似 lucene 的 token 5 | * 6 | * @author chenlb 2009-8-15下午10:23:32 7 | */ 8 | public class Word { 9 | 10 | public static final String TYPE_WORD = "word"; 11 | public static final String TYPE_LETTER = "letter"; 12 | /** 字母开头的"字母或数字" */ 13 | public static final String TYPE_LETTER_OR_DIGIT = "letter_or_digit"; 14 | public static final String TYPE_DIGIT = "digit"; 15 | /** 数字开头的"字母或数字" */ 16 | public static final String TYPE_DIGIT_OR_LETTER = "digit_or_letter"; 17 | public static final String TYPE_LETTER_NUMBER = "letter_number"; 18 | public static final String TYPE_OTHER_NUMBER = "other_number"; 19 | 20 | private int degree = -1; 21 | private int startOffset; 22 | 23 | private char[] sen; 24 | private int offset; 25 | private int len; 26 | 27 | private String type = TYPE_WORD; //类似 lucene token 的 type 28 | 29 | /** 30 | * @param startOffset word 在整个文本中的偏移位置 31 | */ 32 | public Word(char[] word, int startOffset) { 33 | super(); 34 | this.sen = word; 35 | this.startOffset = startOffset; 36 | offset = 0; 37 | len = word.length; 38 | } 39 | 40 | /** 41 | * @param startOffset word 在整个文本中的偏移位置 42 | */ 43 | public Word(char[] word, int startOffset, String wordType) { 44 | this(word, startOffset); 45 | this.type = wordType; 46 | } 47 | 48 | /** 49 | * sen[offset] 开始的 len 个字符才是此 word 50 | * @param senStartOffset sen 在整个文本中的偏移位置 51 | * @param offset 词在 sen 的偏移位置 52 | * @param len 词长 53 | */ 54 | public Word(char[] sen, int senStartOffset, int offset, int len) { 55 | super(); 56 | this.sen = sen; 57 | this.startOffset = senStartOffset; 58 | this.offset = offset; 59 | this.len = len; 60 | } 61 | 62 | public String getString() { 63 | return new String(getSen(), getWordOffset(), getLength()); 64 | } 65 | 66 | public String toString() { 67 | return getString(); 68 | } 69 | /** 70 | * 词在 char[] sen 的偏移位置 71 | * @see #getSen() 72 | */ 73 | public int getWordOffset() { 74 | return offset; 75 | } 76 | 77 | public int getLength() { 78 | return len; 79 | } 80 | 81 | public char[] getSen() { 82 | return sen; 83 | } 84 | 85 | /**此 word 在整个文本中的偏移位置*/ 86 | public int getStartOffset() { 87 | return startOffset+offset; 88 | } 89 | public int getEndOffset() { 90 | return getStartOffset() + getLength(); 91 | } 92 | public int getDegree() { 93 | return degree; 94 | } 95 | public void setDegree(int degree) { 96 | this.degree = degree; 97 | } 98 | public String getType() { 99 | return type; 100 | } 101 | public void setType(String type) { 102 | this.type = type; 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/example/Complex.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j.example; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStreamReader; 6 | import java.io.Reader; 7 | import java.io.StringReader; 8 | 9 | import com.chenlb.mmseg4j.ComplexSeg; 10 | import com.chenlb.mmseg4j.Dictionary; 11 | import com.chenlb.mmseg4j.MMSeg; 12 | import com.chenlb.mmseg4j.Seg; 13 | import com.chenlb.mmseg4j.Word; 14 | 15 | public class Complex { 16 | 17 | protected Dictionary dic; 18 | 19 | public Complex() { 20 | dic = Dictionary.getInstance(); 21 | } 22 | 23 | protected Seg getSeg() { 24 | return new ComplexSeg(dic); 25 | } 26 | 27 | public String segWords(Reader input, String wordSpilt) throws IOException { 28 | StringBuilder sb = new StringBuilder(); 29 | Seg seg = getSeg(); //取得不同的分词具体算法 30 | MMSeg mmSeg = new MMSeg(input, seg); 31 | Word word = null; 32 | boolean first = true; 33 | while((word=mmSeg.next())!=null) { 34 | if(!first) { 35 | sb.append(wordSpilt); 36 | } 37 | String w = word.getString(); 38 | sb.append(w); 39 | first = false; 40 | 41 | } 42 | return sb.toString(); 43 | } 44 | 45 | public String segWords(String txt, String wordSpilt) throws IOException { 46 | return segWords(new StringReader(txt), wordSpilt); 47 | } 48 | 49 | private void printlnHelp() { 50 | System.out.println("\n\t-- 说明: 输入 QUIT 或 EXIT 退出"); 51 | System.out.print("\nmmseg4j-"+this.getClass().getSimpleName().toLowerCase()+">"); 52 | } 53 | 54 | protected void run(String[] args) throws IOException { 55 | String txt = "京华时报2008年1月23日报道 昨天,受一股来自中西伯利亚的强冷空气影响,本市出现大风降温天气,白天最高气温只有零下7摄氏度,同时伴有6到7级的偏北风。"; 56 | 57 | if(args.length > 0) { 58 | txt = args[0]; 59 | } 60 | 61 | System.out.println(segWords(txt, " | ")); 62 | printlnHelp(); 63 | String inputStr = null; 64 | BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); 65 | while((inputStr = br.readLine()) != null) { 66 | if(inputStr.equals("QUIT") || inputStr.equals("EXIT")) { 67 | System.exit(0); 68 | } else if("".equals(inputStr)) { 69 | printlnHelp(); 70 | } else { 71 | //System.out.println(inputStr); 72 | System.out.println(segWords(inputStr, " | ")); //分词 73 | System.out.print("\nmmseg4j-"+this.getClass().getSimpleName().toLowerCase()+">"); 74 | } 75 | } 76 | } 77 | 78 | public static void main(String[] args) throws IOException { 79 | 80 | new Complex().run(args); 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/example/MaxWord.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j.example; 2 | 3 | import java.io.IOException; 4 | 5 | import com.chenlb.mmseg4j.MaxWordSeg; 6 | import com.chenlb.mmseg4j.Seg; 7 | 8 | public class MaxWord extends Complex { 9 | 10 | protected Seg getSeg() { 11 | 12 | return new MaxWordSeg(dic); 13 | } 14 | 15 | public static void main(String[] args) throws IOException { 16 | new MaxWord().run(args); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/example/Simple.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j.example; 2 | 3 | import java.io.IOException; 4 | 5 | import com.chenlb.mmseg4j.Seg; 6 | import com.chenlb.mmseg4j.SimpleSeg; 7 | 8 | /** 9 | * 10 | * @author chenlb 2009-3-14 上午12:38:40 11 | */ 12 | public class Simple extends Complex { 13 | 14 | protected Seg getSeg() { 15 | 16 | return new SimpleSeg(dic); 17 | } 18 | 19 | public static void main(String[] args) throws IOException { 20 | new Simple().run(args); 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/rule/LargestAvgLenRule.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j.rule; 2 | 3 | import com.chenlb.mmseg4j.Chunk; 4 | 5 | /** 6 | * Largest Average Word Length.

7 | * 8 | * 长度(Length)/词数 9 | * 10 | * @see http://technology.chtsai.org/mmseg/ 11 | * 12 | * @author chenlb 2009-3-16 上午11:28:21 13 | */ 14 | public class LargestAvgLenRule extends Rule { 15 | 16 | private double largestAvgLen; 17 | 18 | @Override 19 | public void addChunk(Chunk chunk) { 20 | if(chunk.getAvgLen() >= largestAvgLen) { 21 | largestAvgLen = chunk.getAvgLen(); 22 | super.addChunk(chunk); 23 | } 24 | } 25 | 26 | @Override 27 | protected boolean isRemove(Chunk chunk) { 28 | return chunk.getAvgLen() < largestAvgLen; 29 | } 30 | 31 | @Override 32 | public void reset() { 33 | largestAvgLen = 0; 34 | super.reset(); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/rule/LargestSumDegreeFreedomRule.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j.rule; 2 | 3 | import com.chenlb.mmseg4j.Chunk; 4 | 5 | /** 6 | * Largest Sum of Degree of Morphemic Freedom of One-Character.

7 | * 8 | * 各单字词词频的对数之和*100 9 | * 10 | * @see http://technology.chtsai.org/mmseg/ 11 | * 12 | * @author chenlb 2009-3-16 上午11:28:30 13 | */ 14 | public class LargestSumDegreeFreedomRule extends Rule { 15 | 16 | private int largestSumDegree = Integer.MIN_VALUE; 17 | @Override 18 | public void addChunk(Chunk chunk) { 19 | if(chunk.getSumDegree() >= largestSumDegree) { 20 | largestSumDegree = chunk.getSumDegree(); 21 | super.addChunk(chunk); 22 | } 23 | } 24 | 25 | @Override 26 | public void reset() { 27 | largestSumDegree = Integer.MIN_VALUE; 28 | super.reset(); 29 | } 30 | 31 | @Override 32 | protected boolean isRemove(Chunk chunk) { 33 | 34 | return chunk.getSumDegree() < largestSumDegree; 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/rule/MaxMatchRule.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j.rule; 2 | 3 | import com.chenlb.mmseg4j.Chunk; 4 | 5 | /** 6 | * Maximum Matching.

7 | * 8 | * chuck中各个词的长度之和 9 | * 10 | * @see http://technology.chtsai.org/mmseg/ 11 | * 12 | * @author chenlb 2009-3-16 上午09:47:51 13 | */ 14 | public class MaxMatchRule extends Rule{ 15 | 16 | private int maxLen; 17 | 18 | public void addChunk(Chunk chunk) { 19 | if(chunk.getLen() >= maxLen) { 20 | maxLen = chunk.getLen(); 21 | super.addChunk(chunk); 22 | } 23 | } 24 | 25 | @Override 26 | protected boolean isRemove(Chunk chunk) { 27 | 28 | return chunk.getLen() < maxLen; 29 | } 30 | 31 | public void reset() { 32 | maxLen = 0; 33 | super.reset(); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/rule/Rule.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j.rule; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Iterator; 5 | import java.util.List; 6 | 7 | import com.chenlb.mmseg4j.Chunk; 8 | 9 | /** 10 | * 过虑规则的抽象类。 11 | * 12 | * @author chenlb 2009-3-16 上午11:35:06 13 | */ 14 | public abstract class Rule { 15 | 16 | protected List chunks; 17 | 18 | public void addChunks(List chunks) { 19 | for(Chunk chunk : chunks) { 20 | addChunk(chunk); 21 | } 22 | } 23 | 24 | /** 25 | * 添加 chunk 26 | * @throws NullPointerException, if chunk == null. 27 | * @author chenlb 2009-3-16 上午11:34:17 28 | */ 29 | public void addChunk(Chunk chunk) { 30 | chunks.add(chunk); 31 | } 32 | 33 | /** 34 | * @return 返回规则过虑后的结果。 35 | * @author chenlb 2009-3-16 上午11:33:10 36 | */ 37 | public List remainChunks() { 38 | for(Iterator it=chunks.iterator(); it.hasNext();) { 39 | Chunk chunk = it.next(); 40 | if(isRemove(chunk)) { 41 | it.remove(); 42 | } 43 | } 44 | return chunks; 45 | } 46 | 47 | /** 48 | * 判断 chunk 是否要删除。 49 | * @author chenlb 2009-3-16 上午11:33:30 50 | */ 51 | protected abstract boolean isRemove(Chunk chunk); 52 | 53 | public void reset() { 54 | chunks = new ArrayList(); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/com/chenlb/mmseg4j/rule/SmallestVarianceRule.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j.rule; 2 | 3 | import com.chenlb.mmseg4j.Chunk; 4 | 5 | /** 6 | * Smallest Variance of Word Lengths.

7 | * 8 | * 标准差的平方 9 | * 10 | * @see http://technology.chtsai.org/mmseg/ 11 | * 12 | * @author chenlb 2009-3-16 上午11:28:27 13 | */ 14 | public class SmallestVarianceRule extends Rule { 15 | 16 | private double smallestVariance = Double.MAX_VALUE; 17 | 18 | @Override 19 | public void addChunk(Chunk chunk) { 20 | if(chunk.getVariance() <= smallestVariance) { 21 | smallestVariance = chunk.getVariance(); 22 | super.addChunk(chunk); 23 | } 24 | } 25 | 26 | @Override 27 | public void reset() { 28 | smallestVariance = Double.MAX_VALUE; 29 | super.reset(); 30 | } 31 | 32 | @Override 33 | protected boolean isRemove(Chunk chunk) { 34 | 35 | return chunk.getVariance() > smallestVariance; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/resources/data/units.dic: -------------------------------------------------------------------------------- 1 | #说明:单个字的单位, 与数字一起出现的(前面是数字). 如:2009年 2 | #注意:如果"分"加入到些文件中,"20分钟"就被分成"20|分|钟". 如果你想分成"20|分钟",那就把"分"注释掉. 3 | # 作者认为:像"分"和"分钟"都是单位的话,不把"分"加入. 4 | #时间 5 | 年 6 | 月 7 | 日 8 | 时 9 | #单位"分钟"已经是词,就不把"分"加入了 10 | #分 11 | 秒 12 | #币 13 | 元 14 | 角 15 | #长度 16 | 米 17 | 寸 18 | 尺 19 | 丈 20 | 里 21 | #容量 22 | 升 23 | 斗 24 | 石 25 | #重量 26 | 吨 27 | 克 28 | 斤 29 | 两 30 | 担 31 | #地积 32 | 亩 33 | 顷 -------------------------------------------------------------------------------- /src/test/java/com/chenlb/mmseg4j/ComplexSegTest.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | import java.io.IOException; 4 | 5 | import org.junit.Assert; 6 | import org.junit.Before; 7 | import org.junit.Test; 8 | 9 | import com.chenlb.mmseg4j.example.Complex; 10 | 11 | public class ComplexSegTest { 12 | 13 | Complex segW; 14 | @Before 15 | public void setUp() throws Exception { 16 | segW = new Complex(); 17 | //ComplexSeg.setShowChunk(true); 18 | } 19 | 20 | /*public void testSeg() { 21 | String txt = ""; 22 | txt = "各人发表关于受一股来自中西伯利亚的强冷空气影响"; 23 | ComplexSeg.setShowChunk(true); 24 | ComplexSeg seg = new ComplexSeg(new Dictionary("dic")); //sogou 25 | Sentence sen = new Sentence(txt.toCharArray(), 0); 26 | System.out.println(); 27 | while(!sen.isFinish()) { 28 | Chunk chunk = seg.seg(sen); 29 | System.out.println(chunk+" -> "+chunk.getStartOffset()); 30 | } 31 | }*/ 32 | 33 | @Test 34 | public void testEffect() throws IOException { 35 | String words = segW.segWords("研究生命起源", "|"); 36 | Assert.assertEquals("研究|生命|起源", words); 37 | } 38 | 39 | @Test 40 | public void testEffect1() throws IOException { 41 | String words = segW.segWords("为首要考虑", "|"); 42 | Assert.assertEquals("为首|要|考虑", words); 43 | } 44 | 45 | @Test 46 | public void testEffect2() throws IOException { 47 | String words = segW.segWords("眼看就要来了", "|"); 48 | Assert.assertEquals("眼看|就要|来|了", words); 49 | } 50 | 51 | @Test 52 | public void testEffect3() throws IOException { 53 | String words = segW.segWords("中西伯利亚", "|"); 54 | Assert.assertEquals("中|西伯利亚", words); 55 | } 56 | 57 | @Test 58 | public void testEffect4() throws IOException { 59 | String words = segW.segWords("国际化", "|"); 60 | Assert.assertEquals("国际化", words); 61 | } 62 | 63 | @Test 64 | public void testEffect5() throws IOException { 65 | String words = segW.segWords("化装和服装", "|"); 66 | Assert.assertEquals("化装|和|服装", words); 67 | } 68 | 69 | @Test 70 | public void testEffect6() throws IOException { 71 | String words = segW.segWords("中国人民银行", "|"); 72 | Assert.assertEquals("中国人民银行", words); 73 | } 74 | 75 | /** 76 | * 自扩展的词库文件 77 | */ 78 | @Test 79 | public void testEffect7() throws IOException { 80 | String words = segW.segWords("白云山", "|"); 81 | Assert.assertEquals("白云山", words); 82 | } 83 | 84 | @Test 85 | public void testEffect10() throws IOException { 86 | String words = segW.segWords("清华大学", "|"); 87 | Assert.assertEquals("清华大学", words); 88 | } 89 | 90 | @Test 91 | public void testEffect11() throws IOException { 92 | String words = segW.segWords("华南理工大学", "|"); 93 | Assert.assertEquals("华南理工大学", words); 94 | } 95 | 96 | @Test 97 | public void testEffect12() throws IOException { 98 | String words = segW.segWords("广东工业大学", "|"); 99 | Assert.assertEquals("广东工业大学", words); 100 | } 101 | 102 | @Test 103 | public void testUnitEffect() throws IOException { 104 | String words = segW.segWords("2008年底发了资金吗", "|"); 105 | Assert.assertEquals("2008|年|底|发|了|资金|吗", words); 106 | } 107 | 108 | @Test 109 | public void testUnitEffect1() throws IOException { 110 | String words = segW.segWords("20分钟能完成", "|"); 111 | Assert.assertEquals("20|分钟|能|完成", words); 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/test/java/com/chenlb/mmseg4j/DictionaryTest.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | 6 | import org.junit.Assert; 7 | import org.junit.Test; 8 | 9 | public class DictionaryTest { 10 | 11 | private void printMemory() { 12 | Runtime rt = Runtime.getRuntime(); 13 | long total = rt.totalMemory(); 14 | long free = rt.freeMemory(); 15 | long max = rt.maxMemory(); 16 | System.out.println(String.format("total=%dk, free=%dk, max=%dk, use=%dk", total/1024, free/1024, max/1024, (total-free)/1024)); 17 | } 18 | 19 | @Test 20 | public void testloadDicMemoryUse() { 21 | printMemory(); 22 | Dictionary.getInstance(); 23 | printMemory(); 24 | } 25 | 26 | @Test 27 | public void testloadDic() { 28 | Dictionary dic = Dictionary.getInstance(); 29 | Dictionary dic2 = Dictionary.getInstance(); 30 | Assert.assertTrue(dic == dic2); 31 | 32 | dic.destroy(); 33 | //reload 34 | dic2 = Dictionary.getInstance(); 35 | Assert.assertTrue(dic != dic2); 36 | dic2.destroy(); 37 | } 38 | 39 | @Test 40 | public void testloadDicByPath() { 41 | Dictionary dic = Dictionary.getInstance("src"); 42 | Dictionary dic2 = Dictionary.getInstance("./src"); 43 | Assert.assertTrue(dic == dic2); 44 | 45 | Assert.assertFalse(dic.match("自定义词")); 46 | 47 | dic.destroy(); 48 | } 49 | 50 | @Test 51 | public void testloadMultiDic() { 52 | Dictionary dic = Dictionary.getInstance(); 53 | 54 | Assert.assertTrue(dic.match("自定义词")); 55 | } 56 | 57 | @Test 58 | public void testMatch() { 59 | Dictionary dic = Dictionary.getInstance(); 60 | 61 | Assert.assertTrue(dic.match("词典")); 62 | 63 | Assert.assertFalse(dic.match("人个")); 64 | Assert.assertFalse(dic.match("三个人")); 65 | 66 | Assert.assertFalse(dic.match("")); 67 | Assert.assertFalse(dic.match("人")); 68 | 69 | } 70 | 71 | @Test 72 | public void testFileHashCode() throws IOException { 73 | File f = new File("data"); 74 | File f1 = new File("./data"); 75 | Assert.assertFalse(f.equals(f1)); 76 | 77 | f1 = f.getAbsoluteFile(); 78 | Assert.assertFalse(f.equals(f1)); 79 | 80 | Assert.assertTrue(f.getCanonicalFile().equals(f1.getCanonicalFile())); 81 | 82 | f1 = new File("data"); 83 | Assert.assertTrue(f.equals(f1)); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/test/java/com/chenlb/mmseg4j/KeyTreeTest.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | import com.chenlb.mmseg4j.CharNode.KeyTree; 4 | 5 | import junit.framework.TestCase; 6 | 7 | public class KeyTreeTest extends TestCase { 8 | 9 | protected void setUp() throws Exception { 10 | super.setUp(); 11 | } 12 | 13 | public void testMatch() { 14 | char[] w = "为什么".toCharArray(); 15 | KeyTree kt = new KeyTree(); 16 | kt.add(w); 17 | assertTrue(kt.match(w, 0, w.length)); 18 | assertFalse(kt.match(w, 0, 2)); 19 | assertFalse(kt.match("怎么样".toCharArray(), 0, 3)); 20 | 21 | w = "国人民银行".toCharArray(); 22 | kt.add(w); 23 | int tailLen = kt.maxMatch("中国人民银行".toCharArray(), 1); 24 | assertEquals(tailLen, w.length); 25 | } 26 | 27 | public void testMatch2() { 28 | Dictionary dic = Dictionary.getInstance(); 29 | int tailLen = dic.maxMatch("中国人民银行".toCharArray(), 0); 30 | assertEquals(tailLen, 5); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/test/java/com/chenlb/mmseg4j/MMSegTest.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | import java.io.IOException; 4 | import java.io.StringReader; 5 | 6 | import junit.framework.TestCase; 7 | 8 | public class MMSegTest extends TestCase { 9 | 10 | protected void setUp() throws Exception { 11 | super.setUp(); 12 | } 13 | 14 | public void testNext() throws IOException { 15 | String txt = ""; 16 | txt = "京华时报1月23日报道 昨天,受一股来自中西伯利亚的强冷空气影响,本市出现大风降温天气,白天最高气温只有零下7摄氏度,同时伴有6到7级的偏北风。"; 17 | txt = "研究生命起源"; 18 | txt = "手机电子书 abc http://www.sjshu.com"; 19 | txt = "Apple 苹果 MacBook Pro MB991CH/A 13.3m寸宽屏笔记本(Ⅱ,⑩)"; 20 | //txt = "2009年ゥスぁま是中 ABcc国абвгαβγδ首次,我的ⅠⅡⅢ在chenёlbēū全国ㄦ范围ㄚㄞㄢ内①ē②㈠㈩⒈⒑发行地方政府债券,"; 21 | Dictionary dic = Dictionary.getInstance(); 22 | Seg seg = null; 23 | //seg = new SimpleSeg(dic); 24 | seg = new ComplexSeg(dic); 25 | MMSeg mmSeg = new MMSeg(new StringReader(txt), seg); 26 | Word word = null; 27 | System.out.println(); 28 | while((word=mmSeg.next())!=null) { 29 | 30 | System.out.print(word.getString()+" -> "+word.getStartOffset()); 31 | //offset += word.length; 32 | System.out.println(", "+word.getEndOffset()+", "+word.getType()); 33 | 34 | 35 | } 36 | 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/test/java/com/chenlb/mmseg4j/MaxWordSegTest.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | import java.io.IOException; 4 | 5 | import org.junit.Assert; 6 | import org.junit.Before; 7 | import org.junit.Ignore; 8 | import org.junit.Test; 9 | 10 | import com.chenlb.mmseg4j.example.MaxWord; 11 | 12 | public class MaxWordSegTest { 13 | 14 | MaxWord segW; 15 | @Before 16 | public void setUp() throws Exception { 17 | segW = new MaxWord(); 18 | } 19 | 20 | @Test 21 | public void testEffect() throws IOException { 22 | String words = segW.segWords("共和国", "|"); 23 | Assert.assertEquals("共和|国", words); 24 | } 25 | 26 | @Test 27 | public void testEffect1() throws IOException { 28 | String words = segW.segWords("中国人民银行", "|"); 29 | Assert.assertEquals("中国|国人|人民|银行", words); 30 | } 31 | 32 | @Test 33 | public void testEffect2() throws IOException { 34 | String words = segW.segWords("西伯利亚", "|"); 35 | Assert.assertEquals("西|伯|利|亚", words); 36 | } 37 | 38 | @Test 39 | public void testEffect3() throws IOException { 40 | String words = segW.segWords("中华人民共和国", "|"); 41 | Assert.assertEquals("中华|华人|人民|共和|国", words); 42 | } 43 | 44 | @Test 45 | public void testEffect4() throws IOException { 46 | String words = segW.segWords("羽毛球拍", "|"); 47 | Assert.assertEquals("羽毛|球拍", words); 48 | } 49 | 50 | @Test 51 | public void testEffect5() throws IOException { 52 | String words = segW.segWords("化装和服装", "|"); 53 | Assert.assertEquals("化装|和|服装", words); 54 | } 55 | 56 | @Test 57 | public void testEffect6() throws IOException { 58 | String words = segW.segWords("为什么", "|"); 59 | Assert.assertEquals("为|什么", words); 60 | } 61 | 62 | @Test 63 | @Ignore 64 | public void testEffect7() throws IOException { 65 | String words = segW.segWords("很好听", "|"); 66 | // Complex 分出 '很|好听' 67 | // 目前 max-word 是在 complex 之后再分词的。 68 | Assert.assertEquals("很好|好听", words); 69 | } 70 | 71 | @Test 72 | public void testEffect8() throws IOException { 73 | String words = segW.segWords("强冷空气", "|"); 74 | Assert.assertEquals("强|冷|空气", words); 75 | } 76 | 77 | /** 78 | * 自扩展的词库文件 79 | */ 80 | @Test 81 | public void testEffect9() throws IOException { 82 | String words = segW.segWords("白云山", "|"); 83 | Assert.assertEquals("白云|云山", words); 84 | } 85 | 86 | @Test 87 | public void testEffect10() throws IOException { 88 | String words = segW.segWords("清华大学", "|"); 89 | Assert.assertEquals("清华|大学", words); 90 | } 91 | 92 | @Test 93 | public void testEffect11() throws IOException { 94 | String words = segW.segWords("华南理工大学", "|"); 95 | // '工大' 在词库中没有 96 | Assert.assertEquals("华南|理工|大学", words); 97 | } 98 | 99 | @Test 100 | public void testEffect12() throws IOException { 101 | String words = segW.segWords("广东工业大学", "|"); 102 | // '业大' 在词库中有 103 | Assert.assertEquals("广东|工业|业大|大学", words); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/test/java/com/chenlb/mmseg4j/MyTest.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.File; 5 | import java.io.FileOutputStream; 6 | import java.io.IOException; 7 | import java.io.OutputStreamWriter; 8 | import java.net.URISyntaxException; 9 | import java.net.URL; 10 | import java.util.ArrayList; 11 | import java.util.Arrays; 12 | import java.util.Collections; 13 | import java.util.Comparator; 14 | import java.util.List; 15 | import java.util.Map; 16 | import java.util.Map.Entry; 17 | 18 | import org.junit.Ignore; 19 | import org.junit.Test; 20 | 21 | public class MyTest { 22 | 23 | public void test100Log() { 24 | int freq = 1034142; 25 | print100Log(freq); 26 | 27 | freq = 847332; 28 | print100Log(freq); 29 | } 30 | 31 | private void print100Log(int freq) { 32 | int my100Log = (int) (Math.log(freq) * 100); 33 | System.out.println(freq+" -> "+my100Log+" | "+(Math.log(freq) * 100)); 34 | } 35 | 36 | public void testDicPath() throws URISyntaxException { 37 | URL url = Dictionary.class.getResource("/"); 38 | String path = ""; 39 | path = url.toURI().getRawPath(); 40 | System.out.println(path); 41 | File f = new File(path+"data"); 42 | System.out.println(f+" -> "+f.exists()); 43 | 44 | 45 | path = url.toExternalForm(); 46 | System.out.println(path); 47 | 48 | path = url.getPath(); 49 | System.out.println(path); 50 | 51 | path = System.getProperty("user.dir"); 52 | System.out.println(path); 53 | } 54 | 55 | public void testZhNumCodeP() { 56 | String num = "0123456789"; 57 | String n = "0123456789"; 58 | for(int i=0; i "+cp+", "+(char)ncp+" -> "+ncp); 62 | } 63 | } 64 | 65 | public void testCodePAndType() { 66 | String str = "0909☆§┍┄○一$¥≈∑①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇!中文【ゥスぁまēūㄇㄎноνπⅠⅡⅢ"; 67 | 68 | str = "ぁぃぅぇぉかきくけこんさしすせそたちつってとゐなにぬねのはひふへほゑまみむめもゃゅょゎを"; 69 | str += "あいうえおがぎぐげござじずぜぞだぢづでどぱぴぷぺぽばびぶべぼらりるれろやゆよわ"; 70 | 71 | str += "ァィゥヴェォカヵキクケヶコサシスセソタチツッテトヰンナニヌネノハヒフヘホヱマミムメモャュョヮヲ"; 72 | str += "アイウエオガギグゲゴザジズゼゾダヂヅデドパピプペポバビブベボラリルレロヤユヨワ"; 73 | 74 | str = "āáǎàōóǒòêēéěèīíǐìūúǔùǖǘǚǜü"; 75 | 76 | /*str = "ㄅㄉˇˋㄓˊ˙ㄚㄞㄢㄦㄆㄊㄍㄐㄔㄗㄧㄛㄟㄣㄇㄋㄎㄑㄕㄘㄨㄜㄠㄤㄈㄌㄏㄒㄖㄙㄩㄝㄡㄥ"; 77 | 78 | str = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"; 79 | str += "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ";*/ 80 | 81 | /*str = "αβγδεζηθικλμνξοπρστυφχψω"; 82 | str += "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ";*/ 83 | 84 | int[] cps = new int[str.length()]; 85 | for(int i=0; i ").append(cp); 94 | sb.append(", type=").append(Character.getType(cp)); 95 | sb.append(", hex=").append(Integer.toHexString(cp)); 96 | System.out.println(sb); 97 | } 98 | } 99 | 100 | public void testCodePAndType2() { 101 | 102 | int start = 12435+1; 103 | int end = 12449-1; 104 | 105 | start = 0xff21; 106 | end = 0xff5a; 107 | 108 | StringBuilder sb = new StringBuilder(); 109 | for(int i=start; i<=end; i++) { 110 | sb.setLength(0); 111 | int cp = i;//str.codePointAt(i); 112 | sb.appendCodePoint(cp).append(" -> ").append(cp); 113 | sb.append(", type=").append(Character.getType(cp)); 114 | sb.append(", hex=").append(Integer.toHexString(cp)); 115 | System.out.println(sb); 116 | } 117 | } 118 | 119 | @Test 120 | @Ignore 121 | public void testShowUnicode() { 122 | int c = 0x2F81A; 123 | int mc = Character.toLowerCase(c); 124 | StringBuilder sb = new StringBuilder(); 125 | sb.appendCodePoint(c).append(" --to low--> ").appendCodePoint(mc); 126 | System.out.println("c="+c+",mc="+mc+"\n"+sb); 127 | } 128 | 129 | private static long now() { 130 | return System.currentTimeMillis(); 131 | } 132 | 133 | @Test 134 | @Ignore 135 | public void testSeeSogouDic() throws IOException { 136 | Dictionary dic = Dictionary.getInstance("sogou"); 137 | Map dict = dic.getDict(); 138 | long start = now(); 139 | List> es = new ArrayList>(dict.size()); 140 | es.addAll(dict.entrySet()); 141 | System.out.println("add use "+(now()-start)+"ms"); 142 | start = now(); 143 | Collections.sort(es, new Comparator>() { 144 | 145 | public int compare(Entry a, 146 | Entry b) { 147 | int r = -new Integer(a.getValue().getMaxLen()).compareTo(b.getValue().getMaxLen()); 148 | if(r == 0) { 149 | r = -new Integer(a.getValue().wordNum()).compareTo(b.getValue().wordNum()); 150 | } 151 | if(r == 0) { 152 | r = -new Integer(a.getValue().getFreq()).compareTo(b.getValue().getFreq()); 153 | } 154 | return r; 155 | } 156 | 157 | }); 158 | System.out.println("sort use "+(now()-start)+"ms"); 159 | start = now(); 160 | BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File("sogou/word-stat.txt")), "UTF-8")); 161 | writer.append("char").append('\t') 162 | .append("freq").append('\t') 163 | .append("maxLen").append('\t') 164 | .append("wordNum").append('\t') 165 | .append("lens").append("\r\n"); 166 | for(Map.Entry e : es) { 167 | CharNode cn = e.getValue(); 168 | writer.append(e.getKey()).append('\t') 169 | .append(cn.getFreq()+"").append('\t') 170 | .append(cn.getMaxLen()+"").append('\t') 171 | .append(cn.wordNum()+"").append('\t') 172 | .append("\r\n"); 173 | } 174 | writer.close(); 175 | System.out.println("writer use "+(now()-start)+"ms"); 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /src/test/java/com/chenlb/mmseg4j/SimpleSegTest.java: -------------------------------------------------------------------------------- 1 | package com.chenlb.mmseg4j; 2 | 3 | import java.io.IOException; 4 | 5 | import org.junit.Assert; 6 | import org.junit.Before; 7 | import org.junit.Test; 8 | 9 | import com.chenlb.mmseg4j.example.Simple; 10 | 11 | public class SimpleSegTest { 12 | 13 | Simple segW; 14 | 15 | @Before 16 | public void setUp() throws Exception { 17 | segW = new Simple(); 18 | } 19 | 20 | @Test 21 | public void testEffect() throws IOException { 22 | String words = segW.segWords("研究生命起源", "|"); 23 | Assert.assertEquals("研究生|命|起源", words); 24 | } 25 | 26 | @Test 27 | public void testEffect1() throws IOException { 28 | String words = segW.segWords("为首要考虑", "|"); 29 | Assert.assertEquals("为首|要|考虑", words); 30 | } 31 | 32 | @Test 33 | public void testEffect2() throws IOException { 34 | String words = segW.segWords("眼看就要来了", "|"); 35 | Assert.assertEquals("眼看|就要|来|了", words); 36 | } 37 | 38 | @Test 39 | public void testEffect3() throws IOException { 40 | String words = segW.segWords("中西伯利亚", "|"); 41 | Assert.assertEquals("中西|伯|利|亚", words); 42 | } 43 | 44 | @Test 45 | public void testEffect4() throws IOException { 46 | String words = segW.segWords("国际化", "|"); 47 | Assert.assertEquals("国际化", words); 48 | } 49 | 50 | @Test 51 | public void testEffect5() throws IOException { 52 | String words = segW.segWords("化装和服装", "|"); 53 | Assert.assertEquals("化装|和服|装", words); 54 | } 55 | 56 | @Test 57 | public void testEffect6() throws IOException { 58 | String words = segW.segWords("中国人民银行", "|"); 59 | Assert.assertEquals("中国人民银行", words); 60 | } 61 | 62 | /** 63 | * 自扩展的词库文件 64 | */ 65 | @Test 66 | public void testEffect7() throws IOException { 67 | String words = segW.segWords("白云山", "|"); 68 | Assert.assertEquals("白云山", words); 69 | } 70 | 71 | @Test 72 | public void testUnitEffect() throws IOException { 73 | String words = segW.segWords("2008年中有很多事情", "|"); 74 | Assert.assertEquals("2008|年|中有|很多|事情", words); 75 | } 76 | 77 | @Test 78 | public void testUnitEffect1() throws IOException { 79 | String words = segW.segWords("20分钟能完成", "|"); 80 | Assert.assertEquals("20|分钟|能|完成", words); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/test/resources/data/words-test-my.dic: -------------------------------------------------------------------------------- 1 | # 2 | 自定义词 --------------------------------------------------------------------------------