├── .gitignore
├── LICENSE
├── README.md
├── pom.xml
└── src
├── main
├── java
│ └── com
│ │ └── chenlb
│ │ └── mmseg4j
│ │ ├── CharNode.java
│ │ ├── Chunk.java
│ │ ├── ComplexSeg.java
│ │ ├── Dictionary.java
│ │ ├── MMSeg.java
│ │ ├── MaxWordSeg.java
│ │ ├── Seg.java
│ │ ├── Sentence.java
│ │ ├── SimpleSeg.java
│ │ ├── Word.java
│ │ ├── example
│ │ ├── Complex.java
│ │ ├── MaxWord.java
│ │ └── Simple.java
│ │ └── rule
│ │ ├── LargestAvgLenRule.java
│ │ ├── LargestSumDegreeFreedomRule.java
│ │ ├── MaxMatchRule.java
│ │ ├── Rule.java
│ │ └── SmallestVarianceRule.java
└── resources
│ └── data
│ ├── chars.dic
│ ├── units.dic
│ └── words.dic
└── test
├── java
└── com
│ └── chenlb
│ └── mmseg4j
│ ├── ComplexSegTest.java
│ ├── DictionaryTest.java
│ ├── KeyTreeTest.java
│ ├── MMSegTest.java
│ ├── MaxWordSegTest.java
│ ├── MyTest.java
│ └── SimpleSegTest.java
└── resources
└── data
└── words-test-my.dic
/.gitignore:
--------------------------------------------------------------------------------
1 | # Eclipse
2 | .classpath
3 | .project
4 | .settings/
5 |
6 | # Intellij
7 | .idea/
8 | *.iml
9 | *.iws
10 |
11 | # Mac
12 | .DS_Store
13 |
14 | # Maven
15 | log/
16 | target/
17 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | mmseg4j core 使用 Chih-Hao Tsai 的 MMSeg 算法(http://technology.chtsai.org/mmseg/ )实现的中文分词器。
2 |
3 | MMSeg 算法有两种分词方法:Simple和Complex,都是基于正向最大匹配。Complex 加了四个规则过虑。官方说:词语的正确识别率达到了 98.41%。mmseg4j 已经实现了这两种分词算法。
4 |
5 | ```xml
6 |
7 | com.chenlb.mmseg4j
8 | mmseg4j-core
9 | 1.10.0
10 |
11 | ```
12 |
13 | ## example
14 |
15 | ```
16 | git clone https://github.com/chenlb/mmseg4j-core mmseg4j-core
17 | cd mmseg4j-core
18 | mvn compile
19 |
20 | #运行
21 | #Complex 分词模式
22 | java -cp .:target/classes com.chenlb.mmseg4j.example.Complex
23 |
24 | #Simple 分词模式
25 | java -cp .:target/classes com.chenlb.mmseg4j.example.Simple
26 |
27 | #MaxWord 分词模式
28 | java -cp .:target/classes com.chenlb.mmseg4j.example.MaxWord
29 |
30 | #或编译打包
31 | mvn package
32 |
33 | java -cp .:target/mmseg4j-core-1.10.1-SNAPSHOT.jar com.chenlb.mmseg4j.example.Complex
34 | ```
35 |
36 | ## 其它
37 |
38 | * [早期的介绍](https://github.com/chenlb/mmseg4j-from-googlecode)
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 |
4 | org.sonatype.oss
5 | oss-parent
6 | 7
7 |
8 | com.chenlb.mmseg4j
9 | mmseg4j-core
10 | mmseg4j-core
11 | https://github.com/chenlb/mmseg4j-core
12 | MMSEG cor for java chinese analyzer
13 |
14 |
15 | The Apache Software License, Version 2.0
16 | http://www.apache.org/licenses/LICENSE-2.0.txt
17 | repo
18 |
19 |
20 |
21 | http://blog.chenlb.com
22 | chenlb open source
23 |
24 |
25 | git@github.com:chenlb/mmseg4j-core.git
26 | scm:git:git@github.com:chenlb/mmseg4j-core.git
27 | scm:git:git@github.com:chenlb/mmseg4j-core.git
28 |
29 |
30 |
31 | chenlb
32 | LinBin Chen
33 | chenlb2008@gmail.com
34 |
35 |
36 |
37 | https://github.com/chenlb/mmseg4j-core/issues
38 | github.com
39 |
40 |
41 |
42 | junit
43 | junit
44 | 4.8
45 | test
46 |
47 |
48 |
49 |
50 |
51 |
52 | org.apache.maven.plugins
53 | maven-compiler-plugin
54 | 2.3.1
55 |
56 | 1.6
57 | 1.6
58 | UTF-8
59 |
60 |
61 |
62 | org.apache.maven.plugins
63 | maven-gpg-plugin
64 |
65 |
66 | sign-artifacts
67 | verify
68 |
69 | sign
70 |
71 |
72 |
73 |
74 |
75 |
76 | 1.10.1-SNAPSHOT
77 |
--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/CharNode.java:
--------------------------------------------------------------------------------
1 | package com.chenlb.mmseg4j;
2 |
3 | import java.util.ArrayList;
4 | import java.util.HashMap;
5 | import java.util.Map;
6 |
7 | /**
8 | * 所有词都记录在第一个字的结点下.
9 | *
10 | * @author chenlb 2009-2-20 下午11:30:14
11 | */
12 | public class CharNode {
13 |
14 | private int freq = -1; //Degree of Morphemic Freedom of One-Character, 单字才需要
15 | private int maxLen = 0; //wordTail的最长
16 |
17 | private KeyTree ktWordTails = new KeyTree();
18 | private int wordNum = 0;
19 |
20 | public CharNode() {
21 |
22 | }
23 |
24 | public void addWordTail(char[] wordTail) {
25 | ktWordTails.add(wordTail);
26 | wordNum++;
27 | if(wordTail.length > maxLen) {
28 | maxLen = wordTail.length;
29 | }
30 | }
31 | public int getFreq() {
32 | return freq;
33 | }
34 |
35 | public void setFreq(int freq) {
36 | this.freq = freq;
37 | }
38 |
39 | public int wordNum() {
40 | return wordNum;
41 | }
42 |
43 | /**
44 | * @param sen 句子, 一串文本.
45 | * @param offset 词在句子中的位置
46 | * @param tailLen 词尾的长度, 实际是去掉词的长度.
47 | * @author chenlb 2009-4-8 下午11:10:30
48 | */
49 | public int indexOf(char[] sen, int offset, int tailLen) {
50 | //return binarySearch(wordTails, sen, offset+1, tailLen, casc);
51 | return ktWordTails.match(sen, offset+1, tailLen) ? 1 : -1;
52 | }
53 |
54 | /**
55 | * @param sen 句子, 一串文本.
56 | * @param wordTailOffset 词在句子中的位置, 实际是 offset 后面的开始找.
57 | * @return 返回词尾长, 没有就是 0
58 | * @author chenlb 2009-4-10 下午10:45:51
59 | */
60 | public int maxMatch(char[] sen, int wordTailOffset) {
61 | return ktWordTails.maxMatch(sen, wordTailOffset);
62 | }
63 |
64 | /**
65 | *
66 | * @return 至少返回一个包括 0的int
67 | * @author chenlb 2009-4-12 上午10:01:35
68 | */
69 | public ArrayList maxMatch(ArrayList tailLens, char[] sen, int wordTailOffset) {
70 | return ktWordTails.maxMatch(tailLens, sen, wordTailOffset);
71 | }
72 |
73 | public int getMaxLen() {
74 | return maxLen;
75 | }
76 | public void setMaxLen(int maxLen) {
77 | this.maxLen = maxLen;
78 | }
79 |
80 | public static class KeyTree {
81 | TreeNode head = new TreeNode(' ');
82 |
83 | public void add(char[] w) {
84 | if(w.length < 1) {
85 | return;
86 | }
87 | TreeNode p = head;
88 | for(int i=0; i maxMatch(ArrayList tailLens, char[] sen, int offset) {
119 | TreeNode node = head;
120 | for(int i=offset; i subNodes;
148 | boolean alsoLeaf;
149 | public TreeNode(char key) {
150 | this.key = key;
151 | subNodes = new HashMap();
152 | }
153 |
154 | public void born(char k, TreeNode sub) {
155 | subNodes.put(k, sub);
156 | }
157 |
158 | public TreeNode subNode(char k) {
159 | return subNodes.get(k);
160 | }
161 | public boolean isAlsoLeaf() {
162 | return alsoLeaf;
163 | }
164 | }
165 | }
166 |
--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/Chunk.java:
--------------------------------------------------------------------------------
1 | package com.chenlb.mmseg4j;
2 |
3 |
4 | /**
5 | * 它是MMSeg分词算法中一个关键的概念。Chunk中包含依据上下文分出的一组词和相关的属性,包括长度(Length)、平均长度(Average Length)、标准差的平方(Variance)和自由语素度(Degree Of Morphemic Freedom)。
6 | *
7 | * @author chenlb 2009-3-16 上午11:39:42
8 | */
9 | public class Chunk {
10 |
11 | Word[] words = new Word[3];
12 |
13 | int count = -1;
14 |
15 | /** Word Length */
16 | private int len = -1;
17 | /** Largest Average Word Length */
18 | private double avgLen = -1;
19 | /** Variance of Word Lengths 就是 标准差的平方 */
20 | private double variance = -1;
21 | /** Sum of Degree of Morphemic Freedom of One-Character */
22 | private int sumDegree = -1;
23 |
24 | /** Word Length */
25 | public int getLen() {
26 | if(len < 0) {
27 | len = 0;
28 | count = 0;
29 | for(Word word : words) {
30 | if(word != null) {
31 | len += word.getLength();
32 | count++;
33 | }
34 | }
35 | }
36 | return len;
37 | }
38 |
39 | /** 有多少个词,最多3个。*/
40 | public int getCount() {
41 | if(count < 0) {
42 | count = 0;
43 | for(Word word : words) {
44 | if(word != null) {
45 | count++;
46 | }
47 | }
48 | }
49 | return count;
50 | }
51 |
52 | /** Largest Average Word Length */
53 | public double getAvgLen() {
54 | if(avgLen < 0) {
55 | avgLen = (double)getLen()/getCount();
56 | }
57 | return avgLen;
58 | }
59 |
60 | /** Variance of Word Lengths 就是 标准差的平方 */
61 | public double getVariance() {
62 | if(variance < 0) {
63 | double sum = 0;
64 | for(Word word : words) {
65 | if(word != null) {
66 | sum += Math.pow(word.getLength()-getAvgLen(), 2);
67 | }
68 | }
69 | variance = sum/getCount();
70 | }
71 | return variance;
72 | }
73 |
74 | /** Sum of Degree of Morphemic Freedom of One-Character */
75 | public int getSumDegree() {
76 | if(sumDegree < 0) {
77 | int sum = 0;
78 | for(Word word : words) {
79 | if(word != null && word.getDegree() > -1) {
80 | sum += word.getDegree();
81 | }
82 | }
83 | sumDegree = sum;
84 | }
85 | return sumDegree;
86 | }
87 |
88 | @Override
89 | public String toString() {
90 | StringBuilder sb = new StringBuilder();
91 | for(Word word : words) {
92 | if(word != null) {
93 | sb.append(word.getString()).append('_');
94 | }
95 | }
96 | return sb.toString();
97 | }
98 |
99 | public String toFactorString() {
100 | StringBuilder sb = new StringBuilder();
101 | sb.append("[");
102 | sb.append("len=").append(getLen()).append(", ");
103 | sb.append("avgLen=").append(getAvgLen()).append(", ");
104 | sb.append("variance=").append(getVariance()).append(", ");
105 | sb.append("sum100log=").append(getSumDegree()).append("]");
106 | return sb.toString();
107 | }
108 |
109 | public Word[] getWords() {
110 | return words;
111 | }
112 |
113 | public void setWords(Word[] words) {
114 | this.words = words;
115 | count = words.length;
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/ComplexSeg.java:
--------------------------------------------------------------------------------
1 | package com.chenlb.mmseg4j;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import com.chenlb.mmseg4j.rule.LargestAvgLenRule;
7 | import com.chenlb.mmseg4j.rule.LargestSumDegreeFreedomRule;
8 | import com.chenlb.mmseg4j.rule.MaxMatchRule;
9 | import com.chenlb.mmseg4j.rule.Rule;
10 | import com.chenlb.mmseg4j.rule.SmallestVarianceRule;
11 |
12 |
13 | /**
14 | * 正向最大匹配, 加四个过虑规则的分词方式.
15 | *
16 | * @author chenlb 2009-3-16 下午09:15:26
17 | */
18 | public class ComplexSeg extends Seg{
19 |
20 | private MaxMatchRule mmr = new MaxMatchRule();
21 | private List otherRules = new ArrayList();
22 |
23 | private static boolean showChunk = false;
24 |
25 | public ComplexSeg(Dictionary dic) {
26 | super(dic);
27 | otherRules.add(new LargestAvgLenRule());
28 | otherRules.add(new SmallestVarianceRule());
29 | otherRules.add(new LargestSumDegreeFreedomRule());
30 | }
31 |
32 | public Chunk seg(Sentence sen) {
33 | char[] chs = sen.getText();
34 | int[] tailLen = new int[3]; //记录词的尾长
35 | //int[] maxTailLen = new int[3];
36 | @SuppressWarnings("unchecked")
37 | ArrayList[] tailLens = new ArrayList[2]; //记录词尾部允许的长度
38 | for(int i=0; i<2; i++) {
39 | tailLens[i] = new ArrayList();
40 | }
41 | CharNode[] cns = new CharNode[3];
42 |
43 | int[] offsets = new int[3]; //每个词在sen的开始位置
44 | mmr.reset();
45 | if(!sen.isFinish()) { //sen.getOffset() < chs.length
46 | if(showChunk) {
47 | System.out.println();
48 | }
49 | int maxLen = 0;
50 | offsets[0] = sen.getOffset();
51 | /*
52 | * 遍历所有不同词长,还不是从最大到0(w[0]=maxLen(chs, offsets[0]); w[0]>=0; w[0]--)
53 | * 可以减少一部分多余的查找.
54 | */
55 | maxMatch(cns, 0, chs, offsets[0], tailLens, 0);
56 | for(int aIdx=tailLens[0].size()-1; aIdx>=0; aIdx--) {
57 |
58 | tailLen[0] = tailLens[0].get(aIdx);
59 |
60 | offsets[1] = offsets[0]+1+tailLen[0]; //第二个词的开始位置
61 |
62 | maxMatch(cns, 1, chs, offsets[1], tailLens, 1);
63 | for(int bIdx=tailLens[1].size()-1; bIdx>=0; bIdx--) {
64 |
65 | tailLen[1] = tailLens[1].get(bIdx);
66 | offsets[2] = offsets[1]+1+tailLen[1];
67 |
68 | //第三个词只需要最长的
69 | tailLen[2] = maxMatch(cns, 2, chs, offsets[2]);
70 |
71 | int sumChunkLen = 0;
72 | for(int i=0; i<3; i++) {
73 | sumChunkLen += tailLen[i]+1;
74 | }
75 | Chunk ck = null;
76 | if(sumChunkLen >= maxLen) {
77 | maxLen = sumChunkLen; //下一个chunk块的开始位置增量
78 | ck = createChunk(sen, chs, tailLen, offsets, cns);
79 | mmr.addChunk(ck);
80 |
81 | }
82 | if(showChunk) {
83 | if(ck == null) {
84 | ck = createChunk(sen, chs, tailLen, offsets, cns);
85 | mmr.addChunk(ck);
86 | }
87 | System.out.println(ck);
88 | }
89 |
90 | }
91 | }
92 | sen.addOffset(maxLen); //maxLen个字符已经处理完
93 | List chunks = mmr.remainChunks();
94 | for(Rule rule : otherRules) { //其它规则过虑
95 | if(showChunk) {
96 | System.out.println("-------filter before "+rule+"----------");
97 | printChunk(chunks);
98 | }
99 | if(chunks.size() > 1) {
100 | rule.reset();
101 | rule.addChunks(chunks);
102 | chunks = rule.remainChunks();
103 | } else {
104 | break;
105 | }
106 | }
107 | if(showChunk) {
108 | System.out.println("-------remainChunks----------");
109 | printChunk(chunks);
110 | }
111 | if(chunks.size() > 0) {
112 | return chunks.get(0);
113 | }
114 | }
115 | return null;
116 | }
117 |
118 | private Chunk createChunk(Sentence sen, char[] chs, int[] tailLen, int[] offsets, CharNode[] cns/*, char[][] cks*/) {
119 | Chunk ck = new Chunk();
120 |
121 | for(int i=0; i<3; i++) {
122 |
123 | if(offsets[i] < chs.length) {
124 | ck.words[i] = new Word(chs, sen.getStartOffset(), offsets[i], tailLen[i]+1);//new Word(cks[i], sen.getStartOffset()+offsets[i]);
125 | if(tailLen[i] == 0) { //单字的要取得"字频计算出自由度"
126 | CharNode cn = cns[i]; //dic.head(chs[offsets[i]]);
127 | if(cn !=null) {
128 | ck.words[i].setDegree(cn.getFreq());
129 | }
130 | }
131 | }
132 | }
133 | return ck;
134 | }
135 |
136 | public static boolean isShowChunk() {
137 | return showChunk;
138 | }
139 |
140 | public static void setShowChunk(boolean showChunk) {
141 | ComplexSeg.showChunk = showChunk;
142 | }
143 | }
144 |
--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/Dictionary.java:
--------------------------------------------------------------------------------
1 | package com.chenlb.mmseg4j;
2 |
3 | import java.io.BufferedInputStream;
4 | import java.io.BufferedReader;
5 | import java.io.File;
6 | import java.io.FileInputStream;
7 | import java.io.FilenameFilter;
8 | import java.io.IOException;
9 | import java.io.InputStream;
10 | import java.io.InputStreamReader;
11 | import java.net.URL;
12 | import java.util.ArrayList;
13 | import java.util.HashMap;
14 | import java.util.Map;
15 | import java.util.Map.Entry;
16 | import java.util.concurrent.ConcurrentHashMap;
17 | import java.util.logging.Level;
18 | import java.util.logging.Logger;
19 |
20 | /**
21 | * 词典类. 词库目录单例模式.
22 | * 保存单字与其频率,还有词库.
23 | * 有检测词典变更的接口,外部程序可以使用 {@link #wordsFileIsChange()} 和 {@link #reload()} 来完成检测与加载的工作.
24 | *
25 | * @author chenlb 2009-2-20 下午11:34:29
26 | */
27 | public class Dictionary {
28 |
29 | private static final Logger log = Logger.getLogger(Dictionary.class.getName());
30 |
31 | private File dicPath; //词库目录
32 | private volatile Map dict;
33 | private volatile Map unit; //单个字的单位
34 |
35 | /** 记录 word 文件的最后修改时间 */
36 | private Map wordsLastTime = null;
37 | private long lastLoadTime = 0;
38 |
39 | /** 不要直接使用, 通过 {@link #getDefalutPath()} 使用*/
40 | private static File defalutPath = null;
41 | private static final ConcurrentHashMap dics = new ConcurrentHashMap();
42 |
43 | protected void finalize() throws Throwable {
44 | /*
45 | * 使 class reload 的时也可以释放词库
46 | */
47 | destroy();
48 | }
49 |
50 | /**
51 | * 从默认目录加载词库文件.
52 | * 查找默认目录顺序:
53 | *
54 | *