├── .gitignore
├── LICENSE
├── README.md
├── pom.xml
└── src
    ├── main
        ├── java
        │   └── com
        │   │   └── chenlb
        │   │       └── mmseg4j
        │   │           ├── CharNode.java
        │   │           ├── Chunk.java
        │   │           ├── ComplexSeg.java
        │   │           ├── Dictionary.java
        │   │           ├── MMSeg.java
        │   │           ├── MaxWordSeg.java
        │   │           ├── Seg.java
        │   │           ├── Sentence.java
        │   │           ├── SimpleSeg.java
        │   │           ├── Word.java
        │   │           ├── example
        │   │               ├── Complex.java
        │   │               ├── MaxWord.java
        │   │               └── Simple.java
        │   │           └── rule
        │   │               ├── LargestAvgLenRule.java
        │   │               ├── LargestSumDegreeFreedomRule.java
        │   │               ├── MaxMatchRule.java
        │   │               ├── Rule.java
        │   │               └── SmallestVarianceRule.java
        └── resources
        │   └── data
        │       ├── chars.dic
        │       ├── units.dic
        │       └── words.dic
    └── test
        ├── java
            └── com
            │   └── chenlb
            │       └── mmseg4j
            │           ├── ComplexSegTest.java
            │           ├── DictionaryTest.java
            │           ├── KeyTreeTest.java
            │           ├── MMSegTest.java
            │           ├── MaxWordSegTest.java
            │           ├── MyTest.java
            │           └── SimpleSegTest.java
        └── resources
            └── data
                └── words-test-my.dic


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Eclipse
 2 | .classpath
 3 | .project
 4 | .settings/
 5 | 
 6 | # Intellij
 7 | .idea/
 8 | *.iml
 9 | *.iws
10 | 
11 | # Mac
12 | .DS_Store
13 | 
14 | # Maven
15 | log/
16 | target/
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | mmseg4j core 使用 Chih-Hao Tsai 的 MMSeg 算法(http://technology.chtsai.org/mmseg/ )实现的中文分词器。
 2 | 
 3 | MMSeg 算法有两种分词方法：Simple和Complex，都是基于正向最大匹配。Complex 加了四个规则过虑。官方说：词语的正确识别率达到了 98.41%。mmseg4j 已经实现了这两种分词算法。
 4 | 
 5 | ```xml
 6 | <dependency>
 7 |     <groupId>com.chenlb.mmseg4j</groupId>
 8 |     <artifactId>mmseg4j-core</artifactId>
 9 |     <version>1.10.0</version>
10 | </dependency>
11 | ```
12 | 
13 | ## example
14 | 
15 | ```
16 | git clone https://github.com/chenlb/mmseg4j-core mmseg4j-core
17 | cd mmseg4j-core
18 | mvn compile
19 | 
20 | #运行
21 | #Complex 分词模式
22 | java -cp .:target/classes com.chenlb.mmseg4j.example.Complex
23 | 
24 | #Simple 分词模式
25 | java -cp .:target/classes com.chenlb.mmseg4j.example.Simple
26 | 
27 | #MaxWord 分词模式
28 | java -cp .:target/classes com.chenlb.mmseg4j.example.MaxWord
29 | 
30 | #或编译打包
31 | mvn package
32 | 
33 | java -cp .:target/mmseg4j-core-1.10.1-SNAPSHOT.jar com.chenlb.mmseg4j.example.Complex
34 | ```
35 | 
36 | ## 其它
37 | 
38 | * [早期的介绍](https://github.com/chenlb/mmseg4j-from-googlecode)


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 |   <modelVersion>4.0.0</modelVersion>
 3 |   <parent>
 4 |     <groupId>org.sonatype.oss</groupId>
 5 |     <artifactId>oss-parent</artifactId>
 6 |     <version>7</version>
 7 |   </parent>
 8 |   <groupId>com.chenlb.mmseg4j</groupId>
 9 |   <artifactId>mmseg4j-core</artifactId>
10 |   <name>mmseg4j-core</name>
11 |   <url>https://github.com/chenlb/mmseg4j-core</url>
12 |   <description>MMSEG cor for java chinese analyzer</description>
13 |   <licenses>
14 |     <license>
15 |       <name>The Apache Software License, Version 2.0</name>
16 |       <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
17 |       <distribution>repo</distribution>
18 |     </license>
19 |   </licenses>
20 |   <organization>
21 |   	<url>http://blog.chenlb.com</url>
22 |   	<name>chenlb open source</name>
23 |   </organization>
24 |   <scm>
25 |   	<url>git@github.com:chenlb/mmseg4j-core.git</url>
26 |   	<developerConnection>scm:git:git@github.com:chenlb/mmseg4j-core.git</developerConnection>
27 |   	<connection>scm:git:git@github.com:chenlb/mmseg4j-core.git</connection>
28 |   </scm>
29 |   <developers>
30 |   	<developer>
31 |   		<id>chenlb</id>
32 |   		<name>LinBin Chen</name>
33 |   		<email>chenlb2008@gmail.com</email>
34 |   	</developer>
35 |   </developers>
36 |   <issueManagement>
37 |   	<url>https://github.com/chenlb/mmseg4j-core/issues</url>
38 |   	<system>github.com</system>
39 |   </issueManagement>
40 |   <dependencies>
41 | 	<dependency>
42 | 		<groupId>junit</groupId>
43 | 		<artifactId>junit</artifactId>
44 | 		<version>4.8</version>
45 | 		<scope>test</scope>
46 | 	</dependency>
47 |   </dependencies>
48 | 
49 |   <build>
50 | 	<plugins>
51 | 		<plugin>
52 | 		  <groupId>org.apache.maven.plugins</groupId>
53 | 		  <artifactId>maven-compiler-plugin</artifactId>
54 | 		  <version>2.3.1</version>
55 | 		  <configuration>
56 | 		    <source>1.6</source>
57 | 		    <target>1.6</target>
58 | 		    <encoding>UTF-8</encoding>
59 | 		  </configuration>
60 | 		</plugin>
61 | 		<plugin>
62 | 			<groupId>org.apache.maven.plugins</groupId>
63 | 			<artifactId>maven-gpg-plugin</artifactId>
64 | 			<executions>
65 | 				<execution>
66 | 					<id>sign-artifacts</id>
67 | 					<phase>verify</phase>
68 | 					<goals>
69 | 						<goal>sign</goal>
70 | 					</goals>
71 | 				</execution>
72 | 			</executions>
73 | 		</plugin>
74 | 	</plugins>
75 |   </build>
76 |   <version>1.10.1-SNAPSHOT</version>
77 | </project>


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/CharNode.java:
--------------------------------------------------------------------------------
  1 | package com.chenlb.mmseg4j;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.HashMap;
  5 | import java.util.Map;
  6 | 
  7 | /**
  8 |  * 所有词都记录在第一个字的结点下.
  9 |  * 
 10 |  * @author chenlb 2009-2-20 下午11:30:14
 11 |  */
 12 | public class CharNode {
 13 | 
 14 | 	private int freq = -1;	//Degree of Morphemic Freedom of One-Character, 单字才需要
 15 | 	private int maxLen = 0;	//wordTail的最长
 16 | 
 17 | 	private KeyTree ktWordTails = new KeyTree();
 18 | 	private int wordNum = 0;
 19 | 	
 20 | 	public CharNode() {
 21 | 		
 22 | 	}
 23 | 	
 24 | 	public void addWordTail(char[] wordTail) {
 25 | 		ktWordTails.add(wordTail);
 26 | 		wordNum++;
 27 | 		if(wordTail.length > maxLen) {
 28 | 			maxLen = wordTail.length;
 29 | 		}
 30 | 	}
 31 | 	public int getFreq() {
 32 | 		return freq;
 33 | 	}
 34 | 	
 35 | 	public void setFreq(int freq) {
 36 | 		this.freq = freq;
 37 | 	}
 38 | 	
 39 | 	public int wordNum() {
 40 | 		return wordNum;
 41 | 	}
 42 | 	
 43 | 	/**
 44 | 	 * @param sen 句子, 一串文本.
 45 | 	 * @param offset 词在句子中的位置
 46 | 	 * @param tailLen 词尾的长度, 实际是去掉词的长度.
 47 | 	 * @author chenlb 2009-4-8 下午11:10:30
 48 | 	 */
 49 | 	public int indexOf(char[] sen, int offset, int tailLen) {
 50 | 		//return binarySearch(wordTails, sen, offset+1, tailLen, casc);
 51 | 		return ktWordTails.match(sen, offset+1, tailLen) ? 1 : -1;
 52 | 	}
 53 | 	
 54 | 	/**
 55 | 	 * @param sen 句子, 一串文本.
 56 | 	 * @param wordTailOffset 词在句子中的位置, 实际是 offset 后面的开始找.
 57 | 	 * @return 返回词尾长, 没有就是 0
 58 | 	 * @author chenlb 2009-4-10 下午10:45:51
 59 | 	 */
 60 | 	public int maxMatch(char[] sen, int wordTailOffset) {
 61 | 		return ktWordTails.maxMatch(sen, wordTailOffset);
 62 | 	}
 63 | 	
 64 | 	/**
 65 | 	 * 
 66 | 	 * @return 至少返回一个包括 0的int
 67 | 	 * @author chenlb 2009-4-12 上午10:01:35
 68 | 	 */
 69 | 	public ArrayList<Integer> maxMatch(ArrayList<Integer> tailLens, char[] sen, int wordTailOffset) {
 70 | 		return ktWordTails.maxMatch(tailLens, sen, wordTailOffset);
 71 | 	}
 72 | 	
 73 | 	public int getMaxLen() {
 74 | 		return maxLen;
 75 | 	}
 76 | 	public void setMaxLen(int maxLen) {
 77 | 		this.maxLen = maxLen;
 78 | 	}
 79 | 	
 80 | 	public static class KeyTree {
 81 | 		TreeNode head = new TreeNode(' ');
 82 | 		
 83 | 		public void add(char[] w) {
 84 | 			if(w.length < 1) {
 85 | 				return;
 86 | 			}
 87 | 			TreeNode p = head;
 88 | 			for(int i=0; i<w.length; i++) {
 89 | 				TreeNode n = p.subNode(w[i]);
 90 | 				if(n == null) {
 91 | 					n = new TreeNode(w[i]);
 92 | 					p.born(w[i], n);
 93 | 				}
 94 | 				p = n;
 95 | 			}
 96 | 			p.alsoLeaf = true;
 97 | 		}
 98 | 		
 99 | 		/**
100 | 		 * @return 返回匹配最长词的长度, 没有找到返回 0.
101 | 		 */
102 | 		public int maxMatch(char[] sen, int offset) {
103 | 			int idx = offset - 1;
104 | 			TreeNode node = head;
105 | 			for(int i=offset; i<sen.length; i++) {
106 | 				node = node.subNode(sen[i]);
107 | 				if(node != null) {
108 | 					if(node.isAlsoLeaf()) {
109 | 						idx = i; 
110 | 					}
111 | 				} else {
112 | 					break;
113 | 				}
114 | 			}
115 | 			return idx - offset + 1;
116 | 		}
117 | 		
118 | 		public ArrayList<Integer> maxMatch(ArrayList<Integer> tailLens, char[] sen, int offset) {
119 | 			TreeNode node = head;
120 | 			for(int i=offset; i<sen.length; i++) {
121 | 				node = node.subNode(sen[i]);
122 | 				if(node != null) {
123 | 					if(node.isAlsoLeaf()) {
124 | 						tailLens.add(i-offset+1); 
125 | 					}
126 | 				} else {
127 | 					break;
128 | 				}
129 | 			}
130 | 			return tailLens;
131 | 		}
132 | 		
133 | 		public boolean match(char[] sen, int offset, int len) {
134 | 			TreeNode node = head;
135 | 			for(int i=0; i<len; i++) {
136 | 				node = node.subNode(sen[offset+i]);
137 | 				if(node == null) {
138 | 					return false;
139 | 				}
140 | 			}
141 | 			return node.isAlsoLeaf();
142 | 		}
143 | 	}
144 | 	
145 | 	private static class TreeNode {
146 | 		char key;
147 | 		Map<Character, TreeNode> subNodes;
148 | 		boolean alsoLeaf;
149 | 		public TreeNode(char key) {
150 | 			this.key = key;
151 | 			subNodes = new HashMap<Character, TreeNode>();
152 | 		}
153 | 		
154 | 		public void born(char k, TreeNode sub) {
155 | 			subNodes.put(k, sub);
156 | 		}
157 | 		
158 | 		public TreeNode subNode(char k) {
159 | 			return subNodes.get(k);
160 | 		}
161 | 		public boolean isAlsoLeaf() {
162 | 			return alsoLeaf;
163 | 		}
164 | 	}
165 | }
166 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/Chunk.java:
--------------------------------------------------------------------------------
  1 | package com.chenlb.mmseg4j;
  2 | 
  3 | 
  4 | /**
  5 |  * 它是MMSeg分词算法中一个关键的概念。Chunk中包含依据上下文分出的一组词和相关的属性，包括长度(Length)、平均长度(Average Length)、标准差的平方(Variance)和自由语素度(Degree Of Morphemic Freedom)。
  6 |  * 
  7 |  * @author chenlb 2009-3-16 上午11:39:42
  8 |  */
  9 | public class Chunk {
 10 | 
 11 | 	Word[] words = new Word[3];
 12 | 	
 13 | 	int count = -1;
 14 | 	
 15 | 	/** Word Length */
 16 | 	private int len = -1;
 17 | 	/** Largest Average Word Length */
 18 | 	private double avgLen = -1;
 19 | 	/** Variance of Word Lengths 就是 标准差的平方 */
 20 | 	private double variance = -1;
 21 | 	/** Sum of Degree of Morphemic Freedom of One-Character */
 22 | 	private int sumDegree = -1;
 23 | 	
 24 | 	/** Word Length */
 25 | 	public int getLen() {
 26 | 		if(len < 0) {
 27 | 			len = 0;
 28 | 			count = 0;
 29 | 			for(Word word : words) {
 30 | 				if(word != null) {
 31 | 					len += word.getLength();
 32 | 					count++;
 33 | 				}
 34 | 			}
 35 | 		}
 36 | 		return len;
 37 | 	}
 38 | 	
 39 | 	/** 有多少个词，最多3个。*/
 40 | 	public int getCount() {
 41 | 		if(count < 0) {
 42 | 			count = 0;
 43 | 			for(Word word : words) {
 44 | 				if(word != null) {
 45 | 					count++;
 46 | 				}
 47 | 			}
 48 | 		}
 49 | 		return count;
 50 | 	}
 51 | 	
 52 | 	/** Largest Average Word Length */
 53 | 	public double getAvgLen() {
 54 | 		if(avgLen < 0) {
 55 | 			avgLen = (double)getLen()/getCount();
 56 | 		}
 57 | 		return avgLen;
 58 | 	}
 59 | 	
 60 | 	/** Variance of Word Lengths 就是 标准差的平方 */
 61 | 	public double getVariance() {
 62 | 		if(variance < 0) {
 63 | 			double sum = 0;
 64 | 			for(Word word : words) {
 65 | 				if(word != null) {
 66 | 					sum += Math.pow(word.getLength()-getAvgLen(), 2);
 67 | 				}
 68 | 			}
 69 | 			variance = sum/getCount();
 70 | 		}
 71 | 		return variance;
 72 | 	}
 73 | 	
 74 | 	/** Sum of Degree of Morphemic Freedom of One-Character */
 75 | 	public int getSumDegree() {
 76 | 		if(sumDegree < 0) {
 77 | 			int sum = 0;
 78 | 			for(Word word : words) {
 79 | 				if(word != null && word.getDegree() > -1) {
 80 | 					sum += word.getDegree();
 81 | 				}
 82 | 			}
 83 | 			sumDegree = sum;
 84 | 		}
 85 | 		return sumDegree;
 86 | 	}
 87 | 	
 88 | 	@Override
 89 | 	public String toString() {
 90 | 		StringBuilder sb = new StringBuilder();
 91 | 		for(Word word : words) {
 92 | 			if(word != null) {
 93 | 				sb.append(word.getString()).append('_');
 94 | 			}
 95 | 		}
 96 | 		return sb.toString();
 97 | 	}
 98 | 	
 99 | 	public String toFactorString() {
100 | 		StringBuilder sb = new StringBuilder();
101 | 		sb.append("[");
102 | 		sb.append("len=").append(getLen()).append(", ");
103 | 		sb.append("avgLen=").append(getAvgLen()).append(", ");
104 | 		sb.append("variance=").append(getVariance()).append(", ");
105 | 		sb.append("sum100log=").append(getSumDegree()).append("]");
106 | 		return sb.toString();
107 | 	}
108 | 
109 | 	public Word[] getWords() {
110 | 		return words;
111 | 	}
112 | 	
113 | 	public void setWords(Word[] words) {
114 | 		this.words = words;
115 | 		count = words.length;
116 | 	}
117 | }
118 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/ComplexSeg.java:
--------------------------------------------------------------------------------
  1 | package com.chenlb.mmseg4j;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | import com.chenlb.mmseg4j.rule.LargestAvgLenRule;
  7 | import com.chenlb.mmseg4j.rule.LargestSumDegreeFreedomRule;
  8 | import com.chenlb.mmseg4j.rule.MaxMatchRule;
  9 | import com.chenlb.mmseg4j.rule.Rule;
 10 | import com.chenlb.mmseg4j.rule.SmallestVarianceRule;
 11 | 
 12 | 
 13 | /**
 14 |  * 正向最大匹配, 加四个过虑规则的分词方式.
 15 |  * 
 16 |  * @author chenlb 2009-3-16 下午09:15:26
 17 |  */
 18 | public class ComplexSeg extends Seg{
 19 | 
 20 | 	private MaxMatchRule mmr = new MaxMatchRule();
 21 | 	private List<Rule> otherRules = new ArrayList<Rule>();
 22 | 	
 23 | 	private static boolean showChunk = false;
 24 | 	
 25 | 	public ComplexSeg(Dictionary dic) {
 26 | 		super(dic);
 27 | 		otherRules.add(new LargestAvgLenRule());
 28 | 		otherRules.add(new SmallestVarianceRule());
 29 | 		otherRules.add(new LargestSumDegreeFreedomRule());
 30 | 	}
 31 | 	
 32 | 	public Chunk seg(Sentence sen) {
 33 | 		char[] chs = sen.getText();
 34 | 		int[] tailLen = new int[3];	//记录词的尾长
 35 | 		//int[] maxTailLen = new int[3];	
 36 | 		@SuppressWarnings("unchecked")
 37 | 		ArrayList<Integer>[] tailLens = new ArrayList[2];	//记录词尾部允许的长度
 38 | 		for(int i=0; i<2; i++) {
 39 | 			tailLens[i] = new ArrayList<Integer>();
 40 | 		}
 41 | 		CharNode[] cns = new CharNode[3];
 42 | 		
 43 | 		int[] offsets = new int[3];	//每个词在sen的开始位置
 44 | 		mmr.reset();
 45 | 		if(!sen.isFinish()) {	//sen.getOffset() < chs.length
 46 | 			if(showChunk) {
 47 | 				System.out.println();
 48 | 			}
 49 | 			int maxLen = 0;
 50 | 			offsets[0] = sen.getOffset();
 51 | 			/*
 52 | 			 * 遍历所有不同词长,还不是从最大到0(w[0]=maxLen(chs, offsets[0]); w[0]>=0; w[0]--)
 53 | 			 * 可以减少一部分多余的查找.
 54 | 			 */
 55 | 			maxMatch(cns, 0, chs, offsets[0], tailLens, 0);
 56 | 			for(int aIdx=tailLens[0].size()-1; aIdx>=0; aIdx--) {
 57 | 
 58 | 				tailLen[0] = tailLens[0].get(aIdx);
 59 | 
 60 | 				offsets[1] = offsets[0]+1+tailLen[0];	//第二个词的开始位置
 61 | 
 62 | 				maxMatch(cns, 1, chs, offsets[1], tailLens, 1);
 63 | 				for(int bIdx=tailLens[1].size()-1; bIdx>=0; bIdx--) {
 64 | 
 65 | 					tailLen[1] = tailLens[1].get(bIdx);
 66 | 					offsets[2] = offsets[1]+1+tailLen[1];
 67 | 
 68 | 					//第三个词只需要最长的
 69 | 					tailLen[2] = maxMatch(cns, 2, chs, offsets[2]);
 70 | 
 71 | 					int sumChunkLen = 0;
 72 | 					for(int i=0; i<3; i++) {
 73 | 						sumChunkLen += tailLen[i]+1;
 74 | 					}
 75 | 					Chunk ck = null;
 76 | 					if(sumChunkLen >= maxLen) {
 77 | 						maxLen = sumChunkLen;	//下一个chunk块的开始位置增量
 78 | 						ck = createChunk(sen, chs, tailLen, offsets, cns);
 79 | 						mmr.addChunk(ck);
 80 | 
 81 | 					}
 82 | 					if(showChunk) {
 83 | 						if(ck == null) {
 84 | 							ck = createChunk(sen, chs, tailLen, offsets, cns);
 85 | 							mmr.addChunk(ck);
 86 | 						}
 87 | 						System.out.println(ck);
 88 | 					}
 89 | 
 90 | 				}
 91 | 			}
 92 | 			sen.addOffset(maxLen);	//maxLen个字符已经处理完
 93 | 			List<Chunk> chunks = mmr.remainChunks();
 94 | 			for(Rule rule : otherRules) {	//其它规则过虑
 95 | 				if(showChunk) {
 96 | 					System.out.println("-------filter before "+rule+"----------");
 97 | 					printChunk(chunks);
 98 | 				}
 99 | 				if(chunks.size() > 1) {
100 | 					rule.reset();
101 | 					rule.addChunks(chunks);
102 | 					chunks = rule.remainChunks();
103 | 				} else {
104 | 					break;
105 | 				}
106 | 			}
107 | 			if(showChunk) {
108 | 				System.out.println("-------remainChunks----------");
109 | 				printChunk(chunks);
110 | 			}
111 | 			if(chunks.size() > 0) {
112 | 				return chunks.get(0);
113 | 			}
114 | 		}
115 | 		return null;
116 | 	}
117 | 
118 | 	private Chunk createChunk(Sentence sen, char[] chs, int[] tailLen, int[] offsets, CharNode[] cns/*, char[][] cks*/) {
119 | 		Chunk ck = new Chunk();
120 | 		
121 | 		for(int i=0; i<3; i++) {
122 | 
123 | 			if(offsets[i] < chs.length) {
124 | 				ck.words[i] = new Word(chs, sen.getStartOffset(), offsets[i], tailLen[i]+1);//new Word(cks[i], sen.getStartOffset()+offsets[i]);
125 | 				if(tailLen[i] == 0) {	//单字的要取得"字频计算出自由度"
126 | 					CharNode cn = cns[i];	//dic.head(chs[offsets[i]]);
127 | 					if(cn !=null) {
128 | 						ck.words[i].setDegree(cn.getFreq());
129 | 					}
130 | 				}
131 | 			}
132 | 		}
133 | 		return ck;
134 | 	}
135 | 	
136 | 	public static boolean isShowChunk() {
137 | 		return showChunk;
138 | 	}
139 | 
140 | 	public static void setShowChunk(boolean showChunk) {
141 | 		ComplexSeg.showChunk = showChunk;
142 | 	}
143 | }
144 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/Dictionary.java:
--------------------------------------------------------------------------------
  1 | package com.chenlb.mmseg4j;
  2 | 
  3 | import java.io.BufferedInputStream;
  4 | import java.io.BufferedReader;
  5 | import java.io.File;
  6 | import java.io.FileInputStream;
  7 | import java.io.FilenameFilter;
  8 | import java.io.IOException;
  9 | import java.io.InputStream;
 10 | import java.io.InputStreamReader;
 11 | import java.net.URL;
 12 | import java.util.ArrayList;
 13 | import java.util.HashMap;
 14 | import java.util.Map;
 15 | import java.util.Map.Entry;
 16 | import java.util.concurrent.ConcurrentHashMap;
 17 | import java.util.logging.Level;
 18 | import java.util.logging.Logger;
 19 | 
 20 | /**
 21 |  * 词典类. 词库目录单例模式.<br/>
 22 |  * 保存单字与其频率,还有词库.<br/>
 23 |  * 有检测词典变更的接口，外部程序可以使用 {@link #wordsFileIsChange()} 和 {@link #reload()} 来完成检测与加载的工作.
 24 |  *
 25 |  * @author chenlb 2009-2-20 下午11:34:29
 26 |  */
 27 | public class Dictionary {
 28 | 
 29 | 	private static final Logger log = Logger.getLogger(Dictionary.class.getName());
 30 | 
 31 | 	private File dicPath;	//词库目录
 32 | 	private volatile Map<Character, CharNode> dict;
 33 | 	private volatile Map<Character, Object> unit;	//单个字的单位
 34 | 
 35 | 	/** 记录 word 文件的最后修改时间 */
 36 | 	private Map<File, Long> wordsLastTime = null;
 37 | 	private long lastLoadTime = 0;
 38 | 
 39 | 	/** 不要直接使用, 通过 {@link #getDefalutPath()} 使用*/
 40 | 	private static File defalutPath = null;
 41 | 	private static final ConcurrentHashMap<File, Dictionary> dics = new ConcurrentHashMap<File, Dictionary>();
 42 | 
 43 | 	protected void finalize() throws Throwable {
 44 | 		/*
 45 | 		 * 使 class reload 的时也可以释放词库
 46 | 		 */
 47 | 		destroy();
 48 | 	}
 49 | 
 50 | 	/**
 51 | 	 * 从默认目录加载词库文件.<p/>
 52 | 	 * 查找默认目录顺序:
 53 | 	 * <ol>
 54 | 	 * <li>从系统属性mmseg.dic.path指定的目录中加载</li>
 55 | 	 * <li>从classpath/data目录</li>
 56 | 	 * <li>从user.dir/data目录</li>
 57 | 	 * </ol>
 58 | 	 * @see #getDefalutPath()
 59 | 	 */
 60 | 	public static Dictionary getInstance() {
 61 | 		File path = getDefalutPath();
 62 | 		return getInstance(path);
 63 | 	}
 64 | 
 65 | 	/**
 66 | 	 * @param path 词典的目录
 67 | 	 */
 68 | 	public static Dictionary getInstance(String path) {
 69 | 		return getInstance(new File(path));
 70 | 	}
 71 | 
 72 | 	/**
 73 | 	 * @param path 词典的目录
 74 | 	 */
 75 | 	public static Dictionary getInstance(File path) {
 76 | 		log.info("try to load dir="+path);
 77 | 		File normalizeDir = normalizeFile(path);
 78 | 		Dictionary dic = dics.get(normalizeDir);
 79 | 		if(dic == null) {
 80 | 			dic = new Dictionary(normalizeDir);
 81 | 			dics.put(normalizeDir, dic);
 82 | 		}
 83 | 		return dic;
 84 | 	}
 85 | 
 86 | 	public static File normalizeFile(File file) {
 87 | 		if(file == defalutPath) {
 88 | 			return defalutPath;
 89 | 		}
 90 | 		try {
 91 | 			return file.getCanonicalFile();
 92 | 		} catch (IOException e) {
 93 | 			throw new RuntimeException("normalize file=["+file+"] fail", e);
 94 | 		}
 95 | 	}
 96 | 
 97 | 	/**
 98 | 	 * 销毁, 释放资源. 此后此对像不再可用.
 99 | 	 */
100 | 	void destroy() {
101 | 		clear(dicPath);
102 | 
103 | 		dicPath = null;
104 | 		dict = null;
105 | 		unit = null;
106 | 	}
107 | 
108 | 	/**
109 | 	 * @see Dictionary#clear(File)
110 | 	 */
111 | 	public static Dictionary clear(String path) {
112 | 		return clear(new File(path));
113 | 	}
114 | 
115 | 	/**
116 | 	 * 从单例缓存中去除
117 | 	 * @param path
118 | 	 * @return 没有返回 null
119 | 	 */
120 | 	public static Dictionary clear(File path) {
121 | 		File normalizeDir = normalizeFile(path);
122 | 		return dics.remove(normalizeDir);
123 | 	}
124 | 
125 | 	/**
126 | 	 * 词典的目录
127 | 	 */
128 | 	private Dictionary(File path) {
129 | 		init(path);
130 | 	}
131 | 
132 | 	private void init(File path) {
133 | 		dicPath = path;
134 | 		wordsLastTime = new HashMap<File, Long>();
135 | 
136 | 		reload();	//加载词典
137 | 	}
138 | 
139 | 	private static long now() {
140 | 		return System.currentTimeMillis();
141 | 	}
142 | 
143 | 	/**
144 | 	 * 只要 wordsXXX.dic的文件
145 | 	 * @return
146 | 	 */
147 | 	protected File[] listWordsFiles() {
148 | 		return dicPath.listFiles(new FilenameFilter() {
149 | 
150 | 			public boolean accept(File dir, String name) {
151 | 
152 | 				return name.startsWith("words") && name.endsWith(".dic");
153 | 			}
154 | 
155 | 		});
156 | 	}
157 | 
158 | 	private Map<Character, CharNode> loadDic(File wordsPath) throws IOException {
159 | 		InputStream charsIn = null;
160 | 		File charsFile = new File(wordsPath, "chars.dic");
161 | 		if(charsFile.exists()) {
162 | 			charsIn = new FileInputStream(charsFile);
163 | 			addLastTime(charsFile);	//chars.dic 也检测是否变更
164 | 		} else {	//从 jar 里加载
165 | 			charsIn = this.getClass().getResourceAsStream("/data/chars.dic");
166 | 			charsFile = new File(this.getClass().getResource("/data/chars.dic").getFile());	//only for log
167 | 		}
168 | 		final Map<Character, CharNode> dic = new HashMap<Character, CharNode>();
169 | 		int lineNum = 0;
170 | 		long s = now();
171 | 		long ss = s;
172 | 		lineNum = load(charsIn, new FileLoading() {	//单个字的
173 | 
174 | 			public void row(String line, int n) {
175 | 				if(line.length() < 1) {
176 | 					return;
177 | 				}
178 | 				String[] w = line.split(" ");
179 | 				CharNode cn = new CharNode();
180 | 				switch(w.length) {
181 | 				case 2:
182 | 					try {
183 | 						cn.setFreq((int)(Math.log(Integer.parseInt(w[1]))*100));//字频计算出自由度
184 | 					} catch(NumberFormatException e) {
185 | 						//eat...
186 | 					}
187 | 				case 1:
188 | 
189 | 					dic.put(w[0].charAt(0), cn);
190 | 				}
191 | 			}
192 | 		});
193 | 		log.info("chars loaded time="+(now()-s)+"ms, line="+lineNum+", on file="+charsFile);
194 | 
195 | 		//try load words.dic in jar
196 | 		InputStream wordsDicIn = this.getClass().getResourceAsStream("/data/words.dic");
197 | 		if(wordsDicIn != null) {
198 | 			File wordsDic = new File(this.getClass().getResource("/data/words.dic").getFile());
199 | 			loadWord(wordsDicIn, dic, wordsDic);
200 | 		}
201 | 
202 | 		File[] words = listWordsFiles();	//只要 wordsXXX.dic的文件
203 | 		if(words != null) {	//扩展词库目录
204 | 			for(File wordsFile : words) {
205 | 				loadWord(new FileInputStream(wordsFile), dic, wordsFile);
206 | 
207 | 				addLastTime(wordsFile);	//用于检测是否修改
208 | 			}
209 | 		}
210 | 
211 | 		log.info("load all dic use time="+(now()-ss)+"ms");
212 | 		return dic;
213 | 	}
214 | 
215 | 	/**
216 | 	 * @param is 词库文件流
217 | 	 * @param dic 加载的词保存在结构中
218 | 	 * @param wordsFile	日志用
219 | 	 * @throws IOException from {@link #load(InputStream, FileLoading)}
220 | 	 */
221 | 	private void loadWord(InputStream is, Map<Character, CharNode> dic, File wordsFile) throws IOException {
222 | 		long s = now();
223 | 		int lineNum = load(is, new WordsFileLoading(dic)); //正常的词库
224 | 		log.info("words loaded time="+(now()-s)+"ms, line="+lineNum+", on file="+wordsFile);
225 | 	}
226 | 
227 | 	private Map<Character, Object> loadUnit(File path) throws IOException {
228 | 		InputStream fin = null;
229 | 		File unitFile = new File(path, "units.dic");
230 | 		if(unitFile.exists()) {
231 | 			fin = new FileInputStream(unitFile);
232 | 			addLastTime(unitFile);
233 | 		} else {	//在jar包里的/data/unit.dic
234 | 			fin = Dictionary.class.getResourceAsStream("/data/units.dic");
235 | 			unitFile = new File(Dictionary.class.getResource("/data/units.dic").getFile());
236 | 		}
237 | 
238 | 		final Map<Character, Object> unit = new HashMap<Character, Object>();
239 | 
240 | 		long s = now();
241 | 		int lineNum = load(fin, new FileLoading() {
242 | 
243 | 			public void row(String line, int n) {
244 | 				if(line.length() != 1) {
245 | 					return;
246 | 				}
247 | 				unit.put(line.charAt(0), Dictionary.class);
248 | 			}
249 | 		});
250 | 		log.info("unit loaded time="+(now()-s)+"ms, line="+lineNum+", on file="+unitFile);
251 | 
252 | 		return unit;
253 | 	}
254 | 
255 | 	/**
256 | 	 * 加载 wordsXXX.dic 文件类。
257 | 	 *
258 | 	 * @author chenlb 2009-10-15 下午02:12:55
259 | 	 */
260 | 	private static class WordsFileLoading implements FileLoading {
261 | 		final Map<Character, CharNode> dic;
262 | 
263 | 		/**
264 | 		 * @param dic 加载的词，保存在此结构中。
265 | 		 */
266 | 		public WordsFileLoading(Map<Character, CharNode> dic) {
267 | 			this.dic = dic;
268 | 		}
269 | 
270 | 		public void row(String line, int n) {
271 | 			if(line.length() < 2) {
272 | 				return;
273 | 			}
274 | 			CharNode cn = dic.get(line.charAt(0));
275 | 			if(cn == null) {
276 | 				cn = new CharNode();
277 | 				dic.put(line.charAt(0), cn);
278 | 			}
279 | 			cn.addWordTail(tail(line));
280 | 		}
281 | 	}
282 | 
283 | 	/**
284 | 	 * 加载词文件的模板
285 | 	 * @return 文件总行数
286 | 	 */
287 | 	public static int load(InputStream fin, FileLoading loading) throws IOException {
288 | 		BufferedReader br = new BufferedReader(
289 | 				new InputStreamReader(new BufferedInputStream(fin), "UTF-8"));
290 | 		String line = null;
291 | 		int n = 0;
292 | 		while((line = br.readLine()) != null) {
293 | 			if(line == null || line.startsWith("#")) {
294 | 				continue;
295 | 			}
296 | 			n++;
297 | 			loading.row(line, n);
298 | 		}
299 | 		return n;
300 | 	}
301 | 
302 | 	/**
303 | 	 * 取得 str 除去第一个char的部分
304 | 	 * @author chenlb 2009-3-3 下午10:05:26
305 | 	 */
306 | 	private static char[] tail(String str) {
307 | 		char[] cs = new char[str.length()-1];
308 | 		str.getChars(1, str.length(), cs, 0);
309 | 		return cs;
310 | 	}
311 | 
312 | 	public static interface FileLoading {
313 | 		/**
314 | 		 * @param line 读出的一行
315 | 		 * @param n 当前第几行
316 | 		 * @author chenlb 2009-3-3 下午09:55:54
317 | 		 */
318 | 		void row(String line, int n);
319 | 	}
320 | 
321 | 	/**
322 | 	 * 把 wordsFile 文件的最后更新时间加记录下来.
323 | 	 * @param wordsFile 非 null
324 | 	 */
325 | 	private synchronized void addLastTime(File wordsFile) {
326 | 		if(wordsFile != null) {
327 | 			wordsLastTime.put(wordsFile, wordsFile.lastModified());
328 | 		}
329 | 	}
330 | 
331 | 	/**
332 | 	 * 词典文件是否有修改过
333 | 	 * @return
334 | 	 */
335 | 	public synchronized boolean wordsFileIsChange() {
336 | 		//检查是否有修改文件,包括删除的
337 | 		for(Entry<File, Long> flt : wordsLastTime.entrySet()) {
338 | 			File words = flt.getKey();
339 | 			if(!words.canRead()) {	//可能是删除了
340 | 				return true;
341 | 			}
342 | 			if(words.lastModified() > flt.getValue()) {	//更新了文件
343 | 				return true;
344 | 			}
345 | 		}
346 | 		//检查是否有新文件
347 | 		File[] words = listWordsFiles();
348 | 		if(words != null) {
349 | 			for(File wordsFile : words) {
350 | 				if(!wordsLastTime.containsKey(wordsFile)) {	//有新词典文件
351 | 					return true;
352 | 				}
353 | 			}
354 | 		}
355 | 		return false;
356 | 	}
357 | 
358 | 	/**
359 | 	 * 全新加载词库，没有成功加载会回滚。<P/>
360 | 	 * 注意：重新加载时，务必有两倍的词库树结构的内存，默认词库是 50M/个 左右。否则抛出 OOM。
361 | 	 * @return 是否成功加载
362 | 	 */
363 | 	public synchronized boolean reload() {
364 | 		Map<File, Long> oldWordsLastTime = new HashMap<File, Long>(wordsLastTime);
365 | 		Map<Character, CharNode> oldDict = dict;
366 | 		Map<Character, Object> oldUnit = unit;
367 | 
368 | 		try {
369 | 			wordsLastTime.clear();
370 | 			dict = loadDic(dicPath);
371 | 			unit = loadUnit(dicPath);
372 | 			lastLoadTime = System.currentTimeMillis();
373 | 		} catch (IOException e) {
374 | 			//rollback
375 | 			wordsLastTime.putAll(oldWordsLastTime);
376 | 			dict = oldDict;
377 | 			unit = oldUnit;
378 | 
379 | 			if(log.isLoggable(Level.WARNING)) {
380 | 				log.log(Level.WARNING, "reload dic error! dic="+dicPath+", and rollbacked.", e);
381 | 			}
382 | 
383 | 			return false;
384 | 		}
385 | 		return true;
386 | 	}
387 | 
388 | 	/**
389 | 	 * word 能否在词库里找到
390 | 	 * @author chenlb 2009-3-3 下午11:10:45
391 | 	 */
392 | 	public boolean match(String word) {
393 | 		if(word == null || word.length() < 2) {
394 | 			return false;
395 | 		}
396 | 		CharNode cn = dict.get(word.charAt(0));
397 | 		return search(cn, word.toCharArray(), 0, word.length()-1) >= 0;
398 | 	}
399 | 
400 | 	public CharNode head(char ch) {
401 | 		return dict.get(ch);
402 | 	}
403 | 
404 | 	/**
405 | 	 * sen[offset] 后 tailLen 长的词是否存在.
406 | 	 * @see CharNode#indexOf(char[], int, int)
407 | 	 * @author chenlb 2009-4-8 下午11:13:49
408 | 	 */
409 | 	public int search(CharNode node, char[] sen, int offset, int tailLen) {
410 | 		if(node != null) {
411 | 			return node.indexOf(sen, offset, tailLen);
412 | 		}
413 | 		return -1;
414 | 	}
415 | 
416 | 	public int maxMatch(char[] sen, int offset) {
417 | 		CharNode node = dict.get(sen[offset]);
418 | 		return maxMatch(node, sen, offset);
419 | 	}
420 | 
421 | 	public int maxMatch(CharNode node, char[] sen, int offset) {
422 | 		if(node != null) {
423 | 			return node.maxMatch(sen, offset+1);
424 | 		}
425 | 		return 0;
426 | 	}
427 | 
428 | 	public ArrayList<Integer> maxMatch(CharNode node, ArrayList<Integer> tailLens, char[] sen, int offset) {
429 | 		tailLens.clear();
430 | 		tailLens.add(0);
431 | 		if(node != null) {
432 | 			return node.maxMatch(tailLens, sen, offset+1);
433 | 		}
434 | 		return tailLens;
435 | 	}
436 | 
437 | 	public boolean isUnit(Character ch) {
438 | 		return unit.containsKey(ch);
439 | 	}
440 | 
441 | 	/**
442 | 	 * 当 words.dic 是从 jar 里加载时, 可能 defalut 不存在
443 | 	 */
444 | 	public static File getDefalutPath() {
445 | 		if(defalutPath == null) {
446 | 			String defPath = System.getProperty("mmseg.dic.path");
447 | 			log.info("look up in mmseg.dic.path="+defPath);
448 | 			if(defPath == null) {
449 | 				URL url = Dictionary.class.getClassLoader().getResource("data");
450 | 				if(url != null) {
451 | 					defPath = url.getFile();
452 | 					log.info("look up in classpath="+defPath);
453 | 				} else {
454 | 					defPath = System.getProperty("user.dir")+"/data";
455 | 					log.info("look up in user.dir="+defPath);
456 | 				}
457 | 
458 | 			}
459 | 
460 | 			defalutPath = new File(defPath);
461 | 			if(!defalutPath.exists()) {
462 | 				log.warning("defalut dic path="+defalutPath+" not exist");
463 | 			}
464 | 		}
465 | 		return defalutPath;
466 | 	}
467 | 
468 | 	/**
469 | 	 * 仅仅用来观察词库.
470 | 	 */
471 | 	public Map<Character, CharNode> getDict() {
472 | 		return dict;
473 | 	}
474 | 
475 | 	/**
476 | 	 * 注意：当 words.dic 是从 jar 里加载时，此时 File 可能是不存在的。
477 | 	 */
478 | 	public File getDicPath() {
479 | 		return dicPath;
480 | 	}
481 | 
482 | 	/** 最后加载词库的时间 */
483 | 	public long getLastLoadTime() {
484 | 		return lastLoadTime;
485 | 	}
486 | }
487 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/MMSeg.java:
--------------------------------------------------------------------------------
  1 | package com.chenlb.mmseg4j;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.IOException;
  5 | import java.io.PushbackReader;
  6 | import java.io.Reader;
  7 | import java.util.LinkedList;
  8 | import java.util.Queue;
  9 | 
 10 | /**
 11 |  * Reader 流的分词(有字母,数字等), 析出中文(其实是 CJK)成句子 {@link Sentence} 再对 mmseg 算法分词.<br/>
 12 |  * 
 13 |  * 非线程安全
 14 |  * @author chenlb 2009-9-20下午10:41:41
 15 |  */
 16 | public class MMSeg {
 17 | 	
 18 | 	private PushbackReader reader;
 19 | 	private Seg seg;
 20 | 	
 21 | 	private StringBuilder bufSentence = new StringBuilder(256);
 22 | 	private Sentence currentSentence;
 23 | 	private Queue<Word> bufWord;	// word 缓存, 因为有 chunk 分析三个以上.
 24 | 	
 25 | 	public MMSeg(Reader input, Seg seg) {
 26 | 		this.seg = seg;
 27 | 		
 28 | 		reset(input);
 29 | 	}
 30 | 
 31 | 	private int readedIdx = 0;
 32 | 	
 33 | 	public void reset(Reader input) {
 34 | 		this.reader = new PushbackReader(new BufferedReader(input), 20);
 35 | 		currentSentence = null;
 36 | 		bufWord = new LinkedList<Word>();
 37 | 		bufSentence.setLength(0);
 38 | 		readedIdx = -1;
 39 | 	}
 40 | 	
 41 | 	private int readNext() throws IOException {
 42 | 		int d = reader.read();
 43 | 		if(d > -1) {
 44 | 			readedIdx++;
 45 | 			d = Character.toLowerCase(d);
 46 | 		}
 47 | 		return d;
 48 | 	}
 49 | 	
 50 | 	private void pushBack(int data) throws IOException {
 51 | 		readedIdx--;
 52 | 		reader.unread(data);
 53 | 	}
 54 | 
 55 | 	
 56 | 	public Word next() throws IOException {
 57 | 		//先从缓存中取
 58 | 		Word word = bufWord.poll();;
 59 | 		if(word == null) {
 60 | 			bufSentence.setLength(0);
 61 | 
 62 | 			int data = -1;
 63 | 			boolean read = true;
 64 | 			while(read && (data=readNext()) != -1) {
 65 | 				read = false;	//默认一次可以读出同一类字符,就可以分词内容
 66 | 				int type = Character.getType(data);
 67 | 				String wordType = Word.TYPE_WORD;
 68 | 				switch(type) {
 69 | 				case Character.UPPERCASE_LETTER:
 70 | 				case Character.LOWERCASE_LETTER:
 71 | 				case Character.TITLECASE_LETTER:
 72 | 				case Character.MODIFIER_LETTER:
 73 | 					/*
 74 | 					 * 1. 0x410-0x44f -> А-я	//俄文
 75 | 					 * 2. 0x391-0x3a9 -> Α-Ω	//希腊大写
 76 | 					 * 3. 0x3b1-0x3c9 -> α-ω	//希腊小写
 77 | 					 */
 78 | 					data = toAscii(data);
 79 | 					NationLetter nl = getNation(data);
 80 | 					if(nl == NationLetter.UNKNOW) {
 81 | 						read = true;
 82 | 						break;
 83 | 					}
 84 | 					wordType = Word.TYPE_LETTER;
 85 | 					bufSentence.appendCodePoint(data);
 86 | 					switch(nl) {
 87 | 					case EN:
 88 | 						//字母后面的数字,如: VH049PA
 89 | 						ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit();
 90 | 						readChars(bufSentence, rcad);
 91 | 						if(rcad.hasDigit()) {
 92 | 							wordType = Word.TYPE_LETTER_OR_DIGIT;
 93 | 						}
 94 | 						//only english
 95 | 						//readChars(bufSentence, new ReadCharByAscii());
 96 | 						break;
 97 | 					case RA:
 98 | 						readChars(bufSentence, new ReadCharByRussia());
 99 | 						break;
100 | 					case GE:
101 | 						readChars(bufSentence, new ReadCharByGreece());
102 | 						break;
103 | 					}
104 | 					bufWord.add(createWord(bufSentence, wordType));
105 | 
106 | 					bufSentence.setLength(0);
107 | 
108 | 					break;
109 | 				case Character.OTHER_LETTER:
110 | 					/*
111 | 					 * 1. 0x3041-0x30f6 -> ぁ-ヶ	//日文(平|片)假名
112 | 					 * 2. 0x3105-0x3129 -> ㄅ-ㄩ	//注意符号
113 | 					 */
114 | 					bufSentence.appendCodePoint(data);
115 | 					readChars(bufSentence, new ReadCharByType(Character.OTHER_LETTER));
116 | 
117 | 					currentSentence = createSentence(bufSentence);
118 | 
119 | 					bufSentence.setLength(0);
120 | 
121 | 					break;
122 | 				case Character.DECIMAL_DIGIT_NUMBER:
123 | 					bufSentence.appendCodePoint(toAscii(data));
124 | 					readChars(bufSentence, new ReadCharDigit());	//读后面的数字, AsciiLetterOr
125 | 					wordType = Word.TYPE_DIGIT;
126 | 					int d = readNext();
127 | 					if(d > -1) {
128 | 						if(seg.isUnit(d)) {	//单位,如时间
129 | 							bufWord.add(createWord(bufSentence, startIdx(bufSentence)-1, Word.TYPE_DIGIT));	//先把数字添加(独立)
130 | 
131 | 							bufSentence.setLength(0);
132 | 
133 | 							bufSentence.appendCodePoint(d);
134 | 							wordType = Word.TYPE_WORD;	//单位是 word
135 | 						} else {	//后面可能是字母和数字
136 | 							pushBack(d);
137 | 							if(readChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0) {	//如果有字母或数字都会连在一起.
138 | 								wordType = Word.TYPE_DIGIT_OR_LETTER;
139 | 							}
140 | 						}
141 | 					}
142 | 
143 | 					bufWord.add(createWord(bufSentence, wordType));
144 | 
145 | 
146 | 					bufSentence.setLength(0);	//缓存的字符清除
147 | 
148 | 					break;
149 | 				case Character.LETTER_NUMBER:
150 | 					// ⅠⅡⅢ 单分
151 | 					bufSentence.appendCodePoint(data);
152 | 					readChars(bufSentence, new ReadCharByType(Character.LETTER_NUMBER));
153 | 
154 | 					int startIdx = startIdx(bufSentence);
155 | 					for(int i=0; i<bufSentence.length(); i++) {
156 | 						bufWord.add(new Word(new char[] {bufSentence.charAt(i)}, startIdx++, Word.TYPE_LETTER_NUMBER));
157 | 					}
158 | 
159 | 					bufSentence.setLength(0);	//缓存的字符清除
160 | 
161 | 					break;
162 | 				case Character.OTHER_NUMBER:
163 | 					//①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用
164 | 					bufSentence.appendCodePoint(data);
165 | 					readChars(bufSentence, new ReadCharByType(Character.OTHER_NUMBER));
166 | 
167 | 					bufWord.add(createWord(bufSentence, Word.TYPE_OTHER_NUMBER));
168 | 					bufSentence.setLength(0);
169 | 					break;
170 | 				default :
171 | 					//其它认为无效字符
172 | 					read = true;
173 | 				}//switch
174 | 			}
175 | 				
176 | 			// 中文分词
177 | 			if(currentSentence != null) {
178 | 				do {
179 | 					Chunk chunk = seg.seg(currentSentence);
180 | 					for(int i=0; i<chunk.getCount(); i++) {
181 | 						bufWord.add(chunk.getWords()[i]);
182 | 					}
183 | 				} while (!currentSentence.isFinish());
184 | 				
185 | 				currentSentence = null;
186 | 			}
187 | 			
188 | 			word = bufWord.poll();
189 | 		}
190 | 		
191 | 		return word;
192 | 	}
193 | 	
194 | 	
195 | 	/**
196 | 	 * 读取下一串指定类型字符.
197 | 	 * 
198 | 	 * @author chenlb 2009-8-15下午09:09:50
199 | 	 */
200 | 	private static abstract class ReadChar {
201 | 		/**
202 | 		 * 这个字符是否读取, 不读取也不会读下一个字符.
203 | 		 */
204 | 		abstract boolean isRead(int codePoint);
205 | 		int transform(int codePoint) {
206 | 			return codePoint;
207 | 		}
208 | 	}
209 | 	
210 | 	/**
211 | 	 * 读取下一串指定类型的字符放到 bufSentence 中.
212 | 	 * @param bufSentence
213 | 	 * @param readChar 判断字符的细节.
214 | 	 * @return 返回读取的个数
215 | 	 * @throws IOException {@link #readNext()} 或 {@link #pushBack()} 抛出的.
216 | 	 */
217 | 	private int readChars(StringBuilder bufSentence, ReadChar readChar) throws IOException {
218 | 		int num = 0;
219 | 		int data = -1;
220 | 		while((data = readNext()) != -1) {
221 | 			int d = readChar.transform(data);
222 | 			if(readChar.isRead(d)) {
223 | 				bufSentence.appendCodePoint(d);
224 | 				num++;
225 | 			} else {	//不是数字回压,要下一步操作
226 | 				pushBack(data);
227 | 				break;
228 | 			}
229 | 		}
230 | 		return num;
231 | 	}
232 | 	
233 | 	/**读取数字*/
234 | 	private static class ReadCharDigit extends ReadChar {
235 | 
236 | 		boolean isRead(int codePoint) {
237 | 			int type = Character.getType(codePoint);
238 | 			return isDigit(type);
239 | 		}
240 | 		
241 | 		int transform(int codePoint) {
242 | 			return toAscii(codePoint);
243 | 		}
244 | 		
245 | 	}
246 | 	
247 | 	/**读取字母或数字*/
248 | 	private static class ReadCharByAsciiOrDigit extends ReadCharDigit {
249 | 
250 | 		private boolean hasDigit = false;
251 | 		boolean isRead(int codePoint) {
252 | 			boolean isRead = super.isRead(codePoint);
253 | 			hasDigit |= isRead;
254 | 			return isAsciiLetter(codePoint) || isRead;
255 | 		}
256 | 		boolean hasDigit() {
257 | 			return hasDigit;
258 | 		}
259 | 	}
260 | 	
261 | 	/**读取字母*/
262 | 	@SuppressWarnings("unused")
263 | 	private static class ReadCharByAscii extends ReadCharDigit {
264 | 		boolean isRead(int codePoint) {
265 | 			return isAsciiLetter(codePoint);
266 | 		}
267 | 	}
268 | 	
269 | 	/**读取俄语*/
270 | 	private static class ReadCharByRussia extends ReadCharDigit {
271 | 
272 | 		boolean isRead(int codePoint) {
273 | 			return isRussiaLetter(codePoint);
274 | 		}
275 | 		
276 | 	}
277 | 	
278 | 	/**读取希腊 */
279 | 	private static class ReadCharByGreece extends ReadCharDigit {
280 | 
281 | 		boolean isRead(int codePoint) {
282 | 			return isGreeceLetter(codePoint);
283 | 		}
284 | 		
285 | 	}
286 | 	
287 | 	/**读取指定类型的字符*/
288 | 	private static class ReadCharByType extends ReadChar {
289 | 		int charType;
290 | 		public ReadCharByType(int charType) {
291 | 			this.charType = charType;
292 | 		}
293 | 
294 | 		boolean isRead(int codePoint) {
295 | 			int type = Character.getType(codePoint);
296 | 			return type == charType;
297 | 		}
298 | 		
299 | 	}
300 | 	
301 | 	private Word createWord(StringBuilder bufSentence, String type) {
302 | 		return new Word(toChars(bufSentence), startIdx(bufSentence), type);
303 | 	}
304 | 	
305 | 	private Word createWord(StringBuilder bufSentence, int startIdx, String type) {
306 | 		return new Word(toChars(bufSentence), startIdx, type);
307 | 	}
308 | 	
309 | 	private Sentence createSentence(StringBuilder bufSentence) {
310 | 		return new Sentence(toChars(bufSentence), startIdx(bufSentence));
311 | 	}
312 | 	
313 | 	/**取得 bufSentence 的第一个字符在整个文本中的位置*/
314 | 	private int startIdx(StringBuilder bufSentence) {
315 | 		return readedIdx - bufSentence.length() + 1;
316 | 	}
317 | 	
318 | 	/**从 StringBuilder 里复制出 char[] */
319 | 	private static char[] toChars(StringBuilder bufSentence) {
320 | 		char[] chs = new char[bufSentence.length()];
321 | 		bufSentence.getChars(0, bufSentence.length(), chs, 0);
322 | 		return chs;
323 | 	}
324 | 	
325 | 	/**
326 | 	 * 双角转单角
327 | 	 */
328 | 	private static int toAscii(int codePoint) {
329 | 		if((codePoint>=65296 && codePoint<=65305)	//０-９
330 | 				|| (codePoint>=65313 && codePoint<=65338)	//Ａ-Ｚ
331 | 				|| (codePoint>=65345 && codePoint<=65370)	//ａ-ｚ
332 | 				) {	
333 | 			codePoint -= 65248;
334 | 		}
335 | 		return codePoint;
336 | 	}
337 | 	
338 | 	private static boolean isAsciiLetter(int codePoint) {
339 | 		return (codePoint >= 'A' && codePoint <= 'Z') || (codePoint >= 'a' && codePoint <= 'z');
340 | 	}
341 | 	
342 | 	private static boolean isRussiaLetter(int codePoint) {
343 | 		return (codePoint >= 'А' && codePoint <= 'я') || codePoint=='Ё' || codePoint=='ё';
344 | 	}
345 | 	
346 | 	private static boolean isGreeceLetter(int codePoint) {
347 | 		return (codePoint >= 'Α' && codePoint <= 'Ω') || (codePoint >= 'α' && codePoint <= 'ω');
348 | 	}
349 | 	/**
350 | 	 * EN -> 英语
351 | 	 * RA -> 俄语
352 | 	 * GE -> 希腊
353 | 	 * 
354 | 	 */
355 | 	private static enum NationLetter {EN, RA, GE, UNKNOW};
356 | 	
357 | 	private NationLetter getNation(int codePoint) {
358 | 		if(isAsciiLetter(codePoint)) {
359 | 			return NationLetter.EN;
360 | 		}
361 | 		if(isRussiaLetter(codePoint)) {
362 | 			return NationLetter.RA;
363 | 		}
364 | 		if(isGreeceLetter(codePoint)) {
365 | 			return NationLetter.GE;
366 | 		}
367 | 		return NationLetter.UNKNOW;
368 | 	}
369 | 	
370 | 	@SuppressWarnings("unused")
371 | 	private static boolean isCJK(int type) {
372 | 		return type == Character.OTHER_LETTER;
373 | 	}
374 | 	private static boolean isDigit(int type) {
375 | 		return type == Character.DECIMAL_DIGIT_NUMBER;
376 | 	}
377 | 	@SuppressWarnings("unused")
378 | 	private static boolean isLetter(int type) {
379 | 		return type <= Character.MODIFIER_LETTER && type >= Character.UPPERCASE_LETTER;
380 | 	}
381 | }
382 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/MaxWordSeg.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | /**
 7 |  * 最多分词. 在ComplexSeg基础上把长的词拆.
 8 |  *
 9 |  * @author chenlb 2009-4-6 下午08:12:11
10 |  */
11 | public class MaxWordSeg extends ComplexSeg {
12 | 
13 | 	public MaxWordSeg(Dictionary dic) {
14 | 		super(dic);
15 | 	}
16 | 
17 | 	public Chunk seg(Sentence sen) {
18 | 
19 | 		Chunk chunk = super.seg(sen);
20 | 		if(chunk != null) {
21 | 			List<Word> cks = new ArrayList<Word>();
22 | 			for(int i=0; i<chunk.getCount(); i++) {
23 | 				Word word = chunk.words[i];
24 | 
25 | 				if(word.getLength() < 3) {
26 | 					cks.add(word);
27 | 				} else {
28 | 					char[] chs = word.getSen();
29 | 					int offset = word.getWordOffset(), n = 0, wordEnd = word.getWordOffset()+word.getLength();
30 | 					int senStartOffset = word.getStartOffset() - offset;	//sen 在文件中的位置
31 | 					int end = -1;	//上一次找到的位置
32 | 					for(; offset<wordEnd-1; offset++) {
33 | 						int idx = search(chs, offset, 1);
34 | 						if(idx > -1) {
35 | 							cks.add(new Word(chs, senStartOffset, offset, 2));
36 | 							end = offset+2;
37 | 							n++;
38 | 						} else if(offset >= end) {	//有单字
39 | 							cks.add(new Word(chs, senStartOffset, offset, 1));
40 | 							end = offset+1;
41 | 
42 | 						}
43 | 					}
44 | 					if(end > -1 && end < wordEnd) {
45 | 						cks.add(new Word(chs, senStartOffset, offset, 1));
46 | 					}
47 | 				}
48 | 
49 | 			}
50 | 			chunk.words = cks.toArray(new Word[cks.size()]);
51 | 			chunk.count = cks.size();
52 | 		}
53 | 
54 | 		return chunk;
55 | 	}
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/Seg.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | /**
 7 |  * 分词抽象类.
 8 |  * 
 9 |  * @author chenlb 2009-3-16 下午09:15:30
10 |  */
11 | public abstract class Seg {
12 | 
13 | 	protected Dictionary dic;
14 | 	
15 | 	public Seg(Dictionary dic) {
16 | 		super();
17 | 		this.dic = dic;
18 | 	}
19 | 
20 | 	/**
21 | 	 * 输出 chunks, 调试用.
22 | 	 */
23 | 	protected void printChunk(List<Chunk> chunks) {
24 | 		for(Chunk ck : chunks) {
25 | 			System.out.println(ck+" -> "+ck.toFactorString());
26 | 		}
27 | 	}
28 | 	
29 | 	/**
30 | 	 * @see Dictionary#isUnit(Character)
31 | 	 */
32 | 	protected boolean isUnit(int codePoint) {
33 | 		return dic.isUnit((char) codePoint);
34 | 	}
35 | 	
36 | 	/**
37 | 	 * 查找chs[offset]后面的 tailLen个char是否为词.
38 | 	 * @return 返回chs[offset]字符结点下的词尾索引号,没找到返回 -1.
39 | 	 */
40 | 	protected int search(char[] chs, int offset, int tailLen) {
41 | 		if(tailLen == 0) {
42 | 			return -1;
43 | 		}
44 | 		CharNode cn = dic.head(chs[offset]);
45 | 		
46 | 		return search(cn, chs, offset, tailLen);
47 | 	}
48 | 	
49 | 	/**
50 | 	 * 没有数组的复制.
51 | 	 * @author chenlb 2009-4-8 下午11:39:15
52 | 	 */
53 | 	protected int search(CharNode cn, char[] chs, int offset, int tailLen) {
54 | 		if(tailLen == 0 || cn == null) {
55 | 			return -1;
56 | 		}
57 | 		return dic.search(cn, chs, offset, tailLen);
58 | 	}
59 | 	
60 | 	/**
61 | 	 * 最大匹配<br/>
62 | 	 * 从 chs[offset] 开始匹配, 同时把 chs[offset] 的字符结点保存在 cns[cnIdx]
63 | 	 * @return 最大匹配到的词尾长, > 0 找到
64 | 	 */
65 | 	protected int maxMatch(CharNode[] cns, int cnIdx, char[] chs, int offset) {
66 | 		CharNode cn = null;
67 | 		if(offset < chs.length) {
68 | 			cn = dic.head(chs[offset]);
69 | 		}
70 | 		cns[cnIdx] = cn;
71 | 		return dic.maxMatch(cn, chs, offset);
72 | 	}
73 | 	
74 | 	/**
75 | 	 * 匹配,同时找出长度. <br/>
76 | 	 * 从 chs[offset] 开始找所有匹配的词, 找到的放到 tailLens[tailLensIdx] 中. <br/>
77 | 	 * 同时把 chs[offset] 的字符结点保存在 cns[cnIdx].
78 | 	 * @author chenlb 2009-4-12 上午10:37:58
79 | 	 */
80 | 	protected void maxMatch(CharNode[] cns, int cnIdx, char[] chs, int offset, ArrayList<Integer>[] tailLens, int tailLensIdx) {
81 | 		CharNode cn = null;
82 | 		if(offset < chs.length) {
83 | 			cn = dic.head(chs[offset]);
84 | 		}
85 | 		cns[cnIdx] = cn;
86 | 		dic.maxMatch(cn, tailLens[tailLensIdx], chs, offset);
87 | 	}
88 | 	
89 | 	/**
90 | 	 * 对句子 sen 进行分词.
91 | 	 * @return 不返回 null.
92 | 	 */
93 | 	public abstract Chunk seg(Sentence sen);
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/Sentence.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j;
 2 | 
 3 | /**
 4 |  * 句子, 在一大串字符中断出连续中文的文本.
 5 |  * 
 6 |  * @author chenlb 2009-3-3 下午11:56:53
 7 |  */
 8 | public class Sentence {
 9 | 
10 | 	private char[] text;
11 | 	private int startOffset;
12 | 	
13 | 	private int offset;
14 | 
15 | 	public Sentence() {
16 | 		text = new char[0];
17 | 	}
18 | 	
19 | 	public Sentence(char[] text, int startOffset) {
20 | 		reinit(text, startOffset);
21 | 	}
22 | 
23 | 	public void reinit(char[] text, int startOffset) {
24 | 		this.text = text;
25 | 		this.startOffset = startOffset;
26 | 		offset = 0;
27 | 	}
28 | 	
29 | 	public char[] getText() {
30 | 		return text;
31 | 	}
32 | 
33 | 	/** 句子开始处理的偏移位置 */
34 | 	public int getOffset() {
35 | 		return offset;
36 | 	}
37 | 
38 | 	/** 句子开始处理的偏移位置 */
39 | 	public void setOffset(int offset) {
40 | 		this.offset = offset;
41 | 	}
42 | 
43 | 	public void addOffset(int inc) {
44 | 		offset += inc;
45 | 	}
46 | 
47 | 	/** 句子处理完成 */
48 | 	public boolean isFinish() {
49 | 		return offset >= text.length;
50 | 	}
51 | 
52 | 	/** 句子在文本中的偏移位置 */
53 | 	public int getStartOffset() {
54 | 		return startOffset;
55 | 	}
56 | 
57 | 	/** 句子在文本中的偏移位置 */
58 | 	public void setStartOffset(int startOffset) {
59 | 		this.startOffset = startOffset;
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/SimpleSeg.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j;
 2 | 
 3 | 
 4 | /**
 5 |  * 正向最大匹配的分词方式.
 6 |  * 
 7 |  * @author chenlb 2009-3-16 下午09:07:36
 8 |  */
 9 | public class SimpleSeg extends Seg{
10 | 	
11 | 	public SimpleSeg(Dictionary dic) {
12 | 		super(dic);
13 | 	}
14 | 
15 | 	public Chunk seg(Sentence sen) {
16 | 		Chunk chunk = new Chunk();
17 | 		char[] chs = sen.getText();
18 | 		for(int k=0; k<3&&!sen.isFinish(); k++) {
19 | 			int offset = sen.getOffset();
20 | 			int maxLen = 0;
21 | 
22 | 			//有了 key tree 的支持可以从头开始 max match
23 | 			maxLen = dic.maxMatch(chs, offset);
24 | 			
25 | 			chunk.words[k] = new Word(chs, sen.getStartOffset(), offset, maxLen+1);
26 | 
27 | 			offset += maxLen + 1;
28 | 			sen.setOffset(offset);
29 | 		}
30 | 		
31 | 		return chunk;
32 | 	}
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/Word.java:
--------------------------------------------------------------------------------
  1 | package com.chenlb.mmseg4j;
  2 | 
  3 | /**
  4 |  * 类似 lucene 的 token
  5 |  * 
  6 |  * @author chenlb 2009-8-15下午10:23:32
  7 |  */
  8 | public class Word {
  9 | 
 10 | 	public static final String TYPE_WORD = "word";
 11 | 	public static final String TYPE_LETTER = "letter";
 12 | 	/** 字母开头的"字母或数字" */
 13 | 	public static final String TYPE_LETTER_OR_DIGIT = "letter_or_digit";
 14 | 	public static final String TYPE_DIGIT = "digit";
 15 | 	/** 数字开头的"字母或数字" */
 16 | 	public static final String TYPE_DIGIT_OR_LETTER = "digit_or_letter";
 17 | 	public static final String TYPE_LETTER_NUMBER = "letter_number";
 18 | 	public static final String TYPE_OTHER_NUMBER = "other_number";
 19 | 	
 20 | 	private int degree = -1;
 21 | 	private int startOffset;
 22 | 	
 23 | 	private char[] sen;
 24 | 	private int offset;
 25 | 	private int len;
 26 | 	
 27 | 	private String type = TYPE_WORD;	//类似 lucene token  的 type
 28 | 
 29 | 	/**
 30 | 	 * @param startOffset word 在整个文本中的偏移位置
 31 | 	 */
 32 | 	public Word(char[] word, int startOffset) {
 33 | 		super();
 34 | 		this.sen = word;
 35 | 		this.startOffset = startOffset;
 36 | 		offset = 0;
 37 | 		len = word.length;
 38 | 	}
 39 | 	
 40 | 	/**
 41 | 	 * @param startOffset word 在整个文本中的偏移位置
 42 | 	 */
 43 | 	public Word(char[] word, int startOffset, String wordType) {
 44 | 		this(word, startOffset);
 45 | 		this.type = wordType;
 46 | 	}
 47 | 	
 48 | 	/**
 49 | 	 * sen[offset] 开始的 len 个字符才是此 word
 50 | 	 * @param senStartOffset sen 在整个文本中的偏移位置
 51 | 	 * @param offset 词在 sen 的偏移位置
 52 | 	 * @param len 词长
 53 | 	 */
 54 | 	public Word(char[] sen, int senStartOffset, int offset, int len) {
 55 | 		super();
 56 | 		this.sen = sen;
 57 | 		this.startOffset = senStartOffset;
 58 | 		this.offset = offset;
 59 | 		this.len = len;
 60 | 	}
 61 | 
 62 | 	public String getString() {
 63 | 		return new String(getSen(), getWordOffset(), getLength());
 64 | 	}
 65 | 	
 66 | 	public String toString() {
 67 | 		return getString();
 68 | 	}
 69 | 	/**
 70 | 	 * 词在 char[] sen 的偏移位置
 71 | 	 * @see #getSen()
 72 | 	 */
 73 | 	public int getWordOffset() {
 74 | 		return offset;
 75 | 	}
 76 | 	
 77 | 	public int getLength() {
 78 | 		return len;
 79 | 	}
 80 | 
 81 | 	public char[] getSen() {
 82 | 		return sen;
 83 | 	}
 84 | 	
 85 | 	/**此 word 在整个文本中的偏移位置*/
 86 | 	public int getStartOffset() {
 87 | 		return startOffset+offset;
 88 | 	}
 89 | 	public int getEndOffset() {
 90 | 		return getStartOffset() + getLength();
 91 | 	}
 92 | 	public int getDegree() {
 93 | 		return degree;
 94 | 	}
 95 | 	public void setDegree(int degree) {
 96 | 		this.degree = degree;
 97 | 	}
 98 | 	public String getType() {
 99 | 		return type;
100 | 	}
101 | 	public void setType(String type) {
102 | 		this.type = type;
103 | 	}
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/example/Complex.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j.example;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.IOException;
 5 | import java.io.InputStreamReader;
 6 | import java.io.Reader;
 7 | import java.io.StringReader;
 8 | 
 9 | import com.chenlb.mmseg4j.ComplexSeg;
10 | import com.chenlb.mmseg4j.Dictionary;
11 | import com.chenlb.mmseg4j.MMSeg;
12 | import com.chenlb.mmseg4j.Seg;
13 | import com.chenlb.mmseg4j.Word;
14 | 
15 | public class Complex {
16 | 
17 | 	protected Dictionary dic;
18 | 	
19 | 	public Complex() {
20 | 		dic = Dictionary.getInstance();
21 | 	}
22 | 
23 | 	protected Seg getSeg() {
24 | 		return new ComplexSeg(dic);
25 | 	}
26 | 	
27 | 	public String segWords(Reader input, String wordSpilt) throws IOException {
28 | 		StringBuilder sb = new StringBuilder();
29 | 		Seg seg = getSeg();	//取得不同的分词具体算法
30 | 		MMSeg mmSeg = new MMSeg(input, seg);
31 | 		Word word = null;
32 | 		boolean first = true;
33 | 		while((word=mmSeg.next())!=null) {
34 | 			if(!first) {
35 | 				sb.append(wordSpilt);
36 | 			}
37 | 			String w = word.getString();
38 | 			sb.append(w);
39 | 			first = false;
40 | 			
41 | 		}
42 | 		return sb.toString();
43 | 	}
44 | 	
45 | 	public String segWords(String txt, String wordSpilt) throws IOException {
46 | 		return segWords(new StringReader(txt), wordSpilt);
47 | 	}
48 | 	
49 | 	private void printlnHelp() {
50 | 		System.out.println("\n\t-- 说明: 输入 QUIT 或 EXIT 退出");
51 |         System.out.print("\nmmseg4j-"+this.getClass().getSimpleName().toLowerCase()+">");
52 | 	}
53 | 	
54 | 	protected void run(String[] args) throws IOException {
55 | 		String txt = "京华时报２００８年1月23日报道 昨天，受一股来自中西伯利亚的强冷空气影响，本市出现大风降温天气，白天最高气温只有零下7摄氏度，同时伴有6到7级的偏北风。";
56 | 		
57 | 		if(args.length > 0) {
58 | 			txt = args[0];
59 | 		}
60 | 		
61 | 		System.out.println(segWords(txt, " | "));
62 | 		printlnHelp();
63 | 		String inputStr = null;
64 |         BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
65 |         while((inputStr = br.readLine()) != null) {
66 |             if(inputStr.equals("QUIT") || inputStr.equals("EXIT")) {
67 |                 System.exit(0);
68 |             } else if("".equals(inputStr)) {
69 |             	printlnHelp();
70 |             } else {
71 |             	//System.out.println(inputStr);
72 |             	System.out.println(segWords(inputStr, " | "));    //分词
73 |             	System.out.print("\nmmseg4j-"+this.getClass().getSimpleName().toLowerCase()+">");
74 |             }
75 |         }
76 | 	}
77 | 	
78 | 	public static void main(String[] args) throws IOException {
79 | 		
80 | 		new Complex().run(args);
81 | 	}
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/example/MaxWord.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j.example;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import com.chenlb.mmseg4j.MaxWordSeg;
 6 | import com.chenlb.mmseg4j.Seg;
 7 | 
 8 | public class MaxWord extends Complex {
 9 | 
10 | 	protected Seg getSeg() {
11 | 
12 | 		return new MaxWordSeg(dic);
13 | 	}
14 | 
15 | 	public static void main(String[] args) throws IOException {
16 | 		new MaxWord().run(args);
17 | 	}
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/example/Simple.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j.example;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import com.chenlb.mmseg4j.Seg;
 6 | import com.chenlb.mmseg4j.SimpleSeg;
 7 | 
 8 | /**
 9 |  * 
10 |  * @author chenlb 2009-3-14 上午12:38:40
11 |  */
12 | public class Simple extends Complex {
13 | 	
14 | 	protected Seg getSeg() {
15 | 
16 | 		return new SimpleSeg(dic);
17 | 	}
18 | 
19 | 	public static void main(String[] args) throws IOException {
20 | 		new Simple().run(args);
21 | 	}
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/rule/LargestAvgLenRule.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j.rule;
 2 | 
 3 | import com.chenlb.mmseg4j.Chunk;
 4 | 
 5 | /**
 6 |  * Largest Average Word Length.<p/>
 7 |  * 
 8 |  * 长度(Length)/词数
 9 |  * 
10 |  * @see http://technology.chtsai.org/mmseg/
11 |  * 
12 |  * @author chenlb 2009-3-16 上午11:28:21
13 |  */
14 | public class LargestAvgLenRule extends Rule {
15 | 
16 | 	private double largestAvgLen;
17 | 	
18 | 	@Override
19 | 	public void addChunk(Chunk chunk) {
20 | 		if(chunk.getAvgLen() >= largestAvgLen) {
21 | 			largestAvgLen = chunk.getAvgLen();
22 | 			super.addChunk(chunk);
23 | 		}
24 | 	}
25 | 
26 | 	@Override
27 | 	protected boolean isRemove(Chunk chunk) {
28 | 		return chunk.getAvgLen() < largestAvgLen;
29 | 	}
30 | 
31 | 	@Override
32 | 	public void reset() {
33 | 		largestAvgLen = 0;
34 | 		super.reset();
35 | 	}
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/rule/LargestSumDegreeFreedomRule.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j.rule;
 2 | 
 3 | import com.chenlb.mmseg4j.Chunk;
 4 | 
 5 | /**
 6 |  * Largest Sum of Degree of Morphemic Freedom of One-Character. <p/>
 7 |  * 
 8 |  * 各单字词词频的对数之和*100
 9 |  * 
10 |  * @see http://technology.chtsai.org/mmseg/
11 |  * 
12 |  * @author chenlb 2009-3-16 上午11:28:30
13 |  */
14 | public class LargestSumDegreeFreedomRule extends Rule {
15 | 
16 | 	private int largestSumDegree = Integer.MIN_VALUE;
17 | 	@Override
18 | 	public void addChunk(Chunk chunk) {
19 | 		if(chunk.getSumDegree() >= largestSumDegree) {
20 | 			largestSumDegree = chunk.getSumDegree();
21 | 			super.addChunk(chunk);
22 | 		}
23 | 	}
24 | 
25 | 	@Override
26 | 	public void reset() {
27 | 		largestSumDegree = Integer.MIN_VALUE;
28 | 		super.reset();
29 | 	}
30 | 
31 | 	@Override
32 | 	protected boolean isRemove(Chunk chunk) {
33 | 		
34 | 		return chunk.getSumDegree() < largestSumDegree;
35 | 	}
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/rule/MaxMatchRule.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j.rule;
 2 | 
 3 | import com.chenlb.mmseg4j.Chunk;
 4 | 
 5 | /**
 6 |  * Maximum Matching.<p/>
 7 |  * 
 8 |  * chuck中各个词的长度之和
 9 |  * 
10 |  * @see http://technology.chtsai.org/mmseg/
11 |  * 
12 |  * @author chenlb 2009-3-16 上午09:47:51
13 |  */
14 | public class MaxMatchRule extends Rule{
15 | 
16 | 	private int maxLen;
17 | 	
18 | 	public void addChunk(Chunk chunk) {
19 | 		if(chunk.getLen() >= maxLen) {
20 | 			maxLen = chunk.getLen();
21 | 			super.addChunk(chunk);
22 | 		}
23 | 	}
24 | 	
25 | 	@Override
26 | 	protected boolean isRemove(Chunk chunk) {
27 | 		
28 | 		return chunk.getLen() < maxLen;
29 | 	}
30 | 
31 | 	public void reset() {
32 | 		maxLen = 0;
33 | 		super.reset();
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/rule/Rule.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j.rule;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.Iterator;
 5 | import java.util.List;
 6 | 
 7 | import com.chenlb.mmseg4j.Chunk;
 8 | 
 9 | /**
10 |  * 过虑规则的抽象类。
11 |  * 
12 |  * @author chenlb 2009-3-16 上午11:35:06
13 |  */
14 | public abstract class Rule {
15 | 
16 | 	protected List<Chunk> chunks;
17 | 	
18 | 	public void addChunks(List<Chunk> chunks) {
19 | 		for(Chunk chunk : chunks) {
20 | 			addChunk(chunk);
21 | 		}
22 | 	}
23 | 	
24 | 	/**
25 | 	 * 添加 chunk
26 | 	 * @throws NullPointerException, if chunk == null.
27 | 	 * @author chenlb 2009-3-16 上午11:34:17
28 | 	 */
29 | 	public void addChunk(Chunk chunk) {
30 | 		chunks.add(chunk);
31 | 	}
32 | 	
33 | 	/**
34 | 	 * @return 返回规则过虑后的结果。
35 | 	 * @author chenlb 2009-3-16 上午11:33:10
36 | 	 */
37 | 	public List<Chunk> remainChunks() {
38 | 		for(Iterator<Chunk> it=chunks.iterator(); it.hasNext();) {
39 | 			Chunk chunk = it.next();
40 | 			if(isRemove(chunk)) {
41 | 				it.remove();
42 | 			}
43 | 		}
44 | 		return chunks;
45 | 	}
46 | 	
47 | 	/**
48 | 	 * 判断 chunk 是否要删除。
49 | 	 * @author chenlb 2009-3-16 上午11:33:30
50 | 	 */
51 | 	protected abstract boolean isRemove(Chunk chunk);
52 | 	
53 | 	public void reset() {
54 | 		chunks = new ArrayList<Chunk>();
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/com/chenlb/mmseg4j/rule/SmallestVarianceRule.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j.rule;
 2 | 
 3 | import com.chenlb.mmseg4j.Chunk;
 4 | 
 5 | /**
 6 |  * Smallest Variance of Word Lengths.<p/>
 7 |  * 
 8 |  * 标准差的平方
 9 |  * 
10 |  * @see http://technology.chtsai.org/mmseg/
11 |  * 
12 |  * @author chenlb 2009-3-16 上午11:28:27
13 |  */
14 | public class SmallestVarianceRule extends Rule {
15 | 
16 | 	private double smallestVariance = Double.MAX_VALUE;
17 | 	
18 | 	@Override
19 | 	public void addChunk(Chunk chunk) {
20 | 		if(chunk.getVariance() <= smallestVariance) {
21 | 			smallestVariance = chunk.getVariance();
22 | 			super.addChunk(chunk);
23 | 		}
24 | 	}
25 | 
26 | 	@Override
27 | 	public void reset() {
28 | 		smallestVariance = Double.MAX_VALUE;
29 | 		super.reset();
30 | 	}
31 | 
32 | 	@Override
33 | 	protected boolean isRemove(Chunk chunk) {
34 | 		
35 | 		return chunk.getVariance() > smallestVariance;
36 | 	}
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/resources/data/units.dic:
--------------------------------------------------------------------------------
 1 | #说明:单个字的单位, 与数字一起出现的(前面是数字). 如:2009年
 2 | #注意:如果"分"加入到些文件中,"20分钟"就被分成"20|分|钟". 如果你想分成"20|分钟",那就把"分"注释掉.
 3 | #    作者认为:像"分"和"分钟"都是单位的话,不把"分"加入.
 4 | #时间
 5 | 年
 6 | 月
 7 | 日
 8 | 时
 9 | #单位"分钟"已经是词,就不把"分"加入了
10 | #分
11 | 秒
12 | #币
13 | 元
14 | 角
15 | #长度
16 | 米
17 | 寸
18 | 尺
19 | 丈
20 | 里
21 | #容量
22 | 升
23 | 斗
24 | 石
25 | #重量
26 | 吨
27 | 克
28 | 斤
29 | 两
30 | 担
31 | #地积
32 | 亩
33 | 顷


--------------------------------------------------------------------------------
/src/test/java/com/chenlb/mmseg4j/ComplexSegTest.java:
--------------------------------------------------------------------------------
  1 | package com.chenlb.mmseg4j;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import org.junit.Assert;
  6 | import org.junit.Before;
  7 | import org.junit.Test;
  8 | 
  9 | import com.chenlb.mmseg4j.example.Complex;
 10 | 
 11 | public class ComplexSegTest {
 12 | 
 13 | 	Complex segW;
 14 | 	@Before
 15 | 	public void setUp() throws Exception {
 16 | 		segW = new Complex();
 17 | 		//ComplexSeg.setShowChunk(true);
 18 | 	}
 19 | 
 20 | 	/*public void testSeg() {
 21 | 		String txt = "";
 22 | 		txt = "各人发表关于受一股来自中西伯利亚的强冷空气影响";
 23 | 		ComplexSeg.setShowChunk(true);
 24 | 		ComplexSeg seg = new ComplexSeg(new Dictionary("dic"));	//sogou
 25 | 		Sentence sen = new Sentence(txt.toCharArray(), 0);
 26 | 		System.out.println();
 27 | 		while(!sen.isFinish()) {
 28 | 			Chunk chunk = seg.seg(sen);
 29 | 			System.out.println(chunk+" -> "+chunk.getStartOffset());
 30 | 		}
 31 | 	}*/
 32 | 
 33 | 	@Test
 34 | 	public void testEffect() throws IOException {
 35 | 		String words = segW.segWords("研究生命起源", "|");
 36 | 		Assert.assertEquals("研究|生命|起源", words);
 37 | 	}
 38 | 
 39 | 	@Test
 40 | 	public void testEffect1() throws IOException {
 41 | 		String words = segW.segWords("为首要考虑", "|");
 42 | 		Assert.assertEquals("为首|要|考虑", words);
 43 | 	}
 44 | 
 45 | 	@Test
 46 | 	public void testEffect2() throws IOException {
 47 | 		String words = segW.segWords("眼看就要来了", "|");
 48 | 		Assert.assertEquals("眼看|就要|来|了", words);
 49 | 	}
 50 | 
 51 | 	@Test
 52 | 	public void testEffect3() throws IOException {
 53 | 		String words = segW.segWords("中西伯利亚", "|");
 54 | 		Assert.assertEquals("中|西伯利亚", words);
 55 | 	}
 56 | 
 57 | 	@Test
 58 | 	public void testEffect4() throws IOException {
 59 | 		String words = segW.segWords("国际化", "|");
 60 | 		Assert.assertEquals("国际化", words);
 61 | 	}
 62 | 
 63 | 	@Test
 64 | 	public void testEffect5() throws IOException {
 65 | 		String words = segW.segWords("化装和服装", "|");
 66 | 		Assert.assertEquals("化装|和|服装", words);
 67 | 	}
 68 | 
 69 | 	@Test
 70 | 	public void testEffect6() throws IOException {
 71 | 		String words = segW.segWords("中国人民银行", "|");
 72 | 		Assert.assertEquals("中国人民银行", words);
 73 | 	}
 74 | 
 75 | 	/**
 76 | 	 * 自扩展的词库文件
 77 | 	 */
 78 | 	@Test
 79 | 	public void testEffect7() throws IOException {
 80 | 		String words = segW.segWords("白云山", "|");
 81 | 		Assert.assertEquals("白云山", words);
 82 | 	}
 83 | 
 84 | 	@Test
 85 | 	public void testEffect10() throws IOException {
 86 | 		String words = segW.segWords("清华大学", "|");
 87 | 		Assert.assertEquals("清华大学", words);
 88 | 	}
 89 | 
 90 | 	@Test
 91 | 	public void testEffect11() throws IOException {
 92 | 		String words = segW.segWords("华南理工大学", "|");
 93 | 		Assert.assertEquals("华南理工大学", words);
 94 | 	}
 95 | 
 96 | 	@Test
 97 | 	public void testEffect12() throws IOException {
 98 | 		String words = segW.segWords("广东工业大学", "|");
 99 | 		Assert.assertEquals("广东工业大学", words);
100 | 	}
101 | 
102 | 	@Test
103 | 	public void testUnitEffect() throws IOException {
104 | 		String words = segW.segWords("2008年底发了资金吗", "|");
105 | 		Assert.assertEquals("2008|年|底|发|了|资金|吗", words);
106 | 	}
107 | 
108 | 	@Test
109 | 	public void testUnitEffect1() throws IOException {
110 | 		String words = segW.segWords("20分钟能完成", "|");
111 | 		Assert.assertEquals("20|分钟|能|完成", words);
112 | 	}
113 | }
114 | 


--------------------------------------------------------------------------------
/src/test/java/com/chenlb/mmseg4j/DictionaryTest.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | 
 6 | import org.junit.Assert;
 7 | import org.junit.Test;
 8 | 
 9 | public class DictionaryTest {
10 | 
11 | 	private void printMemory() {
12 | 		Runtime rt = Runtime.getRuntime();
13 | 		long total = rt.totalMemory();
14 | 		long free = rt.freeMemory();
15 | 		long max = rt.maxMemory();
16 | 		System.out.println(String.format("total=%dk, free=%dk, max=%dk, use=%dk", total/1024, free/1024, max/1024, (total-free)/1024));
17 | 	}
18 | 
19 | 	@Test
20 | 	public void testloadDicMemoryUse() {
21 | 		printMemory();
22 | 		Dictionary.getInstance();
23 | 		printMemory();
24 | 	}
25 | 
26 | 	@Test
27 | 	public void testloadDic() {
28 | 		Dictionary dic = Dictionary.getInstance();
29 | 		Dictionary dic2 = Dictionary.getInstance();
30 | 		Assert.assertTrue(dic == dic2);
31 | 
32 | 		dic.destroy();
33 | 		//reload
34 | 		dic2 = Dictionary.getInstance();
35 | 		Assert.assertTrue(dic != dic2);
36 | 		dic2.destroy();
37 | 	}
38 | 
39 | 	@Test
40 | 	public void testloadDicByPath() {
41 | 		Dictionary dic = Dictionary.getInstance("src");
42 | 		Dictionary dic2 = Dictionary.getInstance("./src");
43 | 		Assert.assertTrue(dic == dic2);
44 | 
45 | 		Assert.assertFalse(dic.match("自定义词"));
46 | 
47 | 		dic.destroy();
48 | 	}
49 | 
50 | 	@Test
51 | 	public void testloadMultiDic() {
52 | 		Dictionary dic = Dictionary.getInstance();
53 | 
54 | 		Assert.assertTrue(dic.match("自定义词"));
55 | 	}
56 | 
57 | 	@Test
58 | 	public void testMatch() {
59 | 		Dictionary dic = Dictionary.getInstance();
60 | 
61 | 		Assert.assertTrue(dic.match("词典"));
62 | 
63 | 		Assert.assertFalse(dic.match("人个"));
64 | 		Assert.assertFalse(dic.match("三个人"));
65 | 
66 | 		Assert.assertFalse(dic.match(""));
67 | 		Assert.assertFalse(dic.match("人"));
68 | 
69 | 	}
70 | 
71 | 	@Test
72 | 	public void testFileHashCode() throws IOException {
73 | 		File f = new File("data");
74 | 		File f1 = new File("./data");
75 | 		Assert.assertFalse(f.equals(f1));
76 | 
77 | 		f1 = f.getAbsoluteFile();
78 | 		Assert.assertFalse(f.equals(f1));
79 | 
80 | 		Assert.assertTrue(f.getCanonicalFile().equals(f1.getCanonicalFile()));
81 | 
82 | 		f1 = new File("data");
83 | 		Assert.assertTrue(f.equals(f1));
84 | 	}
85 | }
86 | 


--------------------------------------------------------------------------------
/src/test/java/com/chenlb/mmseg4j/KeyTreeTest.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j;
 2 | 
 3 | import com.chenlb.mmseg4j.CharNode.KeyTree;
 4 | 
 5 | import junit.framework.TestCase;
 6 | 
 7 | public class KeyTreeTest extends TestCase {
 8 | 
 9 | 	protected void setUp() throws Exception {
10 | 		super.setUp();
11 | 	}
12 | 
13 | 	public void testMatch() {
14 | 		char[] w = "为什么".toCharArray();
15 | 		KeyTree kt = new KeyTree();
16 | 		kt.add(w);
17 | 		assertTrue(kt.match(w, 0, w.length));
18 | 		assertFalse(kt.match(w, 0, 2));
19 | 		assertFalse(kt.match("怎么样".toCharArray(), 0, 3));
20 | 		
21 | 		w = "国人民银行".toCharArray();
22 | 		kt.add(w);
23 | 		int tailLen = kt.maxMatch("中国人民银行".toCharArray(), 1);
24 | 		assertEquals(tailLen, w.length);
25 | 	}
26 | 	
27 | 	public void testMatch2() {
28 | 		Dictionary dic = Dictionary.getInstance();
29 | 		int tailLen = dic.maxMatch("中国人民银行".toCharArray(), 0);
30 | 		assertEquals(tailLen, 5);
31 | 	}
32 | }
33 | 


--------------------------------------------------------------------------------
/src/test/java/com/chenlb/mmseg4j/MMSegTest.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.StringReader;
 5 | 
 6 | import junit.framework.TestCase;
 7 | 
 8 | public class MMSegTest extends TestCase {
 9 | 
10 | 	protected void setUp() throws Exception {
11 | 		super.setUp();
12 | 	}
13 | 
14 | 	public void testNext() throws IOException {
15 | 		String txt = "";
16 | 		txt = "京华时报1月23日报道 昨天，受一股来自中西伯利亚的强冷空气影响，本市出现大风降温天气，白天最高气温只有零下7摄氏度，同时伴有6到7级的偏北风。";
17 | 		txt = "研究生命起源";
18 | 		txt = "手机电子书    abc   http://www.sjshu.com";
19 | 		txt = "Apple 苹果 MacBook Pro MB991CH/A 13.3m寸宽屏笔记本(Ⅱ,⑩)";
20 | 		//txt = "２００９年ゥスぁま是中 ＡＢｃｃ国абвгαβγδ首次,我的ⅠⅡⅢ在chenёlbēū全国ㄦ范围ㄚㄞㄢ内①ē②㈠㈩⒈⒑发行地方政府债券，";
21 | 		Dictionary dic = Dictionary.getInstance();
22 | 		Seg seg = null;
23 | 		//seg = new SimpleSeg(dic);
24 | 		seg = new ComplexSeg(dic); 
25 | 		MMSeg mmSeg = new MMSeg(new StringReader(txt), seg);
26 | 		Word word = null;
27 | 		System.out.println();
28 | 		while((word=mmSeg.next())!=null) {
29 | 
30 | 			System.out.print(word.getString()+" -> "+word.getStartOffset());
31 | 			//offset += word.length;
32 | 			System.out.println(", "+word.getEndOffset()+", "+word.getType());
33 | 				
34 | 			
35 | 		}
36 | 		
37 | 	}
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/src/test/java/com/chenlb/mmseg4j/MaxWordSegTest.java:
--------------------------------------------------------------------------------
  1 | package com.chenlb.mmseg4j;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import org.junit.Assert;
  6 | import org.junit.Before;
  7 | import org.junit.Ignore;
  8 | import org.junit.Test;
  9 | 
 10 | import com.chenlb.mmseg4j.example.MaxWord;
 11 | 
 12 | public class MaxWordSegTest {
 13 | 
 14 | 	MaxWord segW;
 15 | 	@Before
 16 | 	public void setUp() throws Exception {
 17 | 		segW = new MaxWord();
 18 | 	}
 19 | 
 20 | 	@Test
 21 | 	public void testEffect() throws IOException {
 22 | 		String words = segW.segWords("共和国", "|");
 23 | 		Assert.assertEquals("共和|国", words);
 24 | 	}
 25 | 
 26 | 	@Test
 27 | 	public void testEffect1() throws IOException {
 28 | 		String words = segW.segWords("中国人民银行", "|");
 29 | 		Assert.assertEquals("中国|国人|人民|银行", words);
 30 | 	}
 31 | 
 32 | 	@Test
 33 | 	public void testEffect2() throws IOException {
 34 | 		String words = segW.segWords("西伯利亚", "|");
 35 | 		Assert.assertEquals("西|伯|利|亚", words);
 36 | 	}
 37 | 
 38 | 	@Test
 39 | 	public void testEffect3() throws IOException {
 40 | 		String words = segW.segWords("中华人民共和国", "|");
 41 | 		Assert.assertEquals("中华|华人|人民|共和|国", words);
 42 | 	}
 43 | 
 44 | 	@Test
 45 | 	public void testEffect4() throws IOException {
 46 | 		String words = segW.segWords("羽毛球拍", "|");
 47 | 		Assert.assertEquals("羽毛|球拍", words);
 48 | 	}
 49 | 
 50 | 	@Test
 51 | 	public void testEffect5() throws IOException {
 52 | 		String words = segW.segWords("化装和服装", "|");
 53 | 		Assert.assertEquals("化装|和|服装", words);
 54 | 	}
 55 | 
 56 | 	@Test
 57 | 	public void testEffect6() throws IOException {
 58 | 		String words = segW.segWords("为什么", "|");
 59 | 		Assert.assertEquals("为|什么", words);
 60 | 	}
 61 | 
 62 | 	@Test
 63 |   @Ignore
 64 | 	public void testEffect7() throws IOException {
 65 | 		String words = segW.segWords("很好听", "|");
 66 |     // Complex 分出 '很|好听'
 67 |     // 目前 max-word 是在 complex 之后再分词的。
 68 | 		Assert.assertEquals("很好|好听", words);
 69 | 	}
 70 | 
 71 | 	@Test
 72 | 	public void testEffect8() throws IOException {
 73 | 		String words = segW.segWords("强冷空气", "|");
 74 | 		Assert.assertEquals("强|冷|空气", words);
 75 | 	}
 76 | 
 77 | 	/**
 78 | 	 * 自扩展的词库文件
 79 | 	 */
 80 | 	@Test
 81 | 	public void testEffect9() throws IOException {
 82 | 		String words = segW.segWords("白云山", "|");
 83 | 		Assert.assertEquals("白云|云山", words);
 84 | 	}
 85 | 
 86 | 	@Test
 87 | 	public void testEffect10() throws IOException {
 88 | 		String words = segW.segWords("清华大学", "|");
 89 | 		Assert.assertEquals("清华|大学", words);
 90 | 	}
 91 | 
 92 | 	@Test
 93 | 	public void testEffect11() throws IOException {
 94 | 		String words = segW.segWords("华南理工大学", "|");
 95 |     // '工大' 在词库中没有
 96 |     Assert.assertEquals("华南|理工|大学", words);
 97 | 	}
 98 | 
 99 | 	@Test
100 | 	public void testEffect12() throws IOException {
101 | 		String words = segW.segWords("广东工业大学", "|");
102 |     // '业大' 在词库中有
103 |     Assert.assertEquals("广东|工业|业大|大学", words);
104 | 	}
105 | }
106 | 


--------------------------------------------------------------------------------
/src/test/java/com/chenlb/mmseg4j/MyTest.java:
--------------------------------------------------------------------------------
  1 | package com.chenlb.mmseg4j;
  2 | 
  3 | import java.io.BufferedWriter;
  4 | import java.io.File;
  5 | import java.io.FileOutputStream;
  6 | import java.io.IOException;
  7 | import java.io.OutputStreamWriter;
  8 | import java.net.URISyntaxException;
  9 | import java.net.URL;
 10 | import java.util.ArrayList;
 11 | import java.util.Arrays;
 12 | import java.util.Collections;
 13 | import java.util.Comparator;
 14 | import java.util.List;
 15 | import java.util.Map;
 16 | import java.util.Map.Entry;
 17 | 
 18 | import org.junit.Ignore;
 19 | import org.junit.Test;
 20 | 
 21 | public class MyTest {
 22 | 
 23 | 	public void test100Log() {
 24 | 		int freq = 1034142;
 25 | 		print100Log(freq);
 26 | 
 27 | 		freq = 847332;
 28 | 		print100Log(freq);
 29 | 	}
 30 | 
 31 | 	private void print100Log(int freq) {
 32 | 		int my100Log = (int) (Math.log(freq) * 100);
 33 | 		System.out.println(freq+" -> "+my100Log+" | "+(Math.log(freq) * 100));
 34 | 	}
 35 | 
 36 | 	public void testDicPath() throws URISyntaxException {
 37 | 		URL url = Dictionary.class.getResource("/");
 38 | 		String path = "";
 39 | 		path = url.toURI().getRawPath();
 40 | 		System.out.println(path);
 41 | 		File f = new File(path+"data");
 42 | 		System.out.println(f+" -> "+f.exists());
 43 | 
 44 | 
 45 | 		path = url.toExternalForm();
 46 | 		System.out.println(path);
 47 | 
 48 | 		path = url.getPath();
 49 | 		System.out.println(path);
 50 | 
 51 | 		path = System.getProperty("user.dir");
 52 | 		System.out.println(path);
 53 | 	}
 54 | 
 55 | 	public void testZhNumCodeP() {
 56 | 		String num = "０１２３４５６７８９";
 57 | 		String n = "0123456789";
 58 | 		for(int i=0; i<num.length(); i++) {
 59 | 			int cp = num.codePointAt(i);
 60 | 			int ncp = n.codePointAt(i);
 61 | 			System.out.println((char)cp+" -> "+cp+", "+(char)ncp+" -> "+ncp);
 62 | 		}
 63 | 	}
 64 | 
 65 | 	public void testCodePAndType() {
 66 | 		String str = "09０９☆§┍┄○一＄￥≈∑①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇！中文【ゥスぁまēūㄇㄎноνπⅠⅡⅢ";
 67 | 
 68 | 		str = "ぁぃぅぇぉかきくけこんさしすせそたちつってとゐなにぬねのはひふへほゑまみむめもゃゅょゎを";
 69 | 		str += "あいうえおがぎぐげござじずぜぞだぢづでどぱぴぷぺぽばびぶべぼらりるれろやゆよわ";
 70 | 
 71 | 		str += "ァィゥヴェォカヵキクケヶコサシスセソタチツッテトヰンナニヌネノハヒフヘホヱマミムメモャュョヮヲ";
 72 | 		str += "アイウエオガギグゲゴザジズゼゾダヂヅデドパピプペポバビブベボラリルレロヤユヨワ";
 73 | 
 74 | 		str = "āáǎàōóǒòêēéěèīíǐìūúǔùǖǘǚǜü";
 75 | 
 76 | 		/*str = "ㄅㄉˇˋㄓˊ˙ㄚㄞㄢㄦㄆㄊㄍㄐㄔㄗㄧㄛㄟㄣㄇㄋㄎㄑㄕㄘㄨㄜㄠㄤㄈㄌㄏㄒㄖㄙㄩㄝㄡㄥ";
 77 | 
 78 | 		str = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя";
 79 | 		str += "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ";*/
 80 | 
 81 | 		/*str = "αβγδεζηθικλμνξοπρστυφχψω";
 82 | 		str += "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ";*/
 83 | 
 84 | 		int[] cps = new int[str.length()];
 85 | 		for(int i=0; i<str.length(); i++) {
 86 | 			cps[i] = str.codePointAt(i);
 87 | 		}
 88 | 		Arrays.sort(cps);
 89 | 		StringBuilder sb = new StringBuilder();
 90 | 		for(int i=0; i<str.length(); i++) {
 91 | 			sb.setLength(0);
 92 | 			int cp = cps[i];//str.codePointAt(i);
 93 | 			sb.appendCodePoint(cp).append(" -> ").append(cp);
 94 | 			sb.append(", type=").append(Character.getType(cp));
 95 | 			sb.append(", hex=").append(Integer.toHexString(cp));
 96 | 			System.out.println(sb);
 97 | 		}
 98 | 	}
 99 | 
100 | 	public void testCodePAndType2() {
101 | 
102 | 		int start = 12435+1;
103 | 		int end = 12449-1;
104 | 
105 | 		start = 0xff21;
106 | 		end = 0xff5a;
107 | 
108 | 		StringBuilder sb = new StringBuilder();
109 | 		for(int i=start; i<=end; i++) {
110 | 			sb.setLength(0);
111 | 			int cp = i;//str.codePointAt(i);
112 | 			sb.appendCodePoint(cp).append(" -> ").append(cp);
113 | 			sb.append(", type=").append(Character.getType(cp));
114 | 			sb.append(", hex=").append(Integer.toHexString(cp));
115 | 			System.out.println(sb);
116 | 		}
117 | 	}
118 | 
119 | 	@Test
120 | 	@Ignore
121 | 	public void testShowUnicode() {
122 | 		int c = 0x2F81A;
123 | 		int mc = Character.toLowerCase(c);
124 | 		StringBuilder sb = new StringBuilder();
125 | 		sb.appendCodePoint(c).append(" --to low--> ").appendCodePoint(mc);
126 | 		System.out.println("c="+c+",mc="+mc+"\n"+sb);
127 | 	}
128 | 
129 | 	private static long now() {
130 | 		return System.currentTimeMillis();
131 | 	}
132 | 
133 | 	@Test
134 | 	@Ignore
135 | 	public void testSeeSogouDic() throws IOException {
136 | 		Dictionary dic = Dictionary.getInstance("sogou");
137 | 		Map<Character, CharNode> dict = dic.getDict();
138 | 		long start = now();
139 | 		List<Map.Entry<Character, CharNode>> es = new ArrayList<Map.Entry<Character,CharNode>>(dict.size());
140 | 		es.addAll(dict.entrySet());
141 | 		System.out.println("add use "+(now()-start)+"ms");
142 | 		start = now();
143 | 		Collections.sort(es, new Comparator<Map.Entry<Character, CharNode>>() {
144 | 
145 | 			public int compare(Entry<Character, CharNode> a,
146 | 					Entry<Character, CharNode> b) {
147 | 				int r = -new Integer(a.getValue().getMaxLen()).compareTo(b.getValue().getMaxLen());
148 | 				if(r == 0) {
149 | 					r = -new Integer(a.getValue().wordNum()).compareTo(b.getValue().wordNum());
150 | 				}
151 | 				if(r == 0) {
152 | 					r = -new Integer(a.getValue().getFreq()).compareTo(b.getValue().getFreq());
153 | 				}
154 | 				return r;
155 | 			}
156 | 
157 | 		});
158 | 		System.out.println("sort use "+(now()-start)+"ms");
159 | 		start = now();
160 | 		BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File("sogou/word-stat.txt")), "UTF-8"));
161 | 		writer.append("char").append('\t')
162 | 		.append("freq").append('\t')
163 | 		.append("maxLen").append('\t')
164 | 		.append("wordNum").append('\t')
165 | 		.append("lens").append("\r\n");
166 | 		for(Map.Entry<Character, CharNode> e : es) {
167 | 			CharNode cn = e.getValue();
168 | 			writer.append(e.getKey()).append('\t')
169 | 				.append(cn.getFreq()+"").append('\t')
170 | 				.append(cn.getMaxLen()+"").append('\t')
171 | 				.append(cn.wordNum()+"").append('\t')
172 | 				.append("\r\n");
173 | 		}
174 | 		writer.close();
175 | 		System.out.println("writer use "+(now()-start)+"ms");
176 | 	}
177 | }
178 | 


--------------------------------------------------------------------------------
/src/test/java/com/chenlb/mmseg4j/SimpleSegTest.java:
--------------------------------------------------------------------------------
 1 | package com.chenlb.mmseg4j;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.junit.Assert;
 6 | import org.junit.Before;
 7 | import org.junit.Test;
 8 | 
 9 | import com.chenlb.mmseg4j.example.Simple;
10 | 
11 | public class SimpleSegTest {
12 | 
13 | 	Simple segW;
14 | 
15 | 	@Before
16 | 	public void setUp() throws Exception {
17 | 		segW = new Simple();
18 | 	}
19 | 
20 | 	@Test
21 | 	public void testEffect() throws IOException {
22 | 		String words = segW.segWords("研究生命起源", "|");
23 | 		Assert.assertEquals("研究生|命|起源", words);
24 | 	}
25 | 
26 | 	@Test
27 | 	public void testEffect1() throws IOException {
28 | 		String words = segW.segWords("为首要考虑", "|");
29 | 		Assert.assertEquals("为首|要|考虑", words);
30 | 	}
31 | 
32 | 	@Test
33 | 	public void testEffect2() throws IOException {
34 | 		String words = segW.segWords("眼看就要来了", "|");
35 | 		Assert.assertEquals("眼看|就要|来|了", words);
36 | 	}
37 | 
38 | 	@Test
39 | 	public void testEffect3() throws IOException {
40 | 		String words = segW.segWords("中西伯利亚", "|");
41 | 		Assert.assertEquals("中西|伯|利|亚", words);
42 | 	}
43 | 
44 | 	@Test
45 | 	public void testEffect4() throws IOException {
46 | 		String words = segW.segWords("国际化", "|");
47 | 		Assert.assertEquals("国际化", words);
48 | 	}
49 | 
50 | 	@Test
51 | 	public void testEffect5() throws IOException {
52 | 		String words = segW.segWords("化装和服装", "|");
53 | 		Assert.assertEquals("化装|和服|装", words);
54 | 	}
55 | 
56 | 	@Test
57 | 	public void testEffect6() throws IOException {
58 | 		String words = segW.segWords("中国人民银行", "|");
59 | 		Assert.assertEquals("中国人民银行", words);
60 | 	}
61 | 
62 | 	/**
63 | 	 * 自扩展的词库文件
64 | 	 */
65 | 	@Test
66 | 	public void testEffect7() throws IOException {
67 | 		String words = segW.segWords("白云山", "|");
68 | 		Assert.assertEquals("白云山", words);
69 | 	}
70 | 
71 | 	@Test
72 | 	public void testUnitEffect() throws IOException {
73 | 		String words = segW.segWords("2008年中有很多事情", "|");
74 | 		Assert.assertEquals("2008|年|中有|很多|事情", words);
75 | 	}
76 | 
77 | 	@Test
78 | 	public void testUnitEffect1() throws IOException {
79 | 		String words = segW.segWords("20分钟能完成", "|");
80 | 		Assert.assertEquals("20|分钟|能|完成", words);
81 | 	}
82 | }
83 | 


--------------------------------------------------------------------------------
/src/test/resources/data/words-test-my.dic:
--------------------------------------------------------------------------------
1 | #
2 | 自定义词


--------------------------------------------------------------------------------