>(sibling.getValue().getSuccess().entrySet().size() + 1);
890 |
891 | if (fetch(sibling.getValue(), new_siblings) == 0) // 一个词的终止且不为其他词的前缀,其实就是叶子节点
892 | {
893 | base[begin + sibling.getKey()] = (-sibling.getValue().getLargestValueId() - 1);
894 | progress++;
895 | }
896 | else
897 | {
898 | int h = insert(new_siblings); // dfs
899 | base[begin + sibling.getKey()] = h;
900 | }
901 | sibling.getValue().setIndex(begin + sibling.getKey());
902 | }
903 | return begin;
904 | }
905 |
906 | /**
907 | * 释放空闲的内存
908 | */
909 | private void loseWeight()
910 | {
911 | int nbase[] = new int[size + 65535];
912 | System.arraycopy(base, 0, nbase, 0, size);
913 | base = nbase;
914 |
915 | int ncheck[] = new int[size + 65535];
916 | System.arraycopy(check, 0, ncheck, 0, size);
917 | check = ncheck;
918 | }
919 | }
920 | }
921 |
--------------------------------------------------------------------------------
/src/main/java/trie/AhoCorasick/State.java:
--------------------------------------------------------------------------------
1 | package trie.AhoCorasick;
2 |
3 | import java.util.*;
4 |
5 | /**
6 | *
7 | * 一个状态有如下几个功能
8 | *
9 | *
10 | *
11 | * - success; 成功转移到另一个状态
12 | * - failure; 不可顺着字符串跳转的话,则跳转到一个浅一点的节点
13 | * - emits; 命中一个模式串
14 | *
15 | *
16 | *
17 | * 根节点稍有不同,根节点没有 failure 功能,它的“failure”指的是按照字符串路径转移到下一个状态。其他节点则都有failure状态。
18 | *
19 | *
20 | * @author Robert Bor
21 | */
22 | public class State
23 | {
24 |
25 | /**
26 | * 模式串的长度,也是这个状态的深度
27 | */
28 | protected final int depth;
29 |
30 | /**
31 | * fail 函数,如果没有匹配到,则跳转到此状态。
32 | */
33 | private State failure = null;
34 |
35 | /**
36 | * 只要这个状态可达,则记录模式串
37 | */
38 | private Set emits = null;
39 | /**
40 | * goto 表,也称转移函数。根据字符串的下一个字符转移到下一个状态
41 | */
42 | private Map success = new TreeMap();
43 |
44 | /**
45 | * 在双数组中的对应下标
46 | */
47 | private int index;
48 |
49 | /**
50 | * 构造深度为0的节点
51 | */
52 | public State()
53 | {
54 | this(0);
55 | }
56 |
57 | /**
58 | * 构造深度为depth的节点
59 | * @param depth
60 | */
61 | public State(int depth)
62 | {
63 | this.depth = depth;
64 | }
65 |
66 | /**
67 | * 获取节点深度
68 | * @return
69 | */
70 | public int getDepth()
71 | {
72 | return this.depth;
73 | }
74 |
75 | /**
76 | * 添加一个匹配到的模式串(这个状态对应着这个模式串)
77 | * @param keyword
78 | */
79 | public void addEmit(int keyword)
80 | {
81 | if (this.emits == null)
82 | {
83 | this.emits = new TreeSet(Collections.reverseOrder());
84 | }
85 | this.emits.add(keyword);
86 | }
87 |
88 | /**
89 | * 获取最大的值
90 | * @return
91 | */
92 | public Integer getLargestValueId()
93 | {
94 | if (emits == null || emits.size() == 0) return null;
95 |
96 | return emits.iterator().next();
97 | }
98 |
99 | /**
100 | * 添加一些匹配到的模式串
101 | * @param emits
102 | */
103 | public void addEmit(Collection emits)
104 | {
105 | for (int emit : emits)
106 | {
107 | addEmit(emit);
108 | }
109 | }
110 |
111 | /**
112 | * 获取这个节点代表的模式串(们)
113 | * @return
114 | */
115 | public Collection emit()
116 | {
117 | return this.emits == null ? Collections.emptyList() : this.emits;
118 | }
119 |
120 | /**
121 | * 是否是终止状态
122 | * @return
123 | */
124 | public boolean isAcceptable()
125 | {
126 | return this.depth > 0 && this.emits != null;
127 | }
128 |
129 | /**
130 | * 获取failure状态
131 | * @return
132 | */
133 | public State failure()
134 | {
135 | return this.failure;
136 | }
137 |
138 | /**
139 | * 设置failure状态
140 | * @param failState
141 | */
142 | public void setFailure(State failState, int fail[])
143 | {
144 | this.failure = failState;
145 | fail[index] = failState.index;
146 | }
147 |
148 | /**
149 | * 转移到下一个状态
150 | * @param character 希望按此字符转移
151 | * @param ignoreRootState 是否忽略根节点,如果是根节点自己调用则应该是true,否则为false
152 | * @return 转移结果
153 | */
154 | private State nextState(Character character, boolean ignoreRootState)
155 | {
156 | State nextState = this.success.get(character);
157 | if (!ignoreRootState && nextState == null && this.depth == 0)
158 | {
159 | nextState = this;
160 | }
161 | return nextState;
162 | }
163 |
164 | /**
165 | * 按照character转移,根节点转移失败会返回自己(永远不会返回null)
166 | * @param character
167 | * @return
168 | */
169 | public State nextState(Character character)
170 | {
171 | return nextState(character, false);
172 | }
173 |
174 | /**
175 | * 按照character转移,任何节点转移失败会返回null
176 | * @param character
177 | * @return
178 | */
179 | public State nextStateIgnoreRootState(Character character)
180 | {
181 | return nextState(character, true);
182 | }
183 |
184 | public State addState(Character character)
185 | {
186 | State nextState = nextStateIgnoreRootState(character);
187 | if (nextState == null)
188 | {
189 | nextState = new State(this.depth + 1);
190 | this.success.put(character, nextState);
191 | }
192 | return nextState;
193 | }
194 |
195 | public Collection getStates()
196 | {
197 | return this.success.values();
198 | }
199 |
200 | public Collection getTransitions()
201 | {
202 | return this.success.keySet();
203 | }
204 |
205 | @Override
206 | public String toString()
207 | {
208 | final StringBuilder sb = new StringBuilder("State{");
209 | sb.append("depth=").append(depth);
210 | sb.append(", ID=").append(index);
211 | sb.append(", emits=").append(emits);
212 | sb.append(", success=").append(success.keySet());
213 | sb.append(", failureID=").append(failure == null ? "-1" : failure.index);
214 | sb.append(", failure=").append(failure);
215 | sb.append('}');
216 | return sb.toString();
217 | }
218 |
219 | /**
220 | * 获取goto表
221 | * @return
222 | */
223 | public Map getSuccess()
224 | {
225 | return success;
226 | }
227 |
228 | public int getIndex()
229 | {
230 | return index;
231 | }
232 |
233 | public void setIndex(int index)
234 | {
235 | this.index = index;
236 | }
237 | }
238 |
--------------------------------------------------------------------------------
/src/main/java/trie/ITrie.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * He Han
4 | * hankcs.cn@gmail.com
5 | * 2015/4/23 0:23
6 | *
7 | *
8 | * Copyright (c) 2003-2015, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
10 | *
11 | */
12 | package trie;
13 |
14 |
15 | import io.ByteArray;
16 | import java.io.DataOutputStream;
17 | import java.util.TreeMap;
18 |
19 | /**
20 | * trie树接口
21 | * @author hankcs
22 | */
23 | public interface ITrie
24 | {
25 | int build(TreeMap keyValueMap);
26 | boolean save(DataOutputStream out);
27 | boolean load(ByteArray byteArray, V[] value);
28 | V get(char[] key);
29 | V get(String key);
30 | V[] getValueArray(V[] a);
31 | boolean containsKey(String key);
32 | int size();
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/java/trie/Trie.java:
--------------------------------------------------------------------------------
1 | package trie;
2 |
3 | /**
4 | * Created by bruce_shan on 2018/12/4 20:33.
5 | * Corporation CSU Software 这是一个字典树demo,并没有在项目中用到
6 | */
7 | public class Trie {
8 | private TrieNode root;
9 |
10 | public Trie() {
11 | root = new TrieNode();
12 | }
13 |
14 | // Inserts a word into the trie.
15 | public void insert(String word) {
16 | TrieNode node = root;
17 | int length = word.length();
18 | int position ;
19 | char c;
20 | for (int i = 0; i < length; i++) {
21 | c = word.charAt(i);
22 | position = c-'a';
23 | if (node.trieNodes[position] == null) {
24 | node.trieNodes[position] = new TrieNode();
25 | }
26 | node = node.trieNodes[position];
27 | node.setCount(node.getCount()+1);
28 | }
29 | node.setExist(true);
30 | }
31 |
32 | // Returns if the word is in the trie.
33 | public boolean search(String word) {
34 | boolean result = false;
35 | TrieNode node = root;
36 | int length = word.length();
37 | int position ;
38 | char c;
39 | for (int i = 0; i < length; i++) {
40 | c = word.charAt(i);
41 | position = c - 'a';
42 | node = node.trieNodes[position];
43 | if (node == null) {
44 | break;
45 | }
46 | }
47 | if (node != null && node.getExist()) {
48 | result = true;
49 | }
50 | return result;
51 | }
52 |
53 | // Returns if there is any word in the trie
54 | // that starts with the given prefix.
55 | public boolean startsWith(String prefix) {
56 | TrieNode node = root;
57 | int length = prefix.length();
58 | int position ;
59 | char c;
60 | for (int i = 0; i < length; i++) {
61 | c = prefix.charAt(i);
62 | position = c - 'a';
63 | node = node.trieNodes[position];
64 | if (node == null) {
65 | return false;
66 | }
67 | }
68 | return true;
69 | }
70 |
71 | // delete if the word is in the trie.
72 | public boolean doDelete(String word, TrieNode node) {
73 | //树中已匹配的字符串比传入字符串短
74 | if (node == null) {
75 | return false;
76 | }
77 |
78 | //树中已匹配的字符串比传入字符串不短
79 | if (word.length() > 1){
80 | char c = word.charAt(0);
81 | int position = c - 'a';
82 | TrieNode trieNode = node.trieNodes[position];
83 | boolean b = doDelete(word.substring(1), trieNode);
84 | if (b) {
85 | node.setCount(node.getCount() - 1);
86 | if (trieNode.getCount() == 0) {
87 | node.trieNodes[position] = null;
88 | }
89 | return true;
90 | }
91 | }
92 |
93 | if (word.length() == 1) {
94 | char c = word.charAt(0);
95 | int position = c - 'a';
96 | TrieNode trieNode = node.trieNodes[position];
97 | //只删除单词 如果是前缀不删除
98 | if (trieNode != null && trieNode.getExist()) {
99 | return true;
100 | }
101 | }
102 | return false;
103 | }
104 |
105 | // delete if the word is in the trie.
106 | public boolean delete(String word) {
107 | return this.doDelete(word,root);
108 | }
109 |
110 | class TrieNode {
111 | // Initialize your data structure here.
112 | int count = 0;
113 | TrieNode[] trieNodes = new TrieNode[26];
114 | Boolean exist = false;
115 | public TrieNode() {
116 | }
117 |
118 | public TrieNode(int count, Boolean exist) {
119 | this.count = count;
120 | this.exist = exist;
121 | }
122 |
123 | public int getCount() {
124 | return count;
125 | }
126 |
127 | public void setCount(int count) {
128 | this.count = count;
129 | }
130 |
131 | public TrieNode[] getTrieNodes() {
132 | return trieNodes;
133 | }
134 |
135 | public void setTrieNodes(TrieNode[] trieNodes) {
136 | this.trieNodes = trieNodes;
137 | }
138 |
139 | public Boolean getExist() {
140 | return exist;
141 | }
142 |
143 | public void setExist(Boolean exist) {
144 | this.exist = exist;
145 | }
146 | }
147 |
148 | public static void main(String[] args) {
149 | Trie trie = new Trie();
150 | trie.search("lintcode");
151 | trie.startsWith("lint");
152 | trie.insert("lint");
153 | trie.startsWith("lint");
154 |
155 | boolean lint = trie.delete("lin");
156 | //System.out.println("lint = " + lint);
157 | lint = trie.delete("lint");
158 | // System.out.println("lint = " + lint);
159 | }
160 | }
161 |
--------------------------------------------------------------------------------
/src/main/java/trie/bintrie/BaseNode.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * He Han
4 | * hankcs.cn@gmail.com
5 | * 2014/5/2 20:22
6 | *
7 | *
8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
10 | *
11 | */
12 | package trie.bintrie;
13 |
14 |
15 | import io.ByteArray;
16 |
17 | import java.io.DataOutputStream;
18 | import java.io.IOException;
19 | import java.io.ObjectInput;
20 | import java.io.ObjectOutput;
21 | import java.util.AbstractMap;
22 | import java.util.Map;
23 | import java.util.Set;
24 |
25 | /**
26 | * 节点,统一Trie树根和其他节点的基类
27 | *
28 | * @param 值
29 | * @author He Han
30 | */
31 | public abstract class BaseNode implements Comparable
32 | {
33 | /**
34 | * 状态数组,方便读取的时候用
35 | */
36 | static final Status[] ARRAY_STATUS = Status.values();
37 | /**
38 | * 子节点
39 | */
40 | protected BaseNode[] child;
41 | /**
42 | * 节点状态
43 | */
44 | protected Status status;
45 | /**
46 | * 节点代表的字符
47 | */
48 | protected char c;
49 | /**
50 | * 节点代表的值
51 | */
52 | protected V value;
53 |
54 | public BaseNode transition(String path, int begin)
55 | {
56 | BaseNode cur = this;
57 | for (int i = begin; i < path.length(); ++i)
58 | {
59 | cur = cur.getChild(path.charAt(i));
60 | if (cur == null || cur.status == Status.UNDEFINED_0) return null;
61 | }
62 | return cur;
63 | }
64 |
65 | public BaseNode transition(char[] path, int begin)
66 | {
67 | BaseNode cur = this;
68 | for (int i = begin; i < path.length; ++i)
69 | {
70 | cur = cur.getChild(path[i]);
71 | if (cur == null || cur.status == Status.UNDEFINED_0) return null;
72 | }
73 | return cur;
74 | }
75 |
76 | /**
77 | * 转移状态
78 | * @param path
79 | * @return
80 | */
81 | public BaseNode transition(char path)
82 | {
83 | BaseNode cur = this;
84 | cur = cur.getChild(path);
85 | if (cur == null || cur.status == Status.UNDEFINED_0) return null;
86 | return cur;
87 | }
88 |
89 | /**
90 | * 添加子节点
91 | *
92 | * @return true-新增了节点 false-修改了现有节点
93 | */
94 | protected abstract boolean addChild(BaseNode node);
95 |
96 | /**
97 | * 是否含有子节点
98 | *
99 | * @param c 子节点的char
100 | * @return 是否含有
101 | */
102 | protected boolean hasChild(char c)
103 | {
104 | return getChild(c) != null;
105 | }
106 |
107 | protected char getChar()
108 | {
109 | return c;
110 | }
111 |
112 | /**
113 | * 获取子节点
114 | *
115 | * @param c 子节点的char
116 | * @return 子节点
117 | */
118 | public abstract BaseNode getChild(char c);
119 |
120 | /**
121 | * 获取节点对应的值
122 | *
123 | * @return 值
124 | */
125 | public final V getValue()
126 | {
127 | return value;
128 | }
129 |
130 | /**
131 | * 设置节点对应的值
132 | *
133 | * @param value 值
134 | */
135 | public final void setValue(V value)
136 | {
137 | this.value = value;
138 | }
139 |
140 | @Override
141 | public int compareTo(BaseNode other)
142 | {
143 | return compareTo(other.getChar());
144 | }
145 |
146 | /**
147 | * 重载,与字符的比较
148 | * @param other
149 | * @return
150 | */
151 | public int compareTo(char other)
152 | {
153 | if (this.c > other)
154 | {
155 | return 1;
156 | }
157 | if (this.c < other)
158 | {
159 | return -1;
160 | }
161 | return 0;
162 | }
163 |
164 | /**
165 | * 获取节点的成词状态
166 | * @return
167 | */
168 | public Status getStatus()
169 | {
170 | return status;
171 | }
172 |
173 | protected void walk(StringBuilder sb, Set> entrySet)
174 | {
175 | sb.append(c);
176 | if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3)
177 | {
178 | entrySet.add(new TrieEntry(sb.toString(), value));
179 | }
180 | if (child == null) return;
181 | for (BaseNode node : child)
182 | {
183 | if (node == null) continue;
184 | node.walk(new StringBuilder(sb.toString()), entrySet);
185 | }
186 | }
187 |
188 | protected void walkToSave(DataOutputStream out) throws IOException
189 | {
190 | out.writeChar(c);
191 | out.writeInt(status.ordinal());
192 | int childSize = 0;
193 | if (child != null) childSize = child.length;
194 | out.writeInt(childSize);
195 | if (child == null) return;
196 | for (BaseNode node : child)
197 | {
198 | node.walkToSave(out);
199 | }
200 | }
201 |
202 | protected void walkToSave(ObjectOutput out) throws IOException
203 | {
204 | out.writeChar(c);
205 | out.writeInt(status.ordinal());
206 | if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2)
207 | {
208 | out.writeObject(value);
209 | }
210 | int childSize = 0;
211 | if (child != null) childSize = child.length;
212 | out.writeInt(childSize);
213 | if (child == null) return;
214 | for (BaseNode node : child)
215 | {
216 | node.walkToSave(out);
217 | }
218 | }
219 |
220 | protected void walkToLoad(ByteArray byteArray, _ValueArray valueArray)
221 | {
222 | c = byteArray.nextChar();
223 | status = ARRAY_STATUS[byteArray.nextInt()];
224 | if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2)
225 | {
226 | value = valueArray.nextValue();
227 | }
228 | int childSize = byteArray.nextInt();
229 | child = new BaseNode[childSize];
230 | for (int i = 0; i < childSize; ++i)
231 | {
232 | child[i] = new Node();
233 | child[i].walkToLoad(byteArray, valueArray);
234 | }
235 | }
236 |
237 | protected void walkToLoad(ObjectInput byteArray) throws IOException, ClassNotFoundException
238 | {
239 | c = byteArray.readChar();
240 | status = ARRAY_STATUS[byteArray.readInt()];
241 | if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2)
242 | {
243 | value = (V) byteArray.readObject();
244 | }
245 | int childSize = byteArray.readInt();
246 | child = new BaseNode[childSize];
247 | for (int i = 0; i < childSize; ++i)
248 | {
249 | child[i] = new Node();
250 | child[i].walkToLoad(byteArray);
251 | }
252 | }
253 |
254 | public enum Status
255 | {
256 | /**
257 | * 未指定,用于删除词条
258 | */
259 | UNDEFINED_0,
260 | /**
261 | * 不是词语的结尾
262 | */
263 | NOT_WORD_1,
264 | /**
265 | * 是个词语的结尾,并且还可以继续
266 | */
267 | WORD_MIDDLE_2,
268 | /**
269 | * 是个词语的结尾,并且没有继续
270 | */
271 | WORD_END_3,
272 | }
273 |
274 | public class TrieEntry extends AbstractMap.SimpleEntry implements Comparable
275 | {
276 | public TrieEntry(String key, V value)
277 | {
278 | super(key, value);
279 | }
280 | @Override
281 | public int compareTo(TrieEntry o)
282 | {
283 | return getKey().compareTo(o.getKey());
284 | }
285 | }
286 |
287 | @Override
288 | public String toString()
289 | {
290 | if (child == null)
291 | {
292 | return "BaseNode{" +
293 | "status=" + status +
294 | ", c=" + c +
295 | ", value=" + value +
296 | '}';
297 | }
298 | return "BaseNode{" +
299 | "child=" + child.length +
300 | ", status=" + status +
301 | ", c=" + c +
302 | ", value=" + value +
303 | '}';
304 | }
305 | }
306 |
--------------------------------------------------------------------------------
/src/main/java/trie/bintrie/BinTrie.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * He Han
4 | * hankcs.cn@gmail.com
5 | * 2014/5/3 11:34
6 | *
7 | *
8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
10 | *
11 | */
12 | package trie.bintrie;
13 |
14 |
15 |
16 | import io.ByteArray;
17 | import io.IOUtil;
18 | import trie.AhoCorasick.AhoCorasickDoubleArrayTrie;
19 | import trie.ITrie;
20 | import util.TextUtility;
21 |
22 | import java.io.*;
23 | import java.util.*;
24 |
25 | import static config.Logger.logger;
26 |
27 | /**
28 | * 首字直接分配内存,之后二分动态数组的Trie树,能够平衡时间和空间
29 | *
30 | * @author hankcs
31 | */
32 | public class BinTrie extends BaseNode implements ITrie, Externalizable
33 | {
34 | private int size;
35 |
36 | public BinTrie()
37 | {
38 | child = new BaseNode[65535 + 1]; // (int)Character.MAX_VALUE
39 | size = 0;
40 | status = Status.NOT_WORD_1;
41 | }
42 |
43 | public BinTrie(Map map)
44 | {
45 | this();
46 | for (Map.Entry entry : map.entrySet())
47 | {
48 | put(entry.getKey(), entry.getValue());
49 | }
50 | }
51 |
52 | /**
53 | * 插入一个词
54 | *
55 | * @param key
56 | * @param value
57 | */
58 | public void put(String key, V value)
59 | {
60 | if (key.length() == 0) return; // 安全起见
61 | BaseNode branch = this;
62 | char[] chars = key.toCharArray();
63 | for (int i = 0; i < chars.length - 1; ++i)
64 | {
65 | // 除了最后一个字外,都是继续
66 | branch.addChild(new Node(chars[i], Status.NOT_WORD_1, null));
67 | branch = branch.getChild(chars[i]);
68 | }
69 | // 最后一个字加入时属性为end
70 | if (branch.addChild(new Node(chars[chars.length - 1], Status.WORD_END_3, value)))
71 | {
72 | ++size; // 维护size
73 | }
74 | }
75 |
76 | public void put(char[] key, V value)
77 | {
78 | BaseNode branch = this;
79 | for (int i = 0; i < key.length - 1; ++i)
80 | {
81 | // 除了最后一个字外,都是继续
82 | branch.addChild(new Node(key[i], Status.NOT_WORD_1, null));
83 | branch = branch.getChild(key[i]);
84 | }
85 | // 最后一个字加入时属性为end
86 | if (branch.addChild(new Node(key[key.length - 1], Status.WORD_END_3, value)))
87 | {
88 | ++size; // 维护size
89 | }
90 | }
91 |
92 | /**
93 | * 设置键值对,当键不存在的时候会自动插入
94 | * @param key
95 | * @param value
96 | */
97 | public void set(String key, V value)
98 | {
99 | put(key.toCharArray(), value);
100 | }
101 |
102 | /**
103 | * 删除一个词
104 | *
105 | * @param key
106 | */
107 | public void remove(String key)
108 | {
109 | BaseNode branch = this;
110 | char[] chars = key.toCharArray();
111 | for (int i = 0; i < chars.length - 1; ++i)
112 | {
113 | if (branch == null) return;
114 | branch = branch.getChild(chars[i]);
115 | }
116 | if (branch == null) return;
117 | // 最后一个字设为undefined
118 | if (branch.addChild(new Node(chars[chars.length - 1], Status.UNDEFINED_0, value)))
119 | {
120 | --size;
121 | }
122 | }
123 |
124 | public boolean containsKey(String key)
125 | {
126 | BaseNode branch = this;
127 | char[] chars = key.toCharArray();
128 | for (char aChar : chars)
129 | {
130 | if (branch == null) return false;
131 | branch = branch.getChild(aChar);
132 | }
133 |
134 | return branch != null && (branch.status == Status.WORD_END_3 || branch.status == Status.WORD_MIDDLE_2);
135 | }
136 |
137 | public V get(String key)
138 | {
139 | BaseNode branch = this;
140 | char[] chars = key.toCharArray();
141 | for (char aChar : chars)
142 | {
143 | if (branch == null) return null;
144 | branch = branch.getChild(aChar);
145 | }
146 |
147 | if (branch == null) return null;
148 | // 下面这句可以保证只有成词的节点被返回
149 | if (!(branch.status == Status.WORD_END_3 || branch.status == Status.WORD_MIDDLE_2)) return null;
150 | return (V) branch.getValue();
151 | }
152 |
153 | public V get(char[] key)
154 | {
155 | BaseNode branch = this;
156 | for (char aChar : key)
157 | {
158 | if (branch == null) return null;
159 | branch = branch.getChild(aChar);
160 | }
161 |
162 | if (branch == null) return null;
163 | // 下面这句可以保证只有成词的节点被返回
164 | if (!(branch.status == Status.WORD_END_3 || branch.status == Status.WORD_MIDDLE_2)) return null;
165 | return (V) branch.getValue();
166 | }
167 |
168 | @Override
169 | public V[] getValueArray(V[] a)
170 | {
171 | if (a.length < size)
172 | a = (V[]) java.lang.reflect.Array.newInstance(
173 | a.getClass().getComponentType(), size);
174 | int i = 0;
175 | for (Map.Entry entry : entrySet())
176 | {
177 | a[i++] = entry.getValue();
178 | }
179 | return a;
180 | }
181 |
182 | /**
183 | * 获取键值对集合
184 | *
185 | * @return
186 | */
187 | public Set> entrySet()
188 | {
189 | Set> entrySet = new TreeSet>();
190 | StringBuilder sb = new StringBuilder();
191 | for (BaseNode node : child)
192 | {
193 | if (node == null) continue;
194 | node.walk(new StringBuilder(sb.toString()), entrySet);
195 | }
196 | return entrySet;
197 | }
198 |
199 | /**
200 | * 键集合
201 | * @return
202 | */
203 | public Set keySet()
204 | {
205 | TreeSet keySet = new TreeSet();
206 | for (Map.Entry entry : entrySet())
207 | {
208 | keySet.add(entry.getKey());
209 | }
210 |
211 | return keySet;
212 | }
213 |
214 | /**
215 | * 前缀查询
216 | *
217 | * @param key 查询串
218 | * @return 键值对
219 | */
220 | public Set> prefixSearch(String key)
221 | {
222 | Set> entrySet = new TreeSet>();
223 | StringBuilder sb = new StringBuilder(key.substring(0, key.length() - 1));
224 | BaseNode branch = this;
225 | char[] chars = key.toCharArray();
226 | for (char aChar : chars)
227 | {
228 | if (branch == null) return entrySet;
229 | branch = branch.getChild(aChar);
230 | }
231 |
232 | if (branch == null) return entrySet;
233 | branch.walk(sb, entrySet);
234 | return entrySet;
235 | }
236 |
237 | /**
238 | * 前缀查询,包含值
239 | *
240 | * @param key 键
241 | * @return 键值对列表
242 | */
243 | public LinkedList> commonPrefixSearchWithValue(String key)
244 | {
245 | char[] chars = key.toCharArray();
246 | return commonPrefixSearchWithValue(chars, 0);
247 | }
248 |
249 | /**
250 | * 前缀查询,通过字符数组来表示字符串可以优化运行速度
251 | *
252 | * @param chars 字符串的字符数组
253 | * @param begin 开始的下标
254 | * @return
255 | */
256 | public LinkedList> commonPrefixSearchWithValue(char[] chars, int begin)
257 | {
258 | LinkedList> result = new LinkedList>();
259 | StringBuilder sb = new StringBuilder();
260 | BaseNode branch = this;
261 | for (int i = begin; i < chars.length; ++i)
262 | {
263 | char aChar = chars[i];
264 | branch = branch.getChild(aChar);
265 | if (branch == null || branch.status == Status.UNDEFINED_0) return result;
266 | sb.append(aChar);
267 | if (branch.status == Status.WORD_MIDDLE_2 || branch.status == Status.WORD_END_3)
268 | {
269 | result.add(new AbstractMap.SimpleEntry(sb.toString(), (V) branch.value));
270 | }
271 | }
272 |
273 | return result;
274 | }
275 |
276 | @Override
277 | protected boolean addChild(BaseNode node)
278 | {
279 | boolean add = false;
280 | char c = node.getChar();
281 | BaseNode target = getChild(c);
282 | if (target == null)
283 | {
284 | child[c] = node;
285 | add = true;
286 | }
287 | else
288 | {
289 | switch (node.status)
290 | {
291 | case UNDEFINED_0:
292 | if (target.status != Status.NOT_WORD_1)
293 | {
294 | target.status = Status.NOT_WORD_1;
295 | add = true;
296 | }
297 | break;
298 | case NOT_WORD_1:
299 | if (target.status == Status.WORD_END_3)
300 | {
301 | target.status = Status.WORD_MIDDLE_2;
302 | }
303 | break;
304 | case WORD_END_3:
305 | if (target.status == Status.NOT_WORD_1)
306 | {
307 | target.status = Status.WORD_MIDDLE_2;
308 | }
309 | if (target.getValue() == null)
310 | {
311 | add = true;
312 | }
313 | target.setValue(node.getValue());
314 | break;
315 | }
316 | }
317 | return add;
318 | }
319 |
320 | public int size()
321 | {
322 | return size;
323 | }
324 |
325 | @Override
326 | protected char getChar()
327 | {
328 | return 0; // 根节点没有char
329 | }
330 |
331 | @Override
332 | public BaseNode getChild(char c)
333 | {
334 | return child[c];
335 | }
336 |
337 | public boolean save(String path)
338 | {
339 | try
340 | {
341 | DataOutputStream out = new DataOutputStream(IOUtil.newOutputStream(path));
342 | for (BaseNode node : child)
343 | {
344 | if (node == null)
345 | {
346 | out.writeInt(0);
347 | }
348 | else
349 | {
350 | out.writeInt(1);
351 | node.walkToSave(out);
352 | }
353 | }
354 | out.close();
355 | }
356 | catch (Exception e)
357 | {
358 | logger.warning("保存到" + path + "失败" + TextUtility.exceptionToString(e));
359 | return false;
360 | }
361 |
362 | return true;
363 | }
364 |
365 | @Override
366 | public int build(TreeMap keyValueMap)
367 | {
368 | for (Map.Entry entry : keyValueMap.entrySet())
369 | {
370 | put(entry.getKey(), entry.getValue());
371 | }
372 | return 0;
373 | }
374 |
375 | /**
376 | * 保存到二进制输出流
377 | *
378 | * @param out
379 | * @return
380 | */
381 | public boolean save(DataOutputStream out)
382 | {
383 | try
384 | {
385 | for (BaseNode node : child)
386 | {
387 | if (node == null)
388 | {
389 | out.writeInt(0);
390 | }
391 | else
392 | {
393 | out.writeInt(1);
394 | node.walkToSave(out);
395 | }
396 | }
397 | }
398 | catch (Exception e)
399 | {
400 | logger.warning("保存到" + out + "失败" + TextUtility.exceptionToString(e));
401 | return false;
402 | }
403 |
404 | return true;
405 | }
406 |
407 | @Override
408 | public boolean load(ByteArray byteArray, V[] value) {
409 | return false;
410 | }
411 |
412 | /**
413 | * 从磁盘加载二分数组树
414 | *
415 | * @param path 路径
416 | * @param value 额外提供的值数组,按照值的字典序。(之所以要求提供它,是因为泛型的保存不归树管理)
417 | * @return 是否成功
418 | */
419 | public boolean load(String path, V[] value)
420 | {
421 | byte[] bytes = IOUtil.readBytes(path);
422 | if (bytes == null) return false;
423 | _ValueArray valueArray = new _ValueArray(value);
424 | ByteArray byteArray = new ByteArray(bytes);
425 | for (int i = 0; i < child.length; ++i)
426 | {
427 | int flag = byteArray.nextInt();
428 | if (flag == 1)
429 | {
430 | child[i] = new Node();
431 | child[i].walkToLoad(byteArray, valueArray);
432 | }
433 | }
434 | size = value.length;
435 |
436 | return true;
437 | }
438 | public _ValueArray newValueArray()
439 | {
440 | return new _ValueArray();
441 | }
442 |
443 | @Override
444 | public void writeExternal(ObjectOutput out) throws IOException
445 | {
446 | out.writeInt(size);
447 | for (BaseNode node : child)
448 | {
449 | if (node == null)
450 | {
451 | out.writeInt(0);
452 | }
453 | else
454 | {
455 | out.writeInt(1);
456 | node.walkToSave(out);
457 | }
458 | }
459 | }
460 |
461 | @Override
462 | public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException
463 | {
464 | size = in.readInt();
465 | for (int i = 0; i < child.length; ++i)
466 | {
467 | int flag = in.readInt();
468 | if (flag == 1)
469 | {
470 | child[i] = new Node();
471 | child[i].walkToLoad(in);
472 | }
473 | }
474 | }
475 |
476 | /**
477 | * 最长匹配
478 | *
479 | * @param text 文本
480 | * @param processor 处理器
481 | */
482 | public void parseLongestText(String text, AhoCorasickDoubleArrayTrie.IHit processor)
483 | {
484 | int length = text.length();
485 | for (int i = 0; i < length; ++i)
486 | {
487 | BaseNode state = transition(text.charAt(i));
488 | if (state != null)
489 | {
490 | int to = i + 1;
491 | int end = to;
492 | V value = state.getValue();
493 | for (; to < length; ++to)
494 | {
495 | state = state.transition(text.charAt(to));
496 | if (state == null) break;
497 | if (state.getValue() != null)
498 | {
499 | value = state.getValue();
500 | end = to + 1;
501 | }
502 | }
503 | if (value != null)
504 | {
505 | processor.hit(i, end, value);
506 | i = end - 1;
507 | }
508 | }
509 | }
510 | }
511 |
512 | /**
513 | * 最长匹配
514 | *
515 | * @param text 文本
516 | * @param processor 处理器
517 | */
518 | public void parseLongestText(char[] text, AhoCorasickDoubleArrayTrie.IHit processor)
519 | {
520 | int length = text.length;
521 | for (int i = 0; i < length; ++i)
522 | {
523 | BaseNode state = transition(text[i]);
524 | if (state != null)
525 | {
526 | int to = i + 1;
527 | int end = to;
528 | V value = state.getValue();
529 | for (; to < length; ++to)
530 | {
531 | state = state.transition(text[to]);
532 | if (state == null) break;
533 | if (state.getValue() != null)
534 | {
535 | value = state.getValue();
536 | end = to + 1;
537 | }
538 | }
539 | if (value != null)
540 | {
541 | processor.hit(i, end, value);
542 | i = end - 1;
543 | }
544 | }
545 | }
546 | }
547 |
548 | /**
549 | * 匹配文本
550 | *
551 | * @param text 文本
552 | * @param processor 处理器
553 | */
554 | public void parseText(String text, AhoCorasickDoubleArrayTrie.IHit processor)
555 | {
556 | int length = text.length();
557 | int begin = 0;
558 | BaseNode state = this;
559 |
560 | for (int i = begin; i < length; ++i)
561 | {
562 | state = state.transition(text.charAt(i));
563 | if (state != null)
564 | {
565 | V value = state.getValue();
566 | if (value != null)
567 | {
568 | processor.hit(begin, i + 1, value);
569 | }
570 | }
571 | else
572 | {
573 | i = begin;
574 | ++begin;
575 | state = this;
576 | }
577 | }
578 | }
579 |
580 | /**
581 | * 匹配文本
582 | *
583 | * @param text 文本
584 | * @param processor 处理器
585 | */
586 | public void parseText(char[] text, AhoCorasickDoubleArrayTrie.IHit processor)
587 | {
588 | int length = text.length;
589 | int begin = 0;
590 | BaseNode state = this;
591 |
592 | for (int i = begin; i < length; ++i)
593 | {
594 | state = state.transition(text[i]);
595 | if (state != null)
596 | {
597 | V value = state.getValue();
598 | if (value != null)
599 | {
600 | processor.hit(begin, i + 1, value);
601 | }
602 | }
603 | else
604 | {
605 | i = begin;
606 | ++begin;
607 | state = this;
608 | }
609 | }
610 | }
611 | }
612 |
--------------------------------------------------------------------------------
/src/main/java/trie/bintrie/Node.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * He Han
4 | * hankcs.cn@gmail.com
5 | * 2014/5/3 12:27
6 | *
7 | *
8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
10 | *
11 | */
12 | package trie.bintrie;
13 |
14 |
15 | import trie.bintrie.util.ArrayTool;
16 |
17 | /**
18 | * 深度大于等于2的子节点
19 | *
20 | * @author He Han
21 | */
22 | public class Node extends BaseNode
23 | {
24 | @Override
25 | protected boolean addChild(BaseNode node)
26 | {
27 | boolean add = false;
28 | if (child == null)
29 | {
30 | child = new BaseNode[0];
31 | }
32 | int index = ArrayTool.binarySearch(child, node);
33 | if (index >= 0)
34 | {
35 | BaseNode target = child[index];
36 | switch (node.status)
37 | {
38 | case UNDEFINED_0:
39 | if (target.status != Status.NOT_WORD_1)
40 | {
41 | target.status = Status.NOT_WORD_1;
42 | target.value = null;
43 | add = true;
44 | }
45 | break;
46 | case NOT_WORD_1:
47 | if (target.status == Status.WORD_END_3)
48 | {
49 | target.status = Status.WORD_MIDDLE_2;
50 | }
51 | break;
52 | case WORD_END_3:
53 | if (target.status != Status.WORD_END_3)
54 | {
55 | target.status = Status.WORD_MIDDLE_2;
56 | }
57 | if (target.getValue() == null)
58 | {
59 | add = true;
60 | }
61 | target.setValue(node.getValue());
62 | break;
63 | }
64 | }
65 | else
66 | {
67 | BaseNode newChild[] = new BaseNode[child.length + 1];
68 | int insert = -(index + 1);
69 | System.arraycopy(child, 0, newChild, 0, insert);
70 | System.arraycopy(child, insert, newChild, insert + 1, child.length - insert);
71 | newChild[insert] = node;
72 | child = newChild;
73 | add = true;
74 | }
75 | return add;
76 | }
77 |
78 | /**
79 | * @param c 节点的字符
80 | * @param status 节点状态
81 | * @param value 值
82 | */
83 | public Node(char c, Status status, V value)
84 | {
85 | this.c = c;
86 | this.status = status;
87 | this.value = value;
88 | }
89 |
90 | public Node()
91 | {
92 | }
93 |
94 | @Override
95 | public BaseNode getChild(char c)
96 | {
97 | if (child == null) return null;
98 | int index = ArrayTool.binarySearch(child, c);
99 | if (index < 0) return null;
100 |
101 | return child[index];
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/src/main/java/trie/bintrie/_ValueArray.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * hankcs
4 | * me@hankcs.com
5 | * 2015/5/15 10:23
6 | *
7 | *
8 | * Copyright (c) 2003-2015, hankcs. All Right Reserved, http://www.hankcs.com/
9 | *
10 | */
11 | package trie.bintrie;
12 |
13 | /**
14 | * 对值数组的包装,可以方便地取下一个
15 | * @author hankcs
16 | */
17 | public class _ValueArray
18 | {
19 | V[] value;
20 | int offset;
21 |
22 | public _ValueArray(V[] value)
23 | {
24 | this.value = value;
25 | }
26 |
27 | public V nextValue()
28 | {
29 | return value[offset++];
30 | }
31 |
32 | /**
33 | * 仅仅给子类用,不要用
34 | */
35 | protected _ValueArray()
36 | {
37 | }
38 |
39 | public _ValueArray setValue(V[] value)
40 | {
41 | this.value = value;
42 | return this;
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/java/trie/bintrie/util/ArrayTool.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * He Han
4 | * hankcs.cn@gmail.com
5 | * 2014/5/3 12:32
6 | *
7 | *
8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
10 | *
11 | */
12 | package trie.bintrie.util;
13 |
14 |
15 | import trie.bintrie.BaseNode;
16 |
17 | /**
18 | * @author He Han
19 | */
20 | public class ArrayTool
21 | {
22 | /**
23 | * 二分查找
24 | * @param branches 数组
25 | * @param node 要查找的node
26 | * @return 数组下标,小于0表示没找到
27 | */
28 | public static int binarySearch(BaseNode[] branches, BaseNode node)
29 | {
30 | int high = branches.length - 1;
31 | if (branches.length < 1)
32 | {
33 | return high;
34 | }
35 | int low = 0;
36 | while (low <= high)
37 | {
38 | int mid = (low + high) >>> 1;
39 | int cmp = branches[mid].compareTo(node);
40 |
41 | if (cmp < 0)
42 | low = mid + 1;
43 | else if (cmp > 0)
44 | high = mid - 1;
45 | else
46 | return mid;
47 | }
48 | return -(low + 1);
49 | }
50 |
51 | public static int binarySearch(BaseNode[] branches, char node)
52 | {
53 | int high = branches.length - 1;
54 | if (branches.length < 1)
55 | {
56 | return high;
57 | }
58 | int low = 0;
59 | while (low <= high)
60 | {
61 | int mid = (low + high) >>> 1;
62 | int cmp = branches[mid].compareTo(node);
63 |
64 | if (cmp < 0)
65 | low = mid + 1;
66 | else if (cmp > 0)
67 | high = mid - 1;
68 | else
69 | return mid;
70 | }
71 | return -(low + 1);
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/main/java/util/FileUtils.java:
--------------------------------------------------------------------------------
1 | package util;
2 |
3 |
4 | import org.apache.commons.lang.StringUtils;
5 |
6 | import java.io.*;
7 | import java.util.HashSet;
8 | import java.util.LinkedHashSet;
9 | import java.util.List;
10 | import java.util.Map;
11 |
12 | public class FileUtils {
13 |
14 | public static void writeMapResultToFile(String outputPath, List> list) {
15 | try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputPath), "utf-8"))) {
16 | for (Map.Entry mapping : list) {
17 | if(StringUtils.isBlank(mapping.getValue())) continue;
18 | writer.write(mapping.getValue() + "\n");
19 | }
20 | } catch (Exception e) {
21 | e.printStackTrace();
22 | }
23 | }
24 | // 文件读写
25 | public static String readFileToString(String fileName) {
26 | String encoding = "UTF-8";
27 | File file = new File(fileName);
28 |
29 | if (!file.exists())
30 |
31 | try {
32 | file.createNewFile();
33 | } catch (IOException e) {
34 | e.printStackTrace();
35 | }
36 |
37 | Long filelength = file.length();
38 | byte[] filecontent = new byte[filelength.intValue()];
39 | try {
40 | if (!file.exists()) file.createNewFile();
41 |
42 |
43 | FileInputStream in = new FileInputStream(file);
44 | in.read(filecontent);
45 | in.close();
46 | } catch (FileNotFoundException e) {
47 | e.printStackTrace();
48 | } catch (IOException e) {
49 | e.printStackTrace();
50 | }
51 | try {
52 | return new String(filecontent, encoding);
53 | } catch (UnsupportedEncodingException e) {
54 | System.err.println("The OS does not support " + encoding);
55 | e.printStackTrace();
56 | return null;
57 | }
58 | }
59 |
60 |
61 | public static void writeFileToPath(String outPutPath, LinkedHashSet stringSet) {
62 | try {
63 | FileOutputStream writer = new FileOutputStream(outPutPath);
64 | OutputStreamWriter bw = new OutputStreamWriter(writer, "UTF-8"); // 以utf-8写结果
65 | stringSet.forEach(it -> {
66 | if (StringUtils.isNotBlank(it)) {
67 | String result = it + " ";
68 | try {
69 | // System.out.println(result);
70 | bw.write(result);
71 | } catch (IOException e) {
72 | e.printStackTrace();
73 | }
74 | }
75 | });
76 |
77 | bw.close();
78 | writer.close();
79 | } catch (IOException ex) {
80 | ex.printStackTrace();
81 | }
82 | }
83 |
84 |
85 | public static void writeFileToPath(String outPutPath, List stringSet) {
86 | try {
87 | FileOutputStream writer = new FileOutputStream(outPutPath);
88 | OutputStreamWriter bw = new OutputStreamWriter(writer, "UTF-8"); // 以utf-8写结果
89 | stringSet.forEach(it -> {
90 | String result = it + "\n";
91 | try {
92 | bw.write(result);
93 | } catch (IOException e) {
94 | e.printStackTrace();
95 | }
96 | });
97 |
98 | bw.close();
99 | writer.close();
100 | } catch (IOException ex) {
101 | ex.printStackTrace();
102 | }
103 | }
104 |
105 |
106 | public static void writeFileToPath(String outPutPath, List list, Map wcMap) {
107 | try {
108 | FileOutputStream writer = new FileOutputStream(outPutPath);
109 |
110 | OutputStreamWriter bw = new OutputStreamWriter(writer, "UTF-8"); // 以utf-8写结果
111 | list.forEach(it -> {
112 | if (StringUtils.isNotBlank(it) && wcMap.get(it) >= 4) {
113 | try {
114 | bw.write(it + " -> " + wcMap.get(it) + "\n");
115 | } catch (IOException e) {
116 | e.printStackTrace();
117 | }
118 | }
119 | });
120 |
121 | bw.close();
122 | writer.close();
123 | } catch (IOException ex) {
124 | ex.printStackTrace();
125 | }
126 | }
127 |
128 | // 按行读取进集合
129 | public static HashSet readFileByLineToHashSet(String inputFilePath) {
130 | HashSet set = new HashSet();
131 | try {
132 | // 以utf-8读取文件
133 | FileInputStream fis = new FileInputStream(inputFilePath);
134 | InputStreamReader reader = new InputStreamReader(fis, "UTF-8");
135 | BufferedReader br = new BufferedReader(reader);
136 | String str = null;
137 | while ((str = br.readLine()) != null) {
138 | set.add(str);
139 | }
140 | br.close();
141 | reader.close();
142 | } catch (FileNotFoundException e) {
143 | e.printStackTrace();
144 | } catch (IOException e) {
145 | e.printStackTrace();
146 | }
147 | return set;
148 | }
149 |
150 | // 按行读取进集合
151 | public static void writeStringToFile(String outPutPath, String text) {
152 | try {
153 | FileOutputStream writer = new FileOutputStream(outPutPath);
154 | OutputStreamWriter bw = new OutputStreamWriter(writer, "UTF-8"); // 以utf-8写结果
155 | bw.write(text);
156 | bw.close();
157 | writer.close();
158 | } catch (IOException ex) {
159 | ex.printStackTrace();
160 | }
161 | }
162 |
163 | public static void writeResultToFile(String outputPath, List list) {
164 | try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputPath), "utf-8"))) {
165 | for (String text : list) {
166 | if (StringUtils.isNotBlank(text)) {
167 | writer.write(text + "\n");
168 | }
169 | }
170 | } catch (Exception e) {
171 | e.printStackTrace();
172 | }
173 | }
174 | }
175 |
--------------------------------------------------------------------------------
/src/main/java/util/HanUtils.java:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/util/HanUtils.java
--------------------------------------------------------------------------------
/src/main/java/util/Predefine.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * He Han
4 | * hankcs.cn@gmail.com
5 | * 2014/5/14 21:36
6 | *
7 | *
8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
10 | *
11 | */
12 | package util;
13 |
14 | import java.util.logging.Level;
15 | import java.util.logging.Logger;
16 | import java.util.regex.Pattern;
17 |
18 | /**
19 | * 一些预定义的静态全局变量
20 | */
21 | public class Predefine
22 | {
23 | public static final String CHINESE_NUMBERS = "零○〇一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟";
24 | /**
25 | * hanlp.properties的路径,一般情况下位于classpath目录中。
26 | * 但在某些极端情况下(不标准的Java虚拟机,用户缺乏相关知识等),允许将其设为绝对路径
27 | */
28 | public static String HANLP_PROPERTIES_PATH;
29 | public final static double MIN_PROBABILITY = 1e-10;
30 | /**
31 | * 浮点数正则
32 | */
33 | public static final Pattern PATTERN_FLOAT_NUMBER = Pattern.compile("^(-?\\d+)(\\.\\d+)?$");
34 |
35 | public static String POSTFIX_SINGLE =
36 | "坝邦堡城池村单岛道堤店洞渡队峰府冈港阁宫沟国海号河湖环集江礁角街井郡坑口矿里岭楼路门盟庙弄牌派坡铺旗桥区渠泉山省市水寺塔台滩坛堂厅亭屯湾屋溪峡县线乡巷洋窑营屿园苑院闸寨站镇州庄族陂庵町";
37 |
38 | public final static String[] POSTFIX_MUTIPLE = {"半岛","草原","城区","大堤","大公国","大桥","地区",
39 | "帝国","渡槽","港口","高速公路","高原","公路","公园","共和国","谷地","广场",
40 | "国道","海峡","胡同","机场","集镇","教区","街道","口岸","码头","煤矿",
41 | "牧场","农场","盆地","平原","丘陵","群岛","沙漠","沙洲","山脉","山丘",
42 | "水库","隧道","特区","铁路","新村","雪峰","盐场","盐湖","渔场","直辖市",
43 | "自治区","自治县","自治州"};
44 |
45 | //Seperator type
46 | public static String SEPERATOR_C_SENTENCE = "。!?:;…";
47 | public static String SEPERATOR_C_SUB_SENTENCE = "、,()“”‘’";
48 | public static String SEPERATOR_E_SENTENCE = "!?:;";
49 | public static String SEPERATOR_E_SUB_SENTENCE = ",()*'";
50 | //注释:原来程序为",()\042'","\042"为10进制42好ASC字符,为*
51 | public static String SEPERATOR_LINK = "\n\r ";
52 |
53 | //Seperator between two words
54 | public static String WORD_SEGMENTER = "@";
55 |
56 | public static int MAX_SEGMENT_NUM = 10;
57 |
58 | public static final int MAX_FREQUENCY = 25146057; // 现在总词频25146057
59 | /**
60 | * Smoothing 平滑因子
61 | */
62 | public static final double dTemp = (double) 1 / MAX_FREQUENCY + 0.00001;
63 | /**
64 | * 平滑参数
65 | */
66 | public static final double dSmoothingPara = 0.1;
67 | /**
68 | * 地址 ns
69 | */
70 | public final static String TAG_PLACE = "未##地";
71 | /**
72 | * 句子的开始 begin
73 | */
74 | public final static String TAG_BIGIN = "始##始";
75 | /**
76 | * 其它
77 | */
78 | public final static String TAG_OTHER = "未##它";
79 | /**
80 | * 团体名词 nt
81 | */
82 | public final static String TAG_GROUP = "未##团";
83 | /**
84 | * 数词 m
85 | */
86 | public final static String TAG_NUMBER = "未##数";
87 | /**
88 | * 数量词 mq (现在觉得应该和数词同等处理,比如一个人和一人都是合理的)
89 | */
90 | public final static String TAG_QUANTIFIER = "未##量";
91 | /**
92 | * 专有名词 nx
93 | */
94 | public final static String TAG_PROPER = "未##专";
95 | /**
96 | * 时间 t
97 | */
98 | public final static String TAG_TIME = "未##时";
99 | /**
100 | * 字符串 x
101 | */
102 | public final static String TAG_CLUSTER = "未##串";
103 | /**
104 | * 结束 end
105 | */
106 | public final static String TAG_END = "末##末";
107 | /**
108 | * 人名 nr
109 | */
110 | public final static String TAG_PEOPLE = "未##人";
111 |
112 | /**
113 | * 日志组件
114 | */
115 | public static Logger logger = Logger.getLogger("HanLP");
116 | static
117 | {
118 | logger.setLevel(Level.WARNING);
119 | }
120 |
121 | /**
122 | * trie树文件后缀名
123 | */
124 | public final static String TRIE_EXT = ".trie.dat";
125 | /**
126 | * 值文件后缀名
127 | */
128 | public final static String VALUE_EXT = ".value.dat";
129 |
130 | /**
131 | * 逆转后缀名
132 | */
133 | public final static String REVERSE_EXT = ".reverse";
134 |
135 | /**
136 | * 二进制文件后缀
137 | */
138 | public final static String BIN_EXT = ".bin";
139 | }
140 |
--------------------------------------------------------------------------------
/src/main/java/util/TextUtility.java:
--------------------------------------------------------------------------------
1 | package util;
2 |
3 |
4 | import pojo.IWord;
5 | import pojo.Sentence;
6 | import pojo.Word;
7 |
8 | import java.io.*;
9 | import java.util.Collection;
10 | import java.util.Iterator;
11 | import java.util.List;
12 |
13 | import static pojo.CharType.*;
14 |
15 |
16 | /**
17 | * 文本工具类
18 | */
19 | public class TextUtility
20 | {
21 |
22 | public static int charType(char c)
23 | {
24 | return charType(String.valueOf(c));
25 | }
26 |
27 | /**
28 | * 判断字符类型
29 | * @param str
30 | * @return
31 | */
32 | public static int charType(String str)
33 | {
34 | if (str != null && str.length() > 0)
35 | {
36 | if (Predefine.CHINESE_NUMBERS.contains(str)) return CT_CNUM;
37 | byte[] b;
38 | try
39 | {
40 | b = str.getBytes("GBK");
41 | }
42 | catch (UnsupportedEncodingException e)
43 | {
44 | b = str.getBytes();
45 | e.printStackTrace();
46 | }
47 | byte b1 = b[0];
48 | byte b2 = b.length > 1 ? b[1] : 0;
49 | int ub1 = getUnsigned(b1);
50 | int ub2 = getUnsigned(b2);
51 | if (ub1 < 128)
52 | {
53 | if (ub1 < 32) return CT_DELIMITER; // NON PRINTABLE CHARACTERS
54 | if (' ' == b1) return CT_OTHER;
55 | if ('\n' == b1) return CT_DELIMITER;
56 | if ("*\"!,.?()[]{}+=/\\;:|".indexOf((char) b1) != -1)
57 | return CT_DELIMITER;
58 | if ("0123456789".indexOf((char)b1) != -1)
59 | return CT_NUM;
60 | return CT_SINGLE;
61 | }
62 | else if (ub1 == 162)
63 | return CT_INDEX;
64 | else if (ub1 == 163 && ub2 > 175 && ub2 < 186)
65 | return CT_NUM;
66 | else if (ub1 == 163
67 | && (ub2 >= 193 && ub2 <= 218 || ub2 >= 225
68 | && ub2 <= 250))
69 | return CT_LETTER;
70 | else if (ub1 == 161 || ub1 == 163)
71 | return CT_DELIMITER;
72 | else if (ub1 >= 176 && ub1 <= 247)
73 | return CT_CHINESE;
74 |
75 | }
76 | return CT_OTHER;
77 | }
78 |
79 | /**
80 | * 是否全是中文
81 | * @param str
82 | * @return
83 | */
84 | public static boolean isAllChinese(String str)
85 | {
86 | return str.matches("[\\u4E00-\\u9FA5]+");
87 | }
88 | /**
89 | * 是否全部不是中文
90 | * @param sString
91 | * @return
92 | */
93 | public static boolean isAllNonChinese(byte[] sString)
94 | {
95 | int nLen = sString.length;
96 | int i = 0;
97 |
98 | while (i < nLen)
99 | {
100 | if (getUnsigned(sString[i]) < 248 && getUnsigned(sString[i]) > 175)
101 | return false;
102 | if (sString[i] < 0)
103 | i += 2;
104 | else
105 | i += 1;
106 | }
107 | return true;
108 | }
109 |
110 | /**
111 | * 是否全是单字节
112 | * @param str
113 | * @return
114 | */
115 | public static boolean isAllSingleByte(String str)
116 | {
117 | assert str != null;
118 | for (int i = 0; i < str.length(); i++)
119 | {
120 | if (str.charAt(i) >128)
121 | {
122 | return false;
123 | }
124 | }
125 | return true;
126 | }
127 |
128 | /**
129 | * 把表示数字含义的字符串转成整形
130 | *
131 | * @param str 要转换的字符串
132 | * @return 如果是有意义的整数,则返回此整数值。否则,返回-1。
133 | */
134 | public static int cint(String str)
135 | {
136 | if (str != null)
137 | try
138 | {
139 | int i = new Integer(str).intValue();
140 | return i;
141 | }
142 | catch (NumberFormatException e)
143 | {
144 |
145 | }
146 |
147 | return -1;
148 | }
149 | /**
150 | * 是否全是数字
151 | * @param str
152 | * @return
153 | */
154 | public static boolean isAllNum(String str)
155 | {
156 | if (str == null)
157 | return false;
158 |
159 | int i = 0;
160 | /** 判断开头是否是+-之类的符号 */
161 | if ("±+-+-—".indexOf(str.charAt(0)) != -1)
162 | i++;
163 | /** 如果是全角的0123456789 字符* */
164 | while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
165 | i++;
166 | // Get middle delimiter such as .
167 | if (i > 0 && i < str.length())
168 | {
169 | char ch = str.charAt(i);
170 | if ("·∶:,,..//".indexOf(ch) != -1)
171 | {// 98.1%
172 | i++;
173 | while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
174 | i++;
175 | }
176 | }
177 | if (i >= str.length())
178 | return true;
179 |
180 | /** 如果是半角的0123456789字符* */
181 | while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
182 | i++;
183 | // Get middle delimiter such as .
184 | if (i > 0 && i < str.length())
185 | {
186 | char ch = str.charAt(i);
187 | if (',' == ch || '.' == ch || '/' == ch || ':' == ch || "∶·,./".indexOf(ch) != -1)
188 | {// 98.1%
189 | i++;
190 | while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1)
191 | i++;
192 | }
193 | }
194 |
195 | if (i < str.length())
196 | {
197 | if ("百千万亿佰仟%%‰".indexOf(str.charAt(i)) != -1)
198 | i++;
199 | }
200 | if (i >= str.length())
201 | return true;
202 |
203 | return false;
204 | }
205 |
206 | /**
207 | * 是否全是序号
208 | * @param sString
209 | * @return
210 | */
211 | public static boolean isAllIndex(byte[] sString)
212 | {
213 | int nLen = sString.length;
214 | int i = 0;
215 |
216 | while (i < nLen - 1 && getUnsigned(sString[i]) == 162)
217 | {
218 | i += 2;
219 | }
220 | if (i >= nLen)
221 | return true;
222 | while (i < nLen && (sString[i] > 'A' - 1 && sString[i] < 'Z' + 1)
223 | || (sString[i] > 'a' - 1 && sString[i] < 'z' + 1))
224 | {// single
225 | // byte
226 | // number
227 | // char
228 | i += 1;
229 | }
230 |
231 | if (i < nLen)
232 | return false;
233 | return true;
234 |
235 | }
236 |
237 | /**
238 | * 是否全为英文
239 | *
240 | * @param text
241 | * @return
242 | */
243 | public static boolean isAllLetter(String text)
244 | {
245 | for (int i = 0; i < text.length(); ++i)
246 | {
247 | char c = text.charAt(i);
248 | if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z'))))
249 | {
250 | return false;
251 | }
252 | }
253 |
254 | return true;
255 | }
256 |
257 | /**
258 | * 是否全为英文或字母
259 | *
260 | * @param text
261 | * @return
262 | */
263 | public static boolean isAllLetterOrNum(String text)
264 | {
265 | for (int i = 0; i < text.length(); ++i)
266 | {
267 | char c = text.charAt(i);
268 | if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z')) && ((c < '0' || c > '9'))))
269 | {
270 | return false;
271 | }
272 | }
273 |
274 | return true;
275 | }
276 |
277 | /**
278 | * 是否全是分隔符
279 | * @param sString
280 | * @return
281 | */
282 | public static boolean isAllDelimiter(byte[] sString)
283 | {
284 | int nLen = sString.length;
285 | int i = 0;
286 |
287 | while (i < nLen - 1 && (getUnsigned(sString[i]) == 161 || getUnsigned(sString[i]) == 163))
288 | {
289 | i += 2;
290 | }
291 | if (i < nLen)
292 | return false;
293 | return true;
294 | }
295 |
296 | /**
297 | * 是否全是中国数字
298 | * @param word
299 | * @return
300 | */
301 | public static boolean isAllChineseNum(String word)
302 | {// 百分之五点六的人早上八点十八分起床
303 |
304 | String chineseNum = "零○一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟∶·./点";//
305 | String prefix = "几数上第";
306 | String surfix = "几多余来成倍";
307 | boolean round = false;
308 |
309 | if (word == null)
310 | return false;
311 |
312 | char[] temp = word.toCharArray();
313 | for (int i = 0; i < temp.length; i++)
314 | {
315 | if (word.startsWith("分之", i))// 百分之五
316 | {
317 | i += 1;
318 | continue;
319 | }
320 | char tchar = temp[i];
321 | if (i == 0 && prefix.indexOf(tchar) != -1)
322 | {
323 | round = true;
324 | }
325 | else if (i == temp.length-1 && !round && surfix.indexOf(tchar) != -1)
326 | {
327 | round = true;
328 | }
329 | else if (chineseNum.indexOf(tchar) == -1)
330 | return false;
331 | }
332 | return true;
333 | }
334 |
335 |
336 | /**
337 | * 得到字符集的字符在字符串中出现的次数
338 | *
339 | * @param charSet
340 | * @param word
341 | * @return
342 | */
343 | public static int getCharCount(String charSet, String word)
344 | {
345 | int nCount = 0;
346 |
347 | if (word != null)
348 | {
349 | String temp = word + " ";
350 | for (int i = 0; i < word.length(); i++)
351 | {
352 | String s = temp.substring(i, i + 1);
353 | if (charSet.indexOf(s) != -1)
354 | nCount++;
355 | }
356 | }
357 |
358 | return nCount;
359 | }
360 |
361 |
362 | /**
363 | * 获取字节对应的无符号整型数
364 | *
365 | * @param b
366 | * @return
367 | */
368 | public static int getUnsigned(byte b)
369 | {
370 | if (b > 0)
371 | return (int) b;
372 | else
373 | return (b & 0x7F + 128);
374 | }
375 |
376 | /**
377 | * 判断字符串是否是年份
378 | *
379 | * @param snum
380 | * @return
381 | */
382 | public static boolean isYearTime(String snum)
383 | {
384 | if (snum != null)
385 | {
386 | int len = snum.length();
387 | String first = snum.substring(0, 1);
388 |
389 | // 1992年, 98年,06年
390 | if (isAllSingleByte(snum)
391 | && (len == 4 || len == 2 && (cint(first) > 4 || cint(first) == 0)))
392 | return true;
393 | if (isAllNum(snum) && (len >= 3 || len == 2 && "056789".indexOf(first) != -1))
394 | return true;
395 | if (getCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖", snum) == len && len >= 2)
396 | return true;
397 | if (len == 4 && getCharCount("千仟零○", snum) == 2)// 二仟零二年
398 | return true;
399 | if (len == 1 && getCharCount("千仟", snum) == 1)
400 | return true;
401 | if (len == 2 && getCharCount("甲乙丙丁戊己庚辛壬癸", snum) == 1
402 | && getCharCount("子丑寅卯辰巳午未申酉戌亥", snum.substring(1)) == 1)
403 | return true;
404 | }
405 | return false;
406 | }
407 |
408 | /**
409 | * 判断一个字符串的所有字符是否在另一个字符串集合中
410 | *
411 | * @param aggr 字符串集合
412 | * @param str 需要判断的字符串
413 | * @return
414 | */
415 | public static boolean isInAggregate(String aggr, String str)
416 | {
417 | if (aggr != null && str != null)
418 | {
419 | str += "1";
420 | for (int i = 0; i < str.length(); i++)
421 | {
422 | String s = str.substring(i, i + 1);
423 | if (aggr.indexOf(s) == -1)
424 | return false;
425 | }
426 | return true;
427 | }
428 |
429 | return false;
430 | }
431 |
432 | /**
433 | * 判断该字符串是否是半角字符
434 | *
435 | * @param str
436 | * @return
437 | */
438 | public static boolean isDBCCase(String str)
439 | {
440 | if (str != null)
441 | {
442 | str += " ";
443 | for (int i = 0; i < str.length(); i++)
444 | {
445 | String s = str.substring(i, i + 1);
446 | int length = 0;
447 | try
448 | {
449 | length = s.getBytes("GBK").length;
450 | }
451 | catch (UnsupportedEncodingException e)
452 | {
453 | e.printStackTrace();
454 | length = s.getBytes().length;
455 | }
456 | if (length != 1)
457 | return false;
458 | }
459 |
460 | return true;
461 | }
462 |
463 | return false;
464 | }
465 |
466 | /**
467 | * 判断该字符串是否是全角字符
468 | *
469 | * @param str
470 | * @return
471 | */
472 | public static boolean isSBCCase(String str)
473 | {
474 | if (str != null)
475 | {
476 | str += " ";
477 | for (int i = 0; i < str.length(); i++)
478 | {
479 | String s = str.substring(i, i + 1);
480 | int length = 0;
481 | try
482 | {
483 | length = s.getBytes("GBK").length;
484 | }
485 | catch (UnsupportedEncodingException e)
486 | {
487 | e.printStackTrace();
488 | length = s.getBytes().length;
489 | }
490 | if (length != 2)
491 | return false;
492 | }
493 |
494 | return true;
495 | }
496 |
497 | return false;
498 | }
499 |
500 | /**
501 | * 判断是否是一个连字符(分隔符)
502 | *
503 | * @param str
504 | * @return
505 | */
506 | public static boolean isDelimiter(String str)
507 | {
508 | if (str != null && ("-".equals(str) || "-".equals(str)))
509 | return true;
510 | else
511 | return false;
512 | }
513 |
514 | public static boolean isUnknownWord(String word)
515 | {
516 | if (word != null && word.indexOf("未##") == 0)
517 | return true;
518 | else
519 | return false;
520 | }
521 |
522 | /**
523 | * 防止频率为0发生除零错误
524 | *
525 | * @param frequency
526 | * @return
527 | */
528 | public static double nonZero(double frequency)
529 | {
530 | if (frequency == 0) return 1e-3;
531 |
532 | return frequency;
533 | }
534 |
535 | /**
536 | * 转换long型为char数组
537 | *
538 | * @param x
539 | */
540 | public static char[] long2char(long x)
541 | {
542 | char[] c = new char[4];
543 | c[0] = (char) (x >> 48);
544 | c[1] = (char) (x >> 32);
545 | c[2] = (char) (x >> 16);
546 | c[3] = (char) (x);
547 | return c;
548 | }
549 |
550 | /**
551 | * 转换long类型为string
552 | *
553 | * @param x
554 | * @return
555 | */
556 | public static String long2String(long x)
557 | {
558 | char[] cArray = long2char(x);
559 | StringBuilder sbResult = new StringBuilder(cArray.length);
560 | for (char c : cArray)
561 | {
562 | sbResult.append(c);
563 | }
564 | return sbResult.toString();
565 | }
566 |
567 | /**
568 | * 将异常转为字符串
569 | *
570 | * @param e
571 | * @return
572 | */
573 | public static String exceptionToString(Exception e)
574 | {
575 | StringWriter sw = new StringWriter();
576 | PrintWriter pw = new PrintWriter(sw);
577 | e.printStackTrace(pw);
578 | return sw.toString();
579 | }
580 |
581 | /**
582 | * 判断某个字符是否为汉字
583 | *
584 | * @param c 需要判断的字符
585 | * @return 是汉字返回true,否则返回false
586 | */
587 | public static boolean isChinese(char c)
588 | {
589 | String regex = "[\\u4e00-\\u9fa5]";
590 | return String.valueOf(c).matches(regex);
591 | }
592 |
593 | /**
594 | * 统计 keyword 在 srcText 中的出现次数
595 | *
596 | * @param keyword
597 | * @param srcText
598 | * @return
599 | */
600 | public static int count(String keyword, String srcText)
601 | {
602 | int count = 0;
603 | int leng = srcText.length();
604 | int j = 0;
605 | for (int i = 0; i < leng; i++)
606 | {
607 | if (srcText.charAt(i) == keyword.charAt(j))
608 | {
609 | j++;
610 | if (j == keyword.length())
611 | {
612 | count++;
613 | j = 0;
614 | }
615 | }
616 | else
617 | {
618 | i = i - j;// should rollback when not match
619 | j = 0;
620 | }
621 | }
622 |
623 | return count;
624 | }
625 |
626 | /**
627 | * 简单好用的写String方式
628 | *
629 | * @param s
630 | * @param out
631 | * @throws IOException
632 | */
633 | public static void writeString(String s, DataOutputStream out) throws IOException
634 | {
635 | out.writeInt(s.length());
636 | for (char c : s.toCharArray())
637 | {
638 | out.writeChar(c);
639 | }
640 | }
641 |
642 | /**
643 | * 判断字符串是否为空(null和空格)
644 | *
645 | * @param cs
646 | * @return
647 | */
648 | public static boolean isBlank(CharSequence cs)
649 | {
650 | int strLen;
651 | if (cs == null || (strLen = cs.length()) == 0)
652 | {
653 | return true;
654 | }
655 | for (int i = 0; i < strLen; i++)
656 | {
657 | if (!Character.isWhitespace(cs.charAt(i)))
658 | {
659 | return false;
660 | }
661 | }
662 | return true;
663 | }
664 |
665 | public static String join(String delimiter, Collection stringCollection)
666 | {
667 | StringBuilder sb = new StringBuilder(stringCollection.size() * (16 + delimiter.length()));
668 | for (String str : stringCollection)
669 | {
670 | sb.append(str).append(delimiter);
671 | }
672 |
673 | return sb.toString();
674 | }
675 |
676 | public static String combine(String... termArray)
677 | {
678 | StringBuilder sbSentence = new StringBuilder();
679 | for (String word : termArray)
680 | {
681 | sbSentence.append(word);
682 | }
683 | return sbSentence.toString();
684 | }
685 |
686 | public static String join(Iterable extends CharSequence> s, String delimiter)
687 | {
688 | Iterator extends CharSequence> iter = s.iterator();
689 | if (!iter.hasNext()) return "";
690 | StringBuilder buffer = new StringBuilder(iter.next());
691 | while (iter.hasNext()) buffer.append(delimiter).append(iter.next());
692 | return buffer.toString();
693 | }
694 |
695 | public static String combine(Sentence sentence)
696 | {
697 | StringBuilder sb = new StringBuilder(sentence.wordList.size() * 3);
698 | for (IWord word : sentence.wordList)
699 | {
700 | sb.append(word.getValue());
701 | }
702 |
703 | return sb.toString();
704 | }
705 |
706 | public static String combine(List wordList)
707 | {
708 | StringBuilder sb = new StringBuilder(wordList.size() * 3);
709 | for (IWord word : wordList)
710 | {
711 | sb.append(word.getValue());
712 | }
713 |
714 | return sb.toString();
715 | }
716 | }
717 |
--------------------------------------------------------------------------------
/src/test/java/SegmentTest/SegTest.java:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/test/java/SegmentTest/SegTest.java
--------------------------------------------------------------------------------
/src/test/java/SegmentTest/WordCountTest.java:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/test/java/SegmentTest/WordCountTest.java
--------------------------------------------------------------------------------
/src/test/java/concurrent/SegCountProcess.java:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/test/java/concurrent/SegCountProcess.java
--------------------------------------------------------------------------------