()
53 | {
54 | @Override
55 | public void hit(int begin, int end, String value)
56 | {
57 | System.out.printf("[%d:%d]=%s\n", begin, end, value);
58 | }
59 | });
60 | ```
61 |
62 | or a lambda function
63 | ```
64 | acdat.parseText(text, (begin, end, value) -> {
65 | System.out.printf("[%d:%d]=%s\n", begin, end, value);
66 | });
67 | ```
68 |
69 | Comparison
70 | -----
71 | I compared my AhoCorasickDoubleArrayTrie with robert-bor's aho-corasick, ACDAT represents for AhoCorasickDoubleArrayTrie and Naive repesents for aho-corasick, the result is :
72 | ```
73 | Parsing English document which contains 3409283 characters, with a dictionary of 127142 words.
74 | Naive ACDAT
75 | time 554 290
76 | char/s 6153940.43 11756148.28
77 | rate 1.00 1.91
78 | ===========================================================================
79 | Parsing Chinese document which contains 1290573 characters, with a dictionary of 146047 words.
80 | Naive ACDAT
81 | time 269 56
82 | char/s 4797669.14 23045946.43
83 | rate 1.00 4.80
84 | ===========================================================================
85 | ```
86 |
87 | In English test, AhoCorasickDoubleArrayTrie is 1.91 times faster. When it comes to Chinese, AhoCorasickDoubleArrayTrie is 4.80 times faster.
88 | Feel free to re-run this test in TestAhoCorasickDoubleArrayTrie, the test data is ready for you.
89 |
90 | Thanks
91 | -----
92 | This project is inspired by [aho-corasick](https://github.com/robert-bor/aho-corasick) and [darts-clone-java](https://github.com/hiroshi-manabe/darts-clone-java).
93 | Many thanks!
94 |
95 | License
96 | -------
97 | Licensed under the Apache License, Version 2.0 (the "License");
98 | you may not use this file except in compliance with the License.
99 | You may obtain a copy of the License at
100 |
101 | http://www.apache.org/licenses/LICENSE-2.0
102 |
103 | Unless required by applicable law or agreed to in writing, software
104 | distributed under the License is distributed on an "AS IS" BASIS,
105 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
106 | See the License for the specific language governing permissions and
107 | limitations under the License.
108 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/algorithm/State.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.algorithm;
2 |
3 | import java.util.*;
4 |
5 | /**
6 | *
7 | * 一个状态有如下几个功能
8 | *
9 | *
10 | *
11 | * - success; 成功转移到另一个状态
12 | * - failure; 不可顺着字符串跳转的话,则跳转到一个浅一点的节点
13 | * - emits; 命中一个模式串
14 | *
15 | *
16 | *
17 | * 根节点稍有不同,根节点没有 failure 功能,它的“failure”指的是按照字符串路径转移到下一个状态。其他节点则都有failure状态。
18 | *
19 | *
20 | * @author Robert Bor
21 | */
22 | public class State
23 | {
24 |
25 | /**
26 | * 模式串的长度,也是这个状态的深度
27 | */
28 | protected final int depth;
29 |
30 | /**
31 | * fail 函数,如果没有匹配到,则跳转到此状态。
32 | */
33 | private State failure = null;
34 |
35 | /**
36 | * 只要这个状态可达,则记录模式串
37 | */
38 | private Set emits = null;
39 | /**
40 | * goto 表,也称转移函数。根据字符串的下一个字符转移到下一个状态
41 | */
42 | private Map success = new TreeMap();
43 |
44 | /**
45 | * 在双数组中的对应下标
46 | */
47 | private int index;
48 |
49 | /**
50 | * 构造深度为0的节点
51 | */
52 | public State()
53 | {
54 | this(0);
55 | }
56 |
57 | /**
58 | * 构造深度为depth的节点
59 | * @param depth
60 | */
61 | public State(int depth)
62 | {
63 | this.depth = depth;
64 | }
65 |
66 | /**
67 | * 获取节点深度
68 | * @return
69 | */
70 | public int getDepth()
71 | {
72 | return this.depth;
73 | }
74 |
75 | /**
76 | * 添加一个匹配到的模式串(这个状态对应着这个模式串)
77 | * @param keyword
78 | */
79 | public void addEmit(int keyword)
80 | {
81 | if (this.emits == null)
82 | {
83 | this.emits = new TreeSet(Collections.reverseOrder());
84 | }
85 | this.emits.add(keyword);
86 | }
87 |
88 | /**
89 | * 获取最大的值
90 | * @return
91 | */
92 | public Integer getLargestValueId()
93 | {
94 | if (emits == null || emits.size() == 0) return null;
95 |
96 | return emits.iterator().next();
97 | }
98 |
99 | /**
100 | * 添加一些匹配到的模式串
101 | * @param emits
102 | */
103 | public void addEmit(Collection emits)
104 | {
105 | for (int emit : emits)
106 | {
107 | addEmit(emit);
108 | }
109 | }
110 |
111 | /**
112 | * 获取这个节点代表的模式串(们)
113 | * @return
114 | */
115 | public Collection emit()
116 | {
117 | return this.emits == null ? Collections.emptyList() : this.emits;
118 | }
119 |
120 | /**
121 | * 是否是终止状态
122 | * @return
123 | */
124 | public boolean isAcceptable()
125 | {
126 | return this.depth > 0 && this.emits != null;
127 | }
128 |
129 | /**
130 | * 获取failure状态
131 | * @return
132 | */
133 | public State failure()
134 | {
135 | return this.failure;
136 | }
137 |
138 | /**
139 | * 设置failure状态
140 | * @param failState
141 | */
142 | public void setFailure(State failState, int fail[])
143 | {
144 | this.failure = failState;
145 | fail[index] = failState.index;
146 | }
147 |
148 | /**
149 | * 转移到下一个状态
150 | * @param character 希望按此字符转移
151 | * @param ignoreRootState 是否忽略根节点,如果是根节点自己调用则应该是true,否则为false
152 | * @return 转移结果
153 | */
154 | private State nextState(Character character, boolean ignoreRootState)
155 | {
156 | State nextState = this.success.get(character);
157 | if (!ignoreRootState && nextState == null && this.depth == 0)
158 | {
159 | nextState = this;
160 | }
161 | return nextState;
162 | }
163 |
164 | /**
165 | * 按照character转移,根节点转移失败会返回自己(永远不会返回null)
166 | * @param character
167 | * @return
168 | */
169 | public State nextState(Character character)
170 | {
171 | return nextState(character, false);
172 | }
173 |
174 | /**
175 | * 按照character转移,任何节点转移失败会返回null
176 | * @param character
177 | * @return
178 | */
179 | public State nextStateIgnoreRootState(Character character)
180 | {
181 | return nextState(character, true);
182 | }
183 |
184 | public State addState(Character character)
185 | {
186 | State nextState = nextStateIgnoreRootState(character);
187 | if (nextState == null)
188 | {
189 | nextState = new State(this.depth + 1);
190 | this.success.put(character, nextState);
191 | }
192 | return nextState;
193 | }
194 |
195 | public Collection getStates()
196 | {
197 | return this.success.values();
198 | }
199 |
200 | public Collection getTransitions()
201 | {
202 | return this.success.keySet();
203 | }
204 |
205 | @Override
206 | public String toString()
207 | {
208 | final StringBuilder sb = new StringBuilder("State{");
209 | sb.append("depth=").append(depth);
210 | sb.append(", ID=").append(index);
211 | sb.append(", emits=").append(emits);
212 | sb.append(", success=").append(success.keySet());
213 | sb.append(", failureID=").append(failure == null ? "-1" : failure.index);
214 | sb.append(", failure=").append(failure);
215 | sb.append('}');
216 | return sb.toString();
217 | }
218 |
219 | /**
220 | * 获取goto表
221 | * @return
222 | */
223 | public Map getSuccess()
224 | {
225 | return success;
226 | }
227 |
228 | public int getIndex()
229 | {
230 | return index;
231 | }
232 |
233 | public void setIndex(int index)
234 | {
235 | this.index = index;
236 | }
237 | }
238 |
--------------------------------------------------------------------------------
/src/test/java/TestAhoCorasickDoubleArrayTrie.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * He Han
4 | * hankcs.cn@gmail.com
5 | * 2015/4/6 12:42
6 | *
7 | *
8 | * Copyright (c) 2003-2014, �Ϻ���ԭ��Ϣ�Ƽ�����˾. All Right Reserved, http://www.linrunsoft.com/
9 | * This source is subject to the LinrunSpace License. Please contact �Ϻ���ԭ��Ϣ�Ƽ�����˾ to get more information.
10 | *
11 | */
12 |
13 | import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie;
14 | import junit.framework.TestCase;
15 | import org.ahocorasick.trie.Trie;
16 |
17 | import java.io.BufferedReader;
18 | import java.io.IOException;
19 | import java.io.InputStreamReader;
20 | import java.util.*;
21 |
22 | /**
23 | * @author hankcs
24 | */
25 | public class TestAhoCorasickDoubleArrayTrie extends TestCase
26 | {
27 | public void testBuildAndParseSimply() throws Exception
28 | {
29 | // Collect test data set
30 | TreeMap map = new TreeMap();
31 | String[] keyArray = new String[]
32 | {
33 | "hers",
34 | "his",
35 | "she",
36 | "he"
37 | };
38 | for (String key : keyArray)
39 | {
40 | map.put(key, key);
41 | }
42 | // Build an AhoCorasickDoubleArrayTrie
43 | AhoCorasickDoubleArrayTrie acdat = new AhoCorasickDoubleArrayTrie();
44 | acdat.build(map);
45 | // Test it
46 | final String text = "uhers";
47 | acdat.parseText(text, new AhoCorasickDoubleArrayTrie.IHit()
48 | {
49 | @Override
50 | public void hit(int begin, int end, String value)
51 | {
52 | System.out.printf("[%d:%d]=%s\n", begin, end, value);
53 | assertEquals(text.substring(begin, end), value);
54 | }
55 | });
56 | List.Hit> wordList = acdat.parseText(text);
57 | System.out.println(wordList);
58 | }
59 |
60 | public void testBuildAndParseWithBigFile() throws Exception
61 | {
62 | // Load test data from disk
63 | Set dictionary = loadDictionary("cn/dictionary.txt");
64 | final String text = loadText("cn/text.txt");
65 | // You can use any type of Map to hold data
66 | Map map = new TreeMap();
67 | // Map map = new HashMap();
68 | // Map map = new LinkedHashMap();
69 | for (String key : dictionary)
70 | {
71 | map.put(key, key);
72 | }
73 | // Build an AhoCorasickDoubleArrayTrie
74 | AhoCorasickDoubleArrayTrie acdat = new AhoCorasickDoubleArrayTrie();
75 | acdat.build(map);
76 | // Test it
77 | acdat.parseText(text, new AhoCorasickDoubleArrayTrie.IHit()
78 | {
79 | @Override
80 | public void hit(int begin, int end, String value)
81 | {
82 | assertEquals(text.substring(begin, end), value);
83 | }
84 | });
85 | }
86 |
87 | private String loadText(String path) throws IOException
88 | {
89 | StringBuilder sbText = new StringBuilder();
90 | BufferedReader br = new BufferedReader(new InputStreamReader(Thread.currentThread().getContextClassLoader().getResourceAsStream(path), "UTF-8"));
91 | String line;
92 | while ((line = br.readLine()) != null)
93 | {
94 | sbText.append(line).append("\n");
95 | }
96 | br.close();
97 |
98 | return sbText.toString();
99 | }
100 |
101 | private Set loadDictionary(String path) throws IOException
102 | {
103 | Set dictionary = new TreeSet();
104 | BufferedReader br = new BufferedReader(new InputStreamReader(Thread.currentThread().getContextClassLoader().getResourceAsStream(path), "UTF-8"));
105 | String line;
106 | while ((line = br.readLine()) != null)
107 | {
108 | dictionary.add(line);
109 | }
110 | br.close();
111 |
112 | return dictionary;
113 | }
114 |
115 | private void runTest(String dictionaryPath, String textPath) throws IOException
116 | {
117 | Set dictionary = loadDictionary(dictionaryPath);
118 | String text = loadText(textPath);
119 | // Build a ahoCorasickNaive implemented by robert-bor
120 | Trie ahoCorasickNaive = new Trie();
121 | for (String word : dictionary)
122 | {
123 | ahoCorasickNaive.addKeyword(word);
124 | }
125 | ahoCorasickNaive.parseText(""); // More fairly, robert-bor's implementation needs to call this to build ac automata.
126 | // Build a AhoCorasickDoubleArrayTrie implemented by hankcs
127 | AhoCorasickDoubleArrayTrie ahoCorasickDoubleArrayTrie = new AhoCorasickDoubleArrayTrie();
128 | TreeMap dictionaryMap = new TreeMap();
129 | for (String word : dictionary)
130 | {
131 | dictionaryMap.put(word, word); // we use the same text as the property of a word
132 | }
133 | ahoCorasickDoubleArrayTrie.build(dictionaryMap);
134 | // Let's test the speed of the two Aho-Corasick automata
135 | System.out.printf("Parsing document which contains %d characters, with a dictionary of %d words.\n", text.length(), dictionary.size());
136 | long start = System.currentTimeMillis();
137 | ahoCorasickNaive.parseText(text);
138 | long costTimeNaive = System.currentTimeMillis() - start;
139 | start = System.currentTimeMillis();
140 | ahoCorasickDoubleArrayTrie.parseText(text, new AhoCorasickDoubleArrayTrie.IHit()
141 | {
142 | @Override
143 | public void hit(int begin, int end, String value)
144 | {
145 |
146 | }
147 | });
148 | long costTimeACDAT = System.currentTimeMillis() - start;
149 | System.out.printf("%-15s\t%-15s\t%-15s\n", "", "Naive", "ACDAT");
150 | System.out.printf("%-15s\t%-15d\t%-15d\n", "time", costTimeNaive, costTimeACDAT);
151 | System.out.printf("%-15s\t%-15.2f\t%-15.2f\n", "char/s", (text.length() / (costTimeNaive / 1000.0)), (text.length() / (costTimeACDAT / 1000.0)));
152 | System.out.printf("%-15s\t%-15.2f\t%-15.2f\n", "rate", 1.0, costTimeNaive / (double) costTimeACDAT);
153 | System.out.println("===========================================================================");
154 | }
155 |
156 | /**
157 | * Compare my AhoCorasickDoubleArrayTrie with robert-bor's aho-corasick, notice that robert-bor's aho-corasick is
158 | * compiled under jdk1.8, so you will need jdk1.8 to run this test
159 | * To avoid JVM wasting time on allocating memory, please use -Xms512m -Xmx512m -Xmn256m .
160 | * @throws Exception
161 | */
162 | public void testBenchmark() throws Exception
163 | {
164 | runTest("en/dictionary.txt", "en/text.txt");
165 | runTest("cn/dictionary.txt", "cn/text.txt");
166 | }
167 | }
168 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/algorithm/AhoCorasickDoubleArrayTrie.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.algorithm;
2 |
3 | import java.io.IOException;
4 | import java.io.ObjectInputStream;
5 | import java.io.ObjectOutputStream;
6 | import java.util.*;
7 | import java.util.concurrent.LinkedBlockingDeque;
8 |
9 | /**
10 | * An implemention of Aho Corasick algorithm based on Double Array Trie
11 | *
12 | * @author hankcs
13 | */
14 | public class AhoCorasickDoubleArrayTrie
15 | {
16 | /**
17 | * check array of the Double Array Trie structure
18 | */
19 | protected int check[];
20 | /**
21 | * base array of the Double Array Trie structure
22 | */
23 | protected int base[];
24 | /**
25 | * fail table of the Aho Corasick automata
26 | */
27 | int fail[];
28 | /**
29 | * out table of the Aho Corasick automata
30 | */
31 | int[][] output;
32 | /**
33 | * outer value array
34 | */
35 | protected V[] v;
36 |
37 | /**
38 | * the length of every key
39 | */
40 | protected int[] l;
41 |
42 | /**
43 | * the size of base and check array
44 | */
45 | protected int size;
46 |
47 | /**
48 | * Parse text
49 | * @param text The text
50 | * @return a list of outputs
51 | */
52 | public List> parseText(String text)
53 | {
54 | int position = 1;
55 | int currentState = 0;
56 | List> collectedEmits = new LinkedList>();
57 | for (int i = 0; i < text.length(); ++i)
58 | {
59 | currentState = getState(currentState, text.charAt(i));
60 | storeEmits(position, currentState, collectedEmits);
61 | ++position;
62 | }
63 |
64 | return collectedEmits;
65 | }
66 |
67 | /**
68 | * Parse text
69 | * @param text The text
70 | * @param processor A processor which handles the output
71 | */
72 | public void parseText(String text, IHit processor)
73 | {
74 | int position = 1;
75 | int currentState = 0;
76 | for (int i = 0; i < text.length(); ++i)
77 | {
78 | currentState = getState(currentState, text.charAt(i));
79 | int[] hitArray = output[currentState];
80 | if (hitArray != null)
81 | {
82 | for (int hit : hitArray)
83 | {
84 | processor.hit(position - l[hit], position, v[hit]);
85 | }
86 | }
87 | ++position;
88 | }
89 | }
90 |
91 | /**
92 | * Parse text
93 | * @param text The text
94 | * @param processor A processor which handles the output
95 | */
96 | public void parseText(char[] text, IHit processor)
97 | {
98 | int position = 1;
99 | int currentState = 0;
100 | for (char c : text)
101 | {
102 | currentState = getState(currentState, c);
103 | int[] hitArray = output[currentState];
104 | if (hitArray != null)
105 | {
106 | for (int hit : hitArray)
107 | {
108 | processor.hit(position - l[hit], position, v[hit]);
109 | }
110 | }
111 | ++position;
112 | }
113 | }
114 |
115 | /**
116 | * Parse text
117 | * @param text The text
118 | * @param processor A processor which handles the output
119 | */
120 | public void parseText(char[] text, IHitFull processor)
121 | {
122 | int position = 1;
123 | int currentState = 0;
124 | for (char c : text)
125 | {
126 | currentState = getState(currentState, c);
127 | int[] hitArray = output[currentState];
128 | if (hitArray != null)
129 | {
130 | for (int hit : hitArray)
131 | {
132 | processor.hit(position - l[hit], position, v[hit], hit);
133 | }
134 | }
135 | ++position;
136 | }
137 | }
138 |
139 |
140 | /**
141 | * Save
142 | * @param out An ObjectOutputStream object
143 | * @throws IOException Some IOException
144 | */
145 | public void save(ObjectOutputStream out) throws IOException
146 | {
147 | out.writeObject(base);
148 | out.writeObject(check);
149 | out.writeObject(fail);
150 | out.writeObject(output);
151 | out.writeObject(l);
152 | out.writeObject(v);
153 | }
154 |
155 | /**
156 | * Load
157 | * @param in An ObjectInputStream object
158 | * @throws IOException
159 | * @throws ClassNotFoundException
160 | */
161 | public void load(ObjectInputStream in) throws IOException, ClassNotFoundException
162 | {
163 | base = (int[]) in.readObject();
164 | check = (int[]) in.readObject();
165 | fail = (int[]) in.readObject();
166 | output = (int[][]) in.readObject();
167 | l = (int[]) in.readObject();
168 | v = (V[]) in.readObject();
169 | }
170 |
171 | /**
172 | * Get value by a String key, just like a map.get() method
173 | * @param key The key
174 | * @return
175 | */
176 | public V get(String key)
177 | {
178 | int index = exactMatchSearch(key);
179 | if (index >= 0)
180 | {
181 | return v[index];
182 | }
183 |
184 | return null;
185 | }
186 |
187 | /**
188 | * Pick the value by index in value array
189 | * Notice that to be more efficiently, this method DONOT check the parameter
190 | * @param index The index
191 | * @return The value
192 | */
193 | public V get(int index)
194 | {
195 | return v[index];
196 | }
197 |
198 | /**
199 | * Processor handles the output when hit a keyword
200 | */
201 | public interface IHit
202 | {
203 | /**
204 | * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword
205 | * @param begin the beginning index, inclusive.
206 | * @param end the ending index, exclusive.
207 | * @param value the value assigned to the keyword
208 | */
209 | void hit(int begin, int end, V value);
210 | }
211 |
212 | /**
213 | * Processor handles the output when hit a keyword, with more detail
214 | */
215 | public interface IHitFull
216 | {
217 | /**
218 | * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword
219 | * @param begin the beginning index, inclusive.
220 | * @param end the ending index, exclusive.
221 | * @param value the value assigned to the keyword
222 | * @param index the index of the value assigned to the keyword, you can use the integer as a perfect hash value
223 | */
224 | void hit(int begin, int end, V value, int index);
225 | }
226 |
227 | /**
228 | * A result output
229 | *
230 | * @param the value type
231 | */
232 | public class Hit
233 | {
234 | /**
235 | * the beginning index, inclusive.
236 | */
237 | public final int begin;
238 | /**
239 | * the ending index, exclusive.
240 | */
241 | public final int end;
242 | /**
243 | * the value assigned to the keyword
244 | */
245 | public final V value;
246 |
247 | public Hit(int begin, int end, V value)
248 | {
249 | this.begin = begin;
250 | this.end = end;
251 | this.value = value;
252 | }
253 |
254 | @Override
255 | public String toString()
256 | {
257 | return String.format("[%d:%d]=%s", begin, end, value);
258 | }
259 | }
260 |
261 | /**
262 | * transmit state, supports failure function
263 | *
264 | * @param currentState
265 | * @param character
266 | * @return
267 | */
268 | private int getState(int currentState, char character)
269 | {
270 | int newCurrentState = transitionWithRoot(currentState, character); // 先按success跳转
271 | while (newCurrentState == -1) // 跳转失败的话,按failure跳转
272 | {
273 | currentState = fail[currentState];
274 | newCurrentState = transitionWithRoot(currentState, character);
275 | }
276 | return newCurrentState;
277 | }
278 |
279 | /**
280 | * store output
281 | *
282 | * @param position
283 | * @param currentState
284 | * @param collectedEmits
285 | */
286 | private void storeEmits(int position, int currentState, List> collectedEmits)
287 | {
288 | int[] hitArray = output[currentState];
289 | if (hitArray != null)
290 | {
291 | for (int hit : hitArray)
292 | {
293 | collectedEmits.add(new Hit(position - l[hit], position, v[hit]));
294 | }
295 | }
296 | }
297 |
298 | /**
299 | * transition of a state
300 | *
301 | * @param current
302 | * @param c
303 | * @return
304 | */
305 | protected int transition(int current, char c)
306 | {
307 | int b = current;
308 | int p;
309 |
310 | p = b + c + 1;
311 | if (b == check[p])
312 | b = base[p];
313 | else
314 | return -1;
315 |
316 | p = b;
317 | return p;
318 | }
319 |
320 | /**
321 | * transition of a state, if the state is root and it failed, then returns the root
322 | *
323 | * @param nodePos
324 | * @param c
325 | * @return
326 | */
327 | protected int transitionWithRoot(int nodePos, char c)
328 | {
329 | int b = base[nodePos];
330 | int p;
331 |
332 | p = b + c + 1;
333 | if (b != check[p])
334 | {
335 | if (nodePos == 0) return 0;
336 | return -1;
337 | }
338 |
339 | return p;
340 | }
341 |
342 |
343 | /**
344 | * Build a AhoCorasickDoubleArrayTrie from a map
345 | * @param map a map containing key-value pairs
346 | */
347 | public void build(Map map)
348 | {
349 | new Builder().build(map);
350 | }
351 |
352 |
353 | /**
354 | * match exactly by a key
355 | *
356 | * @param key the key
357 | * @return the index of the key, you can use it as a perfect hash function
358 | */
359 | public int exactMatchSearch(String key)
360 | {
361 | return exactMatchSearch(key, 0, 0, 0);
362 | }
363 |
364 | /**
365 | * match exactly by a key
366 | *
367 | * @param key
368 | * @param pos
369 | * @param len
370 | * @param nodePos
371 | * @return
372 | */
373 | private int exactMatchSearch(String key, int pos, int len, int nodePos)
374 | {
375 | if (len <= 0)
376 | len = key.length();
377 | if (nodePos <= 0)
378 | nodePos = 0;
379 |
380 | int result = -1;
381 |
382 | char[] keyChars = key.toCharArray();
383 |
384 | int b = base[nodePos];
385 | int p;
386 |
387 | for (int i = pos; i < len; i++)
388 | {
389 | p = b + (int) (keyChars[i]) + 1;
390 | if (b == check[p])
391 | b = base[p];
392 | else
393 | return result;
394 | }
395 |
396 | p = b;
397 | int n = base[p];
398 | if (b == check[p] && n < 0)
399 | {
400 | result = -n - 1;
401 | }
402 | return result;
403 | }
404 |
405 | /**
406 | * match exactly by a key
407 | *
408 | * @param keyChars the char array of the key
409 | * @param pos the begin index of char array
410 | * @param len the length of the key
411 | * @param nodePos the starting position of the node for searching
412 | * @return the value index of the key, minus indicates null
413 | */
414 | private int exactMatchSearch(char[] keyChars, int pos, int len, int nodePos)
415 | {
416 | int result = -1;
417 |
418 | int b = base[nodePos];
419 | int p;
420 |
421 | for (int i = pos; i < len; i++)
422 | {
423 | p = b + (int) (keyChars[i]) + 1;
424 | if (b == check[p])
425 | b = base[p];
426 | else
427 | return result;
428 | }
429 |
430 | p = b;
431 | int n = base[p];
432 | if (b == check[p] && n < 0)
433 | {
434 | result = -n - 1;
435 | }
436 | return result;
437 | }
438 |
439 | // /**
440 | // * Just for debug when I wrote it
441 | // */
442 | // public void debug()
443 | // {
444 | // System.out.println("base:");
445 | // for (int i = 0; i < base.length; i++)
446 | // {
447 | // if (base[i] < 0)
448 | // {
449 | // System.out.println(i + " : " + -base[i]);
450 | // }
451 | // }
452 | //
453 | // System.out.println("output:");
454 | // for (int i = 0; i < output.length; i++)
455 | // {
456 | // if (output[i] != null)
457 | // {
458 | // System.out.println(i + " : " + Arrays.toString(output[i]));
459 | // }
460 | // }
461 | //
462 | // System.out.println("fail:");
463 | // for (int i = 0; i < fail.length; i++)
464 | // {
465 | // if (fail[i] != 0)
466 | // {
467 | // System.out.println(i + " : " + fail[i]);
468 | // }
469 | // }
470 | //
471 | // System.out.println(this);
472 | // }
473 | //
474 | // @Override
475 | // public String toString()
476 | // {
477 | // String infoIndex = "i = ";
478 | // String infoChar = "char = ";
479 | // String infoBase = "base = ";
480 | // String infoCheck = "check= ";
481 | // for (int i = 0; i < Math.min(base.length, 200); ++i)
482 | // {
483 | // if (base[i] != 0 || check[i] != 0)
484 | // {
485 | // infoChar += " " + (i == check[i] ? " ×" : (char) (i - check[i] - 1));
486 | // infoIndex += " " + String.format("%5d", i);
487 | // infoBase += " " + String.format("%5d", base[i]);
488 | // infoCheck += " " + String.format("%5d", check[i]);
489 | // }
490 | // }
491 | // return "DoubleArrayTrie:" +
492 | // "\n" + infoChar +
493 | // "\n" + infoIndex +
494 | // "\n" + infoBase +
495 | // "\n" + infoCheck + "\n" +
496 | //// "check=" + Arrays.toString(check) +
497 | //// ", base=" + Arrays.toString(base) +
498 | //// ", used=" + Arrays.toString(used) +
499 | // "size=" + size
500 | //// ", length=" + Arrays.toString(length) +
501 | //// ", value=" + Arrays.toString(value) +
502 | // ;
503 | // }
504 | //
505 | // /**
506 | // * 一个顺序输出变量名与变量值的调试类
507 | // */
508 | // private static class DebugArray
509 | // {
510 | // Map nameValueMap = new LinkedHashMap();
511 | //
512 | // public void add(String name, int value)
513 | // {
514 | // String valueInMap = nameValueMap.get(name);
515 | // if (valueInMap == null)
516 | // {
517 | // valueInMap = "";
518 | // }
519 | //
520 | // valueInMap += " " + String.format("%5d", value);
521 | //
522 | // nameValueMap.put(name, valueInMap);
523 | // }
524 | //
525 | // @Override
526 | // public String toString()
527 | // {
528 | // String text = "";
529 | // for (Map.Entry entry : nameValueMap.entrySet())
530 | // {
531 | // String name = entry.getKey();
532 | // String value = entry.getValue();
533 | // text += String.format("%-5s", name) + "= " + value + '\n';
534 | // }
535 | //
536 | // return text;
537 | // }
538 | //
539 | // public void println()
540 | // {
541 | // System.out.print(this);
542 | // }
543 | // }
544 |
545 | /**
546 | * Get the size of the keywords
547 | * @return
548 | */
549 | public int size()
550 | {
551 | return v.length;
552 | }
553 |
554 | /**
555 | * A builder to build the AhoCorasickDoubleArrayTrie
556 | */
557 | private class Builder
558 | {
559 | /**
560 | * the root state of trie
561 | */
562 | private State rootState = new State();
563 | /**
564 | * whether the position has been used
565 | */
566 | private boolean used[];
567 | /**
568 | * the allocSize of the dynamic array
569 | */
570 | private int allocSize;
571 | /**
572 | * a parameter controls the memory growth speed of the dynamic array
573 | */
574 | private int progress;
575 | /**
576 | * the next position to check unused memory
577 | */
578 | private int nextCheckPos;
579 | /**
580 | * the size of the key-pair sets
581 | */
582 | private int keySize;
583 |
584 | /**
585 | * Build from a map
586 | * @param map a map containing key-value pairs
587 | */
588 | @SuppressWarnings("unchecked")
589 | public void build(Map map)
590 | {
591 | // 把值保存下来
592 | v = (V[]) map.values().toArray();
593 | l = new int[v.length];
594 | Set keySet = map.keySet();
595 | // 构建二分trie树
596 | addAllKeyword(keySet);
597 | // 在二分trie树的基础上构建双数组trie树
598 | buildDoubleArrayTrie(keySet.size());
599 | used = null;
600 | // 构建failure表并且合并output表
601 | constructFailureStates();
602 | rootState = null;
603 | loseWeight();
604 | }
605 |
606 | /**
607 | * fetch siblings of a parent node
608 | *
609 | * @param parent parent node
610 | * @param siblings parent node's child nodes, i . e . the siblings
611 | * @return the amount of the siblings
612 | */
613 | private int fetch(State parent, List> siblings)
614 | {
615 | if (parent.isAcceptable())
616 | {
617 | State fakeNode = new State(-(parent.getDepth() + 1)); // 此节点是parent的子节点,同时具备parent的输出
618 | fakeNode.addEmit(parent.getLargestValueId());
619 | siblings.add(new AbstractMap.SimpleEntry(0, fakeNode));
620 | }
621 | for (Map.Entry entry : parent.getSuccess().entrySet())
622 | {
623 | siblings.add(new AbstractMap.SimpleEntry(entry.getKey() + 1, entry.getValue()));
624 | }
625 | return siblings.size();
626 | }
627 |
628 | /**
629 | * add a keyword
630 | *
631 | * @param keyword a keyword
632 | * @param index the index of the keyword
633 | */
634 | private void addKeyword(String keyword, int index)
635 | {
636 | State currentState = this.rootState;
637 | for (Character character : keyword.toCharArray())
638 | {
639 | currentState = currentState.addState(character);
640 | }
641 | currentState.addEmit(index);
642 | l[index] = keyword.length();
643 | }
644 |
645 | /**
646 | * add a collection of keywords
647 | *
648 | * @param keywordSet the collection holding keywords
649 | */
650 | private void addAllKeyword(Collection keywordSet)
651 | {
652 | int i = 0;
653 | for (String keyword : keywordSet)
654 | {
655 | addKeyword(keyword, i++);
656 | }
657 | }
658 |
659 | /**
660 | * construct failure table
661 | */
662 | private void constructFailureStates()
663 | {
664 | fail = new int[size + 1];
665 | fail[1] = base[0];
666 | output = new int[size + 1][];
667 | Queue queue = new LinkedBlockingDeque();
668 |
669 | // 第一步,将深度为1的节点的failure设为根节点
670 | for (State depthOneState : this.rootState.getStates())
671 | {
672 | depthOneState.setFailure(this.rootState, fail);
673 | queue.add(depthOneState);
674 | constructOutput(depthOneState);
675 | }
676 |
677 | // 第二步,为深度 > 1 的节点建立failure表,这是一个bfs
678 | while (!queue.isEmpty())
679 | {
680 | State currentState = queue.remove();
681 |
682 | for (Character transition : currentState.getTransitions())
683 | {
684 | State targetState = currentState.nextState(transition);
685 | queue.add(targetState);
686 |
687 | State traceFailureState = currentState.failure();
688 | while (traceFailureState.nextState(transition) == null)
689 | {
690 | traceFailureState = traceFailureState.failure();
691 | }
692 | State newFailureState = traceFailureState.nextState(transition);
693 | targetState.setFailure(newFailureState, fail);
694 | targetState.addEmit(newFailureState.emit());
695 | constructOutput(targetState);
696 | }
697 | }
698 | }
699 |
700 | /**
701 | * construct output table
702 | */
703 | private void constructOutput(State targetState)
704 | {
705 | Collection emit = targetState.emit();
706 | if (emit == null || emit.size() == 0) return;
707 | int output[] = new int[emit.size()];
708 | Iterator it = emit.iterator();
709 | for (int i = 0; i < output.length; ++i)
710 | {
711 | output[i] = it.next();
712 | }
713 | AhoCorasickDoubleArrayTrie.this.output[targetState.getIndex()] = output;
714 | }
715 |
716 | private void buildDoubleArrayTrie(int keySize)
717 | {
718 | progress = 0;
719 | this.keySize = keySize;
720 | resize(65536 * 32); // 32个双字节
721 |
722 | base[0] = 1;
723 | nextCheckPos = 0;
724 |
725 | State root_node = this.rootState;
726 |
727 | List> siblings = new ArrayList>(root_node.getSuccess().entrySet().size());
728 | fetch(root_node, siblings);
729 | insert(siblings);
730 | }
731 |
732 | /**
733 | * allocate the memory of the dynamic array
734 | *
735 | * @param newSize
736 | * @return
737 | */
738 | private int resize(int newSize)
739 | {
740 | int[] base2 = new int[newSize];
741 | int[] check2 = new int[newSize];
742 | boolean used2[] = new boolean[newSize];
743 | if (allocSize > 0)
744 | {
745 | System.arraycopy(base, 0, base2, 0, allocSize);
746 | System.arraycopy(check, 0, check2, 0, allocSize);
747 | System.arraycopy(used, 0, used2, 0, allocSize);
748 | }
749 |
750 | base = base2;
751 | check = check2;
752 | used = used2;
753 |
754 | return allocSize = newSize;
755 | }
756 |
757 | /**
758 | * insert the siblings to double array trie
759 | *
760 | * @param siblings the siblings being inserted
761 | * @return the position to insert them
762 | */
763 | private int insert(List> siblings)
764 | {
765 | int begin = 0;
766 | int pos = Math.max(siblings.get(0).getKey() + 1, nextCheckPos) - 1;
767 | int nonzero_num = 0;
768 | int first = 0;
769 |
770 | if (allocSize <= pos)
771 | resize(pos + 1);
772 |
773 | outer:
774 | // 此循环体的目标是找出满足base[begin + a1...an] == 0的n个空闲空间,a1...an是siblings中的n个节点
775 | while (true)
776 | {
777 | pos++;
778 |
779 | if (allocSize <= pos)
780 | resize(pos + 1);
781 |
782 | if (check[pos] != 0)
783 | {
784 | nonzero_num++;
785 | continue;
786 | }
787 | else if (first == 0)
788 | {
789 | nextCheckPos = pos;
790 | first = 1;
791 | }
792 |
793 | begin = pos - siblings.get(0).getKey(); // 当前位置离第一个兄弟节点的距离
794 | if (allocSize <= (begin + siblings.get(siblings.size() - 1).getKey()))
795 | {
796 | // progress can be zero // 防止progress产生除零错误
797 | double l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05 : 1.0 * keySize / (progress + 1);
798 | resize((int) (allocSize * l));
799 | }
800 |
801 | if (used[begin])
802 | continue;
803 |
804 | for (int i = 1; i < siblings.size(); i++)
805 | if (check[begin + siblings.get(i).getKey()] != 0)
806 | continue outer;
807 |
808 | break;
809 | }
810 |
811 | // -- Simple heuristics --
812 | // if the percentage of non-empty contents in check between the
813 | // index
814 | // 'next_check_pos' and 'check' is greater than some constant value
815 | // (e.g. 0.9),
816 | // new 'next_check_pos' index is written by 'check'.
817 | if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95)
818 | nextCheckPos = pos; // 从位置 next_check_pos 开始到 pos 间,如果已占用的空间在95%以上,下次插入节点时,直接从 pos 位置处开始查找
819 | used[begin] = true;
820 |
821 | size = (size > begin + siblings.get(siblings.size() - 1).getKey() + 1) ? size : begin + siblings.get(siblings.size() - 1).getKey() + 1;
822 |
823 | for (Map.Entry sibling : siblings)
824 | {
825 | check[begin + sibling.getKey()] = begin;
826 | }
827 |
828 | for (Map.Entry sibling : siblings)
829 | {
830 | List> new_siblings = new ArrayList>(sibling.getValue().getSuccess().entrySet().size() + 1);
831 |
832 | if (fetch(sibling.getValue(), new_siblings) == 0) // 一个词的终止且不为其他词的前缀,其实就是叶子节点
833 | {
834 | base[begin + sibling.getKey()] = (-sibling.getValue().getLargestValueId() - 1);
835 | progress++;
836 | }
837 | else
838 | {
839 | int h = insert(new_siblings); // dfs
840 | base[begin + sibling.getKey()] = h;
841 | }
842 | sibling.getValue().setIndex(begin + sibling.getKey());
843 | }
844 | return begin;
845 | }
846 |
847 | /**
848 | * free the unnecessary memory
849 | */
850 | private void loseWeight()
851 | {
852 | int nbase[] = new int[size + 65535];
853 | System.arraycopy(base, 0, nbase, 0, size);
854 | base = nbase;
855 |
856 | int ncheck[] = new int[size + 65535];
857 | System.arraycopy(check, 0, ncheck, 0, size);
858 | check = ncheck;
859 | }
860 | }
861 | }
862 |
--------------------------------------------------------------------------------