├── pom.xml ├── README.md └── src ├── main └── java │ └── com │ └── hankcs │ └── algorithm │ ├── State.java │ └── AhoCorasickDoubleArrayTrie.java └── test └── java └── TestAhoCorasickDoubleArrayTrie.java /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | AhoCorasickDoubleArrayTrie 8 | aho-corasick-double-array-trie 9 | 1.0.1 10 | 11 | 12 | 13 | 14 | 15 | org.apache.maven.plugins 16 | maven-compiler-plugin 17 | ${maven-compiler-plugin.version} 18 | 19 | ${project.build.sourceEncoding} 20 | ${java.version} 21 | ${java.version} 22 | true 23 | true 24 | true 25 | 26 | 27 | 28 | 29 | org.apache.maven.plugins 30 | maven-jar-plugin 31 | ${maven-jar-plugin.version} 32 | 33 | 34 | **/*.properties 35 | 36 | 37 | 38 | 39 | 40 | org.apache.maven.plugins 41 | maven-surefire-plugin 42 | ${maven-surefire-plugin.version} 43 | 44 | true 45 | 46 | 47 | 48 | 49 | org.apache.maven.plugins 50 | maven-resources-plugin 51 | ${maven-resources-plugin.version} 52 | 53 | ${project.build.sourceEncoding} 54 | 55 | 56 | 57 | 58 | maven-source-plugin 59 | ${maven-source-plugin.version} 60 | 61 | 62 | attach-sources 63 | 64 | jar 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | org.codehaus.sonar 73 | sonar-maven3-plugin 74 | ${sonar-maven3-plugin.version} 75 | 76 | 77 | 78 | 79 | 80 | 1.6 81 | UTF-8 82 | 83 | 3.0 84 | 2.4 85 | 2.14 86 | 2.6 87 | 2.2.1 88 | 3.5 89 | 90 | 3.8.1 91 | 92 | 93 | 94 | 95 | junit 96 | junit 97 | ${junit.version} 98 | test 99 | 100 | 101 | org.ahocorasick 102 | ahocorasick 103 | 0.2.3 104 | test 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | AhoCorasickDoubleArrayTrie 2 | ============ 3 | 4 | An extremely fast implementation of Aho Corasick algorithm based on Double Array Trie structure. Its speed is 1.7 to 4.5 times of naive implementations, perhaps it's the fastest implementation so far ;-) 5 | 6 | Introduction 7 | ------------ 8 | You may heard that Aho-Corasick algorithm is fast for parsing text with a huge dictionary, for example: 9 | * looking for certain words in texts in order to URL link or emphasize them 10 | * adding semantics to plain text 11 | * checking against a dictionary to see if syntactic errors were made 12 | 13 | But most implementation use a `TreeMap` to store the *goto* structure, which costs `O(ln(t))` time, `t` is the largest amount of a word's common prefixes. The final complexity is `O(n * ln(t))`, absolutely `t > 2`, so `n * ln(t) > n `. The others used a `HashMap`, which wasted too much memory, and still remained slowly. 14 | 15 | I improve it by replace the `XXXMap` to a Double Array Trie, whose time complexity is just `O(1)`, thus we get a total complexity of exactly `O(n)`, and take a perfect balance of time and memory. Yes, its speed is not related to the length or language or common prefix of the words of a dictionary. 16 | 17 | This implementation has been widely used in my [HanLP: Han Language Processing](https://github.com/hankcs/HanLP) package. I hope it can serve as a common data structure library in projects handling text or NLP task. 18 | 19 | Usage 20 | ----- 21 | Setting up the `AhoCorasickDoubleArrayTrie` is a piece of cake: 22 | ```java 23 | // Collect test data set 24 | TreeMap map = new TreeMap(); 25 | String[] keyArray = new String[] 26 | { 27 | "hers", 28 | "his", 29 | "she", 30 | "he" 31 | }; 32 | for (String key : keyArray) 33 | { 34 | map.put(key, key); 35 | } 36 | // Build an AhoCorasickDoubleArrayTrie 37 | AhoCorasickDoubleArrayTrie acdat = new AhoCorasickDoubleArrayTrie(); 38 | acdat.build(map); 39 | // Test it 40 | final String text = "uhers"; 41 | List.Hit> wordList = acdat.parseText(text); 42 | ``` 43 | 44 | Of course, there remains many useful methods to be discovered, feel free to try: 45 | * Use a `Map` to assign a `SomeObject` as value to a keyword. 46 | * Store the `AhoCorasickDoubleArrayTrie` to disk by calling `save` method. 47 | * Restore the `AhoCorasickDoubleArrayTrie` from disk by calling `load` method. 48 | 49 | In other situations you probably do not need a huge wordList, then please try this: 50 | 51 | ```java 52 | acdat.parseText(text, new AhoCorasickDoubleArrayTrie.IHit() 53 | { 54 | @Override 55 | public void hit(int begin, int end, String value) 56 | { 57 | System.out.printf("[%d:%d]=%s\n", begin, end, value); 58 | } 59 | }); 60 | ``` 61 | 62 | or a lambda function 63 | ``` 64 | acdat.parseText(text, (begin, end, value) -> { 65 | System.out.printf("[%d:%d]=%s\n", begin, end, value); 66 | }); 67 | ``` 68 | 69 | Comparison 70 | ----- 71 | I compared my AhoCorasickDoubleArrayTrie with robert-bor's aho-corasick, ACDAT represents for AhoCorasickDoubleArrayTrie and Naive repesents for aho-corasick, the result is : 72 | ``` 73 | Parsing English document which contains 3409283 characters, with a dictionary of 127142 words. 74 | Naive ACDAT 75 | time 554 290 76 | char/s 6153940.43 11756148.28 77 | rate 1.00 1.91 78 | =========================================================================== 79 | Parsing Chinese document which contains 1290573 characters, with a dictionary of 146047 words. 80 | Naive ACDAT 81 | time 269 56 82 | char/s 4797669.14 23045946.43 83 | rate 1.00 4.80 84 | =========================================================================== 85 | ``` 86 | 87 | In English test, AhoCorasickDoubleArrayTrie is 1.91 times faster. When it comes to Chinese, AhoCorasickDoubleArrayTrie is 4.80 times faster. 88 | Feel free to re-run this test in TestAhoCorasickDoubleArrayTrie, the test data is ready for you. 89 | 90 | Thanks 91 | ----- 92 | This project is inspired by [aho-corasick](https://github.com/robert-bor/aho-corasick) and [darts-clone-java](https://github.com/hiroshi-manabe/darts-clone-java). 93 | Many thanks! 94 | 95 | License 96 | ------- 97 | Licensed under the Apache License, Version 2.0 (the "License"); 98 | you may not use this file except in compliance with the License. 99 | You may obtain a copy of the License at 100 | 101 | http://www.apache.org/licenses/LICENSE-2.0 102 | 103 | Unless required by applicable law or agreed to in writing, software 104 | distributed under the License is distributed on an "AS IS" BASIS, 105 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 106 | See the License for the specific language governing permissions and 107 | limitations under the License. 108 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/algorithm/State.java: -------------------------------------------------------------------------------- 1 | package com.hankcs.algorithm; 2 | 3 | import java.util.*; 4 | 5 | /** 6 | *

7 | * 一个状态有如下几个功能 8 | *

9 | *

10 | *

success; 成功转移到另一个状态
failure; 不可顺着字符串跳转的话，则跳转到一个浅一点的节点
emits; 命中一个模式串

15 | *

16 | *

17 | * 根节点稍有不同，根节点没有 failure 功能，它的“failure”指的是按照字符串路径转移到下一个状态。其他节点则都有failure状态。 18 | *

19 | * 20 | * @author Robert Bor 21 | */ 22 | public class State 23 | { 24 | 25 | /** 26 | * 模式串的长度，也是这个状态的深度 27 | */ 28 | protected final int depth; 29 | 30 | /** 31 | * fail 函数，如果没有匹配到，则跳转到此状态。 32 | */ 33 | private State failure = null; 34 | 35 | /** 36 | * 只要这个状态可达，则记录模式串 37 | */ 38 | private Set emits = null; 39 | /** 40 | * goto 表，也称转移函数。根据字符串的下一个字符转移到下一个状态 41 | */ 42 | private Map success = new TreeMap(); 43 | 44 | /** 45 | * 在双数组中的对应下标 46 | */ 47 | private int index; 48 | 49 | /** 50 | * 构造深度为0的节点 51 | */ 52 | public State() 53 | { 54 | this(0); 55 | } 56 | 57 | /** 58 | * 构造深度为depth的节点 59 | * @param depth 60 | */ 61 | public State(int depth) 62 | { 63 | this.depth = depth; 64 | } 65 | 66 | /** 67 | * 获取节点深度 68 | * @return 69 | */ 70 | public int getDepth() 71 | { 72 | return this.depth; 73 | } 74 | 75 | /** 76 | * 添加一个匹配到的模式串（这个状态对应着这个模式串) 77 | * @param keyword 78 | */ 79 | public void addEmit(int keyword) 80 | { 81 | if (this.emits == null) 82 | { 83 | this.emits = new TreeSet(Collections.reverseOrder()); 84 | } 85 | this.emits.add(keyword); 86 | } 87 | 88 | /** 89 | * 获取最大的值 90 | * @return 91 | */ 92 | public Integer getLargestValueId() 93 | { 94 | if (emits == null || emits.size() == 0) return null; 95 | 96 | return emits.iterator().next(); 97 | } 98 | 99 | /** 100 | * 添加一些匹配到的模式串 101 | * @param emits 102 | */ 103 | public void addEmit(Collection emits) 104 | { 105 | for (int emit : emits) 106 | { 107 | addEmit(emit); 108 | } 109 | } 110 | 111 | /** 112 | * 获取这个节点代表的模式串（们） 113 | * @return 114 | */ 115 | public Collection emit() 116 | { 117 | return this.emits == null ? Collections.emptyList() : this.emits; 118 | } 119 | 120 | /** 121 | * 是否是终止状态 122 | * @return 123 | */ 124 | public boolean isAcceptable() 125 | { 126 | return this.depth > 0 && this.emits != null; 127 | } 128 | 129 | /** 130 | * 获取failure状态 131 | * @return 132 | */ 133 | public State failure() 134 | { 135 | return this.failure; 136 | } 137 | 138 | /** 139 | * 设置failure状态 140 | * @param failState 141 | */ 142 | public void setFailure(State failState, int fail[]) 143 | { 144 | this.failure = failState; 145 | fail[index] = failState.index; 146 | } 147 | 148 | /** 149 | * 转移到下一个状态 150 | * @param character 希望按此字符转移 151 | * @param ignoreRootState 是否忽略根节点，如果是根节点自己调用则应该是true，否则为false 152 | * @return 转移结果 153 | */ 154 | private State nextState(Character character, boolean ignoreRootState) 155 | { 156 | State nextState = this.success.get(character); 157 | if (!ignoreRootState && nextState == null && this.depth == 0) 158 | { 159 | nextState = this; 160 | } 161 | return nextState; 162 | } 163 | 164 | /** 165 | * 按照character转移，根节点转移失败会返回自己（永远不会返回null） 166 | * @param character 167 | * @return 168 | */ 169 | public State nextState(Character character) 170 | { 171 | return nextState(character, false); 172 | } 173 | 174 | /** 175 | * 按照character转移，任何节点转移失败会返回null 176 | * @param character 177 | * @return 178 | */ 179 | public State nextStateIgnoreRootState(Character character) 180 | { 181 | return nextState(character, true); 182 | } 183 | 184 | public State addState(Character character) 185 | { 186 | State nextState = nextStateIgnoreRootState(character); 187 | if (nextState == null) 188 | { 189 | nextState = new State(this.depth + 1); 190 | this.success.put(character, nextState); 191 | } 192 | return nextState; 193 | } 194 | 195 | public Collection getStates() 196 | { 197 | return this.success.values(); 198 | } 199 | 200 | public Collection getTransitions() 201 | { 202 | return this.success.keySet(); 203 | } 204 | 205 | @Override 206 | public String toString() 207 | { 208 | final StringBuilder sb = new StringBuilder("State{"); 209 | sb.append("depth=").append(depth); 210 | sb.append(", ID=").append(index); 211 | sb.append(", emits=").append(emits); 212 | sb.append(", success=").append(success.keySet()); 213 | sb.append(", failureID=").append(failure == null ? "-1" : failure.index); 214 | sb.append(", failure=").append(failure); 215 | sb.append('}'); 216 | return sb.toString(); 217 | } 218 | 219 | /** 220 | * 获取goto表 221 | * @return 222 | */ 223 | public Map getSuccess() 224 | { 225 | return success; 226 | } 227 | 228 | public int getIndex() 229 | { 230 | return index; 231 | } 232 | 233 | public void setIndex(int index) 234 | { 235 | this.index = index; 236 | } 237 | } 238 | -------------------------------------------------------------------------------- /src/test/java/TestAhoCorasickDoubleArrayTrie.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2015/4/6 12:42 6 | * 7 | * 8 | * Copyright (c) 2003-2014, �Ϻ��ԭ��Ϣ�Ƽ��޹�˾. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact �Ϻ��ԭ��Ϣ�Ƽ��޹�˾ to get more information. 10 | * 11 | */ 12 | 13 | import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie; 14 | import junit.framework.TestCase; 15 | import org.ahocorasick.trie.Trie; 16 | 17 | import java.io.BufferedReader; 18 | import java.io.IOException; 19 | import java.io.InputStreamReader; 20 | import java.util.*; 21 | 22 | /** 23 | * @author hankcs 24 | */ 25 | public class TestAhoCorasickDoubleArrayTrie extends TestCase 26 | { 27 | public void testBuildAndParseSimply() throws Exception 28 | { 29 | // Collect test data set 30 | TreeMap map = new TreeMap(); 31 | String[] keyArray = new String[] 32 | { 33 | "hers", 34 | "his", 35 | "she", 36 | "he" 37 | }; 38 | for (String key : keyArray) 39 | { 40 | map.put(key, key); 41 | } 42 | // Build an AhoCorasickDoubleArrayTrie 43 | AhoCorasickDoubleArrayTrie acdat = new AhoCorasickDoubleArrayTrie(); 44 | acdat.build(map); 45 | // Test it 46 | final String text = "uhers"; 47 | acdat.parseText(text, new AhoCorasickDoubleArrayTrie.IHit() 48 | { 49 | @Override 50 | public void hit(int begin, int end, String value) 51 | { 52 | System.out.printf("[%d:%d]=%s\n", begin, end, value); 53 | assertEquals(text.substring(begin, end), value); 54 | } 55 | }); 56 | List.Hit> wordList = acdat.parseText(text); 57 | System.out.println(wordList); 58 | } 59 | 60 | public void testBuildAndParseWithBigFile() throws Exception 61 | { 62 | // Load test data from disk 63 | Set dictionary = loadDictionary("cn/dictionary.txt"); 64 | final String text = loadText("cn/text.txt"); 65 | // You can use any type of Map to hold data 66 | Map map = new TreeMap(); 67 | // Map map = new HashMap(); 68 | // Map map = new LinkedHashMap(); 69 | for (String key : dictionary) 70 | { 71 | map.put(key, key); 72 | } 73 | // Build an AhoCorasickDoubleArrayTrie 74 | AhoCorasickDoubleArrayTrie acdat = new AhoCorasickDoubleArrayTrie(); 75 | acdat.build(map); 76 | // Test it 77 | acdat.parseText(text, new AhoCorasickDoubleArrayTrie.IHit() 78 | { 79 | @Override 80 | public void hit(int begin, int end, String value) 81 | { 82 | assertEquals(text.substring(begin, end), value); 83 | } 84 | }); 85 | } 86 | 87 | private String loadText(String path) throws IOException 88 | { 89 | StringBuilder sbText = new StringBuilder(); 90 | BufferedReader br = new BufferedReader(new InputStreamReader(Thread.currentThread().getContextClassLoader().getResourceAsStream(path), "UTF-8")); 91 | String line; 92 | while ((line = br.readLine()) != null) 93 | { 94 | sbText.append(line).append("\n"); 95 | } 96 | br.close(); 97 | 98 | return sbText.toString(); 99 | } 100 | 101 | private Set loadDictionary(String path) throws IOException 102 | { 103 | Set dictionary = new TreeSet(); 104 | BufferedReader br = new BufferedReader(new InputStreamReader(Thread.currentThread().getContextClassLoader().getResourceAsStream(path), "UTF-8")); 105 | String line; 106 | while ((line = br.readLine()) != null) 107 | { 108 | dictionary.add(line); 109 | } 110 | br.close(); 111 | 112 | return dictionary; 113 | } 114 | 115 | private void runTest(String dictionaryPath, String textPath) throws IOException 116 | { 117 | Set dictionary = loadDictionary(dictionaryPath); 118 | String text = loadText(textPath); 119 | // Build a ahoCorasickNaive implemented by robert-bor 120 | Trie ahoCorasickNaive = new Trie(); 121 | for (String word : dictionary) 122 | { 123 | ahoCorasickNaive.addKeyword(word); 124 | } 125 | ahoCorasickNaive.parseText(""); // More fairly, robert-bor's implementation needs to call this to build ac automata. 126 | // Build a AhoCorasickDoubleArrayTrie implemented by hankcs 127 | AhoCorasickDoubleArrayTrie ahoCorasickDoubleArrayTrie = new AhoCorasickDoubleArrayTrie(); 128 | TreeMap dictionaryMap = new TreeMap(); 129 | for (String word : dictionary) 130 | { 131 | dictionaryMap.put(word, word); // we use the same text as the property of a word 132 | } 133 | ahoCorasickDoubleArrayTrie.build(dictionaryMap); 134 | // Let's test the speed of the two Aho-Corasick automata 135 | System.out.printf("Parsing document which contains %d characters, with a dictionary of %d words.\n", text.length(), dictionary.size()); 136 | long start = System.currentTimeMillis(); 137 | ahoCorasickNaive.parseText(text); 138 | long costTimeNaive = System.currentTimeMillis() - start; 139 | start = System.currentTimeMillis(); 140 | ahoCorasickDoubleArrayTrie.parseText(text, new AhoCorasickDoubleArrayTrie.IHit() 141 | { 142 | @Override 143 | public void hit(int begin, int end, String value) 144 | { 145 | 146 | } 147 | }); 148 | long costTimeACDAT = System.currentTimeMillis() - start; 149 | System.out.printf("%-15s\t%-15s\t%-15s\n", "", "Naive", "ACDAT"); 150 | System.out.printf("%-15s\t%-15d\t%-15d\n", "time", costTimeNaive, costTimeACDAT); 151 | System.out.printf("%-15s\t%-15.2f\t%-15.2f\n", "char/s", (text.length() / (costTimeNaive / 1000.0)), (text.length() / (costTimeACDAT / 1000.0))); 152 | System.out.printf("%-15s\t%-15.2f\t%-15.2f\n", "rate", 1.0, costTimeNaive / (double) costTimeACDAT); 153 | System.out.println("==========================================================================="); 154 | } 155 | 156 | /** 157 | * Compare my AhoCorasickDoubleArrayTrie with robert-bor's aho-corasick, notice that robert-bor's aho-corasick is 158 | * compiled under jdk1.8, so you will need jdk1.8 to run this test
159 | * To avoid JVM wasting time on allocating memory, please use -Xms512m -Xmx512m -Xmn256m . 160 | * @throws Exception 161 | */ 162 | public void testBenchmark() throws Exception 163 | { 164 | runTest("en/dictionary.txt", "en/text.txt"); 165 | runTest("cn/dictionary.txt", "cn/text.txt"); 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/algorithm/AhoCorasickDoubleArrayTrie.java: -------------------------------------------------------------------------------- 1 | package com.hankcs.algorithm; 2 | 3 | import java.io.IOException; 4 | import java.io.ObjectInputStream; 5 | import java.io.ObjectOutputStream; 6 | import java.util.*; 7 | import java.util.concurrent.LinkedBlockingDeque; 8 | 9 | /** 10 | * An implemention of Aho Corasick algorithm based on Double Array Trie 11 | * 12 | * @author hankcs 13 | */ 14 | public class AhoCorasickDoubleArrayTrie 15 | { 16 | /** 17 | * check array of the Double Array Trie structure 18 | */ 19 | protected int check[]; 20 | /** 21 | * base array of the Double Array Trie structure 22 | */ 23 | protected int base[]; 24 | /** 25 | * fail table of the Aho Corasick automata 26 | */ 27 | int fail[]; 28 | /** 29 | * out table of the Aho Corasick automata 30 | */ 31 | int[][] output; 32 | /** 33 | * outer value array 34 | */ 35 | protected V[] v; 36 | 37 | /** 38 | * the length of every key 39 | */ 40 | protected int[] l; 41 | 42 | /** 43 | * the size of base and check array 44 | */ 45 | protected int size; 46 | 47 | /** 48 | * Parse text 49 | * @param text The text 50 | * @return a list of outputs 51 | */ 52 | public List> parseText(String text) 53 | { 54 | int position = 1; 55 | int currentState = 0; 56 | List> collectedEmits = new LinkedList>(); 57 | for (int i = 0; i < text.length(); ++i) 58 | { 59 | currentState = getState(currentState, text.charAt(i)); 60 | storeEmits(position, currentState, collectedEmits); 61 | ++position; 62 | } 63 | 64 | return collectedEmits; 65 | } 66 | 67 | /** 68 | * Parse text 69 | * @param text The text 70 | * @param processor A processor which handles the output 71 | */ 72 | public void parseText(String text, IHit processor) 73 | { 74 | int position = 1; 75 | int currentState = 0; 76 | for (int i = 0; i < text.length(); ++i) 77 | { 78 | currentState = getState(currentState, text.charAt(i)); 79 | int[] hitArray = output[currentState]; 80 | if (hitArray != null) 81 | { 82 | for (int hit : hitArray) 83 | { 84 | processor.hit(position - l[hit], position, v[hit]); 85 | } 86 | } 87 | ++position; 88 | } 89 | } 90 | 91 | /** 92 | * Parse text 93 | * @param text The text 94 | * @param processor A processor which handles the output 95 | */ 96 | public void parseText(char[] text, IHit processor) 97 | { 98 | int position = 1; 99 | int currentState = 0; 100 | for (char c : text) 101 | { 102 | currentState = getState(currentState, c); 103 | int[] hitArray = output[currentState]; 104 | if (hitArray != null) 105 | { 106 | for (int hit : hitArray) 107 | { 108 | processor.hit(position - l[hit], position, v[hit]); 109 | } 110 | } 111 | ++position; 112 | } 113 | } 114 | 115 | /** 116 | * Parse text 117 | * @param text The text 118 | * @param processor A processor which handles the output 119 | */ 120 | public void parseText(char[] text, IHitFull processor) 121 | { 122 | int position = 1; 123 | int currentState = 0; 124 | for (char c : text) 125 | { 126 | currentState = getState(currentState, c); 127 | int[] hitArray = output[currentState]; 128 | if (hitArray != null) 129 | { 130 | for (int hit : hitArray) 131 | { 132 | processor.hit(position - l[hit], position, v[hit], hit); 133 | } 134 | } 135 | ++position; 136 | } 137 | } 138 | 139 | 140 | /** 141 | * Save 142 | * @param out An ObjectOutputStream object 143 | * @throws IOException Some IOException 144 | */ 145 | public void save(ObjectOutputStream out) throws IOException 146 | { 147 | out.writeObject(base); 148 | out.writeObject(check); 149 | out.writeObject(fail); 150 | out.writeObject(output); 151 | out.writeObject(l); 152 | out.writeObject(v); 153 | } 154 | 155 | /** 156 | * Load 157 | * @param in An ObjectInputStream object 158 | * @throws IOException 159 | * @throws ClassNotFoundException 160 | */ 161 | public void load(ObjectInputStream in) throws IOException, ClassNotFoundException 162 | { 163 | base = (int[]) in.readObject(); 164 | check = (int[]) in.readObject(); 165 | fail = (int[]) in.readObject(); 166 | output = (int[][]) in.readObject(); 167 | l = (int[]) in.readObject(); 168 | v = (V[]) in.readObject(); 169 | } 170 | 171 | /** 172 | * Get value by a String key, just like a map.get() method 173 | * @param key The key 174 | * @return 175 | */ 176 | public V get(String key) 177 | { 178 | int index = exactMatchSearch(key); 179 | if (index >= 0) 180 | { 181 | return v[index]; 182 | } 183 | 184 | return null; 185 | } 186 | 187 | /** 188 | * Pick the value by index in value array
189 | * Notice that to be more efficiently, this method DONOT check the parameter 190 | * @param index The index 191 | * @return The value 192 | */ 193 | public V get(int index) 194 | { 195 | return v[index]; 196 | } 197 | 198 | /** 199 | * Processor handles the output when hit a keyword 200 | */ 201 | public interface IHit 202 | { 203 | /** 204 | * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword 205 | * @param begin the beginning index, inclusive. 206 | * @param end the ending index, exclusive. 207 | * @param value the value assigned to the keyword 208 | */ 209 | void hit(int begin, int end, V value); 210 | } 211 | 212 | /** 213 | * Processor handles the output when hit a keyword, with more detail 214 | */ 215 | public interface IHitFull 216 | { 217 | /** 218 | * Hit a keyword, you can use some code like text.substring(begin, end) to get the keyword 219 | * @param begin the beginning index, inclusive. 220 | * @param end the ending index, exclusive. 221 | * @param value the value assigned to the keyword 222 | * @param index the index of the value assigned to the keyword, you can use the integer as a perfect hash value 223 | */ 224 | void hit(int begin, int end, V value, int index); 225 | } 226 | 227 | /** 228 | * A result output 229 | * 230 | * @param the value type 231 | */ 232 | public class Hit 233 | { 234 | /** 235 | * the beginning index, inclusive. 236 | */ 237 | public final int begin; 238 | /** 239 | * the ending index, exclusive. 240 | */ 241 | public final int end; 242 | /** 243 | * the value assigned to the keyword 244 | */ 245 | public final V value; 246 | 247 | public Hit(int begin, int end, V value) 248 | { 249 | this.begin = begin; 250 | this.end = end; 251 | this.value = value; 252 | } 253 | 254 | @Override 255 | public String toString() 256 | { 257 | return String.format("[%d:%d]=%s", begin, end, value); 258 | } 259 | } 260 | 261 | /** 262 | * transmit state, supports failure function 263 | * 264 | * @param currentState 265 | * @param character 266 | * @return 267 | */ 268 | private int getState(int currentState, char character) 269 | { 270 | int newCurrentState = transitionWithRoot(currentState, character); // 先按success跳转 271 | while (newCurrentState == -1) // 跳转失败的话，按failure跳转 272 | { 273 | currentState = fail[currentState]; 274 | newCurrentState = transitionWithRoot(currentState, character); 275 | } 276 | return newCurrentState; 277 | } 278 | 279 | /** 280 | * store output 281 | * 282 | * @param position 283 | * @param currentState 284 | * @param collectedEmits 285 | */ 286 | private void storeEmits(int position, int currentState, List> collectedEmits) 287 | { 288 | int[] hitArray = output[currentState]; 289 | if (hitArray != null) 290 | { 291 | for (int hit : hitArray) 292 | { 293 | collectedEmits.add(new Hit(position - l[hit], position, v[hit])); 294 | } 295 | } 296 | } 297 | 298 | /** 299 | * transition of a state 300 | * 301 | * @param current 302 | * @param c 303 | * @return 304 | */ 305 | protected int transition(int current, char c) 306 | { 307 | int b = current; 308 | int p; 309 | 310 | p = b + c + 1; 311 | if (b == check[p]) 312 | b = base[p]; 313 | else 314 | return -1; 315 | 316 | p = b; 317 | return p; 318 | } 319 | 320 | /** 321 | * transition of a state, if the state is root and it failed, then returns the root 322 | * 323 | * @param nodePos 324 | * @param c 325 | * @return 326 | */ 327 | protected int transitionWithRoot(int nodePos, char c) 328 | { 329 | int b = base[nodePos]; 330 | int p; 331 | 332 | p = b + c + 1; 333 | if (b != check[p]) 334 | { 335 | if (nodePos == 0) return 0; 336 | return -1; 337 | } 338 | 339 | return p; 340 | } 341 | 342 | 343 | /** 344 | * Build a AhoCorasickDoubleArrayTrie from a map 345 | * @param map a map containing key-value pairs 346 | */ 347 | public void build(Map map) 348 | { 349 | new Builder().build(map); 350 | } 351 | 352 | 353 | /** 354 | * match exactly by a key 355 | * 356 | * @param key the key 357 | * @return the index of the key, you can use it as a perfect hash function 358 | */ 359 | public int exactMatchSearch(String key) 360 | { 361 | return exactMatchSearch(key, 0, 0, 0); 362 | } 363 | 364 | /** 365 | * match exactly by a key 366 | * 367 | * @param key 368 | * @param pos 369 | * @param len 370 | * @param nodePos 371 | * @return 372 | */ 373 | private int exactMatchSearch(String key, int pos, int len, int nodePos) 374 | { 375 | if (len <= 0) 376 | len = key.length(); 377 | if (nodePos <= 0) 378 | nodePos = 0; 379 | 380 | int result = -1; 381 | 382 | char[] keyChars = key.toCharArray(); 383 | 384 | int b = base[nodePos]; 385 | int p; 386 | 387 | for (int i = pos; i < len; i++) 388 | { 389 | p = b + (int) (keyChars[i]) + 1; 390 | if (b == check[p]) 391 | b = base[p]; 392 | else 393 | return result; 394 | } 395 | 396 | p = b; 397 | int n = base[p]; 398 | if (b == check[p] && n < 0) 399 | { 400 | result = -n - 1; 401 | } 402 | return result; 403 | } 404 | 405 | /** 406 | * match exactly by a key 407 | * 408 | * @param keyChars the char array of the key 409 | * @param pos the begin index of char array 410 | * @param len the length of the key 411 | * @param nodePos the starting position of the node for searching 412 | * @return the value index of the key, minus indicates null 413 | */ 414 | private int exactMatchSearch(char[] keyChars, int pos, int len, int nodePos) 415 | { 416 | int result = -1; 417 | 418 | int b = base[nodePos]; 419 | int p; 420 | 421 | for (int i = pos; i < len; i++) 422 | { 423 | p = b + (int) (keyChars[i]) + 1; 424 | if (b == check[p]) 425 | b = base[p]; 426 | else 427 | return result; 428 | } 429 | 430 | p = b; 431 | int n = base[p]; 432 | if (b == check[p] && n < 0) 433 | { 434 | result = -n - 1; 435 | } 436 | return result; 437 | } 438 | 439 | // /** 440 | // * Just for debug when I wrote it 441 | // */ 442 | // public void debug() 443 | // { 444 | // System.out.println("base:"); 445 | // for (int i = 0; i < base.length; i++) 446 | // { 447 | // if (base[i] < 0) 448 | // { 449 | // System.out.println(i + " : " + -base[i]); 450 | // } 451 | // } 452 | // 453 | // System.out.println("output:"); 454 | // for (int i = 0; i < output.length; i++) 455 | // { 456 | // if (output[i] != null) 457 | // { 458 | // System.out.println(i + " : " + Arrays.toString(output[i])); 459 | // } 460 | // } 461 | // 462 | // System.out.println("fail:"); 463 | // for (int i = 0; i < fail.length; i++) 464 | // { 465 | // if (fail[i] != 0) 466 | // { 467 | // System.out.println(i + " : " + fail[i]); 468 | // } 469 | // } 470 | // 471 | // System.out.println(this); 472 | // } 473 | // 474 | // @Override 475 | // public String toString() 476 | // { 477 | // String infoIndex = "i = "; 478 | // String infoChar = "char = "; 479 | // String infoBase = "base = "; 480 | // String infoCheck = "check= "; 481 | // for (int i = 0; i < Math.min(base.length, 200); ++i) 482 | // { 483 | // if (base[i] != 0 || check[i] != 0) 484 | // { 485 | // infoChar += " " + (i == check[i] ? " ×" : (char) (i - check[i] - 1)); 486 | // infoIndex += " " + String.format("%5d", i); 487 | // infoBase += " " + String.format("%5d", base[i]); 488 | // infoCheck += " " + String.format("%5d", check[i]); 489 | // } 490 | // } 491 | // return "DoubleArrayTrie：" + 492 | // "\n" + infoChar + 493 | // "\n" + infoIndex + 494 | // "\n" + infoBase + 495 | // "\n" + infoCheck + "\n" + 496 | //// "check=" + Arrays.toString(check) + 497 | //// ", base=" + Arrays.toString(base) + 498 | //// ", used=" + Arrays.toString(used) + 499 | // "size=" + size 500 | //// ", length=" + Arrays.toString(length) + 501 | //// ", value=" + Arrays.toString(value) + 502 | // ; 503 | // } 504 | // 505 | // /** 506 | // * 一个顺序输出变量名与变量值的调试类 507 | // */ 508 | // private static class DebugArray 509 | // { 510 | // Map nameValueMap = new LinkedHashMap(); 511 | // 512 | // public void add(String name, int value) 513 | // { 514 | // String valueInMap = nameValueMap.get(name); 515 | // if (valueInMap == null) 516 | // { 517 | // valueInMap = ""; 518 | // } 519 | // 520 | // valueInMap += " " + String.format("%5d", value); 521 | // 522 | // nameValueMap.put(name, valueInMap); 523 | // } 524 | // 525 | // @Override 526 | // public String toString() 527 | // { 528 | // String text = ""; 529 | // for (Map.Entry entry : nameValueMap.entrySet()) 530 | // { 531 | // String name = entry.getKey(); 532 | // String value = entry.getValue(); 533 | // text += String.format("%-5s", name) + "= " + value + '\n'; 534 | // } 535 | // 536 | // return text; 537 | // } 538 | // 539 | // public void println() 540 | // { 541 | // System.out.print(this); 542 | // } 543 | // } 544 | 545 | /** 546 | * Get the size of the keywords 547 | * @return 548 | */ 549 | public int size() 550 | { 551 | return v.length; 552 | } 553 | 554 | /** 555 | * A builder to build the AhoCorasickDoubleArrayTrie 556 | */ 557 | private class Builder 558 | { 559 | /** 560 | * the root state of trie 561 | */ 562 | private State rootState = new State(); 563 | /** 564 | * whether the position has been used 565 | */ 566 | private boolean used[]; 567 | /** 568 | * the allocSize of the dynamic array 569 | */ 570 | private int allocSize; 571 | /** 572 | * a parameter controls the memory growth speed of the dynamic array 573 | */ 574 | private int progress; 575 | /** 576 | * the next position to check unused memory 577 | */ 578 | private int nextCheckPos; 579 | /** 580 | * the size of the key-pair sets 581 | */ 582 | private int keySize; 583 | 584 | /** 585 | * Build from a map 586 | * @param map a map containing key-value pairs 587 | */ 588 | @SuppressWarnings("unchecked") 589 | public void build(Map map) 590 | { 591 | // 把值保存下来 592 | v = (V[]) map.values().toArray(); 593 | l = new int[v.length]; 594 | Set keySet = map.keySet(); 595 | // 构建二分trie树 596 | addAllKeyword(keySet); 597 | // 在二分trie树的基础上构建双数组trie树 598 | buildDoubleArrayTrie(keySet.size()); 599 | used = null; 600 | // 构建failure表并且合并output表 601 | constructFailureStates(); 602 | rootState = null; 603 | loseWeight(); 604 | } 605 | 606 | /** 607 | * fetch siblings of a parent node 608 | * 609 | * @param parent parent node 610 | * @param siblings parent node's child nodes, i . e . the siblings 611 | * @return the amount of the siblings 612 | */ 613 | private int fetch(State parent, List> siblings) 614 | { 615 | if (parent.isAcceptable()) 616 | { 617 | State fakeNode = new State(-(parent.getDepth() + 1)); // 此节点是parent的子节点，同时具备parent的输出 618 | fakeNode.addEmit(parent.getLargestValueId()); 619 | siblings.add(new AbstractMap.SimpleEntry(0, fakeNode)); 620 | } 621 | for (Map.Entry entry : parent.getSuccess().entrySet()) 622 | { 623 | siblings.add(new AbstractMap.SimpleEntry(entry.getKey() + 1, entry.getValue())); 624 | } 625 | return siblings.size(); 626 | } 627 | 628 | /** 629 | * add a keyword 630 | * 631 | * @param keyword a keyword 632 | * @param index the index of the keyword 633 | */ 634 | private void addKeyword(String keyword, int index) 635 | { 636 | State currentState = this.rootState; 637 | for (Character character : keyword.toCharArray()) 638 | { 639 | currentState = currentState.addState(character); 640 | } 641 | currentState.addEmit(index); 642 | l[index] = keyword.length(); 643 | } 644 | 645 | /** 646 | * add a collection of keywords 647 | * 648 | * @param keywordSet the collection holding keywords 649 | */ 650 | private void addAllKeyword(Collection keywordSet) 651 | { 652 | int i = 0; 653 | for (String keyword : keywordSet) 654 | { 655 | addKeyword(keyword, i++); 656 | } 657 | } 658 | 659 | /** 660 | * construct failure table 661 | */ 662 | private void constructFailureStates() 663 | { 664 | fail = new int[size + 1]; 665 | fail[1] = base[0]; 666 | output = new int[size + 1][]; 667 | Queue queue = new LinkedBlockingDeque(); 668 | 669 | // 第一步，将深度为1的节点的failure设为根节点 670 | for (State depthOneState : this.rootState.getStates()) 671 | { 672 | depthOneState.setFailure(this.rootState, fail); 673 | queue.add(depthOneState); 674 | constructOutput(depthOneState); 675 | } 676 | 677 | // 第二步，为深度 > 1 的节点建立failure表，这是一个bfs 678 | while (!queue.isEmpty()) 679 | { 680 | State currentState = queue.remove(); 681 | 682 | for (Character transition : currentState.getTransitions()) 683 | { 684 | State targetState = currentState.nextState(transition); 685 | queue.add(targetState); 686 | 687 | State traceFailureState = currentState.failure(); 688 | while (traceFailureState.nextState(transition) == null) 689 | { 690 | traceFailureState = traceFailureState.failure(); 691 | } 692 | State newFailureState = traceFailureState.nextState(transition); 693 | targetState.setFailure(newFailureState, fail); 694 | targetState.addEmit(newFailureState.emit()); 695 | constructOutput(targetState); 696 | } 697 | } 698 | } 699 | 700 | /** 701 | * construct output table 702 | */ 703 | private void constructOutput(State targetState) 704 | { 705 | Collection emit = targetState.emit(); 706 | if (emit == null || emit.size() == 0) return; 707 | int output[] = new int[emit.size()]; 708 | Iterator it = emit.iterator(); 709 | for (int i = 0; i < output.length; ++i) 710 | { 711 | output[i] = it.next(); 712 | } 713 | AhoCorasickDoubleArrayTrie.this.output[targetState.getIndex()] = output; 714 | } 715 | 716 | private void buildDoubleArrayTrie(int keySize) 717 | { 718 | progress = 0; 719 | this.keySize = keySize; 720 | resize(65536 * 32); // 32个双字节 721 | 722 | base[0] = 1; 723 | nextCheckPos = 0; 724 | 725 | State root_node = this.rootState; 726 | 727 | List> siblings = new ArrayList>(root_node.getSuccess().entrySet().size()); 728 | fetch(root_node, siblings); 729 | insert(siblings); 730 | } 731 | 732 | /** 733 | * allocate the memory of the dynamic array 734 | * 735 | * @param newSize 736 | * @return 737 | */ 738 | private int resize(int newSize) 739 | { 740 | int[] base2 = new int[newSize]; 741 | int[] check2 = new int[newSize]; 742 | boolean used2[] = new boolean[newSize]; 743 | if (allocSize > 0) 744 | { 745 | System.arraycopy(base, 0, base2, 0, allocSize); 746 | System.arraycopy(check, 0, check2, 0, allocSize); 747 | System.arraycopy(used, 0, used2, 0, allocSize); 748 | } 749 | 750 | base = base2; 751 | check = check2; 752 | used = used2; 753 | 754 | return allocSize = newSize; 755 | } 756 | 757 | /** 758 | * insert the siblings to double array trie 759 | * 760 | * @param siblings the siblings being inserted 761 | * @return the position to insert them 762 | */ 763 | private int insert(List> siblings) 764 | { 765 | int begin = 0; 766 | int pos = Math.max(siblings.get(0).getKey() + 1, nextCheckPos) - 1; 767 | int nonzero_num = 0; 768 | int first = 0; 769 | 770 | if (allocSize <= pos) 771 | resize(pos + 1); 772 | 773 | outer: 774 | // 此循环体的目标是找出满足base[begin + a1...an] == 0的n个空闲空间,a1...an是siblings中的n个节点 775 | while (true) 776 | { 777 | pos++; 778 | 779 | if (allocSize <= pos) 780 | resize(pos + 1); 781 | 782 | if (check[pos] != 0) 783 | { 784 | nonzero_num++; 785 | continue; 786 | } 787 | else if (first == 0) 788 | { 789 | nextCheckPos = pos; 790 | first = 1; 791 | } 792 | 793 | begin = pos - siblings.get(0).getKey(); // 当前位置离第一个兄弟节点的距离 794 | if (allocSize <= (begin + siblings.get(siblings.size() - 1).getKey())) 795 | { 796 | // progress can be zero // 防止progress产生除零错误 797 | double l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05 : 1.0 * keySize / (progress + 1); 798 | resize((int) (allocSize * l)); 799 | } 800 | 801 | if (used[begin]) 802 | continue; 803 | 804 | for (int i = 1; i < siblings.size(); i++) 805 | if (check[begin + siblings.get(i).getKey()] != 0) 806 | continue outer; 807 | 808 | break; 809 | } 810 | 811 | // -- Simple heuristics -- 812 | // if the percentage of non-empty contents in check between the 813 | // index 814 | // 'next_check_pos' and 'check' is greater than some constant value 815 | // (e.g. 0.9), 816 | // new 'next_check_pos' index is written by 'check'. 817 | if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95) 818 | nextCheckPos = pos; // 从位置 next_check_pos 开始到 pos 间，如果已占用的空间在95%以上，下次插入节点时，直接从 pos 位置处开始查找 819 | used[begin] = true; 820 | 821 | size = (size > begin + siblings.get(siblings.size() - 1).getKey() + 1) ? size : begin + siblings.get(siblings.size() - 1).getKey() + 1; 822 | 823 | for (Map.Entry sibling : siblings) 824 | { 825 | check[begin + sibling.getKey()] = begin; 826 | } 827 | 828 | for (Map.Entry sibling : siblings) 829 | { 830 | List> new_siblings = new ArrayList>(sibling.getValue().getSuccess().entrySet().size() + 1); 831 | 832 | if (fetch(sibling.getValue(), new_siblings) == 0) // 一个词的终止且不为其他词的前缀，其实就是叶子节点 833 | { 834 | base[begin + sibling.getKey()] = (-sibling.getValue().getLargestValueId() - 1); 835 | progress++; 836 | } 837 | else 838 | { 839 | int h = insert(new_siblings); // dfs 840 | base[begin + sibling.getKey()] = h; 841 | } 842 | sibling.getValue().setIndex(begin + sibling.getKey()); 843 | } 844 | return begin; 845 | } 846 | 847 | /** 848 | * free the unnecessary memory 849 | */ 850 | private void loseWeight() 851 | { 852 | int nbase[] = new int[size + 65535]; 853 | System.arraycopy(base, 0, nbase, 0, size); 854 | base = nbase; 855 | 856 | int ncheck[] = new int[size + 65535]; 857 | System.arraycopy(check, 0, ncheck, 0, size); 858 | check = ncheck; 859 | } 860 | } 861 | } 862 | --------------------------------------------------------------------------------