(SIZE);
115 | int i = 0;
116 | for (State state : success)
117 | {
118 | if (state != null) stateList.add((char) i);
119 | ++i;
120 | }
121 | return stateList;
122 | }
123 | }
124 |
--------------------------------------------------------------------------------
/src/main/java/pers/hai/framework/ahocorasick/trie/Emit.java:
--------------------------------------------------------------------------------
1 | package pers.hai.framework.ahocorasick.trie;
2 |
3 | import pers.hai.framework.ahocorasick.interval.Interval;
4 | import pers.hai.framework.ahocorasick.interval.Intervalable;
5 |
6 | /**
7 | * 一个模式串匹配结果
8 | */
9 | public class Emit extends Interval implements Intervalable
10 | {
11 | /**
12 | * 匹配到的模式串
13 | */
14 | private final String keyword;
15 |
16 | /**
17 | * 构造一个模式串匹配结果
18 | * @param start 起点
19 | * @param end 重点
20 | * @param keyword 模式串
21 | */
22 | public Emit(final int start, final int end, final String keyword)
23 | {
24 | super(start, end);
25 | this.keyword = keyword;
26 | }
27 |
28 | /**
29 | * 获取对应的模式串
30 | * @return 模式串
31 | */
32 | public String getKeyword()
33 | {
34 | return this.keyword;
35 | }
36 |
37 | @Override
38 | public String toString()
39 | {
40 | return super.toString() + "=" + this.keyword;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/java/pers/hai/framework/ahocorasick/trie/FragmentToken.java:
--------------------------------------------------------------------------------
1 | package pers.hai.framework.ahocorasick.trie;
2 |
3 | public class FragmentToken extends Token
4 | {
5 |
6 | public FragmentToken(String fragment)
7 | {
8 | super(fragment);
9 | }
10 |
11 | @Override
12 | public boolean isMatch()
13 | {
14 | return false;
15 | }
16 |
17 | @Override
18 | public Emit getEmit()
19 | {
20 | return null;
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/main/java/pers/hai/framework/ahocorasick/trie/MatchToken.java:
--------------------------------------------------------------------------------
1 | package pers.hai.framework.ahocorasick.trie;
2 |
3 | /**
4 | * 匹配到的片段
5 | */
6 | public class MatchToken extends Token
7 | {
8 |
9 | private Emit emit;
10 |
11 | public MatchToken(String fragment, Emit emit)
12 | {
13 | super(fragment);
14 | this.emit = emit;
15 | }
16 |
17 | @Override
18 | public boolean isMatch()
19 | {
20 | return true;
21 | }
22 |
23 | @Override
24 | public Emit getEmit()
25 | {
26 | return this.emit;
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/java/pers/hai/framework/ahocorasick/trie/State.java:
--------------------------------------------------------------------------------
1 | package pers.hai.framework.ahocorasick.trie;
2 |
3 | import java.util.*;
4 |
5 | /**
6 | *
7 | * 一个状态有如下几个功能
8 | *
9 | *
10 | *
11 | * - success; 成功转移到另一个状态
12 | * - failure; 不可顺着字符串跳转的话,则跳转到一个浅一点的节点
13 | * - emits; 命中一个模式串
14 | *
15 | *
16 | *
17 | * 根节点稍有不同,根节点没有 failure 功能,它的“failure”指的是按照字符串路径转移到下一个状态。其他节点则都有failure状态。
18 | *
19 | *
20 | * @author Robert Bor
21 | */
22 | public abstract class State {
23 |
24 | /**
25 | * 模式串的长度,也是这个状态的深度
26 | */
27 | protected final int depth;
28 |
29 | /**
30 | * 只用于根节点来表示自己(当没有匹配到任何模式串的时候)
31 | */
32 | protected final State rootState;
33 |
34 | /**
35 | * fail 函数,如果没有匹配到,则跳转到此状态。
36 | */
37 | private State failure = null;
38 |
39 | /**
40 | * 只要这个状态可达,则记录模式串
41 | */
42 | private Set emits = null;
43 |
44 | /**
45 | * 构造深度为0的节点
46 | */
47 | public State()
48 | {
49 | this(0);
50 | }
51 |
52 | /**
53 | * 构造深度为depth的节点
54 | * @param depth
55 | */
56 | public State(int depth)
57 | {
58 | this.depth = depth;
59 | this.rootState = depth == 0 ? this : null;
60 | }
61 |
62 | /**
63 | * 获取节点深度
64 | * @return
65 | */
66 | public int getDepth()
67 | {
68 | return this.depth;
69 | }
70 |
71 | /**
72 | * 添加一个匹配到的模式串(这个状态对应着这个模式串)
73 | * @param keyword
74 | */
75 | public void addEmit(String keyword)
76 | {
77 | if (this.emits == null)
78 | {
79 | this.emits = new TreeSet();
80 | }
81 | this.emits.add(keyword);
82 | }
83 |
84 | /**
85 | * 添加一些匹配到的模式串
86 | * @param emits
87 | */
88 | public void addEmit(Collection emits)
89 | {
90 | for (String emit : emits)
91 | {
92 | addEmit(emit);
93 | }
94 | }
95 |
96 | /**
97 | * 获取这个节点代表的模式串(们)
98 | * @return
99 | */
100 | public Collection emit()
101 | {
102 | return this.emits == null ? Collections.emptyList() : this.emits;
103 | }
104 |
105 | /**
106 | * 获取failure状态
107 | * @return
108 | */
109 | public State failure()
110 | {
111 | return this.failure;
112 | }
113 |
114 | /**
115 | * 设置failure状态
116 | * @param failState
117 | */
118 | public void setFailure(State failState)
119 | {
120 | this.failure = failState;
121 | }
122 |
123 | /**
124 | * 转移到下一个状态(基于success转移)
125 | * @param character 希望按此字符转移
126 | * @return 转移结果
127 | */
128 | public abstract State nextState(Character character);
129 |
130 | /**
131 | * 转移到下一个状态,忽略根节点
132 | * @param character
133 | * @return
134 | */
135 | public abstract State nextStateIgnoreRootState(Character character);
136 |
137 | /**
138 | * 添加一个状态到success函数
139 | * @param character
140 | * @return
141 | */
142 | public abstract State addState(Character character);
143 |
144 | /**
145 | * 获取success状态
146 | * @return
147 | */
148 | public abstract Collection getStates();
149 |
150 | /**
151 | * 获取要转移到下一个状态的可能char
152 | * @return
153 | */
154 | public abstract Collection getTransitions();
155 | }
156 |
--------------------------------------------------------------------------------
/src/main/java/pers/hai/framework/ahocorasick/trie/Token.java:
--------------------------------------------------------------------------------
1 | package pers.hai.framework.ahocorasick.trie;
2 |
3 | /**
4 | * 一个片段
5 | */
6 | public abstract class Token
7 | {
8 | /**
9 | * 对应的片段
10 | */
11 | private String fragment;
12 |
13 | public Token(String fragment)
14 | {
15 | this.fragment = fragment;
16 | }
17 |
18 | public String getFragment()
19 | {
20 | return this.fragment;
21 | }
22 |
23 | public abstract boolean isMatch();
24 |
25 | public abstract Emit getEmit();
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/pers/hai/framework/ahocorasick/trie/Trie.java:
--------------------------------------------------------------------------------
1 | package pers.hai.framework.ahocorasick.trie;
2 |
3 | import pers.hai.framework.ahocorasick.interval.IntervalTree;
4 | import pers.hai.framework.ahocorasick.interval.Intervalable;
5 |
6 | import java.util.ArrayList;
7 | import java.util.Collection;
8 | import java.util.List;
9 | import java.util.Queue;
10 | import java.util.concurrent.LinkedBlockingDeque;
11 |
12 | /**
13 | * 基于 Aho-Corasick 白皮书 贝尔实验室:
14 | * ftp://163.13.200.222/assistant/bearhero/prog/%A8%E4%A5%A6/ac_bm.pdf
15 | *
16 | * @author Robert Bor
17 | */
18 | public class Trie {
19 |
20 | private TrieConfig trieConfig;
21 |
22 | private State rootState;
23 |
24 | /**
25 | * 是否建立了failure表
26 | */
27 | private boolean failureStatesConstructed = false;
28 |
29 | /**
30 | * 构造一棵trie树
31 | *
32 | * @param trieConfig
33 | */
34 | public Trie(TrieConfig trieConfig) {
35 | this(trieConfig, true);
36 | }
37 |
38 | public Trie(TrieConfig trieConfig, boolean ascii) {
39 | this.trieConfig = trieConfig;
40 | if (ascii) {
41 | this.rootState = new AsciiState();
42 | } else {
43 | this.rootState = new UnicodeState();
44 | }
45 | }
46 |
47 | /**
48 | * 以默认配置构造一棵trie树
49 | */
50 | public Trie() {
51 | this(new TrieConfig());
52 | }
53 |
54 | /**
55 | * 构造一棵trie树
56 | *
57 | * @param ascii
58 | * 是否是Ascii树(如果设为true,则会针对ascii加速,否则会支持Unicode)
59 | */
60 | public Trie(boolean ascii) {
61 | this(new TrieConfig(), ascii);
62 | }
63 |
64 | /**
65 | * 大小写敏感
66 | *
67 | * @return
68 | */
69 | public Trie caseInsensitive() {
70 | this.trieConfig.setCaseInsensitive(true);
71 | return this;
72 | }
73 |
74 | /**
75 | * 不允许模式串在位置上前后重叠
76 | *
77 | * @return
78 | */
79 | public Trie removeOverlaps() {
80 | this.trieConfig.setAllowOverlaps(false);
81 | return this;
82 | }
83 |
84 | public Trie onlyWholeWords() {
85 | this.trieConfig.setOnlyWholeWords(true);
86 | return this;
87 | }
88 |
89 | /**
90 | * 添加一个模式串
91 | *
92 | * @param keyword
93 | */
94 | public void addKeyword(String keyword) {
95 | if (keyword == null || keyword.length() == 0) {
96 | return;
97 | }
98 | State currentState = this.rootState;
99 | for (Character character : keyword.toCharArray()) {
100 | currentState = currentState.addState(character);
101 | }
102 | currentState.addEmit(keyword);
103 | }
104 |
105 | /**
106 | * 一个分词器
107 | *
108 | * @param text
109 | * 待分词文本
110 | * @return
111 | */
112 | public Collection tokenize(String text) {
113 |
114 | Collection tokens = new ArrayList();
115 |
116 | Collection collectedEmits = parseText(text);
117 | int lastCollectedPosition = -1;
118 | for (Emit emit : collectedEmits) {
119 | if (emit.getStart() - lastCollectedPosition > 1) {
120 | tokens.add(createFragment(emit, text, lastCollectedPosition));
121 | }
122 | tokens.add(createMatch(emit, text));
123 | lastCollectedPosition = emit.getEnd();
124 | }
125 | if (text.length() - lastCollectedPosition > 1) {
126 | tokens.add(createFragment(null, text, lastCollectedPosition));
127 | }
128 |
129 | return tokens;
130 | }
131 |
132 | private Token createFragment(Emit emit, String text,
133 | int lastCollectedPosition) {
134 | return new FragmentToken(text.substring(lastCollectedPosition + 1,
135 | emit == null ? text.length() : emit.getStart()));
136 | }
137 |
138 | private Token createMatch(Emit emit, String text) {
139 | return new MatchToken(
140 | text.substring(emit.getStart(), emit.getEnd() + 1), emit);
141 | }
142 |
143 | /**
144 | * 模式匹配
145 | *
146 | * @param text
147 | * 待匹配的文本
148 | * @return 匹配到的模式串
149 | */
150 | @SuppressWarnings("unchecked")
151 | public Collection parseText(String text) {
152 | checkForConstructedFailureStates();
153 |
154 | int position = 0;
155 | State currentState = this.rootState;
156 | List collectedEmits = new ArrayList();
157 | for (Character character : text.toCharArray()) {
158 | if (trieConfig.isCaseInsensitive()) {
159 | character = Character.toLowerCase(character);
160 | }
161 | currentState = getState(currentState, character);
162 | storeEmits(position, currentState, collectedEmits);
163 | ++position;
164 | }
165 |
166 | if (trieConfig.isOnlyWholeWords()) {
167 | removePartialMatches(text, collectedEmits);
168 | }
169 |
170 | if (!trieConfig.isAllowOverlaps()) {
171 | IntervalTree intervalTree = new IntervalTree(
172 | (List) (List>) collectedEmits);
173 | intervalTree.removeOverlaps(
174 | (List) (List>) collectedEmits);
175 | }
176 |
177 | return collectedEmits;
178 | }
179 |
180 | /**
181 | * 移除半截单词
182 | *
183 | * @param searchText
184 | * @param collectedEmits
185 | */
186 | private void removePartialMatches(String searchText,
187 | List collectedEmits) {
188 | long size = searchText.length();
189 | List removeEmits = new ArrayList();
190 | for (Emit emit : collectedEmits) {
191 | if ((emit.getStart() == 0 || !Character
192 | .isAlphabetic(searchText.charAt(emit.getStart() - 1)))
193 | && (emit.getEnd() + 1 == size || !Character.isAlphabetic(
194 | searchText.charAt(emit.getEnd() + 1)))) {
195 | continue;
196 | }
197 | removeEmits.add(emit);
198 | }
199 |
200 | for (Emit removeEmit : removeEmits) {
201 | collectedEmits.remove(removeEmit);
202 | }
203 | }
204 |
205 | /**
206 | * 跳转到下一个状态
207 | *
208 | * @param currentState
209 | * 当前状态
210 | * @param character
211 | * 接受字符
212 | * @return 跳转结果
213 | */
214 | private static State getState(State currentState, Character character) {
215 | State newCurrentState = currentState.nextState(character); // 先按success跳转
216 | while (newCurrentState == null) // 跳转失败的话,按failure跳转
217 | {
218 | currentState = currentState.failure();
219 | newCurrentState = currentState.nextState(character);
220 | }
221 | return newCurrentState;
222 | }
223 |
224 | /**
225 | * 检查是否建立了failure表
226 | */
227 | private void checkForConstructedFailureStates() {
228 | if (!this.failureStatesConstructed) {
229 | constructFailureStates();
230 | }
231 | }
232 |
233 | /**
234 | * 建立failure表
235 | */
236 | private void constructFailureStates() {
237 | Queue queue = new LinkedBlockingDeque();
238 |
239 | // 第一步,将深度为1的节点的failure设为根节点
240 | for (State depthOneState : this.rootState.getStates()) {
241 | depthOneState.setFailure(this.rootState);
242 | queue.add(depthOneState);
243 | }
244 | this.failureStatesConstructed = true;
245 |
246 | // 第二步,为深度 > 1 的节点建立failure表,这是一个bfs
247 | while (!queue.isEmpty()) {
248 | State currentState = queue.remove();
249 |
250 | for (Character transition : currentState.getTransitions()) {
251 | State targetState = currentState.nextState(transition);
252 | queue.add(targetState);
253 |
254 | State traceFailureState = currentState.failure();
255 | while (traceFailureState.nextState(transition) == null) {
256 | traceFailureState = traceFailureState.failure();
257 | }
258 | State newFailureState = traceFailureState.nextState(transition);
259 | targetState.setFailure(newFailureState);
260 | targetState.addEmit(newFailureState.emit());
261 | }
262 | }
263 | }
264 |
265 | /**
266 | * 保存匹配结果
267 | *
268 | * @param position
269 | * 当前位置,也就是匹配到的模式串的结束位置+1
270 | * @param currentState
271 | * 当前状态
272 | * @param collectedEmits
273 | * 保存位置
274 | */
275 | private static void storeEmits(int position, State currentState,
276 | List collectedEmits) {
277 | Collection emits = currentState.emit();
278 | if (emits != null && !emits.isEmpty()) {
279 | for (String emit : emits) {
280 | collectedEmits.add(
281 | new Emit(position - emit.length() + 1, position, emit));
282 | }
283 | }
284 | }
285 |
286 | }
287 |
--------------------------------------------------------------------------------
/src/main/java/pers/hai/framework/ahocorasick/trie/TrieConfig.java:
--------------------------------------------------------------------------------
1 | package pers.hai.framework.ahocorasick.trie;
2 |
3 | /**
4 | * 配置
5 | */
6 | public class TrieConfig
7 | {
8 | /**
9 | * 允许重叠
10 | */
11 | private boolean allowOverlaps = true;
12 |
13 | /**
14 | * 只匹配完整单词
15 | */
16 | private boolean onlyWholeWords = false;
17 |
18 | /**
19 | * 大小写不敏感
20 | */
21 | private boolean caseInsensitive = false;
22 |
23 | /**
24 | * 是否允许重叠
25 | *
26 | * @return
27 | */
28 | public boolean isAllowOverlaps()
29 | {
30 | return allowOverlaps;
31 | }
32 |
33 | /**
34 | * 设置是否允许重叠
35 | *
36 | * @param allowOverlaps
37 | */
38 | public void setAllowOverlaps(boolean allowOverlaps)
39 | {
40 | this.allowOverlaps = allowOverlaps;
41 | }
42 |
43 | /**
44 | * 是否只匹配完整单词
45 | *
46 | * @return
47 | */
48 | public boolean isOnlyWholeWords()
49 | {
50 | return onlyWholeWords;
51 | }
52 |
53 | /**
54 | * 设置是否只匹配完整单词
55 | *
56 | * @param onlyWholeWords
57 | */
58 | public void setOnlyWholeWords(boolean onlyWholeWords)
59 | {
60 | this.onlyWholeWords = onlyWholeWords;
61 | }
62 |
63 | /**
64 | * 是否大小写敏感
65 | *
66 | * @return
67 | */
68 | public boolean isCaseInsensitive()
69 | {
70 | return caseInsensitive;
71 | }
72 |
73 | /**
74 | * 设置大小写敏感
75 | *
76 | * @param caseInsensitive
77 | */
78 | public void setCaseInsensitive(boolean caseInsensitive)
79 | {
80 | this.caseInsensitive = caseInsensitive;
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/src/main/java/pers/hai/framework/ahocorasick/trie/UnicodeState.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * He Han
4 | * hankcs.cn@gmail.com
5 | * 2014/10/31 21:25
6 | *
7 | *
8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
10 | *
11 | */
12 | package pers.hai.framework.ahocorasick.trie;
13 |
14 | import java.util.Collection;
15 | import java.util.Map;
16 | import java.util.TreeMap;
17 |
18 | /**
19 | * @author hankcs
20 | */
21 | public class UnicodeState extends State
22 | {
23 | /**
24 | * goto 表,也称转移函数。根据字符串的下一个字符转移到下一个状态
25 | */
26 | private Map success = new TreeMap();
27 |
28 | public UnicodeState()
29 | {
30 | super();
31 | }
32 |
33 | public UnicodeState(int depth)
34 | {
35 | super(depth);
36 | }
37 |
38 | /**
39 | * 转移到下一个状态
40 | * @param character 希望按此字符转移
41 | * @param ignoreRootState 是否忽略根节点,如果是根节点自己调用则应该是true,否则为false
42 | * @return 转移结果
43 | */
44 | private State nextState(Character character, boolean ignoreRootState)
45 | {
46 | State nextState = this.success.get(character);
47 | if (!ignoreRootState && nextState == null && this.rootState != null)
48 | {
49 | nextState = this.rootState;
50 | }
51 | return nextState;
52 | }
53 |
54 | @Override
55 | public State nextState(Character character)
56 | {
57 | return nextState(character, false);
58 | }
59 |
60 | @Override
61 | public State nextStateIgnoreRootState(Character character)
62 | {
63 | return nextState(character, true);
64 | }
65 |
66 | @Override
67 | public State addState(Character character)
68 | {
69 | State nextState = nextStateIgnoreRootState(character);
70 | if (nextState == null)
71 | {
72 | nextState = new UnicodeState(this.depth + 1);
73 | this.success.put(character, nextState);
74 | }
75 | return nextState;
76 | }
77 |
78 | @Override
79 | public Collection getStates()
80 | {
81 | return this.success.values();
82 | }
83 |
84 | @Override
85 | public Collection getTransitions()
86 | {
87 | return this.success.keySet();
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qwhai/jac-core/2de4c497fa4ce8f045f7d53441fa152f632c3ac5/src/main/resources/log4j.properties
--------------------------------------------------------------------------------
/src/test/java/pers/hai/framework/ahocorasick/AhoCorasickTest.java:
--------------------------------------------------------------------------------
1 | package pers.hai.framework.ahocorasick;
2 |
3 | import pers.hai.framework.ahocorasick.trie.Emit;
4 | import pers.hai.framework.ahocorasick.trie.Trie;
5 | import pers.hai.framework.ahocorasick.trie.TrieConfig;
6 | import org.apache.log4j.Logger;
7 | import org.junit.Test;
8 |
9 | import java.util.Collection;
10 | import java.util.HashSet;
11 | import java.util.Set;
12 |
13 | /**
14 | * @Description TODO
15 | * @Author: Q-WHai
16 | * @Date: Created in 17:12 2019/05/06
17 | */
18 | public class AhoCorasickTest {
19 |
20 | private final Logger logger = Logger.getLogger(AhoCorasickTest.class);
21 |
22 | @Test
23 | public void test1() {
24 | String text = "Hello world, Hello java.";
25 | Set words = new HashSet<>(){{
26 | add("Hello");
27 | add("World");
28 | add("java");
29 | }};
30 |
31 | Trie trie = new Trie();
32 | for (String word : words) {
33 | trie.addKeyword(word);
34 | }
35 |
36 | Collection emits = trie.parseText(text);
37 | for (Emit emit : emits) {
38 | logger.info(emit);
39 | }
40 | }
41 |
42 | @Test
43 | public void test2() {
44 | String text = "基于Java实现AhoCorasick自动机框架";
45 | Set words = new HashSet<>(){{
46 | add("基于");
47 | add("AhoCorasick");
48 | add("自动机");
49 | }};
50 |
51 | Trie trie = new Trie(false);
52 | for (String word : words) {
53 | trie.addKeyword(word);
54 | }
55 |
56 | Collection emits = trie.parseText(text);
57 | for (Emit emit : emits) {
58 | logger.info(emit);
59 | }
60 | }
61 |
62 | @Test
63 | public void test3() {
64 | String text = "Hello world, Hello java.";
65 | Set words = new HashSet<>(){{
66 | add("hello");
67 | add("World");
68 | add("java");
69 | }};
70 |
71 | TrieConfig config = new TrieConfig();
72 | config.setAllowOverlaps(false);
73 | config.setOnlyWholeWords(true);
74 | config.setCaseInsensitive(true);
75 |
76 | Trie trie = new Trie(config);
77 | for (String word : words) {
78 | trie.addKeyword(word);
79 | }
80 |
81 | Collection emits = trie.parseText(text);
82 | for (Emit emit : emits) {
83 | logger.info(emit);
84 | }
85 | }
86 |
87 | @Test
88 | public void test4() {
89 | String text = "基于Java实现AhoCorasick自动机框架";
90 | Set words = new HashSet<>(){{
91 | add("基于");
92 | add("ahocorasick");
93 | add("自动机");
94 | add("java");
95 | }};
96 |
97 | TrieConfig config = new TrieConfig();
98 | config.setAllowOverlaps(false);
99 | config.setOnlyWholeWords(true);
100 | config.setCaseInsensitive(true);
101 |
102 | Trie trie = new Trie(config, false);
103 | for (String word : words) {
104 | trie.addKeyword(word);
105 | }
106 |
107 | Collection emits = trie.parseText(text);
108 | for (Emit emit : emits) {
109 | logger.info(emit);
110 | }
111 | }
112 | }
113 |
--------------------------------------------------------------------------------