├── .gitignore ├── jac-core.iml ├── jac.core.iml ├── pom.xml ├── readme.md └── src ├── main ├── java │ └── pers │ │ └── hai │ │ └── framework │ │ └── ahocorasick │ │ ├── interval │ │ ├── Interval.java │ │ ├── IntervalNode.java │ │ ├── IntervalTree.java │ │ ├── Intervalable.java │ │ ├── IntervalableComparatorByPosition.java │ │ └── IntervalableComparatorBySize.java │ │ └── trie │ │ ├── AsciiState.java │ │ ├── Emit.java │ │ ├── FragmentToken.java │ │ ├── MatchToken.java │ │ ├── State.java │ │ ├── Token.java │ │ ├── Trie.java │ │ ├── TrieConfig.java │ │ └── UnicodeState.java └── resources │ └── log4j.properties └── test └── java └── pers └── hai └── framework └── ahocorasick └── AhoCorasickTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | /.idea 25 | /target 26 | -------------------------------------------------------------------------------- /jac-core.iml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /jac.core.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 4.0.0 6 | 7 | org.hai 8 | jac.core 9 | 0.9.0 10 | 11 | jac.core 12 | 13 | http://www.example.com 14 | 15 | 16 | UTF-8 17 | 12 18 | 12 19 | 4.11 20 | 1.2.17 21 | 22 | 23 | 24 | 25 | junit 26 | junit 27 | ${junit.version} 28 | 29 | 30 | 31 | 32 | log4j 33 | log4j 34 | ${log4j.version} 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | maven-clean-plugin 44 | 3.1.0 45 | 46 | 47 | 48 | maven-resources-plugin 49 | 3.0.2 50 | 51 | 52 | maven-compiler-plugin 53 | 3.8.0 54 | 55 | 56 | maven-surefire-plugin 57 | 2.22.1 58 | 59 | 60 | maven-jar-plugin 61 | 3.0.2 62 | 63 | 64 | maven-install-plugin 65 | 2.5.2 66 | 67 | 68 | maven-deploy-plugin 69 | 2.8.2 70 | 71 | 72 | 73 | maven-site-plugin 74 | 3.7.1 75 | 76 | 77 | maven-project-info-reports-plugin 78 | 3.0.0 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ## jac-core 2 | > 基于Java实现AhoCorasick自动机框架 3 | 4 | [![lang](https://img.shields.io/badge/lang-java-brightgreen.svg)]() 5 | [![ide](https://img.shields.io/badge/ide-IntelliJ%20IDEA-brightgreen.svg)]() 6 | [![maven](https://img.shields.io/badge/maven-3.6.0-brightgreen.svg)]() 7 | 8 | ### Examples 9 | 10 | **1.General match method** 11 | ```java 12 | String text = "Hello world, Hello java."; 13 | Set words = new HashSet<>(){{ 14 | add("Hello"); 15 | add("World"); 16 | add("java"); 17 | }}; 18 | 19 | Trie trie = new Trie(); 20 | for (String word : words) { 21 | trie.addKeyword(word); 22 | } 23 | 24 | Collection emits = trie.parseText(text); 25 | for (Emit emit : emits) { 26 | logger.info(emit); 27 | } 28 | ``` 29 | **Result** 30 | ``` 31 | 0:4=Hello 32 | 13:17=Hello 33 | 19:22=java 34 | ``` 35 | 36 | **2.Chinese match method** 37 | ```java 38 | String text = "基于Java实现AhoCorasick自动机框架"; 39 | Set words = new HashSet<>(){{ 40 | add("基于"); 41 | add("AhoCorasick"); 42 | add("自动机"); 43 | }}; 44 | 45 | Trie trie = new Trie(false); 46 | for (String word : words) { 47 | trie.addKeyword(word); 48 | } 49 | 50 | Collection emits = trie.parseText(text); 51 | for (Emit emit : emits) { 52 | logger.info(emit); 53 | } 54 | ``` 55 | **Result** 56 | ``` 57 | 0:1=基于 58 | 8:18=AhoCorasick 59 | 19:21=自动机 60 | ``` 61 | 62 | **3.Match by `TrieConfig`** 63 | ```java 64 | String text = "Hello world, Hello java."; 65 | Set words = new HashSet<>(){{ 66 | add("hello"); 67 | add("World"); 68 | add("java"); 69 | }}; 70 | 71 | TrieConfig config = new TrieConfig(); 72 | config.setAllowOverlaps(false); 73 | config.setOnlyWholeWords(true); 74 | config.setCaseInsensitive(true); 75 | 76 | Trie trie = new Trie(config); 77 | for (String word : words) { 78 | trie.addKeyword(word); 79 | } 80 | 81 | Collection emits = trie.parseText(text); 82 | for (Emit emit : emits) { 83 | logger.info(emit); 84 | } 85 | ``` 86 | **Result** 87 | ``` 88 | 0:4=hello 89 | 13:17=hello 90 | 19:22=java 91 | ``` 92 | 93 | ---------------------------------------------- 94 | 95 | - [Blogcsdn](https://qwhai.blog.csdn.net/) 96 | - 《[深入理解Aho-Corasick自动机算法](https://qwhai.blog.csdn.net/article/details/49335051)》 97 | - [Github](https://github.com/qwhaib) 98 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/interval/Interval.java: -------------------------------------------------------------------------------- 1 | package pers.hai.framework.ahocorasick.interval; 2 | 3 | /** 4 | * 区间 5 | */ 6 | public class Interval implements Intervalable 7 | { 8 | /** 9 | * 起点 10 | */ 11 | private int start; 12 | /** 13 | * 终点 14 | */ 15 | private int end; 16 | 17 | /** 18 | * 构造一个区间 19 | * @param start 20 | * @param end 21 | */ 22 | public Interval(final int start, final int end) 23 | { 24 | this.start = start; 25 | this.end = end; 26 | } 27 | 28 | public int getStart() 29 | { 30 | return this.start; 31 | } 32 | 33 | public int getEnd() 34 | { 35 | return this.end; 36 | } 37 | 38 | public int size() 39 | { 40 | return end - start + 1; 41 | } 42 | 43 | /** 44 | * 是否与另一个区间交叉(有一部分重叠) 45 | * @param other 46 | * @return 47 | */ 48 | public boolean overlapsWith(Interval other) 49 | { 50 | return this.start <= other.getEnd() && 51 | this.end >= other.getStart(); 52 | } 53 | 54 | /** 55 | * 区间是否覆盖了这个点 56 | * @param point 57 | * @return 58 | */ 59 | public boolean overlapsWith(int point) 60 | { 61 | return this.start <= point && point <= this.end; 62 | } 63 | 64 | @Override 65 | public boolean equals(Object o) 66 | { 67 | if (!(o instanceof Intervalable)) 68 | { 69 | return false; 70 | } 71 | Intervalable other = (Intervalable) o; 72 | return this.start == other.getStart() && 73 | this.end == other.getEnd(); 74 | } 75 | 76 | @Override 77 | public int hashCode() 78 | { 79 | return this.start % 100 + this.end % 100; 80 | } 81 | 82 | @Override 83 | public int compareTo(Object o) 84 | { 85 | if (!(o instanceof Intervalable)) 86 | { 87 | return -1; 88 | } 89 | Intervalable other = (Intervalable) o; 90 | int comparison = this.start - other.getStart(); 91 | return comparison != 0 ? comparison : this.end - other.getEnd(); 92 | } 93 | 94 | @Override 95 | public String toString() 96 | { 97 | return this.start + ":" + this.end; 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/interval/IntervalNode.java: -------------------------------------------------------------------------------- 1 | package pers.hai.framework.ahocorasick.interval; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.List; 6 | 7 | /** 8 | * 线段树上面的节点,实际上是一些区间的集合,并且按中点维护了两个节点 9 | */ 10 | public class IntervalNode 11 | { 12 | /** 13 | * 方向 14 | */ 15 | private enum Direction 16 | { 17 | LEFT, RIGHT 18 | } 19 | 20 | /** 21 | * 区间集合的最左端 22 | */ 23 | private IntervalNode left = null; 24 | /** 25 | * 最右端 26 | */ 27 | private IntervalNode right = null; 28 | /** 29 | * 中点 30 | */ 31 | private int point; 32 | /** 33 | * 区间集合 34 | */ 35 | private List intervals = new ArrayList(); 36 | 37 | /** 38 | * 构造一个节点 39 | * @param intervals 40 | */ 41 | public IntervalNode(List intervals) 42 | { 43 | this.point = determineMedian(intervals); 44 | 45 | List toLeft = new ArrayList(); // 以中点为界靠左的区间 46 | List toRight = new ArrayList(); // 靠右的区间 47 | 48 | for (Intervalable interval : intervals) 49 | { 50 | if (interval.getEnd() < this.point) 51 | { 52 | toLeft.add(interval); 53 | } 54 | else if (interval.getStart() > this.point) 55 | { 56 | toRight.add(interval); 57 | } 58 | else 59 | { 60 | this.intervals.add(interval); 61 | } 62 | } 63 | 64 | if (toLeft.size() > 0) 65 | { 66 | this.left = new IntervalNode(toLeft); 67 | } 68 | if (toRight.size() > 0) 69 | { 70 | this.right = new IntervalNode(toRight); 71 | } 72 | } 73 | 74 | /** 75 | * 计算中点 76 | * @param intervals 区间集合 77 | * @return 中点坐标 78 | */ 79 | public int determineMedian(List intervals) 80 | { 81 | int start = -1; 82 | int end = -1; 83 | for (Intervalable interval : intervals) 84 | { 85 | int currentStart = interval.getStart(); 86 | int currentEnd = interval.getEnd(); 87 | if (start == -1 || currentStart < start) 88 | { 89 | start = currentStart; 90 | } 91 | if (end == -1 || currentEnd > end) 92 | { 93 | end = currentEnd; 94 | } 95 | } 96 | return (start + end) / 2; 97 | } 98 | 99 | /** 100 | * 寻找与interval有重叠的区间 101 | * @param interval 102 | * @return 103 | */ 104 | public List findOverlaps(Intervalable interval) 105 | { 106 | 107 | List overlaps = new ArrayList(); 108 | 109 | if (this.point < interval.getStart()) 110 | { 111 | // 右边找找 112 | addToOverlaps(interval, overlaps, findOverlappingRanges(this.right, interval)); 113 | addToOverlaps(interval, overlaps, checkForOverlapsToTheRight(interval)); 114 | } 115 | else if (this.point > interval.getEnd()) 116 | { 117 | // 左边找找 118 | addToOverlaps(interval, overlaps, findOverlappingRanges(this.left, interval)); 119 | addToOverlaps(interval, overlaps, checkForOverlapsToTheLeft(interval)); 120 | } 121 | else 122 | { 123 | // 否则在当前区间 124 | addToOverlaps(interval, overlaps, this.intervals); 125 | addToOverlaps(interval, overlaps, findOverlappingRanges(this.left, interval)); 126 | addToOverlaps(interval, overlaps, findOverlappingRanges(this.right, interval)); 127 | } 128 | 129 | return overlaps; 130 | } 131 | 132 | /** 133 | * 添加到重叠区间列表中 134 | * @param interval 跟此区间重叠 135 | * @param overlaps 重叠区间列表 136 | * @param newOverlaps 希望将这些区间加入 137 | */ 138 | protected void addToOverlaps(Intervalable interval, List overlaps, List newOverlaps) 139 | { 140 | for (Intervalable currentInterval : newOverlaps) 141 | { 142 | if (!currentInterval.equals(interval)) 143 | { 144 | overlaps.add(currentInterval); 145 | } 146 | } 147 | } 148 | 149 | /** 150 | * 往左边寻找重叠 151 | * @param interval 152 | * @return 153 | */ 154 | protected List checkForOverlapsToTheLeft(Intervalable interval) 155 | { 156 | return checkForOverlaps(interval, Direction.LEFT); 157 | } 158 | 159 | /** 160 | * 往右边寻找重叠 161 | * @param interval 162 | * @return 163 | */ 164 | protected List checkForOverlapsToTheRight(Intervalable interval) 165 | { 166 | return checkForOverlaps(interval, Direction.RIGHT); 167 | } 168 | 169 | /** 170 | * 寻找重叠 171 | * @param interval 一个区间,与该区间重叠 172 | * @param direction 方向,表明重叠区间在interval的左边还是右边 173 | * @return 174 | */ 175 | protected List checkForOverlaps(Intervalable interval, Direction direction) 176 | { 177 | 178 | List overlaps = new ArrayList(); 179 | for (Intervalable currentInterval : this.intervals) 180 | { 181 | switch (direction) 182 | { 183 | case LEFT: 184 | if (currentInterval.getStart() <= interval.getEnd()) 185 | { 186 | overlaps.add(currentInterval); 187 | } 188 | break; 189 | case RIGHT: 190 | if (currentInterval.getEnd() >= interval.getStart()) 191 | { 192 | overlaps.add(currentInterval); 193 | } 194 | break; 195 | } 196 | } 197 | return overlaps; 198 | } 199 | 200 | /** 201 | * 是对IntervalNode.findOverlaps(Intervalable)的一个包装,防止NPE 202 | * @see IntervalNode#findOverlaps(Intervalable) 203 | * @param node 204 | * @param interval 205 | * @return 206 | */ 207 | protected static List findOverlappingRanges(IntervalNode node, Intervalable interval) 208 | { 209 | if (node != null) 210 | { 211 | return node.findOverlaps(interval); 212 | } 213 | return Collections.emptyList(); 214 | } 215 | 216 | } 217 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/interval/IntervalTree.java: -------------------------------------------------------------------------------- 1 | package pers.hai.framework.ahocorasick.interval; 2 | 3 | import java.util.Collections; 4 | import java.util.List; 5 | import java.util.Set; 6 | import java.util.TreeSet; 7 | 8 | /** 9 | * 线段树,用于检查区间重叠 10 | */ 11 | public class IntervalTree 12 | { 13 | /** 14 | * 根节点 15 | */ 16 | private IntervalNode rootNode = null; 17 | 18 | /** 19 | * 构造线段树 20 | * 21 | * @param intervals 22 | */ 23 | public IntervalTree(List intervals) 24 | { 25 | this.rootNode = new IntervalNode(intervals); 26 | } 27 | 28 | /** 29 | * 从区间列表中移除重叠的区间 30 | * 31 | * @param intervals 32 | * @return 33 | */ 34 | public List removeOverlaps(List intervals) 35 | { 36 | 37 | // 排序,按照先大小后左端点的顺序 38 | Collections.sort(intervals, new IntervalableComparatorBySize()); 39 | 40 | Set removeIntervals = new TreeSet(); 41 | 42 | for (Intervalable interval : intervals) 43 | { 44 | // 如果区间已经被移除了,就忽略它 45 | if (removeIntervals.contains(interval)) 46 | { 47 | continue; 48 | } 49 | 50 | // 否则就移除它 51 | removeIntervals.addAll(findOverlaps(interval)); 52 | } 53 | 54 | // 移除所有的重叠区间 55 | for (Intervalable removeInterval : removeIntervals) 56 | { 57 | intervals.remove(removeInterval); 58 | } 59 | 60 | // 排序,按照左端顺序 61 | Collections.sort(intervals, new IntervalableComparatorByPosition()); 62 | 63 | return intervals; 64 | } 65 | 66 | /** 67 | * 寻找重叠区间 68 | * 69 | * @param interval 与这个区间重叠 70 | * @return 重叠的区间列表 71 | */ 72 | public List findOverlaps(Intervalable interval) 73 | { 74 | return rootNode.findOverlaps(interval); 75 | } 76 | 77 | } 78 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/interval/Intervalable.java: -------------------------------------------------------------------------------- 1 | package pers.hai.framework.ahocorasick.interval; 2 | 3 | /** 4 | * 区间接口 5 | */ 6 | @SuppressWarnings("rawtypes") 7 | public interface Intervalable extends Comparable { 8 | /** 9 | * 起点 10 | * 11 | * @return 12 | */ 13 | int getStart(); 14 | 15 | /** 16 | * 终点 17 | * 18 | * @return 19 | */ 20 | int getEnd(); 21 | 22 | /** 23 | * 长度 24 | * 25 | * @return 26 | */ 27 | int size(); 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/interval/IntervalableComparatorByPosition.java: -------------------------------------------------------------------------------- 1 | package pers.hai.framework.ahocorasick.interval; 2 | 3 | import java.util.Comparator; 4 | 5 | /** 6 | * 按起点比较区间 7 | */ 8 | public class IntervalableComparatorByPosition implements Comparator 9 | { 10 | @Override 11 | public int compare(Intervalable intervalable, Intervalable intervalable2) 12 | { 13 | return intervalable.getStart() - intervalable2.getStart(); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/interval/IntervalableComparatorBySize.java: -------------------------------------------------------------------------------- 1 | package pers.hai.framework.ahocorasick.interval; 2 | 3 | import java.util.Comparator; 4 | 5 | /** 6 | * 按照长度比较区间,如果长度相同,则比较起点 7 | */ 8 | public class IntervalableComparatorBySize implements Comparator 9 | { 10 | @Override 11 | public int compare(Intervalable intervalable, Intervalable intervalable2) 12 | { 13 | int comparison = intervalable2.size() - intervalable.size(); 14 | if (comparison == 0) 15 | { 16 | comparison = intervalable.getStart() - intervalable2.getStart(); 17 | } 18 | return comparison; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/trie/AsciiState.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/10/31 21:24 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package pers.hai.framework.ahocorasick.trie; 13 | 14 | import java.util.ArrayList; 15 | import java.util.Collection; 16 | import java.util.List; 17 | 18 | /** 19 | * 单字节字符优化版AC自动机节点 20 | * @author hankcs 21 | */ 22 | public class AsciiState extends State 23 | { 24 | static final int SIZE = 256; 25 | /** 26 | * goto 表,也称转移函数。根据字符串的下一个字符转移到下一个状态 27 | */ 28 | private State[] success = new State[SIZE]; 29 | 30 | /** 31 | * 转移到下一个状态 32 | * @param character 希望按此字符转移 33 | * @param ignoreRootState 是否忽略根节点,如果是根节点自己调用则应该是true,否则为false 34 | * @return 转移结果 35 | */ 36 | private State nextState(Character character, boolean ignoreRootState) 37 | { 38 | State nextState = this.success[character & 0xff]; 39 | if (!ignoreRootState && nextState == null && this.rootState != null) 40 | { 41 | nextState = this.rootState; 42 | } 43 | return nextState; 44 | } 45 | 46 | /** 47 | * @see AsciiState 48 | */ 49 | public AsciiState() 50 | { 51 | } 52 | 53 | public AsciiState(int depth) 54 | { 55 | super(depth); 56 | } 57 | 58 | /** 59 | * 转移到下一个状态(基于success转移) 60 | * @param character 希望按此字符转移 61 | * @return 转移结果 62 | */ 63 | public State nextState(Character character) 64 | { 65 | return nextState(character, false); 66 | } 67 | 68 | /** 69 | * 转移到下一个状态,忽略根节点 70 | * @param character 71 | * @return 72 | */ 73 | public State nextStateIgnoreRootState(Character character) 74 | { 75 | return nextState(character, true); 76 | } 77 | 78 | /** 79 | * 添加一个状态到success函数 80 | * @param character 81 | * @return 82 | */ 83 | public State addState(Character character) 84 | { 85 | State nextState = nextStateIgnoreRootState(character); 86 | if (nextState == null) 87 | { 88 | nextState = new AsciiState(this.depth + 1); 89 | this.success[character] = nextState; 90 | } 91 | return nextState; 92 | } 93 | 94 | /** 95 | * 获取success状态 96 | * @return 97 | */ 98 | public Collection getStates() 99 | { 100 | List stateList = new ArrayList(SIZE); 101 | for (State state : success) 102 | { 103 | if (state != null) stateList.add(state); 104 | } 105 | return stateList; 106 | } 107 | 108 | /** 109 | * 获取要转移到下一个状态的可能char 110 | * @return 111 | */ 112 | public Collection getTransitions() 113 | { 114 | List stateList = new ArrayList(SIZE); 115 | int i = 0; 116 | for (State state : success) 117 | { 118 | if (state != null) stateList.add((char) i); 119 | ++i; 120 | } 121 | return stateList; 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/trie/Emit.java: -------------------------------------------------------------------------------- 1 | package pers.hai.framework.ahocorasick.trie; 2 | 3 | import pers.hai.framework.ahocorasick.interval.Interval; 4 | import pers.hai.framework.ahocorasick.interval.Intervalable; 5 | 6 | /** 7 | * 一个模式串匹配结果 8 | */ 9 | public class Emit extends Interval implements Intervalable 10 | { 11 | /** 12 | * 匹配到的模式串 13 | */ 14 | private final String keyword; 15 | 16 | /** 17 | * 构造一个模式串匹配结果 18 | * @param start 起点 19 | * @param end 重点 20 | * @param keyword 模式串 21 | */ 22 | public Emit(final int start, final int end, final String keyword) 23 | { 24 | super(start, end); 25 | this.keyword = keyword; 26 | } 27 | 28 | /** 29 | * 获取对应的模式串 30 | * @return 模式串 31 | */ 32 | public String getKeyword() 33 | { 34 | return this.keyword; 35 | } 36 | 37 | @Override 38 | public String toString() 39 | { 40 | return super.toString() + "=" + this.keyword; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/trie/FragmentToken.java: -------------------------------------------------------------------------------- 1 | package pers.hai.framework.ahocorasick.trie; 2 | 3 | public class FragmentToken extends Token 4 | { 5 | 6 | public FragmentToken(String fragment) 7 | { 8 | super(fragment); 9 | } 10 | 11 | @Override 12 | public boolean isMatch() 13 | { 14 | return false; 15 | } 16 | 17 | @Override 18 | public Emit getEmit() 19 | { 20 | return null; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/trie/MatchToken.java: -------------------------------------------------------------------------------- 1 | package pers.hai.framework.ahocorasick.trie; 2 | 3 | /** 4 | * 匹配到的片段 5 | */ 6 | public class MatchToken extends Token 7 | { 8 | 9 | private Emit emit; 10 | 11 | public MatchToken(String fragment, Emit emit) 12 | { 13 | super(fragment); 14 | this.emit = emit; 15 | } 16 | 17 | @Override 18 | public boolean isMatch() 19 | { 20 | return true; 21 | } 22 | 23 | @Override 24 | public Emit getEmit() 25 | { 26 | return this.emit; 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/trie/State.java: -------------------------------------------------------------------------------- 1 | package pers.hai.framework.ahocorasick.trie; 2 | 3 | import java.util.*; 4 | 5 | /** 6 | *

7 | * 一个状态有如下几个功能 8 | *

9 | *

10 | *

    11 | *
  • success; 成功转移到另一个状态
  • 12 | *
  • failure; 不可顺着字符串跳转的话,则跳转到一个浅一点的节点
  • 13 | *
  • emits; 命中一个模式串
  • 14 | *
15 | *

16 | *

17 | * 根节点稍有不同,根节点没有 failure 功能,它的“failure”指的是按照字符串路径转移到下一个状态。其他节点则都有failure状态。 18 | *

19 | * 20 | * @author Robert Bor 21 | */ 22 | public abstract class State { 23 | 24 | /** 25 | * 模式串的长度,也是这个状态的深度 26 | */ 27 | protected final int depth; 28 | 29 | /** 30 | * 只用于根节点来表示自己(当没有匹配到任何模式串的时候) 31 | */ 32 | protected final State rootState; 33 | 34 | /** 35 | * fail 函数,如果没有匹配到,则跳转到此状态。 36 | */ 37 | private State failure = null; 38 | 39 | /** 40 | * 只要这个状态可达,则记录模式串 41 | */ 42 | private Set emits = null; 43 | 44 | /** 45 | * 构造深度为0的节点 46 | */ 47 | public State() 48 | { 49 | this(0); 50 | } 51 | 52 | /** 53 | * 构造深度为depth的节点 54 | * @param depth 55 | */ 56 | public State(int depth) 57 | { 58 | this.depth = depth; 59 | this.rootState = depth == 0 ? this : null; 60 | } 61 | 62 | /** 63 | * 获取节点深度 64 | * @return 65 | */ 66 | public int getDepth() 67 | { 68 | return this.depth; 69 | } 70 | 71 | /** 72 | * 添加一个匹配到的模式串(这个状态对应着这个模式串) 73 | * @param keyword 74 | */ 75 | public void addEmit(String keyword) 76 | { 77 | if (this.emits == null) 78 | { 79 | this.emits = new TreeSet(); 80 | } 81 | this.emits.add(keyword); 82 | } 83 | 84 | /** 85 | * 添加一些匹配到的模式串 86 | * @param emits 87 | */ 88 | public void addEmit(Collection emits) 89 | { 90 | for (String emit : emits) 91 | { 92 | addEmit(emit); 93 | } 94 | } 95 | 96 | /** 97 | * 获取这个节点代表的模式串(们) 98 | * @return 99 | */ 100 | public Collection emit() 101 | { 102 | return this.emits == null ? Collections.emptyList() : this.emits; 103 | } 104 | 105 | /** 106 | * 获取failure状态 107 | * @return 108 | */ 109 | public State failure() 110 | { 111 | return this.failure; 112 | } 113 | 114 | /** 115 | * 设置failure状态 116 | * @param failState 117 | */ 118 | public void setFailure(State failState) 119 | { 120 | this.failure = failState; 121 | } 122 | 123 | /** 124 | * 转移到下一个状态(基于success转移) 125 | * @param character 希望按此字符转移 126 | * @return 转移结果 127 | */ 128 | public abstract State nextState(Character character); 129 | 130 | /** 131 | * 转移到下一个状态,忽略根节点 132 | * @param character 133 | * @return 134 | */ 135 | public abstract State nextStateIgnoreRootState(Character character); 136 | 137 | /** 138 | * 添加一个状态到success函数 139 | * @param character 140 | * @return 141 | */ 142 | public abstract State addState(Character character); 143 | 144 | /** 145 | * 获取success状态 146 | * @return 147 | */ 148 | public abstract Collection getStates(); 149 | 150 | /** 151 | * 获取要转移到下一个状态的可能char 152 | * @return 153 | */ 154 | public abstract Collection getTransitions(); 155 | } 156 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/trie/Token.java: -------------------------------------------------------------------------------- 1 | package pers.hai.framework.ahocorasick.trie; 2 | 3 | /** 4 | * 一个片段 5 | */ 6 | public abstract class Token 7 | { 8 | /** 9 | * 对应的片段 10 | */ 11 | private String fragment; 12 | 13 | public Token(String fragment) 14 | { 15 | this.fragment = fragment; 16 | } 17 | 18 | public String getFragment() 19 | { 20 | return this.fragment; 21 | } 22 | 23 | public abstract boolean isMatch(); 24 | 25 | public abstract Emit getEmit(); 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/trie/Trie.java: -------------------------------------------------------------------------------- 1 | package pers.hai.framework.ahocorasick.trie; 2 | 3 | import pers.hai.framework.ahocorasick.interval.IntervalTree; 4 | import pers.hai.framework.ahocorasick.interval.Intervalable; 5 | 6 | import java.util.ArrayList; 7 | import java.util.Collection; 8 | import java.util.List; 9 | import java.util.Queue; 10 | import java.util.concurrent.LinkedBlockingDeque; 11 | 12 | /** 13 | * 基于 Aho-Corasick 白皮书 贝尔实验室: 14 | * ftp://163.13.200.222/assistant/bearhero/prog/%A8%E4%A5%A6/ac_bm.pdf 15 | * 16 | * @author Robert Bor 17 | */ 18 | public class Trie { 19 | 20 | private TrieConfig trieConfig; 21 | 22 | private State rootState; 23 | 24 | /** 25 | * 是否建立了failure表 26 | */ 27 | private boolean failureStatesConstructed = false; 28 | 29 | /** 30 | * 构造一棵trie树 31 | * 32 | * @param trieConfig 33 | */ 34 | public Trie(TrieConfig trieConfig) { 35 | this(trieConfig, true); 36 | } 37 | 38 | public Trie(TrieConfig trieConfig, boolean ascii) { 39 | this.trieConfig = trieConfig; 40 | if (ascii) { 41 | this.rootState = new AsciiState(); 42 | } else { 43 | this.rootState = new UnicodeState(); 44 | } 45 | } 46 | 47 | /** 48 | * 以默认配置构造一棵trie树 49 | */ 50 | public Trie() { 51 | this(new TrieConfig()); 52 | } 53 | 54 | /** 55 | * 构造一棵trie树 56 | * 57 | * @param ascii 58 | * 是否是Ascii树(如果设为true,则会针对ascii加速,否则会支持Unicode) 59 | */ 60 | public Trie(boolean ascii) { 61 | this(new TrieConfig(), ascii); 62 | } 63 | 64 | /** 65 | * 大小写敏感 66 | * 67 | * @return 68 | */ 69 | public Trie caseInsensitive() { 70 | this.trieConfig.setCaseInsensitive(true); 71 | return this; 72 | } 73 | 74 | /** 75 | * 不允许模式串在位置上前后重叠 76 | * 77 | * @return 78 | */ 79 | public Trie removeOverlaps() { 80 | this.trieConfig.setAllowOverlaps(false); 81 | return this; 82 | } 83 | 84 | public Trie onlyWholeWords() { 85 | this.trieConfig.setOnlyWholeWords(true); 86 | return this; 87 | } 88 | 89 | /** 90 | * 添加一个模式串 91 | * 92 | * @param keyword 93 | */ 94 | public void addKeyword(String keyword) { 95 | if (keyword == null || keyword.length() == 0) { 96 | return; 97 | } 98 | State currentState = this.rootState; 99 | for (Character character : keyword.toCharArray()) { 100 | currentState = currentState.addState(character); 101 | } 102 | currentState.addEmit(keyword); 103 | } 104 | 105 | /** 106 | * 一个分词器 107 | * 108 | * @param text 109 | * 待分词文本 110 | * @return 111 | */ 112 | public Collection tokenize(String text) { 113 | 114 | Collection tokens = new ArrayList(); 115 | 116 | Collection collectedEmits = parseText(text); 117 | int lastCollectedPosition = -1; 118 | for (Emit emit : collectedEmits) { 119 | if (emit.getStart() - lastCollectedPosition > 1) { 120 | tokens.add(createFragment(emit, text, lastCollectedPosition)); 121 | } 122 | tokens.add(createMatch(emit, text)); 123 | lastCollectedPosition = emit.getEnd(); 124 | } 125 | if (text.length() - lastCollectedPosition > 1) { 126 | tokens.add(createFragment(null, text, lastCollectedPosition)); 127 | } 128 | 129 | return tokens; 130 | } 131 | 132 | private Token createFragment(Emit emit, String text, 133 | int lastCollectedPosition) { 134 | return new FragmentToken(text.substring(lastCollectedPosition + 1, 135 | emit == null ? text.length() : emit.getStart())); 136 | } 137 | 138 | private Token createMatch(Emit emit, String text) { 139 | return new MatchToken( 140 | text.substring(emit.getStart(), emit.getEnd() + 1), emit); 141 | } 142 | 143 | /** 144 | * 模式匹配 145 | * 146 | * @param text 147 | * 待匹配的文本 148 | * @return 匹配到的模式串 149 | */ 150 | @SuppressWarnings("unchecked") 151 | public Collection parseText(String text) { 152 | checkForConstructedFailureStates(); 153 | 154 | int position = 0; 155 | State currentState = this.rootState; 156 | List collectedEmits = new ArrayList(); 157 | for (Character character : text.toCharArray()) { 158 | if (trieConfig.isCaseInsensitive()) { 159 | character = Character.toLowerCase(character); 160 | } 161 | currentState = getState(currentState, character); 162 | storeEmits(position, currentState, collectedEmits); 163 | ++position; 164 | } 165 | 166 | if (trieConfig.isOnlyWholeWords()) { 167 | removePartialMatches(text, collectedEmits); 168 | } 169 | 170 | if (!trieConfig.isAllowOverlaps()) { 171 | IntervalTree intervalTree = new IntervalTree( 172 | (List) (List) collectedEmits); 173 | intervalTree.removeOverlaps( 174 | (List) (List) collectedEmits); 175 | } 176 | 177 | return collectedEmits; 178 | } 179 | 180 | /** 181 | * 移除半截单词 182 | * 183 | * @param searchText 184 | * @param collectedEmits 185 | */ 186 | private void removePartialMatches(String searchText, 187 | List collectedEmits) { 188 | long size = searchText.length(); 189 | List removeEmits = new ArrayList(); 190 | for (Emit emit : collectedEmits) { 191 | if ((emit.getStart() == 0 || !Character 192 | .isAlphabetic(searchText.charAt(emit.getStart() - 1))) 193 | && (emit.getEnd() + 1 == size || !Character.isAlphabetic( 194 | searchText.charAt(emit.getEnd() + 1)))) { 195 | continue; 196 | } 197 | removeEmits.add(emit); 198 | } 199 | 200 | for (Emit removeEmit : removeEmits) { 201 | collectedEmits.remove(removeEmit); 202 | } 203 | } 204 | 205 | /** 206 | * 跳转到下一个状态 207 | * 208 | * @param currentState 209 | * 当前状态 210 | * @param character 211 | * 接受字符 212 | * @return 跳转结果 213 | */ 214 | private static State getState(State currentState, Character character) { 215 | State newCurrentState = currentState.nextState(character); // 先按success跳转 216 | while (newCurrentState == null) // 跳转失败的话,按failure跳转 217 | { 218 | currentState = currentState.failure(); 219 | newCurrentState = currentState.nextState(character); 220 | } 221 | return newCurrentState; 222 | } 223 | 224 | /** 225 | * 检查是否建立了failure表 226 | */ 227 | private void checkForConstructedFailureStates() { 228 | if (!this.failureStatesConstructed) { 229 | constructFailureStates(); 230 | } 231 | } 232 | 233 | /** 234 | * 建立failure表 235 | */ 236 | private void constructFailureStates() { 237 | Queue queue = new LinkedBlockingDeque(); 238 | 239 | // 第一步,将深度为1的节点的failure设为根节点 240 | for (State depthOneState : this.rootState.getStates()) { 241 | depthOneState.setFailure(this.rootState); 242 | queue.add(depthOneState); 243 | } 244 | this.failureStatesConstructed = true; 245 | 246 | // 第二步,为深度 > 1 的节点建立failure表,这是一个bfs 247 | while (!queue.isEmpty()) { 248 | State currentState = queue.remove(); 249 | 250 | for (Character transition : currentState.getTransitions()) { 251 | State targetState = currentState.nextState(transition); 252 | queue.add(targetState); 253 | 254 | State traceFailureState = currentState.failure(); 255 | while (traceFailureState.nextState(transition) == null) { 256 | traceFailureState = traceFailureState.failure(); 257 | } 258 | State newFailureState = traceFailureState.nextState(transition); 259 | targetState.setFailure(newFailureState); 260 | targetState.addEmit(newFailureState.emit()); 261 | } 262 | } 263 | } 264 | 265 | /** 266 | * 保存匹配结果 267 | * 268 | * @param position 269 | * 当前位置,也就是匹配到的模式串的结束位置+1 270 | * @param currentState 271 | * 当前状态 272 | * @param collectedEmits 273 | * 保存位置 274 | */ 275 | private static void storeEmits(int position, State currentState, 276 | List collectedEmits) { 277 | Collection emits = currentState.emit(); 278 | if (emits != null && !emits.isEmpty()) { 279 | for (String emit : emits) { 280 | collectedEmits.add( 281 | new Emit(position - emit.length() + 1, position, emit)); 282 | } 283 | } 284 | } 285 | 286 | } 287 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/trie/TrieConfig.java: -------------------------------------------------------------------------------- 1 | package pers.hai.framework.ahocorasick.trie; 2 | 3 | /** 4 | * 配置 5 | */ 6 | public class TrieConfig 7 | { 8 | /** 9 | * 允许重叠 10 | */ 11 | private boolean allowOverlaps = true; 12 | 13 | /** 14 | * 只匹配完整单词 15 | */ 16 | private boolean onlyWholeWords = false; 17 | 18 | /** 19 | * 大小写不敏感 20 | */ 21 | private boolean caseInsensitive = false; 22 | 23 | /** 24 | * 是否允许重叠 25 | * 26 | * @return 27 | */ 28 | public boolean isAllowOverlaps() 29 | { 30 | return allowOverlaps; 31 | } 32 | 33 | /** 34 | * 设置是否允许重叠 35 | * 36 | * @param allowOverlaps 37 | */ 38 | public void setAllowOverlaps(boolean allowOverlaps) 39 | { 40 | this.allowOverlaps = allowOverlaps; 41 | } 42 | 43 | /** 44 | * 是否只匹配完整单词 45 | * 46 | * @return 47 | */ 48 | public boolean isOnlyWholeWords() 49 | { 50 | return onlyWholeWords; 51 | } 52 | 53 | /** 54 | * 设置是否只匹配完整单词 55 | * 56 | * @param onlyWholeWords 57 | */ 58 | public void setOnlyWholeWords(boolean onlyWholeWords) 59 | { 60 | this.onlyWholeWords = onlyWholeWords; 61 | } 62 | 63 | /** 64 | * 是否大小写敏感 65 | * 66 | * @return 67 | */ 68 | public boolean isCaseInsensitive() 69 | { 70 | return caseInsensitive; 71 | } 72 | 73 | /** 74 | * 设置大小写敏感 75 | * 76 | * @param caseInsensitive 77 | */ 78 | public void setCaseInsensitive(boolean caseInsensitive) 79 | { 80 | this.caseInsensitive = caseInsensitive; 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/pers/hai/framework/ahocorasick/trie/UnicodeState.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/10/31 21:25 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package pers.hai.framework.ahocorasick.trie; 13 | 14 | import java.util.Collection; 15 | import java.util.Map; 16 | import java.util.TreeMap; 17 | 18 | /** 19 | * @author hankcs 20 | */ 21 | public class UnicodeState extends State 22 | { 23 | /** 24 | * goto 表,也称转移函数。根据字符串的下一个字符转移到下一个状态 25 | */ 26 | private Map success = new TreeMap(); 27 | 28 | public UnicodeState() 29 | { 30 | super(); 31 | } 32 | 33 | public UnicodeState(int depth) 34 | { 35 | super(depth); 36 | } 37 | 38 | /** 39 | * 转移到下一个状态 40 | * @param character 希望按此字符转移 41 | * @param ignoreRootState 是否忽略根节点,如果是根节点自己调用则应该是true,否则为false 42 | * @return 转移结果 43 | */ 44 | private State nextState(Character character, boolean ignoreRootState) 45 | { 46 | State nextState = this.success.get(character); 47 | if (!ignoreRootState && nextState == null && this.rootState != null) 48 | { 49 | nextState = this.rootState; 50 | } 51 | return nextState; 52 | } 53 | 54 | @Override 55 | public State nextState(Character character) 56 | { 57 | return nextState(character, false); 58 | } 59 | 60 | @Override 61 | public State nextStateIgnoreRootState(Character character) 62 | { 63 | return nextState(character, true); 64 | } 65 | 66 | @Override 67 | public State addState(Character character) 68 | { 69 | State nextState = nextStateIgnoreRootState(character); 70 | if (nextState == null) 71 | { 72 | nextState = new UnicodeState(this.depth + 1); 73 | this.success.put(character, nextState); 74 | } 75 | return nextState; 76 | } 77 | 78 | @Override 79 | public Collection getStates() 80 | { 81 | return this.success.values(); 82 | } 83 | 84 | @Override 85 | public Collection getTransitions() 86 | { 87 | return this.success.keySet(); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qwhai/jac-core/2de4c497fa4ce8f045f7d53441fa152f632c3ac5/src/main/resources/log4j.properties -------------------------------------------------------------------------------- /src/test/java/pers/hai/framework/ahocorasick/AhoCorasickTest.java: -------------------------------------------------------------------------------- 1 | package pers.hai.framework.ahocorasick; 2 | 3 | import pers.hai.framework.ahocorasick.trie.Emit; 4 | import pers.hai.framework.ahocorasick.trie.Trie; 5 | import pers.hai.framework.ahocorasick.trie.TrieConfig; 6 | import org.apache.log4j.Logger; 7 | import org.junit.Test; 8 | 9 | import java.util.Collection; 10 | import java.util.HashSet; 11 | import java.util.Set; 12 | 13 | /** 14 | * @Description TODO 15 | * @Author: Q-WHai 16 | * @Date: Created in 17:12 2019/05/06 17 | */ 18 | public class AhoCorasickTest { 19 | 20 | private final Logger logger = Logger.getLogger(AhoCorasickTest.class); 21 | 22 | @Test 23 | public void test1() { 24 | String text = "Hello world, Hello java."; 25 | Set words = new HashSet<>(){{ 26 | add("Hello"); 27 | add("World"); 28 | add("java"); 29 | }}; 30 | 31 | Trie trie = new Trie(); 32 | for (String word : words) { 33 | trie.addKeyword(word); 34 | } 35 | 36 | Collection emits = trie.parseText(text); 37 | for (Emit emit : emits) { 38 | logger.info(emit); 39 | } 40 | } 41 | 42 | @Test 43 | public void test2() { 44 | String text = "基于Java实现AhoCorasick自动机框架"; 45 | Set words = new HashSet<>(){{ 46 | add("基于"); 47 | add("AhoCorasick"); 48 | add("自动机"); 49 | }}; 50 | 51 | Trie trie = new Trie(false); 52 | for (String word : words) { 53 | trie.addKeyword(word); 54 | } 55 | 56 | Collection emits = trie.parseText(text); 57 | for (Emit emit : emits) { 58 | logger.info(emit); 59 | } 60 | } 61 | 62 | @Test 63 | public void test3() { 64 | String text = "Hello world, Hello java."; 65 | Set words = new HashSet<>(){{ 66 | add("hello"); 67 | add("World"); 68 | add("java"); 69 | }}; 70 | 71 | TrieConfig config = new TrieConfig(); 72 | config.setAllowOverlaps(false); 73 | config.setOnlyWholeWords(true); 74 | config.setCaseInsensitive(true); 75 | 76 | Trie trie = new Trie(config); 77 | for (String word : words) { 78 | trie.addKeyword(word); 79 | } 80 | 81 | Collection emits = trie.parseText(text); 82 | for (Emit emit : emits) { 83 | logger.info(emit); 84 | } 85 | } 86 | 87 | @Test 88 | public void test4() { 89 | String text = "基于Java实现AhoCorasick自动机框架"; 90 | Set words = new HashSet<>(){{ 91 | add("基于"); 92 | add("ahocorasick"); 93 | add("自动机"); 94 | add("java"); 95 | }}; 96 | 97 | TrieConfig config = new TrieConfig(); 98 | config.setAllowOverlaps(false); 99 | config.setOnlyWholeWords(true); 100 | config.setCaseInsensitive(true); 101 | 102 | Trie trie = new Trie(config, false); 103 | for (String word : words) { 104 | trie.addKeyword(word); 105 | } 106 | 107 | Collection emits = trie.parseText(text); 108 | for (Emit emit : emits) { 109 | logger.info(emit); 110 | } 111 | } 112 | } 113 | --------------------------------------------------------------------------------