├── README.md ├── .gitignore ├── LICENSE └── pyAhocorasick └── pyAhocorasick.py /README.md: -------------------------------------------------------------------------------- 1 | pyAhocorasick 2 | ============= 3 | 4 | a pure python Aho-corasick algorithm implementation 5 | 6 | 以下是中文翻译: 7 | 没别的,就是支持unicode utf-8 中文。其他的可参考其他项目,欢迎中国程序男加入。 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Tonyzhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /pyAhocorasick/pyAhocorasick.py: -------------------------------------------------------------------------------- 1 | # -*- encoding=utf-8 -*- 2 | ''' 3 | Created on Mar 15, 2014 4 | 5 | @author: tonyzhang 6 | ''' 7 | 8 | __all__ = ['Ahocorasick', ] 9 | 10 | class Node(object): 11 | 12 | def __init__(self): 13 | self.next = {} 14 | self.fail = None 15 | self.isWord = False 16 | 17 | class Ahocorasick(object): 18 | 19 | def __init__(self): 20 | self.__root = Node() 21 | 22 | def addWord(self, word): 23 | ''' 24 | @param word: add word to Tire tree 25 | 添加关键词到Tire树中 26 | ''' 27 | tmp = self.__root 28 | for i in range(0, len(word)): 29 | if not tmp.next.has_key(word[i]): 30 | tmp.next[word[i]] = Node() 31 | tmp = tmp.next[word[i]] 32 | tmp.isWord = True 33 | 34 | def make(self): 35 | ''' 36 | build the fail function 37 | 构建自动机,失效函数 38 | ''' 39 | tmpQueue = [] 40 | tmpQueue.append(self.__root) 41 | while(len(tmpQueue) > 0): 42 | temp = tmpQueue.pop() 43 | p = None 44 | for k, v in temp.next.items(): 45 | if temp == self.__root: 46 | temp.next[k].fail = self.__root 47 | else: 48 | p = temp.fail 49 | while p is not None: 50 | if p.next.has_key(k): 51 | temp.next[k].fail = p.next[k] 52 | break 53 | p = p.fail 54 | if p is None : 55 | temp.next[k].fail = self.__root 56 | tmpQueue.append(temp.next[k]) 57 | 58 | def search(self, content): 59 | ''' 60 | @return: a list of tuple,the tuple contain the match start and end index 61 | ''' 62 | p = self.__root 63 | result = [] 64 | startWordIndex = 0 65 | endWordIndex = -1 66 | currentPosition = 0 67 | 68 | while currentPosition < len(content): 69 | word = content[currentPosition] 70 | # 检索状态机,直到匹配 71 | while p.next.has_key(word) == False and p != self.__root: 72 | p = p.fail 73 | 74 | if p.next.has_key(word): 75 | if p == self.__root: 76 | # 若当前节点是根且存在转移状态,则说明是匹配词的开头,记录词的起始位置 77 | startWordIndex = currentPosition 78 | # 转移状态机的状态 79 | p = p.next[word] 80 | else: 81 | p = self.__root 82 | 83 | if p.isWord: 84 | # 若状态为词的结尾,则把词放进结果集 85 | result.append((startWordIndex, currentPosition)) 86 | 87 | currentPosition += 1 88 | return result 89 | 90 | def replace(self, content): 91 | ''' 92 | 93 | ''' 94 | replacepos = self.search(content) 95 | result = content 96 | for i in replacepos: 97 | result = result[0:i[0]] + (i[1] - i[0] + 1) * u'*' + content[i[1] + 1:] 98 | return result 99 | 100 | 101 | if __name__ == '__main__': 102 | ah = Ahocorasick() 103 | ah.addWord(u'测试') 104 | ah.addWord(u"我是") 105 | ah.make() 106 | print ah.search(u'测试123我是好人') 107 | --------------------------------------------------------------------------------