├── README.md ├── SuffixTree ├── __init__.py ├── __init__.pyc ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── modules.cpython-35.pyc │ └── suffixtree.cpython-35.pyc ├── modules.py └── suffixtree.py ├── __pycache__ └── suffixtree.cpython-35.pyc └── test.py /README.md: -------------------------------------------------------------------------------- 1 | This module is an optimized implementation of Ukkonen's suffix tree algorithm in python which **will** be having most of the important text processing functionalities such as: 2 | 3 | 4 | ### Search for strings: 5 | `✓` Check if a string ***P*** of length ***m*** is a substring in ***O(m)*** time. 6 | `✓` Find the first occurrence of the patterns ***P1,... ,Pq*** of total length ***m*** as substrings in ***O(m)*** time. 7 | 8 | `✓` Find all ***z*** occurrences of the patterns ***P1,... ,Pq*** of total length ***m*** as substrings in ***O(m+z)*** time. 9 | - Search for a regular expression P in time expected sublinear in ***n*** 10 | - Find for each suffix of a pattern ***P*** the length of the longest match between a prefix of ***P[i... m]*** and a substring in ***D*** in ![image](https://cloud.githubusercontent.com/assets/5694520/22856327/5881bd04-f0a4-11e6-9d9a-e01fc0c15dd2.png) time. This is termed the matching statistics for ***P*** 11 | 12 | ### Find properties of the strings: 13 | - Find the longest common substrings of the string ***Si*** and ***Sj*** in ![image](https://cloud.githubusercontent.com/assets/5694520/22856331/72a43c66-f0a4-11e6-8f06-4c8ea987c79c.png) time. 14 | - Find all maximal pairs, maximal repeats or supermaximal repeats in ![image](https://cloud.githubusercontent.com/assets/5694520/22856334/861ff74e-f0a4-11e6-9ff7-9629c4d1d69b.png) time. 15 | - Find the Lempel–Ziv decomposition in ![image](https://cloud.githubusercontent.com/assets/5694520/22856287/8bdbe630-f0a3-11e6-8611-de6c0a40932c.png) time.[10] 16 | - Find the longest repeated substrings in ![image](https://cloud.githubusercontent.com/assets/5694520/22856287/8bdbe630-f0a3-11e6-8611-de6c0a40932c.png) time. 17 | - Find the most frequently occurring substrings of a minimum length in ![image](https://cloud.githubusercontent.com/assets/5694520/22856287/8bdbe630-f0a3-11e6-8611-de6c0a40932c.png) time. 18 | - Find the shortest strings from ![image](https://cloud.githubusercontent.com/assets/5694520/22856282/7e4d4fe0-f0a3-11e6-915e-1c9dfcd679bf.png) that do not occur in ***D*** in ***O(n+z)*** time, if there are ***z*** such strings. 19 | - Find the shortest substrings occurring only once in ![image](https://cloud.githubusercontent.com/assets/5694520/22856287/8bdbe630-f0a3-11e6-8611-de6c0a40932c.png) time. 20 | - Find, for each ***i*** the shortest substrings of ***Si*** not occurring elsewhere in ***D*** in ![image](https://cloud.githubusercontent.com/assets/5694520/22856287/8bdbe630-f0a3-11e6-8611-de6c0a40932c.png) time. 21 | 22 | ## sources: 23 | 24 | - http://web.stanford.edu/~mjkay/gusfield.pdf 25 | - On–line construction of suffix trees. Esko Ukkonen 26 | - http://www.geeksforgeeks.org/ukkonens-suffix-tree-construction-part-6/ 27 | -------------------------------------------------------------------------------- /SuffixTree/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kasravnd/SuffixTree/1aca5a6eed0ce7c8f29a29f257cbab53c29b7b14/SuffixTree/__init__.py -------------------------------------------------------------------------------- /SuffixTree/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kasravnd/SuffixTree/1aca5a6eed0ce7c8f29a29f257cbab53c29b7b14/SuffixTree/__init__.pyc -------------------------------------------------------------------------------- /SuffixTree/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kasravnd/SuffixTree/1aca5a6eed0ce7c8f29a29f257cbab53c29b7b14/SuffixTree/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /SuffixTree/__pycache__/modules.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kasravnd/SuffixTree/1aca5a6eed0ce7c8f29a29f257cbab53c29b7b14/SuffixTree/__pycache__/modules.cpython-35.pyc -------------------------------------------------------------------------------- /SuffixTree/__pycache__/suffixtree.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kasravnd/SuffixTree/1aca5a6eed0ce7c8f29a29f257cbab53c29b7b14/SuffixTree/__pycache__/suffixtree.cpython-35.pyc -------------------------------------------------------------------------------- /SuffixTree/modules.py: -------------------------------------------------------------------------------- 1 | class Base: 2 | def __init__(self, tree): 3 | self.tree = tree 4 | self.main_string = tree._string 5 | self.root = tree.root 6 | 7 | 8 | class CheckSubString(Base): 9 | def __init__(self, tree, sub_string, findall=False): 10 | super(CheckSubString, self).__init__(tree) 11 | self.sub_string = sub_string 12 | self.latest_index = 0 13 | self.findall = findall 14 | self.continue_flag = False 15 | self.sub_length = len(sub_string) 16 | 17 | def traverse(self, node, sub_string): 18 | if sub_string: 19 | # Since each child starts with a unique character we will pursue the process for the child that sub-string 20 | # Starts with the frist character of this edge 21 | item = next(((char, child) for char, child in node.children.items() if sub_string.startswith(char)), None) 22 | 23 | if item: 24 | char, child = item 25 | start, end = child.start, child.end 26 | # If the edge is equal with sub-string returns the index 27 | if self.main_string[start: end + 1].startswith(sub_string): 28 | if self.findall: 29 | return self.find_all_match(child, len(sub_string)) 30 | return start - (self.sub_length - len(sub_string)) 31 | # sub-string starts with the frist character of our edge but is not equal with it 32 | # So call the travese for the rest of sub-string (from the lenght of previous edge) 33 | return self.traverse(child, sub_string[end - start + 1:]) 34 | else: 35 | # At this level there were no edge that sub-string starts with its leading character. 36 | return -1 37 | if self.findall: 38 | return self.find_all_match(node, len(sub_string)) 39 | return node.start - (self.sub_length - len(sub_string)) 40 | 41 | def check(self): 42 | if self.root is None: 43 | return -1 44 | if not isinstance(self.sub_string, str): 45 | return -1 46 | if not self.sub_string: 47 | # Every string starts with an empty string 48 | return 0 49 | 50 | return self.traverse(self.root, self.sub_string) 51 | 52 | def find_all_match(self, node, sub_length): 53 | 54 | def inner(node, traversed_edges): 55 | for char, child in node.children.items(): 56 | if child.leaf: 57 | yield child.start - traversed_edges 58 | else: 59 | start, end = child.start, child.end 60 | sub_length = end - start + 1 61 | yield from inner(child, traversed_edges + sub_length) 62 | 63 | if node.leaf: 64 | first = node.start - (self.sub_length - sub_length) 65 | return [first, *inner(node, self.sub_length)] 66 | else: 67 | return list(inner(node, self.sub_length)) 68 | -------------------------------------------------------------------------------- /SuffixTree/suffixtree.py: -------------------------------------------------------------------------------- 1 | """An optimized implementation of Suffix-Tree.""" 2 | 3 | # For more infor about the comments you can read http://web.stanford.edu/~mjkay/gusfield.pdf 4 | from operator import attrgetter 5 | 6 | leafEnd = -1 7 | 8 | 9 | class Node: 10 | """The Suffix-tree's node.""" 11 | 12 | def __init__(self, leaf): 13 | # self.__identifier = identifier 14 | self.children = {} 15 | # for leaf nodes, it stores the index of suffix for 16 | # the path from root to leaf""" 17 | self.leaf = leaf 18 | self.suffixIndex = None 19 | self.start = None 20 | self.end = None 21 | self.suffixLink = None 22 | 23 | def __eq__(self, node): 24 | atg = attrgetter('start', 'end', 'suffixIndex') 25 | return atg(self) == atg(node) 26 | 27 | def __ne__(self, node): 28 | atg = attrgetter('start', 'end', 'suffixIndex') 29 | return atg(self) != atg(node) 30 | 31 | def __getattribute__(self, name): 32 | if name == 'end': 33 | if self.leaf: 34 | return leafEnd 35 | return super(Node, self).__getattribute__(name) 36 | 37 | 38 | class SuffixTree: 39 | """The Suffix-Tree.""" 40 | 41 | def __init__(self, data): 42 | """Initiate the tree.""" 43 | self._string = data 44 | self.lastNewNode = None 45 | self.activeNode = None 46 | """activeEdge is represeted as input string character 47 | index (not the character itself)""" 48 | self.activeEdge = -1 49 | self.activeLength = 0 50 | # remainingSuffixCount tells how many suffixes yet to 51 | # be added in tree 52 | self.remainingSuffixCount = 0 53 | self.rootEnd = None 54 | self.splitEnd = None 55 | self.size = -1 # Length of input string 56 | self.root = None 57 | 58 | def edge_length(self, node): 59 | return node.end - node.start + 1 60 | 61 | def walk_down(self, current_node): 62 | """Walk down from current node. 63 | 64 | activePoint change for walk down (APCFWD) using 65 | Skip/Count Trick (Trick 1). If activeLength is greater 66 | than current edge length, set next internal node as 67 | activeNode and adjust activeEdge and activeLength 68 | accordingly to represent same activePoint. 69 | """ 70 | length = self.edge_length(current_node) 71 | if (self.activeLength >= length): 72 | self.activeEdge += length 73 | self.activeLength -= length 74 | self.activeNode = current_node 75 | return True 76 | return False 77 | 78 | def new_node(self, start, end=None, leaf=False): 79 | """For root node, suffixLink will be set to NULL 80 | For internal nodes, suffixLink will be set to root 81 | by default in current extension and may change in 82 | next extension""" 83 | node = Node(leaf) 84 | node.suffixLink = self.root 85 | node.start = start 86 | node.end = end 87 | """suffixIndex will be set to -1 by default and 88 | actual suffix index will be set later for leaves 89 | at the end of all phases""" 90 | node.suffixIndex = -1 91 | return node 92 | 93 | def extend_suffix_tree(self, pos): 94 | global leafEnd 95 | """Extension Rule 1, this takes care of extending all 96 | leaves created so far in tree""" 97 | leafEnd = pos 98 | """Increment remainingSuffixCount indicating that a 99 | new suffix added to the list of suffixes yet to be 100 | added in tree""" 101 | self.remainingSuffixCount += 1 102 | """set lastNewNode to None while starting a new phase, 103 | indicating there is no internal node waiting for 104 | it's suffix link reset in current phase""" 105 | self.lastNewNode = None 106 | # Add all suffixes (yet to be added) one by one in tree 107 | while(self.remainingSuffixCount > 0): 108 | if (self.activeLength == 0): 109 | self.activeEdge = pos # APCFALZ 110 | # There is no outgoing edge starting with 111 | # activeEdge from activeNode 112 | if (self.activeNode.children.get(self._string[self.activeEdge]) is None): 113 | # Extension Rule 2 (A new leaf edge gets created) 114 | self.activeNode.children[self._string[self.activeEdge]] = self.new_node(pos, leaf=True) 115 | """A new leaf edge is created in above line starting 116 | from an existng node (the current activeNode), and 117 | if there is any internal node waiting for it's suffix 118 | link get reset, point the suffix link from that last 119 | internal node to current activeNode. Then set lastNewNode 120 | to None indicating no more node waiting for suffix link 121 | reset.""" 122 | if (self.lastNewNode is not None): 123 | self.lastNewNode.suffixLink = self.activeNode 124 | self.lastNewNode = None 125 | # There is an outgoing edge starting with activeEdge 126 | # from activeNode 127 | else: 128 | # Get the next node at the end of edge starting 129 | # with activeEdge 130 | _next = self.activeNode.children.get(self._string[self.activeEdge]) 131 | if self.walk_down(_next): # Do walkdown 132 | # Start from _next node (the new activeNode) 133 | continue 134 | """Extension Rule 3 (current character being processed 135 | is already on the edge)""" 136 | if (self._string[_next.start + self.activeLength] == self._string[pos]): 137 | # If a newly created node waiting for it's 138 | # suffix link to be set, then set suffix link 139 | # of that waiting node to curent. active node 140 | if((self.lastNewNode is not None) and (self.activeNode != self.root)): 141 | self.lastNewNode.suffixLink = self.activeNode 142 | self.lastNewNode = None 143 | # APCFER3 144 | self.activeLength += 1 145 | """STOP all further processing in this phase 146 | and move on to _next phase""" 147 | break 148 | """We will be here when activePoint is in middle of 149 | the edge being traversed and current character 150 | being processed is not on the edge (we fall off 151 | the tree). In this case, we add a new internal node 152 | and a new leaf edge going out of that new node. This 153 | is Extension Rule 2, where a new leaf edge and a new 154 | internal node get created""" 155 | self.splitEnd = _next.start + self.activeLength - 1 156 | # New internal node 157 | split = self.new_node(_next.start, self.splitEnd) 158 | self.activeNode.children[self._string[self.activeEdge]] = split 159 | # New leaf coming out of new internal node 160 | split.children[self._string[pos]] = self.new_node(pos, leaf=True) 161 | _next.start += self.activeLength 162 | split.children[self._string[_next.start]] = _next 163 | """We got a new internal node here. If there is any 164 | internal node created in last extensions of same 165 | phase which is still waiting for it's suffix link 166 | reset, do it now.""" 167 | if (self.lastNewNode is not None): 168 | # suffixLink of lastNewNode points to current newly 169 | # created internal node 170 | self.lastNewNode.suffixLink = split 171 | """Make the current newly created internal node waiting 172 | for it's suffix link reset (which is pointing to self.root 173 | at present). If we come across any other internal node 174 | (existing or newly created) in next extension of same 175 | phase, when a new leaf edge gets added (i.e. when 176 | Extension Rule 2 applies is any of the next extension 177 | of same phase) at that point, suffixLink of this node 178 | will point to that internal node.""" 179 | self.lastNewNode = split 180 | """One suffix got added in tree, decrement the count of 181 | suffixes yet to be added.""" 182 | self.remainingSuffixCount -= 1 183 | if ((self.activeNode == self.root) and (self.activeLength > 0)): # APCFER2C1 184 | self.activeLength -= 1 185 | self.activeEdge = pos - self.remainingSuffixCount + 1 186 | elif (self.activeNode != self.root): # APCFER2C2 187 | self.activeNode = self.activeNode.suffixLink 188 | 189 | def walk_dfs(self, current): 190 | start, end = current.start, current.end 191 | yield self._string[start: end + 1] 192 | 193 | for node in current.children.values(): 194 | if node: 195 | yield from self.walk_dfs(node) 196 | 197 | def build_suffix_tree(self): 198 | self.size = len(self._string) 199 | 200 | """Root is a special node with start and end indices as -1, 201 | as it has no parent from where an edge comes to root""" 202 | self.rootEnd = -1 203 | self.root = self.new_node(-1, self.rootEnd) 204 | self.activeNode = self.root # First activeNode will be root 205 | for i in range(self.size): 206 | self.extend_suffix_tree(i) 207 | 208 | def __str__(self): 209 | return "\n".join(map(str, self.edges.values())) 210 | 211 | def print_dfs(self): 212 | for sub in self.walk_dfs(self.root): 213 | print(sub) 214 | -------------------------------------------------------------------------------- /__pycache__/suffixtree.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kasravnd/SuffixTree/1aca5a6eed0ce7c8f29a29f257cbab53c29b7b14/__pycache__/suffixtree.cpython-35.pyc -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from SuffixTree.suffixtree import SuffixTree 2 | from SuffixTree.modules import CheckSubString 3 | 4 | s = "abcabxabcd$" 5 | tree = SuffixTree(s) 6 | tree.build_suffix_tree() 7 | a = CheckSubString(tree, 'abx', findall=True) 8 | print(a.check()) 9 | 10 | # tree.print_dfs() 11 | 12 | """ 13 | output: 14 | 15 | $ [10] 16 | ab [-1] 17 | 18 | c [-1] 19 | 20 | abxabcd$ [0] 21 | d$ [6] 22 | xabcd$ [3] 23 | b [-1] 24 | 25 | c [-1] 26 | 27 | abxabcd$ [1] 28 | d$ [7] 29 | xabcd$ [4] 30 | c [-1] 31 | 32 | abxabcd$ [2] 33 | d$ [8] 34 | d$ [9] 35 | xabcd$ [5] 36 | """ 37 | --------------------------------------------------------------------------------