├── LICENSE ├── README.md └── suffixtree.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 mutux (Pan Du) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ukkonen's Suffix Tree Algorithm in Python Complete Version 2 | Suffix Tree Algorithm implemented in Python, might be the most complete version online, even more complete than that demonstrated on stackoverflow. 3 | 4 | I underestimated the complication of the algorithm and just wanted to have some fun. A primitive implementation was done in a couple of hours, and the demonstation example on stackoverflow works just fine. Then when I wanted try some more complicated examples, I kept hitting the wall time and time again. It annoyed me, and thus costed me several days to try different situations when constructing a suffix tree. 5 | 6 | Finally, the version comes out, I think **all the situations** explained in the questions and answers have been experienced and covered in the algorithm above before I read the full post. 7 | 8 | I also write a blog on explaining the implementation details on my blogger [MuTuX](http://www.mutux.com/2017/07/19/suffix-tree-implementation-move-all.html "mutux's blog on text mining") with flowcharts and explanation on it. 9 | 10 | 11 | ## Examples 12 | 13 | ``` 14 | docs = ['abcabxabcd', 'dedododeeodoeodooedeeododooodoede$', 'ooooooooo', 'mississippi'] 15 | for text in docs: 16 | tree, pst = build(text, regularize=True) 17 | Node.draw(tree, pst, ed='#') 18 | ``` 19 | 20 | The running results: 21 | ``` 22 | abcabxabcd 23 | ● (0) 24 | | 25 | | ab 26 | +----------------● (4->6) 27 | | | 28 | | | xabcd 29 | | +---------------● (5) 30 | | | 31 | | | c 32 | | +---------------● (9->11) 33 | | | 34 | | | abxabcd 35 | | +---------------● (1) 36 | | | 37 | | | d 38 | | +---------------● (10) 39 | | 40 | | c 41 | +----------------● (13) 42 | | | 43 | | | abxabcd 44 | | +---------------● (3) 45 | | | 46 | | | d 47 | | +---------------● (14) 48 | | 49 | | b 50 | +----------------● (6) 51 | | | 52 | | | xabcd 53 | | +---------------● (7) 54 | | | 55 | | | c 56 | | +---------------● (11->13) 57 | | | 58 | | | abxabcd 59 | | +---------------● (2) 60 | | | 61 | | | d 62 | | +---------------● (12) 63 | | 64 | | xabcd 65 | +----------------● (8) 66 | | 67 | | d 68 | +----------------● (15) 69 | 70 | dedododeeodoeodooedeeododooodoede$ 71 | ● (0) 72 | | 73 | | e 74 | +----------------------------------------● (28) 75 | | | 76 | | | $ 77 | | +---------------------------------------● (71) 78 | | | 79 | | | eodo 80 | | +---------------------------------------● (48->37) 81 | | | 82 | | | eodooedeeododooodoede$ 83 | | +---------------------------------------● (29) 84 | | | 85 | | | dooodoede$ 86 | | +---------------------------------------● (49) 87 | | | 88 | | | d 89 | | +---------------------------------------● (44->19) 90 | | | 91 | | | ododeeodoeodooedeeododooodoede$ 92 | | +---------------------------------------● (18) 93 | | | 94 | | | e 95 | | +---------------------------------------● (68->26) 96 | | | 97 | | | eododooodoede$ 98 | | +---------------------------------------● (45) 99 | | | 100 | | | $ 101 | | +---------------------------------------● (69) 102 | | | 103 | | | odo 104 | | +---------------------------------------● (37->31) 105 | | | 106 | | | eodooedeeododooodoede$ 107 | | +---------------------------------------● (30) 108 | | | 109 | | | dooodoede$ 110 | | +---------------------------------------● (50) 111 | | | 112 | | | oedeeododooodoede$ 113 | | +---------------------------------------● (38) 114 | | 115 | | d 116 | +----------------------------------------● (19) 117 | | | 118 | | | e 119 | | +---------------------------------------● (26->28) 120 | | | 121 | | | dododeeodoeodooedeeododooodoede$ 122 | | +---------------------------------------● (17) 123 | | | 124 | | | $ 125 | | +---------------------------------------● (70) 126 | | | 127 | | | eodo 128 | | +---------------------------------------● (46->48) 129 | | | 130 | | | eodooedeeododooodoede$ 131 | | +---------------------------------------● (27) 132 | | | 133 | | | dooodoede$ 134 | | +---------------------------------------● (47) 135 | | | 136 | | | o 137 | | +---------------------------------------● (33->35) 138 | | | 139 | | | e 140 | | +---------------------------------------● (64->42) 141 | | | 142 | | | de$ 143 | | +---------------------------------------● (65) 144 | | | 145 | | | odooedeeododooodoede$ 146 | | +---------------------------------------● (34) 147 | | | 148 | | | d 149 | | +---------------------------------------● (22->24) 150 | | | 151 | | | eeodoeodooedeeododooodoede$ 152 | | +---------------------------------------● (23) 153 | | | 154 | | | o 155 | | +---------------------------------------● (53->31) 156 | | | 157 | | | deeodoeodooedeeododooodoede$ 158 | | +---------------------------------------● (20) 159 | | | 160 | | | oodoede$ 161 | | +---------------------------------------● (54) 162 | | | 163 | | | o 164 | | +---------------------------------------● (57->59) 165 | | | 166 | | | edeeododooodoede$ 167 | | +---------------------------------------● (40) 168 | | | 169 | | | odoede$ 170 | | +---------------------------------------● (58) 171 | | 172 | | o 173 | +----------------------------------------● (35) 174 | | | 175 | | | e 176 | | +---------------------------------------● (42->28) 177 | | | 178 | | | odooedeeododooodoede$ 179 | | +---------------------------------------● (36) 180 | | | 181 | | | de 182 | | +---------------------------------------● (66->68) 183 | | | 184 | | | eododooodoede$ 185 | | +---------------------------------------● (43) 186 | | | 187 | | | $ 188 | | +---------------------------------------● (67) 189 | | | 190 | | | d 191 | | +---------------------------------------● (24->19) 192 | | | 193 | | | eeodoeodooedeeododooodoede$ 194 | | +---------------------------------------● (25) 195 | | | 196 | | | o 197 | | +---------------------------------------● (31->33) 198 | | | 199 | | | e 200 | | +---------------------------------------● (62->64) 201 | | | 202 | | | de$ 203 | | +---------------------------------------● (63) 204 | | | 205 | | | odooedeeododooodoede$ 206 | | +---------------------------------------● (32) 207 | | | 208 | | | d 209 | | +---------------------------------------● (51->22) 210 | | | 211 | | | eeodoeodooedeeododooodoede$ 212 | | +---------------------------------------● (21) 213 | | | 214 | | | ooodoede$ 215 | | +---------------------------------------● (52) 216 | | | 217 | | | o 218 | | +---------------------------------------● (55->57) 219 | | | 220 | | | edeeododooodoede$ 221 | | +---------------------------------------● (39) 222 | | | 223 | | | odoede$ 224 | | +---------------------------------------● (56) 225 | | | 226 | | | o 227 | | +---------------------------------------● (59->35) 228 | | | 229 | | | edeeododooodoede$ 230 | | +---------------------------------------● (41) 231 | | | 232 | | | doede$ 233 | | +---------------------------------------● (61) 234 | | | 235 | | | odoede$ 236 | | +---------------------------------------● (60) 237 | | 238 | | $ 239 | +----------------------------------------● (72) 240 | 241 | ooooooooo$ 242 | ● (0) 243 | | 244 | | o 245 | +----------------● (89) 246 | | | 247 | | | $ 248 | | +---------------● (90) 249 | | | 250 | | | o 251 | | +---------------● (87->89) 252 | | | 253 | | | $ 254 | | +---------------● (88) 255 | | | 256 | | | o 257 | | +---------------● (85->87) 258 | | | 259 | | | $ 260 | | +---------------● (86) 261 | | | 262 | | | o 263 | | +---------------● (83->85) 264 | | | 265 | | | $ 266 | | +---------------● (84) 267 | | | 268 | | | o 269 | | +---------------● (81->83) 270 | | | 271 | | | $ 272 | | +---------------● (82) 273 | | | 274 | | | o 275 | | +---------------● (79->81) 276 | | | 277 | | | $ 278 | | +---------------● (80) 279 | | | 280 | | | o 281 | | +---------------● (77->79) 282 | | | 283 | | | $ 284 | | +---------------● (78) 285 | | | 286 | | | o 287 | | +---------------● (75->77) 288 | | | 289 | | | $ 290 | | +---------------● (76) 291 | | | 292 | | | o$ 293 | | +---------------● (74) 294 | | 295 | | $ 296 | +----------------● (91) 297 | 298 | mississippi$ 299 | ● (0) 300 | | 301 | | i 302 | +------------------● (104) 303 | | | 304 | | | ppi$ 305 | | +-----------------● (105) 306 | | | 307 | | | $ 308 | | +-----------------● (109) 309 | | | 310 | | | ssi 311 | | +-----------------● (98->100) 312 | | | 313 | | | ppi$ 314 | | +-----------------● (99) 315 | | | 316 | | | ssippi$ 317 | | +-----------------● (94) 318 | | 319 | | p 320 | +------------------● (107) 321 | | | 322 | | | i$ 323 | | +-----------------● (108) 324 | | | 325 | | | pi$ 326 | | +-----------------● (106) 327 | | 328 | | s 329 | +------------------● (96) 330 | | | 331 | | | i 332 | | +-----------------● (102->104) 333 | | | 334 | | | ppi$ 335 | | +-----------------● (103) 336 | | | 337 | | | ssippi$ 338 | | +-----------------● (97) 339 | | | 340 | | | si 341 | | +-----------------● (100->102) 342 | | | 343 | | | ppi$ 344 | | +-----------------● (101) 345 | | | 346 | | | ssippi$ 347 | | +-----------------● (95) 348 | | 349 | | mississippi$ 350 | +------------------● (93) 351 | | 352 | | $ 353 | +------------------● (110) 354 | 355 | ``` 356 | 357 | ## Finally 358 | Have fun! 359 | -------------------------------------------------------------------------------- /suffixtree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | class Node: 3 | 4 | __num__ = -1 5 | 6 | def __init__(self, parentkey, outedges, suffixlink=None): 7 | self.parentkey = parentkey 8 | self.outedges = outedges 9 | self.suffixlink = suffixlink 10 | Node.__num__ += 1 11 | self.id = Node.__num__ 12 | 13 | def getoutedges(self): 14 | return self.outedges 15 | 16 | def setoutedge(self, key, (anode, label_start_index, label_end_index, bnode)): 17 | if self.outedges is None: 18 | self.outedges = {} 19 | self.outedges[key] = (anode, label_start_index, label_end_index, bnode) 20 | 21 | def getoutedge(self, key): 22 | if key in self.outedges: 23 | return self.getoutedges()[key] 24 | else: 25 | return None 26 | 27 | def getparenkey(self): 28 | return self.parentkey 29 | 30 | def setparentkey(self, parentkey): 31 | self.parentkey = parentkey 32 | 33 | def getsuffixlink(self): 34 | return self.suffixlink 35 | 36 | def setsuffixlink(self, node): 37 | self.suffixlink = node 38 | 39 | def getid(self): 40 | return self.id 41 | 42 | @staticmethod 43 | def __draw__(rnode, chars, v, ed='#'): 44 | l = len(chars) 45 | edges = rnode.getoutedges().items() 46 | nogc = [] 47 | hasgc = [] 48 | gc = [] 49 | maxlen = len(chars) + 6 50 | for edg in edges: 51 | if v == 0: 52 | if edg[1][3].getoutedges() is None: 53 | nogc.append(edg) 54 | else: 55 | hasgc.append(edg) 56 | else: 57 | if edg[1][3].getoutedges() is None: 58 | hasgc.append(edg) 59 | else: 60 | nogc.append(edg) 61 | gc.extend(hasgc) 62 | gc.extend(nogc) 63 | for k, (parent, s, t, node) in gc: 64 | if ed == '#': 65 | if t == '#': 66 | t = l 67 | else: 68 | if t == '#': 69 | t = ed 70 | linkid = '' 71 | if node.getsuffixlink() is not None: 72 | linkid = '->' + str(node.getsuffixlink().getid()) 73 | 74 | if v == 0: 75 | print " " * maxlen * v + '|' 76 | print " " * maxlen * v + '|' + ' ' * 3 + chars[s:t + 1] 77 | print '+' + " " * maxlen * v + '-' + '-' * (maxlen - 1) + '● ' + '(' + str(node.getid()) + linkid + ')' 78 | else: 79 | print '|' + " " * maxlen * v + '|' 80 | print '|' + " " * maxlen * v + '|' + ' ' * 3 + chars[s:t + 1] 81 | print '|' + " " * maxlen * v + '+' + '-' * (maxlen - 1) + '● ' + '(' + str(node.getid()) + linkid + ')' 82 | if node.getoutedges() is not None: 83 | Node.__draw__(node, chars, v + 1, ed) 84 | 85 | @staticmethod 86 | def draw(root, chars, ed='#'): 87 | print '\n', chars, '\n● (0)' 88 | v = 0 89 | Node.__draw__(root, chars, v, ed) 90 | 91 | 92 | def build(chars, regularize=False): 93 | root = Node(None, None, None) 94 | actnode = root 95 | actkey = '' 96 | actlen = 0 97 | remainder = 0 # used for splitting 98 | ind = 0 99 | while ind < len(chars): 100 | ch = chars[ind] 101 | if remainder == 0: 102 | if actnode.getoutedges() is not None and ch in actnode.getoutedges(): 103 | actkey = ch 104 | actlen = 1 105 | remainder = 1 106 | anode, start, end, bnode = actnode.getoutedge(actkey) 107 | if end == '#': 108 | end = ind 109 | if end - start + 1 == actlen: 110 | actnode = actnode.getoutedge(actkey)[3] 111 | actkey = '' 112 | actlen = 0 113 | else: 114 | aleaf = Node(None, None, None) 115 | aedge = (actnode, ind, '#', aleaf) 116 | aleaf.setparentkey((actnode, chars[ind])) 117 | actnode.setoutedge(chars[ind], aedge) 118 | else: 119 | if actkey == '' and actlen == 0: # compare on node 120 | if ch in actnode.getoutedges(): 121 | actkey = ch 122 | actlen = 1 123 | remainder += 1 124 | else: 125 | remainder += 1 126 | remainder, actnode, actkey, actlen = unfold(root, chars, ind, remainder, actnode, actkey, actlen) 127 | else: # compare on edge 128 | anode, start, end, bnode = actnode.getoutedge(actkey) 129 | if end == '#': 130 | end = ind 131 | compareposition = start + actlen 132 | if chars[compareposition] != ch: 133 | remainder += 1 134 | remainder, actnode, actkey, actlen = unfold(root, chars, ind, remainder, actnode, actkey, actlen) 135 | else: 136 | if compareposition < end: # on edge 137 | actlen += 1 138 | remainder += 1 139 | else: # on node 140 | remainder += 1 141 | actnode = actnode.getoutedge(actkey)[3] 142 | if compareposition == end: 143 | actlen = 0 144 | actkey = '' 145 | else: 146 | actlen = 1 147 | actkey = ch 148 | ind += 1 149 | if ind == len(chars) and remainder > 0: 150 | if regularize: 151 | chars = chars + '$' 152 | return root, chars 153 | 154 | 155 | def unfold(root, chars, ind, remainder, actnode, actkey, actlen): 156 | prenode = None 157 | while remainder > 0: 158 | remains = chars[ind - remainder + 1:ind + 1] 159 | actlen_re = len(remains) - 1 - actlen 160 | actnode, actkey, actlen, actlen_re = hop(ind, actnode, actkey, actlen, remains, actlen_re) 161 | lost, actnode, actkey, actlen, actlen_re = step(chars, ind, actnode, actkey, actlen, remains, actlen_re) 162 | if lost: 163 | if actlen == 1 and prenode is not None and actnode is not root: 164 | prenode.setsuffixlink(actnode) 165 | return remainder, actnode, actkey, actlen 166 | if actlen == 0: 167 | if remains[actlen_re] not in actnode.getoutedges(): 168 | aleaf = Node(None, None, None) 169 | aedge = (actnode, ind, '#', aleaf) 170 | aleaf.setparentkey((actnode, chars[ind])) 171 | actnode.setoutedge(chars[ind], aedge) 172 | else: # on edge 173 | anode, start, end, bnode = actnode.getoutedge(actkey) 174 | if remains[actlen_re + actlen] != chars[start + actlen]: 175 | # split 176 | anode, start, end, bnode = actnode.getoutedge(actkey) 177 | newnode = Node(None, None, None) 178 | halfedge1 = (actnode, start, start + actlen - 1, newnode) 179 | halfedge2 = (newnode, start + actlen, end, bnode) 180 | actnode.setoutedge(actkey, halfedge1) 181 | newnode.setparentkey((actnode, actkey)) 182 | newnode.setoutedge(chars[start + actlen], halfedge2) 183 | aleaf = Node(None, None, None) 184 | aedge = (newnode, ind, '#', aleaf) 185 | aleaf.setparentkey((newnode, chars[ind])) 186 | newnode.setoutedge(chars[ind], aedge) 187 | else: 188 | return remainder, actnode, actkey, actlen 189 | if prenode is not None and 'aleaf' in locals() and aleaf.getparenkey()[0] is not root: 190 | prenode.setsuffixlink(aleaf.getparenkey()[0]) 191 | if 'aleaf' in locals() and aleaf.getparenkey()[0] is not root: 192 | prenode = aleaf.getparenkey()[0] 193 | if actnode == root and remainder > 1: 194 | actkey = remains[1] 195 | actlen -= 1 196 | if actnode.getsuffixlink() is not None: 197 | actnode = actnode.getsuffixlink() 198 | else: 199 | actnode = root 200 | remainder -= 1 201 | return remainder, actnode, actkey, actlen 202 | 203 | 204 | def step(chars, ind, actnode, actkey, actlen, remains, ind_remainder): 205 | rem_label = remains[ind_remainder:] 206 | if actlen > 0: 207 | anode, start, end, bnode = actnode.getoutedge(actkey) 208 | if end == '#': 209 | end = ind 210 | edgelabel = chars[start:end + 1] 211 | if edgelabel.startswith(rem_label): 212 | actlen = len(rem_label) 213 | actkey = rem_label[0] 214 | return True, actnode, actkey, actlen, ind_remainder 215 | else: 216 | # on node 217 | if ind_remainder < len(remains) and remains[ind_remainder] in actnode.getoutedges(): 218 | anode, start, end, bnode = actnode.getoutedge(remains[ind_remainder]) 219 | if end == '#': 220 | end = ind 221 | edgelabel = chars[start:end + 1] 222 | if edgelabel.startswith(rem_label): 223 | actlen = len(rem_label) 224 | actkey = rem_label[0] 225 | return True, actnode, actkey, actlen, ind_remainder 226 | return False, actnode, actkey, actlen, ind_remainder 227 | 228 | 229 | def hop(ind, actnode, actkey, actlen, remains, ind_remainder): 230 | if actlen == 0 or actkey == '': 231 | return actnode, actkey, actlen, ind_remainder 232 | anode, start, end, bnode = actnode.getoutedge(actkey) 233 | if end == '#': 234 | end = ind 235 | edgelength = end - start + 1 236 | while actlen > edgelength: 237 | actnode = actnode.getoutedge(actkey)[3] 238 | ind_remainder += edgelength 239 | actkey = remains[ind_remainder] 240 | actlen -= edgelength 241 | anode, start, end, bnode = actnode.getoutedge(actkey) 242 | if end == '#': 243 | end = ind 244 | edgelength = end - start + 1 245 | if actlen == edgelength: 246 | actnode = actnode.getoutedge(actkey)[3] 247 | actkey = '' 248 | actlen = 0 249 | ind_remainder += edgelength 250 | return actnode, actkey, actlen, ind_remainder 251 | 252 | 253 | if __name__ == "__main__": 254 | docs = ['abcabxabcd', 'dedododeeodoeodooedeeododooodoede$', 'ooooooooo', 'mississippi'] 255 | for text in docs: 256 | tree, pst = build(text, regularize=True) 257 | Node.draw(tree, pst, ed='#') 258 | --------------------------------------------------------------------------------