├── LICENSE
├── README.md
├── suffix_tree.py
├── test.txt
└── test_suffix_tree.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012 Ken Van Haren
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Suffix Trees in Python
 2 | ================================
 3 | 
 4 | Based off of Mark Nelson's C++ implementation of Ukkonen's algorithm. Ukkonen's
 5 | algorithm gives a O(n) + O(k) contruction time for a suffix tree, where n is 
 6 | the length of the string and k is the size of the alphabet of that string. 
 7 | Ukkonen's is an online algorithm, processing the input sequentially and producing 
 8 | a valid suffix tree at each character.
 9 | 
10 | How to use
11 | ----------
12 | 
13 | 	string = "I need to be searched!"
14 |     tree = SuffixTree(string)
15 | 	index_of_need = tree.find_substring("need")
16 | 
17 | Usage note
18 | ----------
19 | 
20 | This library is mostly an academic exercise. 
21 | If you need an efficient library
22 | I would recommend a python-wrapped c implementation, 
23 | such as [this one](http://www.daimi.au.dk/~mailund/suffix_tree.html).
24 | 


--------------------------------------------------------------------------------
/suffix_tree.py:
--------------------------------------------------------------------------------
  1 | class Node(object):
  2 |     """A node in the suffix tree. 
  3 |     
  4 |     suffix_node
  5 |         the index of a node with a matching suffix, representing a suffix link.
  6 |         -1 indicates this node has no suffix link.
  7 |     """
  8 |     def __init__(self):
  9 |         self.suffix_node = -1   
 10 | 
 11 |     def __repr__(self):
 12 |         return "Node(suffix link: %d)"%self.suffix_node
 13 | 
 14 | class Edge(object):
 15 |     """An edge in the suffix tree.
 16 |     
 17 |     first_char_index
 18 |         index of start of string part represented by this edge
 19 |         
 20 |     last_char_index
 21 |         index of end of string part represented by this edge
 22 |         
 23 |     source_node_index
 24 |         index of source node of edge
 25 |     
 26 |     dest_node_index
 27 |         index of destination node of edge
 28 |     """
 29 |     def __init__(self, first_char_index, last_char_index, source_node_index, dest_node_index):
 30 |         self.first_char_index = first_char_index
 31 |         self.last_char_index = last_char_index
 32 |         self.source_node_index = source_node_index
 33 |         self.dest_node_index = dest_node_index
 34 |         
 35 |     @property
 36 |     def length(self):
 37 |         return self.last_char_index - self.first_char_index
 38 | 
 39 |     def __repr__(self):
 40 |         return 'Edge(%d, %d, %d, %d)'% (self.source_node_index, self.dest_node_index 
 41 |                                         ,self.first_char_index, self.last_char_index )
 42 | 
 43 | 
 44 | class Suffix(object):
 45 |     """Represents a suffix from first_char_index to last_char_index.
 46 |     
 47 |     source_node_index
 48 |         index of node where this suffix starts
 49 |     
 50 |     first_char_index
 51 |         index of start of suffix in string
 52 |         
 53 |     last_char_index
 54 |         index of end of suffix in string
 55 |     """
 56 |     def __init__(self, source_node_index, first_char_index, last_char_index):
 57 |         self.source_node_index = source_node_index
 58 |         self.first_char_index = first_char_index
 59 |         self.last_char_index = last_char_index
 60 |         
 61 |     @property
 62 |     def length(self):
 63 |         return self.last_char_index - self.first_char_index
 64 |                 
 65 |     def explicit(self):
 66 |         """A suffix is explicit if it ends on a node. first_char_index
 67 |         is set greater than last_char_index to indicate this.
 68 |         """
 69 |         return self.first_char_index > self.last_char_index
 70 |     
 71 |     def implicit(self):
 72 |         return self.last_char_index >= self.first_char_index
 73 | 
 74 |         
 75 | class SuffixTree(object):
 76 |     """A suffix tree for string matching. Uses Ukkonen's algorithm
 77 |     for construction.
 78 |     """
 79 |     def __init__(self, string, case_insensitive=False):
 80 |         """
 81 |         string
 82 |             the string for which to construct a suffix tree
 83 |         """
 84 |         self.string = string
 85 |         self.case_insensitive = case_insensitive
 86 |         self.N = len(string) - 1
 87 |         self.nodes = [Node()]
 88 |         self.edges = {}
 89 |         self.active = Suffix(0, 0, -1)
 90 |         if self.case_insensitive:
 91 |             self.string = self.string.lower()
 92 |         for i in range(len(string)):
 93 |             self._add_prefix(i)
 94 |     
 95 |     def __repr__(self):
 96 |         """ 
 97 |         Lists edges in the suffix tree
 98 |         """
 99 |         curr_index = self.N
100 |         s = "\tStart \tEnd \tSuf \tFirst \tLast \tString\n"
101 |         values = list(self.edges.values())
102 |         values.sort(key=lambda x: x.source_node_index)
103 |         for edge in values:
104 |             if edge.source_node_index == -1:
105 |                 continue
106 |             s += "\t%s \t%s \t%s \t%s \t%s \t"%(edge.source_node_index
107 |                     ,edge.dest_node_index 
108 |                     ,self.nodes[edge.dest_node_index].suffix_node 
109 |                     ,edge.first_char_index
110 |                     ,edge.last_char_index)
111 |                     
112 |             
113 |             top = min(curr_index, edge.last_char_index)
114 |             s += self.string[edge.first_char_index:top+1] + "\n"
115 |         return s
116 |             
117 |     def _add_prefix(self, last_char_index):
118 |         """The core construction method.
119 |         """
120 |         last_parent_node = -1
121 |         while True:
122 |             parent_node = self.active.source_node_index
123 |             if self.active.explicit():
124 |                 if (self.active.source_node_index, self.string[last_char_index]) in self.edges:
125 |                     # prefix is already in tree
126 |                     break
127 |             else:
128 |                 e = self.edges[self.active.source_node_index, self.string[self.active.first_char_index]]
129 |                 if self.string[e.first_char_index + self.active.length + 1] == self.string[last_char_index]:
130 |                     # prefix is already in tree
131 |                     break
132 |                 parent_node = self._split_edge(e, self.active)
133 |         
134 | 
135 |             self.nodes.append(Node())
136 |             e = Edge(last_char_index, self.N, parent_node, len(self.nodes) - 1)
137 |             self._insert_edge(e)
138 |             
139 |             if last_parent_node > 0:
140 |                 self.nodes[last_parent_node].suffix_node = parent_node
141 |             last_parent_node = parent_node
142 |             
143 |             if self.active.source_node_index == 0:
144 |                 self.active.first_char_index += 1
145 |             else:
146 |                 self.active.source_node_index = self.nodes[self.active.source_node_index].suffix_node
147 |             self._canonize_suffix(self.active)
148 |         if last_parent_node > 0:
149 |             self.nodes[last_parent_node].suffix_node = parent_node
150 |         self.active.last_char_index += 1
151 |         self._canonize_suffix(self.active)
152 |         
153 |     def _insert_edge(self, edge):
154 |         self.edges[(edge.source_node_index, self.string[edge.first_char_index])] = edge
155 |         
156 |     def _remove_edge(self, edge):
157 |         self.edges.pop((edge.source_node_index, self.string[edge.first_char_index]))
158 |         
159 |     def _split_edge(self, edge, suffix):
160 |         self.nodes.append(Node())
161 |         e = Edge(edge.first_char_index, edge.first_char_index + suffix.length, suffix.source_node_index, len(self.nodes) - 1)
162 |         self._remove_edge(edge)
163 |         self._insert_edge(e)
164 |         self.nodes[e.dest_node_index].suffix_node = suffix.source_node_index  ### need to add node for each edge
165 |         edge.first_char_index += suffix.length + 1
166 |         edge.source_node_index = e.dest_node_index
167 |         self._insert_edge(edge)
168 |         return e.dest_node_index
169 | 
170 |     def _canonize_suffix(self, suffix):
171 |         """This canonizes the suffix, walking along its suffix string until it 
172 |         is explicit or there are no more matched nodes.
173 |         """
174 |         if not suffix.explicit():
175 |             e = self.edges[suffix.source_node_index, self.string[suffix.first_char_index]]
176 |             if e.length <= suffix.length:
177 |                 suffix.first_char_index += e.length + 1
178 |                 suffix.source_node_index = e.dest_node_index
179 |                 self._canonize_suffix(suffix)
180 |  
181 | 
182 |     # Public methods
183 |     def find_substring(self, substring):
184 |         """Returns the index of substring in string or -1 if it
185 |         is not found.
186 |         """
187 |         if not substring:
188 |             return -1
189 |         if self.case_insensitive:
190 |             substring = substring.lower()
191 |         curr_node = 0
192 |         i = 0
193 |         while i < len(substring):
194 |             edge = self.edges.get((curr_node, substring[i]))
195 |             if not edge:
196 |                 return -1
197 |             ln = min(edge.length + 1, len(substring) - i)
198 |             if substring[i:i + ln] != self.string[edge.first_char_index:edge.first_char_index + ln]:
199 |                 return -1
200 |             i += edge.length + 1
201 |             curr_node = edge.dest_node_index
202 |         return edge.first_char_index - len(substring) + ln
203 |         
204 |     def has_substring(self, substring):
205 |         return self.find_substring(substring) != -1
206 | 
207 |         
208 | 


--------------------------------------------------------------------------------
/test.txt:
--------------------------------------------------------------------------------
  1 | In computer science, a suffix tree (also called PAT tree or, in an earlier form, position tree) is a data structure that presents the suffixes of a given string in a way that allows for a particularly fast implementation of many important string operations.
  2 | The suffix tree for a string S is a tree whose edges are labeled with strings, such that each suffix of S corresponds to exactly one path from the tree's root to a leaf. It is thus a radix tree (more specifically, a Patricia tree) for the suffixes of S.
  3 | Constructing such a tree for the string S takes time and space linear in the length of S. Once constructed, several operations can be performed quickly, for instance locating a substring in S, locating a substring if a certain number of mistakes are allowed, locating matches for a regular expression pattern etc. Suffix trees also provided one of the first linear-time solutions for the longest common substring problem. These speedups come at a cost: storing a string's suffix tree typically requires significantly more space than storing the string itself.
  4 | Contents [hide]
  5 | 1 History
  6 | 2 Definition
  7 | 3 Generalised suffix tree
  8 | 4 Functionality
  9 | 5 Applications
 10 | 6 Implementation
 11 | 7 External construction
 12 | 8 See also
 13 | 9 References
 14 | 10 External links
 15 | [edit]History
 16 | 
 17 | The concept was first introduced as a position tree by Weiner in 1973,[1] which Donald Knuth subsequently characterized as "Algorithm of the Year 1973". The construction was greatly simplified by McCreight in 1976 [2] , and also by Ukkonen in 1995.[3][4] Ukkonen provided the first linear-time online-construction of suffix trees, now known as Ukkonen's algorithm. These algorithms are all linear-time for constant-size alphabet, and have worst-case running time of O(nlogn) in general.
 18 | In 1997, Farach[5] gave the first suffix tree construction algorithm that is optimal for all alphabets. In particular, this is the first linear-time algorithm for strings drawn from an alphabet of integers in a polynomial range. This latter algorithm has become the basis for new algorithms for constructing both suffix trees and suffix arrays, for example, in external memory, compressed, succinct, etc.
 19 | [edit]Definition
 20 | 
 21 | The suffix tree for the string S of length n is defined as a tree such that ([6] page 90):
 22 | the paths from the root to the leaves have a one-to-one relationship with the suffixes of S,
 23 | edges spell non-empty strings,
 24 | and all internal nodes (except perhaps the root) have at least two children.
 25 | Since such a tree does not exist for all strings, S is padded with a terminal symbol not seen in the string (usually denoted $). This ensures that no suffix is a prefix of another, and that there will be n leaf nodes, one for each of the n suffixes of S. Since all internal non-root nodes are branching, there can be at most n −  1 such nodes, and n + (n − 1) + 1 = 2n nodes in total (n leaves, n − 1 internal nodes, 1 root).
 26 | Suffix links are a key feature for older linear-time construction algorithms, although most newer algorithms, which are based on Farach's algorithm, dispense with suffix links. In a complete suffix tree, all internal non-root nodes have a suffix link to another internal node. If the path from the root to a node spells the string χα, where χ is a single character and α is a string (possibly empty), it has a suffix link to the internal node representing α. See for example the suffix link from the node for ANA to the node for NA in the figure above. Suffix links are also used in some algorithms running on the tree.
 27 | 
 28 | A suffix tree for a string S of length n can be built in Θ(n) time, if the letters come from an alphabet of integers in a polynomial range (in particular, this is true for constant-sized alphabets).[5] For larger alphabets, the running time is dominated by first sorting the letters to bring them into a range of size O(n); in general, this takes O(nlogn) time. The costs below are given under the assumption that the alphabet is constant.
 29 | Assume that a suffix tree has been built for the string S of length n, or that a generalised suffix tree has been built for the set of strings  of total length . You can:
 30 | Search for strings:
 31 | Check if a string P of length m is a substring in O(m) time ([6] page 92).
 32 | Find the first occurrence of the patterns  of total length m as substrings in O(m) time.
 33 | Find all z occurrences of the patterns  of total length m as substrings in O(m + z) time ([6] page 123).
 34 | Search for a regular expression P in time expected sublinear in n ([7]).
 35 | Find for each suffix of a pattern P, the length of the longest match between a prefix of  and a substring in D in Θ(m) time ([6] page 132). This is termed the matching statistics for P.
 36 | Find properties of the strings:
 37 | Find the longest common substrings of the string Si and Sj in Θ(ni + nj) time ([6] page 125).
 38 | Find all maximal pairs, maximal repeats or supermaximal repeats in Θ(n + z) time ([6] page 144).
 39 | Find the Lempel–Ziv decomposition in Θ(n) time ([6] page 166).
 40 | Find the longest repeated substrings in Θ(n) time.
 41 | Find the most frequently occurring substrings of a minimum length in Θ(n) time.
 42 | Find the shortest strings from Σ that do not occur in D, in O(n + z) time, if there are z such strings.
 43 | Find the shortest substrings occurring only once in Θ(n) time.
 44 | Find, for each i, the shortest substrings of Si not occurring elsewhere in D in Θ(n) time.
 45 | The suffix tree can be prepared for constant time lowest common ancestor retrieval between nodes in Θ(n) time ([6] chapter 8). You can then also:
 46 | Find the longest common prefix between the suffixes Si[p..ni] and Sj[q..nj] in Θ(1) ([6] page 196).
 47 | Search for a pattern P of length m with at most k mismatches in O(kn + z) time, where z is the number of hits ([6] page 200).
 48 | Find all z maximal palindromes in Θ(n)([6] page 198), or Θ(gn) time if gaps of length g are allowed, or Θ(kn) if k mismatches are allowed ([6] page 201).
 49 | Find all z tandem repeats in O(nlogn + z), and k-mismatch tandem repeats in O(knlog(n / k) + z) ([6] page 204).
 50 | Find the longest substrings common to at least k strings in D for  in Θ(n) time ([6] page 205).
 51 | [edit]Applications
 52 | 
 53 | Suffix trees can be used to solve a large number of string problems that occur in text-editing, free-text search, computational biology and other application areas.[8] Primary applications include:[8]
 54 | String search, in O(m) complexity, where m is the length of the sub-string (but with initial O(n) time required to build the suffix tree for the string)
 55 | Finding the longest repeated substring
 56 | Finding the longest common substring
 57 | Finding the longest palindrome in a string
 58 | Suffix trees are often used in bioinformatics applications, searching for patterns in DNA or protein sequences (which can be viewed as long strings of characters). The ability to search efficiently with mismatches might be considered their greatest strength. Suffix trees are also used in data compression; they can be used to find repeated data, and can be used for the sorting stage of the Burrows–Wheeler transform. Variants of the LZW compression schemes use suffix trees (LZSS). A suffix tree is also used in suffix tree clustering, a data clustering algorithm used in some search engines (first introduced in [9]).
 59 | [edit]Implementation
 60 | 
 61 | If each node and edge can be represented in Θ(1) space, the entire tree can be represented in Θ(n) space. The total length of all the strings on all of the edges in the tree is O(n2), but each edge can be stored as the position and length of a substring of S, giving a total space usage of Θ(n) computer words. The worst-case space usage of a suffix tree is seen with a fibonacci word, giving the full 2n nodes.
 62 | An important choice when making a suffix tree implementation is the parent-child relationships between nodes. The most common is using linked lists called sibling lists. Each node has a pointer to its first child, and to the next node in the child list it is a part of. Hash maps, sorted/unsorted arrays (with array doubling), and balanced search trees may also be used, giving different running time properties. We are interested in:
 63 | The cost of finding the child on a given character.
 64 | The cost of inserting a child.
 65 | The cost of enlisting all children of a node (divided by the number of children in the table below).
 66 | Let σ be the size of the alphabet. Then you have the following costs:
 67 | Lookup	Insertion	Traversal
 68 | Sibling lists / unsorted arrays	O(σ)	Θ(1)	Θ(1)
 69 | Hash maps	Θ(1)	Θ(1)	O(σ)
 70 | Balanced search tree	O(logσ)	O(logσ)	O(1)
 71 | Sorted arrays	O(logσ)	O(σ)	O(1)
 72 | Hash maps + sibling lists	O(1)	O(1)	O(1)
 73 | Note that the insertion cost is amortised, and that the costs for hashing are given perfect hashing.
 74 | The large amount of information in each edge and node makes the suffix tree very expensive, consuming about ten to twenty times the memory size of the source text in good implementations. The suffix array reduces this requirement to a factor of four, and researchers have continued to find smaller indexing structures.
 75 | [edit]External construction
 76 | 
 77 | Suffix trees quickly outgrow the main memory on standard machines for sequence collections in the order of gigabytes. As such, their construction calls for external memory approaches.
 78 | There are theoretical results for constructing suffix trees in external memory. The algorithm by Farach et al. [10] is theoretically optimal, with an I/O complexity equal to that of sorting. However, as discussed for example in ,[11] the overall intricacy of this algorithm has prevented, so far, its practical implementation.
 79 | On the other hand, there have been practical works for constructing disk-based suffix trees which scale to (few) GB/hours. The state of the art methods are TDD ,[12] TRELLIS [13] , DiGeST ,[14] and B2ST .[15]
 80 | TDD and TRELLIS scale up to the entire human genome – approximately 3GB – resulting in a disk-based suffix tree of a size in the tens of gigabytes,.[12][13] However, these methods cannot handle efficiently collections of sequences exceeding 3GB.[14] DiGeST performs significantly better and is able to handle collections of sequences in the order of 6GB in about 6 hours.[14] The source code and documentation for the latter is available from [16] . All these methods can efficiently build suffix trees for the case when the tree does not fit in main memory, but the input does. The most recent method, B2ST,[15] scales to handle inputs that do not fit in main memory.
 81 | [edit]See also
 82 | 
 83 | Suffix array
 84 | Generalised suffix tree
 85 | [edit]References
 86 | 
 87 | ^ P. Weiner (1973). "Linear pattern matching algorithm". 14th Annual IEEE Symposium on Switching and Automata Theory. pp. 1–11. doi:10.1109/SWAT.1973.13.
 88 | ^ Edward M. McCreight (1976). "A Space-Economical Suffix Tree Construction Algorithm". Journal of the ACM 23 (2): 262–272. doi:10.1145/321941.321946.
 89 | ^ E. Ukkonen (1995). "On-line construction of suffix trees". Algorithmica 14 (3): 249–260. doi:10.1007/BF01206331.
 90 | ^ R. Giegerich and S. Kurtz (1997). "From Ukkonen to McCreight and Weiner: A Unifying View of Linear-Time Suffix Tree Construction". Algorithmica 19 (3): 331–353. doi:10.1007/PL00009177.
 91 | ^ a b M. Farach (1997). "Optimal Suffix Tree Construction with Large Alphabets". FOCS: 137–143.
 92 | ^ a b c d e f g h i j k l m n Gusfield, Dan (1999) [1997]. Algorithms on Strings, Trees and Sequences: Computer Science and Computational Biology. USA: Cambridge University Press. ISBN 0-521-58519-8.
 93 | ^ Ricardo A. Baeza-Yates and Gaston H. Gonnet (1996). "Fast text searching for regular expressions or automaton searching on tries". Journal of the ACM (ACM Press) 43 (6): 915–936. doi:10.1145/235809.235810.
 94 | ^ a b Allison, L.. "Suffix Trees". Retrieved 2008-10-14.
 95 | ^ Oren Zamir and Oren Etzioni (1998). "Web document clustering: a feasibility demonstration". SIGIR '98: Proceedings of the 21st annual international ACM SIGIR conference on Research and development in information retrieval. ACM. pp. 46–54.
 96 | ^ Martin Farach-Colton, Paolo Ferragina, S. Muthukrishnan (2000). "On the sorting-complexity of suffix tree construction.". J. Acm 47(6) 47 (6): 987–1011. doi:10.1145/355541.355547.
 97 | ^ Smyth, William (2003). Computing Patterns in Strings. Addison-Wesley.
 98 | ^ a b Sandeep Tata, Richard A. Hankins, and Jignesh M. Patel (2003). "Practical Suffix Tree Construction". VLDB '03: Proceedings of the 30th International Conference on Very Large Data Bases. Morgan Kaufmann. pp. 36–47.
 99 | ^ a b Benjarath Phoophakdee and Mohammed J. Zaki (2007). "Genome-scale disk-based suffix tree indexing". SIGMOD '07: Proceedings of the ACM SIGMOD International Conference on Management of Data. ACM. pp. 833–844.
100 | ^ a b c Marina Barsky, Ulrike Stege, Alex Thomo, and Chris Upton (2008). "A new method for indexing genomes using on-disk suffix trees". CIKM '08: Proceedings of the 17th ACM Conference on Information and Knowledge Management. ACM. pp. 649–658.
101 | ^ a b Marina Barsky, Ulrike Stege, Alex Thomo, and Chris Upton (2009). "Suffix trees for very large genomic sequences". CIKM '09: Proceedings of the 18th ACM Conference on Information and Knowledge Management. ACM.
102 | ^ "The disk-based suffix tree for pattern search in sequenced genomes". Retrieved 2009-10-15.
103 | [edit]External links
104 | 
105 | 
106 | This article's use of external links may not follow Wikipedia's policies or guidelines. Please improve this article by removing excessive and inappropriate external links. (August 2010)
107 | Suffix Trees by Dr. Sartaj Sahni (CISE Department Chair at University of Florida)
108 | Suffix Trees by Lloyd Allison
109 | NIST's Dictionary of Algorithms and Data Structures: Suffix Tree
110 | suffix_tree ANSI C implementation of a Suffix Tree
111 | libstree, a generic suffix tree library written in C
112 | Tree::Suffix, a Perl binding to libstree
113 | Strmat a faster generic suffix tree library written in C (uses arrays instead of linked lists)
114 | SuffixTree a Python binding to Strmat
115 | Universal Data Compression Based on the Burrows-Wheeler Transformation: Theory and Practice, application of suffix trees in the BWT
116 | Theory and Practice of Succinct Data Structures, C++ implementation of a compressed suffix tree]
117 | Practical Algorithm Template Library, a C++ library with suffix tree
118 | 
119 | In computer science, a suffix tree (also called PAT tree or, in an earlier form, position tree) is a data structure that presents the suffixes of a given string in a way that allows for a particularly fast implementation of many important string operations.
120 | The suffix tree for a string S is a tree whose edges are labeled with strings, such that each suffix of S corresponds to exactly one path from the tree's root to a leaf. It is thus a radix tree (more specifically, a Patricia tree) for the suffixes of S.
121 | Constructing such a tree for the string S takes time and space linear in the length of S. Once constructed, several operations can be performed quickly, for instance locating a substring in S, locating a substring if a certain number of mistakes are allowed, locating matches for a regular expression pattern etc. Suffix trees also provided one of the first linear-time solutions for the longest common substring problem. These speedups come at a cost: storing a string's suffix tree typically requires significantly more space than storing the string itself.
122 | Contents [hide]
123 | 1 History
124 | 2 Definition
125 | 3 Generalised suffix tree
126 | 4 Functionality
127 | 5 Applications
128 | 6 Implementation
129 | 7 External construction
130 | 8 See also
131 | 9 References
132 | 10 External links
133 | [edit]History
134 | 
135 | The concept was first introduced as a position tree by Weiner in 1973,[1] which Donald Knuth subsequently characterized as "Algorithm of the Year 1973". The construction was greatly simplified by McCreight in 1976 [2] , and also by Ukkonen in 1995.[3][4] Ukkonen provided the first linear-time online-construction of suffix trees, now known as Ukkonen's algorithm. These algorithms are all linear-time for constant-size alphabet, and have worst-case running time of O(nlogn) in general.
136 | In 1997, Farach[5] gave the first suffix tree construction algorithm that is optimal for all alphabets. In particular, this is the first linear-time algorithm for strings drawn from an alphabet of integers in a polynomial range. This latter algorithm has become the basis for new algorithms for constructing both suffix trees and suffix arrays, for example, in external memory, compressed, succinct, etc.
137 | [edit]Definition
138 | 
139 | The suffix tree for the string S of length n is defined as a tree such that ([6] page 90):
140 | the paths from the root to the leaves have a one-to-one relationship with the suffixes of S,
141 | edges spell non-empty strings,
142 | and all internal nodes (except perhaps the root) have at least two children.
143 | Since such a tree does not exist for all strings, S is padded with a terminal symbol not seen in the string (usually denoted $). This ensures that no suffix is a prefix of another, and that there will be n leaf nodes, one for each of the n suffixes of S. Since all internal non-root nodes are branching, there can be at most n −  1 such nodes, and n + (n − 1) + 1 = 2n nodes in total (n leaves, n − 1 internal nodes, 1 root).
144 | Suffix links are a key feature for older linear-time construction algorithms, although most newer algorithms, which are based on Farach's algorithm, dispense with suffix links. In a complete suffix tree, all internal non-root nodes have a suffix link to another internal node. If the path from the root to a node spells the string χα, where χ is a single character and α is a string (possibly empty), it has a suffix link to the internal node representing α. See for example the suffix link from the node for ANA to the node for NA in the figure above. Suffix links are also used in some algorithms running on the tree.
145 | 
146 | A suffix tree for a string S of length n can be built in Θ(n) time, if the letters come from an alphabet of integers in a polynomial range (in particular, this is true for constant-sized alphabets).[5] For larger alphabets, the running time is dominated by first sorting the letters to bring them into a range of size O(n); in general, this takes O(nlogn) time. The costs below are given under the assumption that the alphabet is constant.
147 | Assume that a suffix tree has been built for the string S of length n, or that a generalised suffix tree has been built for the set of strings  of total length . You can:
148 | Search for strings:
149 | Check if a string P of length m is a substring in O(m) time ([6] page 92).
150 | Find the first occurrence of the patterns  of total length m as substrings in O(m) time.
151 | Find all z occurrences of the patterns  of total length m as substrings in O(m + z) time ([6] page 123).
152 | Search for a regular expression P in time expected sublinear in n ([7]).
153 | Find for each suffix of a pattern P, the length of the longest match between a prefix of  and a substring in D in Θ(m) time ([6] page 132). This is termed the matching statistics for P.
154 | Find properties of the strings:
155 | Find the longest common substrings of the string Si and Sj in Θ(ni + nj) time ([6] page 125).
156 | Find all maximal pairs, maximal repeats or supermaximal repeats in Θ(n + z) time ([6] page 144).
157 | Find the Lempel–Ziv decomposition in Θ(n) time ([6] page 166).
158 | Find the longest repeated substrings in Θ(n) time.
159 | Find the most frequently occurring substrings of a minimum length in Θ(n) time.
160 | Find the shortest strings from Σ that do not occur in D, in O(n + z) time, if there are z such strings.
161 | Find the shortest substrings occurring only once in Θ(n) time.
162 | Find, for each i, the shortest substrings of Si not occurring elsewhere in D in Θ(n) time.
163 | The suffix tree can be prepared for constant time lowest common ancestor retrieval between nodes in Θ(n) time ([6] chapter 8). You can then also:
164 | Find the longest common prefix between the suffixes Si[p..ni] and Sj[q..nj] in Θ(1) ([6] page 196).
165 | Search for a pattern P of length m with at most k mismatches in O(kn + z) time, where z is the number of hits ([6] page 200).
166 | Find all z maximal palindromes in Θ(n)([6] page 198), or Θ(gn) time if gaps of length g are allowed, or Θ(kn) if k mismatches are allowed ([6] page 201).
167 | Find all z tandem repeats in O(nlogn + z), and k-mismatch tandem repeats in O(knlog(n / k) + z) ([6] page 204).
168 | Find the longest substrings common to at least k strings in D for  in Θ(n) time ([6] page 205).
169 | [edit]Applications
170 | 
171 | Suffix trees can be used to solve a large number of string problems that occur in text-editing, free-text search, computational biology and other application areas.[8] Primary applications include:[8]
172 | String search, in O(m) complexity, where m is the length of the sub-string (but with initial O(n) time required to build the suffix tree for the string)
173 | Finding the longest repeated substring
174 | Finding the longest common substring
175 | Finding the longest palindrome in a string
176 | Suffix trees are often used in bioinformatics applications, searching for patterns in DNA or protein sequences (which can be viewed as long strings of characters). The ability to search efficiently with mismatches might be considered their greatest strength. Suffix trees are also used in data compression; they can be used to find repeated data, and can be used for the sorting stage of the Burrows–Wheeler transform. Variants of the LZW compression schemes use suffix trees (LZSS). A suffix tree is also used in suffix tree clustering, a data clustering algorithm used in some search engines (first introduced in [9]).
177 | [edit]Implementation
178 | 
179 | If each node and edge can be represented in Θ(1) space, the entire tree can be represented in Θ(n) space. The total length of all the strings on all of the edges in the tree is O(n2), but each edge can be stored as the position and length of a substring of S, giving a total space usage of Θ(n) computer words. The worst-case space usage of a suffix tree is seen with a fibonacci word, giving the full 2n nodes.
180 | An important choice when making a suffix tree implementation is the parent-child relationships between nodes. The most common is using linked lists called sibling lists. Each node has a pointer to its first child, and to the next node in the child list it is a part of. Hash maps, sorted/unsorted arrays (with array doubling), and balanced search trees may also be used, giving different running time properties. We are interested in:
181 | The cost of finding the child on a given character.
182 | The cost of inserting a child.
183 | The cost of enlisting all children of a node (divided by the number of children in the table below).
184 | Let σ be the size of the alphabet. Then you have the following costs:
185 | Lookup	Insertion	Traversal
186 | Sibling lists / unsorted arrays	O(σ)	Θ(1)	Θ(1)
187 | Hash maps	Θ(1)	Θ(1)	O(σ)
188 | Balanced search tree	O(logσ)	O(logσ)	O(1)
189 | Sorted arrays	O(logσ)	O(σ)	O(1)
190 | Hash maps + sibling lists	O(1)	O(1)	O(1)
191 | Note that the insertion cost is amortised, and that the costs for hashing are given perfect hashing.
192 | The large amount of information in each edge and node makes the suffix tree very expensive, consuming about ten to twenty times the memory size of the source text in good implementations. The suffix array reduces this requirement to a factor of four, and researchers have continued to find smaller indexing structures.
193 | [edit]External construction
194 | 
195 | Suffix trees quickly outgrow the main memory on standard machines for sequence collections in the order of gigabytes. As such, their construction calls for external memory approaches.
196 | There are theoretical results for constructing suffix trees in external memory. The algorithm by Farach et al. [10] is theoretically optimal, with an I/O complexity equal to that of sorting. However, as discussed for example in ,[11] the overall intricacy of this algorithm has prevented, so far, its practical implementation.
197 | On the other hand, there have been practical works for constructing disk-based suffix trees which scale to (few) GB/hours. The state of the art methods are TDD ,[12] TRELLIS [13] , DiGeST ,[14] and B2ST .[15]
198 | TDD and TRELLIS scale up to the entire human genome – approximately 3GB – resulting in a disk-based suffix tree of a size in the tens of gigabytes,.[12][13] However, these methods cannot handle efficiently collections of sequences exceeding 3GB.[14] DiGeST performs significantly better and is able to handle collections of sequences in the order of 6GB in about 6 hours.[14] The source code and documentation for the latter is available from [16] . All these methods can efficiently build suffix trees for the case when the tree does not fit in main memory, but the input does. The most recent method, B2ST,[15] scales to handle inputs that do not fit in main memory.
199 | [edit]See also
200 | 
201 | Suffix array
202 | Generalised suffix tree
203 | [edit]References
204 | 
205 | ^ P. Weiner (1973). "Linear pattern matching algorithm". 14th Annual IEEE Symposium on Switching and Automata Theory. pp. 1–11. doi:10.1109/SWAT.1973.13.
206 | ^ Edward M. McCreight (1976). "A Space-Economical Suffix Tree Construction Algorithm". Journal of the ACM 23 (2): 262–272. doi:10.1145/321941.321946.
207 | ^ E. Ukkonen (1995). "On-line construction of suffix trees". Algorithmica 14 (3): 249–260. doi:10.1007/BF01206331.
208 | ^ R. Giegerich and S. Kurtz (1997). "From Ukkonen to McCreight and Weiner: A Unifying View of Linear-Time Suffix Tree Construction". Algorithmica 19 (3): 331–353. doi:10.1007/PL00009177.
209 | ^ a b M. Farach (1997). "Optimal Suffix Tree Construction with Large Alphabets". FOCS: 137–143.
210 | ^ a b c d e f g h i j k l m n Gusfield, Dan (1999) [1997]. Algorithms on Strings, Trees and Sequences: Computer Science and Computational Biology. USA: Cambridge University Press. ISBN 0-521-58519-8.
211 | ^ Ricardo A. Baeza-Yates and Gaston H. Gonnet (1996). "Fast text searching for regular expressions or automaton searching on tries". Journal of the ACM (ACM Press) 43 (6): 915–936. doi:10.1145/235809.235810.
212 | ^ a b Allison, L.. "Suffix Trees". Retrieved 2008-10-14.
213 | ^ Oren Zamir and Oren Etzioni (1998). "Web document clustering: a feasibility demonstration". SIGIR '98: Proceedings of the 21st annual international ACM SIGIR conference on Research and development in information retrieval. ACM. pp. 46–54.
214 | ^ Martin Farach-Colton, Paolo Ferragina, S. Muthukrishnan (2000). "On the sorting-complexity of suffix tree construction.". J. Acm 47(6) 47 (6): 987–1011. doi:10.1145/355541.355547.
215 | ^ Smyth, William (2003). Computing Patterns in Strings. Addison-Wesley.
216 | ^ a b Sandeep Tata, Richard A. Hankins, and Jignesh M. Patel (2003). "Practical Suffix Tree Construction". VLDB '03: Proceedings of the 30th International Conference on Very Large Data Bases. Morgan Kaufmann. pp. 36–47.
217 | ^ a b Benjarath Phoophakdee and Mohammed J. Zaki (2007). "Genome-scale disk-based suffix tree indexing". SIGMOD '07: Proceedings of the ACM SIGMOD International Conference on Management of Data. ACM. pp. 833–844.
218 | ^ a b c Marina Barsky, Ulrike Stege, Alex Thomo, and Chris Upton (2008). "A new method for indexing genomes using on-disk suffix trees". CIKM '08: Proceedings of the 17th ACM Conference on Information and Knowledge Management. ACM. pp. 649–658.
219 | ^ a b Marina Barsky, Ulrike Stege, Alex Thomo, and Chris Upton (2009). "Suffix trees for very large genomic sequences". CIKM '09: Proceedings of the 18th ACM Conference on Information and Knowledge Management. ACM.
220 | ^ "The disk-based suffix tree for pattern search in sequenced genomes". Retrieved 2009-10-15.
221 | [edit]External links
222 | 
223 | 
224 | This article's use of external links may not follow Wikipedia's policies or guidelines. Please improve this article by removing excessive and inappropriate external links. (August 2010)
225 | Suffix Trees by Dr. Sartaj Sahni (CISE Department Chair at University of Florida)
226 | Suffix Trees by Lloyd Allison
227 | NIST's Dictionary of Algorithms and Data Structures: Suffix Tree
228 | suffix_tree ANSI C implementation of a Suffix Tree
229 | libstree, a generic suffix tree library written in C
230 | Tree::Suffix, a Perl binding to libstree
231 | Strmat a faster generic suffix tree library written in C (uses arrays instead of linked lists)
232 | SuffixTree a Python binding to Strmat
233 | Universal Data Compression Based on the Burrows-Wheeler Transformation: Theory and Practice, application of suffix trees in the BWT
234 | Theory and Practice of Succinct Data Structures, C++ implementation of a compressed suffix tree]
235 | Practical Algorithm Template Library, a C++ library with suffix treeIn computer science, a suffix tree (also called PAT tree or, in an earlier form, position tree) is a data structure that presents the suffixes of a given string in a way that allows for a particularly fast implementation of many important string operations.
236 | The suffix tree for a string S is a tree whose edges are labeled with strings, such that each suffix of S corresponds to exactly one path from the tree's root to a leaf. It is thus a radix tree (more specifically, a Patricia tree) for the suffixes of S.
237 | Constructing such a tree for the string S takes time and space linear in the length of S. Once constructed, several operations can be performed quickly, for instance locating a substring in S, locating a substring if a certain number of mistakes are allowed, locating matches for a regular expression pattern etc. Suffix trees also provided one of the first linear-time solutions for the longest common substring problem. These speedups come at a cost: storing a string's suffix tree typically requires significantly more space than storing the string itself.
238 | Contents [hide]
239 | 1 History
240 | 2 Definition
241 | 3 Generalised suffix tree
242 | 4 Functionality
243 | 5 Applications
244 | 6 Implementation
245 | 7 External construction
246 | 8 See also
247 | 9 References
248 | 10 External links
249 | [edit]History
250 | 
251 | The concept was first introduced as a position tree by Weiner in 1973,[1] which Donald Knuth subsequently characterized as "Algorithm of the Year 1973". The construction was greatly simplified by McCreight in 1976 [2] , and also by Ukkonen in 1995.[3][4] Ukkonen provided the first linear-time online-construction of suffix trees, now known as Ukkonen's algorithm. These algorithms are all linear-time for constant-size alphabet, and have worst-case running time of O(nlogn) in general.
252 | In 1997, Farach[5] gave the first suffix tree construction algorithm that is optimal for all alphabets. In particular, this is the first linear-time algorithm for strings drawn from an alphabet of integers in a polynomial range. This latter algorithm has become the basis for new algorithms for constructing both suffix trees and suffix arrays, for example, in external memory, compressed, succinct, etc.
253 | [edit]Definition
254 | 
255 | The suffix tree for the string S of length n is defined as a tree such that ([6] page 90):
256 | the paths from the root to the leaves have a one-to-one relationship with the suffixes of S,
257 | edges spell non-empty strings,
258 | and all internal nodes (except perhaps the root) have at least two children.
259 | Since such a tree does not exist for all strings, S is padded with a terminal symbol not seen in the string (usually denoted $). This ensures that no suffix is a prefix of another, and that there will be n leaf nodes, one for each of the n suffixes of S. Since all internal non-root nodes are branching, there can be at most n −  1 such nodes, and n + (n − 1) + 1 = 2n nodes in total (n leaves, n − 1 internal nodes, 1 root).
260 | Suffix links are a key feature for older linear-time construction algorithms, although most newer algorithms, which are based on Farach's algorithm, dispense with suffix links. In a complete suffix tree, all internal non-root nodes have a suffix link to another internal node. If the path from the root to a node spells the string χα, where χ is a single character and α is a string (possibly empty), it has a suffix link to the internal node representing α. See for example the suffix link from the node for ANA to the node for NA in the figure above. Suffix links are also used in some algorithms running on the tree.
261 | 
262 | A suffix tree for a string S of length n can be built in Θ(n) time, if the letters come from an alphabet of integers in a polynomial range (in particular, this is true for constant-sized alphabets).[5] For larger alphabets, the running time is dominated by first sorting the letters to bring them into a range of size O(n); in general, this takes O(nlogn) time. The costs below are given under the assumption that the alphabet is constant.
263 | Assume that a suffix tree has been built for the string S of length n, or that a generalised suffix tree has been built for the set of strings  of total length . You can:
264 | Search for strings:
265 | Check if a string P of length m is a substring in O(m) time ([6] page 92).
266 | Find the first occurrence of the patterns  of total length m as substrings in O(m) time.
267 | Find all z occurrences of the patterns  of total length m as substrings in O(m + z) time ([6] page 123).
268 | Search for a regular expression P in time expected sublinear in n ([7]).
269 | Find for each suffix of a pattern P, the length of the longest match between a prefix of  and a substring in D in Θ(m) time ([6] page 132). This is termed the matching statistics for P.
270 | Find properties of the strings:
271 | Find the longest common substrings of the string Si and Sj in Θ(ni + nj) time ([6] page 125).
272 | Find all maximal pairs, maximal repeats or supermaximal repeats in Θ(n + z) time ([6] page 144).
273 | Find the Lempel–Ziv decomposition in Θ(n) time ([6] page 166).
274 | Find the longest repeated substrings in Θ(n) time.
275 | Find the most frequently occurring substrings of a minimum length in Θ(n) time.
276 | Find the shortest strings from Σ that do not occur in D, in O(n + z) time, if there are z such strings.
277 | Find the shortest substrings occurring only once in Θ(n) time.
278 | Find, for each i, the shortest substrings of Si not occurring elsewhere in D in Θ(n) time.
279 | The suffix tree can be prepared for constant time lowest common ancestor retrieval between nodes in Θ(n) time ([6] chapter 8). You can then also:
280 | Find the longest common prefix between the suffixes Si[p..ni] and Sj[q..nj] in Θ(1) ([6] page 196).
281 | Search for a pattern P of length m with at most k mismatches in O(kn + z) time, where z is the number of hits ([6] page 200).
282 | Find all z maximal palindromes in Θ(n)([6] page 198), or Θ(gn) time if gaps of length g are allowed, or Θ(kn) if k mismatches are allowed ([6] page 201).
283 | Find all z tandem repeats in O(nlogn + z), and k-mismatch tandem repeats in O(knlog(n / k) + z) ([6] page 204).
284 | Find the longest substrings common to at least k strings in D for  in Θ(n) time ([6] page 205).
285 | [edit]Applications
286 | 
287 | Suffix trees can be used to solve a large number of string problems that occur in text-editing, free-text search, computational biology and other application areas.[8] Primary applications include:[8]
288 | String search, in O(m) complexity, where m is the length of the sub-string (but with initial O(n) time required to build the suffix tree for the string)
289 | Finding the longest repeated substring
290 | Finding the longest common substring
291 | Finding the longest palindrome in a string
292 | Suffix trees are often used in bioinformatics applications, searching for patterns in DNA or protein sequences (which can be viewed as long strings of characters). The ability to search efficiently with mismatches might be considered their greatest strength. Suffix trees are also used in data compression; they can be used to find repeated data, and can be used for the sorting stage of the Burrows–Wheeler transform. Variants of the LZW compression schemes use suffix trees (LZSS). A suffix tree is also used in suffix tree clustering, a data clustering algorithm used in some search engines (first introduced in [9]).
293 | [edit]Implementation
294 | 
295 | If each node and edge can be represented in Θ(1) space, the entire tree can be represented in Θ(n) space. The total length of all the strings on all of the edges in the tree is O(n2), but each edge can be stored as the position and length of a substring of S, giving a total space usage of Θ(n) computer words. The worst-case space usage of a suffix tree is seen with a fibonacci word, giving the full 2n nodes.
296 | An important choice when making a suffix tree implementation is the parent-child relationships between nodes. The most common is using linked lists called sibling lists. Each node has a pointer to its first child, and to the next node in the child list it is a part of. Hash maps, sorted/unsorted arrays (with array doubling), and balanced search trees may also be used, giving different running time properties. We are interested in:
297 | The cost of finding the child on a given character.
298 | The cost of inserting a child.
299 | The cost of enlisting all children of a node (divided by the number of children in the table below).
300 | Let σ be the size of the alphabet. Then you have the following costs:
301 | Lookup	Insertion	Traversal
302 | Sibling lists / unsorted arrays	O(σ)	Θ(1)	Θ(1)
303 | Hash maps	Θ(1)	Θ(1)	O(σ)
304 | Balanced search tree	O(logσ)	O(logσ)	O(1)
305 | Sorted arrays	O(logσ)	O(σ)	O(1)
306 | Hash maps + sibling lists	O(1)	O(1)	O(1)
307 | Note that the insertion cost is amortised, and that the costs for hashing are given perfect hashing.
308 | The large amount of information in each edge and node makes the suffix tree very expensive, consuming about ten to twenty times the memory size of the source text in good implementations. The suffix array reduces this requirement to a factor of four, and researchers have continued to find smaller indexing structures.
309 | [edit]External construction
310 | 
311 | Suffix trees quickly outgrow the main memory on standard machines for sequence collections in the order of gigabytes. As such, their construction calls for external memory approaches.
312 | There are theoretical results for constructing suffix trees in external memory. The algorithm by Farach et al. [10] is theoretically optimal, with an I/O complexity equal to that of sorting. However, as discussed for example in ,[11] the overall intricacy of this algorithm has prevented, so far, its practical implementation.
313 | On the other hand, there have been practical works for constructing disk-based suffix trees which scale to (few) GB/hours. The state of the art methods are TDD ,[12] TRELLIS [13] , DiGeST ,[14] and B2ST .[15]
314 | TDD and TRELLIS scale up to the entire human genome – approximately 3GB – resulting in a disk-based suffix tree of a size in the tens of gigabytes,.[12][13] However, these methods cannot handle efficiently collections of sequences exceeding 3GB.[14] DiGeST performs significantly better and is able to handle collections of sequences in the order of 6GB in about 6 hours.[14] The source code and documentation for the latter is available from [16] . All these methods can efficiently build suffix trees for the case when the tree does not fit in main memory, but the input does. The most recent method, B2ST,[15] scales to handle inputs that do not fit in main memory.
315 | [edit]See also
316 | 
317 | Suffix array
318 | Generalised suffix tree
319 | [edit]References
320 | 
321 | ^ P. Weiner (1973). "Linear pattern matching algorithm". 14th Annual IEEE Symposium on Switching and Automata Theory. pp. 1–11. doi:10.1109/SWAT.1973.13.
322 | ^ Edward M. McCreight (1976). "A Space-Economical Suffix Tree Construction Algorithm". Journal of the ACM 23 (2): 262–272. doi:10.1145/321941.321946.
323 | ^ E. Ukkonen (1995). "On-line construction of suffix trees". Algorithmica 14 (3): 249–260. doi:10.1007/BF01206331.
324 | ^ R. Giegerich and S. Kurtz (1997). "From Ukkonen to McCreight and Weiner: A Unifying View of Linear-Time Suffix Tree Construction". Algorithmica 19 (3): 331–353. doi:10.1007/PL00009177.
325 | ^ a b M. Farach (1997). "Optimal Suffix Tree Construction with Large Alphabets". FOCS: 137–143.
326 | ^ a b c d e f g h i j k l m n Gusfield, Dan (1999) [1997]. Algorithms on Strings, Trees and Sequences: Computer Science and Computational Biology. USA: Cambridge University Press. ISBN 0-521-58519-8.
327 | ^ Ricardo A. Baeza-Yates and Gaston H. Gonnet (1996). "Fast text searching for regular expressions or automaton searching on tries". Journal of the ACM (ACM Press) 43 (6): 915–936. doi:10.1145/235809.235810.
328 | ^ a b Allison, L.. "Suffix Trees". Retrieved 2008-10-14.
329 | ^ Oren Zamir and Oren Etzioni (1998). "Web document clustering: a feasibility demonstration". SIGIR '98: Proceedings of the 21st annual international ACM SIGIR conference on Research and development in information retrieval. ACM. pp. 46–54.
330 | ^ Martin Farach-Colton, Paolo Ferragina, S. Muthukrishnan (2000). "On the sorting-complexity of suffix tree construction.". J. Acm 47(6) 47 (6): 987–1011. doi:10.1145/355541.355547.
331 | ^ Smyth, William (2003). Computing Patterns in Strings. Addison-Wesley.
332 | ^ a b Sandeep Tata, Richard A. Hankins, and Jignesh M. Patel (2003). "Practical Suffix Tree Construction". VLDB '03: Proceedings of the 30th International Conference on Very Large Data Bases. Morgan Kaufmann. pp. 36–47.
333 | ^ a b Benjarath Phoophakdee and Mohammed J. Zaki (2007). "Genome-scale disk-based suffix tree indexing". SIGMOD '07: Proceedings of the ACM SIGMOD International Conference on Management of Data. ACM. pp. 833–844.
334 | ^ a b c Marina Barsky, Ulrike Stege, Alex Thomo, and Chris Upton (2008). "A new method for indexing genomes using on-disk suffix trees". CIKM '08: Proceedings of the 17th ACM Conference on Information and Knowledge Management. ACM. pp. 649–658.
335 | ^ a b Marina Barsky, Ulrike Stege, Alex Thomo, and Chris Upton (2009). "Suffix trees for very large genomic sequences". CIKM '09: Proceedings of the 18th ACM Conference on Information and Knowledge Management. ACM.
336 | ^ "The disk-based suffix tree for pattern search in sequenced genomes". Retrieved 2009-10-15.
337 | [edit]External links
338 | 
339 | 


--------------------------------------------------------------------------------
/test_suffix_tree.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from suffix_tree import SuffixTree
 3 | 
 4 | 
 5 | class SuffixTreeTest(unittest.TestCase):
 6 |     """Some functional tests.
 7 |     """
 8 |     def test_empty_string(self):
 9 |         st = SuffixTree('')
10 |         self.assertEqual(st.find_substring('not there'), -1)
11 |         self.assertEqual(st.find_substring(''), -1)
12 |         self.assertFalse(st.has_substring('not there'))
13 |         self.assertFalse(st.has_substring(''))
14 |         
15 |     def test_repeated_string(self):
16 |         st = SuffixTree("aaa")
17 |         self.assertEqual(st.find_substring('a'), 0)
18 |         self.assertEqual(st.find_substring('aa'), 0)
19 |         self.assertEqual(st.find_substring('aaa'), 0)
20 |         self.assertEqual(st.find_substring('b'), -1)
21 |         self.assertTrue(st.has_substring('a'))
22 |         self.assertTrue(st.has_substring('aa'))
23 |         self.assertTrue(st.has_substring('aaa'))
24 |         
25 |         self.assertFalse(st.has_substring('aaaa'))
26 |         self.assertFalse(st.has_substring('b'))
27 |         #case sensitive by default
28 |         self.assertFalse(st.has_substring('A'))
29 |         
30 |     def test_long_string(self):
31 |         f = open("test.txt")
32 |         st = SuffixTree(f.read())
33 |         self.assertEqual(st.find_substring('Ukkonen'), 1498)
34 |         self.assertEqual(st.find_substring('Optimal'), 11131)
35 |         self.assertFalse(st.has_substring('ukkonen'))
36 |         
37 |     def test_case_sensitivity(self):
38 |         f = open("test.txt")
39 |         st = SuffixTree(f.read(), case_insensitive=True)
40 |         self.assertEqual(st.find_substring('ukkonen'), 1498)
41 |         self.assertEqual(st.find_substring('Optimal'), 1830)
42 | 
43 |     def test_repr(self):
44 |         st = SuffixTree("t")
45 |         output = '\tStart \tEnd \tSuf \tFirst \tLast \tString\n\t0 \t1 \t-1 \t0 \t0 \tt\n'
46 |         import pdb;pdb.set_trace()
47 |         self.assertEqual(st.__repr__(), output)
48 | 
49 | if __name__ == '__main__':
50 |     unittest.main()
51 | 
52 | 


--------------------------------------------------------------------------------