├── README.md ├── Cargo.toml ├── .gitignore ├── src ├── disjoint_set.rs └── lib.rs └── tests └── integration_test.rs /README.md: -------------------------------------------------------------------------------- 1 | # rust-generalized-suffix-tree 2 | Implementation of Generalized Suffix Tree using Ukkonen's algorithm in Rust 3 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "generalized_suffix_tree" 3 | license = "MIT" 4 | description = "Implementation of Generalized Suffix Tree using Ukkonen's algorithm in Rust" 5 | version = "1.2.1" 6 | authors = ["Xun Li "] 7 | edition = "2018" 8 | repository = "https://github.com/lxfind/rust-generalized-suffix-tree" 9 | 10 | [dependencies] 11 | mediumvec = "1.0.4" 12 | 13 | [dev-dependencies] 14 | rand = "0.7.0" 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | 12 | 13 | #Added by cargo 14 | # 15 | #already existing elements are commented out 16 | 17 | /target 18 | #**/*.rs.bk 19 | #Cargo.lock 20 | 21 | #VS Code 22 | .vscode 23 | -------------------------------------------------------------------------------- /src/disjoint_set.rs: -------------------------------------------------------------------------------- 1 | /// This is a simple implementation of a Disjoint Set, which allows for 2 | /// efficient operations that involves set union and ancestor finding. 3 | /// We need this to implement the Tarjan algorithm for computing 4 | /// least common ancestors, which is needed to compute the longest common 5 | /// substring. 6 | pub struct DisjointSet { 7 | ancestors: Vec, 8 | } 9 | 10 | impl DisjointSet { 11 | pub fn new(size: usize) -> Self { 12 | let mut ancestors = Vec::with_capacity(size); 13 | for i in 0..size { 14 | // MakeSet(i) 15 | ancestors.push(i); 16 | } 17 | Self { ancestors } 18 | } 19 | 20 | pub fn find_set(&mut self, index: usize) -> usize { 21 | let mut ret = self.ancestors[index]; 22 | if ret != index { 23 | ret = self.find_set(ret); 24 | self.ancestors[index] = ret; 25 | } 26 | ret 27 | } 28 | 29 | /// Merge two sets. Always merge `v` into `u` by assuming that `u` has higher rank. 30 | /// This is not optimum but suitable for the purpose of this code. 31 | pub fn union(&mut self, u: usize, v: usize) { 32 | if self.ancestors[u] == self.ancestors[v] { 33 | return; 34 | } 35 | self.ancestors[v] = u; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tests/integration_test.rs: -------------------------------------------------------------------------------- 1 | use generalized_suffix_tree; 2 | 3 | #[cfg(test)] 4 | mod tests { 5 | use super::*; 6 | 7 | #[test] 8 | fn test_is_suffix() { 9 | let mut tree = generalized_suffix_tree::GeneralizedSuffixTree::new(); 10 | let s1 = "ABCABXABCD"; 11 | tree.add_string(String::from(s1), '$'); 12 | for i in 0..s1.len() { 13 | assert!(tree.is_suffix(&s1[i..]), "{} should be a suffix", &s1[i..]); 14 | } 15 | assert!(!tree.is_suffix("A")); 16 | assert!(!tree.is_suffix("BC")); 17 | 18 | let s2 = "XABCDABCA"; 19 | tree.add_string(String::from(s2), '#'); 20 | for i in 0..s1.len() { 21 | assert!(tree.is_suffix(&s1[i..]), "{} should be a suffix", &s1[i..]); 22 | } 23 | for i in 0..s2.len() { 24 | assert!(tree.is_suffix(&s2[i..]), "{} should be a suffix", &s2[i..]); 25 | } 26 | assert!(!tree.is_suffix("BC")); 27 | } 28 | 29 | #[test] 30 | fn test_is_substr() { 31 | let mut tree = generalized_suffix_tree::GeneralizedSuffixTree::new(); 32 | let s1 = "ABCABXABCD"; 33 | tree.add_string(String::from(s1), '$'); 34 | for i in 0..s1.len() { 35 | for j in i..s1.len() { 36 | assert!( 37 | tree.is_substr(&s1[i..(j + 1)]), 38 | "{} should be a substring", 39 | &s1[i..(j + 1)] 40 | ); 41 | } 42 | } 43 | assert!(!tree.is_substr("ABD")); 44 | assert!(!tree.is_substr("XB")); 45 | 46 | let s2 = "XABCDABCA"; 47 | tree.add_string(String::from(s2), '#'); 48 | for i in 0..s1.len() { 49 | for j in i..s1.len() { 50 | assert!( 51 | tree.is_substr(&s1[i..(j + 1)]), 52 | "{} should be a substring", 53 | &s1[i..(j + 1)] 54 | ); 55 | } 56 | } 57 | for i in 0..s2.len() { 58 | for j in i..s2.len() { 59 | assert!( 60 | tree.is_substr(&s2[i..(j + 1)]), 61 | "{} should be a substring", 62 | &s2[i..(j + 1)] 63 | ); 64 | } 65 | } 66 | assert!(!tree.is_suffix("BC")); 67 | } 68 | 69 | #[test] 70 | fn test_longest_common_substring_all() { 71 | { 72 | let mut tree = generalized_suffix_tree::GeneralizedSuffixTree::new(); 73 | tree.add_string(String::from("ABCABD"), '$'); 74 | tree.add_string(String::from("ABDABCA"), '#'); 75 | tree.pretty_print(); 76 | assert_eq!(tree.longest_common_substring_all(), "ABCA"); 77 | } 78 | { 79 | let mut tree = generalized_suffix_tree::GeneralizedSuffixTree::new(); 80 | tree.add_string(String::from("VOTEFORTHEGREATALBANIAFORYOU"), '$'); 81 | tree.add_string(String::from("CHOOSETHEGREATALBANIANFUTURE"), '#'); 82 | assert_eq!(tree.longest_common_substring_all(), "THEGREATALBANIA"); 83 | } 84 | } 85 | 86 | #[test] 87 | fn test_longest_common_substring_with() { 88 | { 89 | let mut tree = generalized_suffix_tree::GeneralizedSuffixTree::new(); 90 | tree.add_string(String::from("VOTEFORTHEGREATALBANIAFORYOU"), '$'); 91 | let test_str = String::from("CHOOSETHEGREATALBANIANFUTURE"); 92 | assert_eq!( 93 | tree.longest_common_substring_with(&test_str), 94 | "THEGREATALBANIA" 95 | ); 96 | tree.add_string(test_str, '#'); 97 | let test_str = String::from("VOTECHOOSEGREATALBANIATHEFUTURE"); 98 | assert_eq!( 99 | tree.longest_common_substring_with(&test_str), 100 | "EGREATALBANIA" 101 | ); 102 | } 103 | { 104 | let mut tree = generalized_suffix_tree::GeneralizedSuffixTree::new(); 105 | tree.add_string(String::from("HHDBBCIAAE"), '$'); 106 | let test_str = String::from("AAFJEHDAEG"); 107 | assert_eq!(tree.longest_common_substring_with(&test_str).len(), 2); 108 | } 109 | } 110 | 111 | fn gen_random_string(len: usize, alphabet_size: usize) -> String { 112 | let mut s = String::new(); 113 | for _ in 0..len { 114 | let ch = (rand::random::() % alphabet_size as u8) + 'A' as u8; 115 | s.push(ch as char); 116 | } 117 | s 118 | } 119 | 120 | #[test] 121 | #[ignore] 122 | fn test_longest_common_substring_cross_check() { 123 | for _ in 0..10000 { 124 | let s1 = gen_random_string(100, 10); 125 | let s2 = gen_random_string(100, 10); 126 | let result1 = { 127 | let mut tree = generalized_suffix_tree::GeneralizedSuffixTree::new(); 128 | tree.add_string(s1.clone(), '$'); 129 | tree.add_string(s2.clone(), '#'); 130 | tree.longest_common_substring_all() 131 | }; 132 | let result2 = { 133 | let mut tree = generalized_suffix_tree::GeneralizedSuffixTree::new(); 134 | tree.add_string(s1.clone(), '$'); 135 | tree.longest_common_substring_with(&s2) 136 | }; 137 | let result3 = { 138 | let mut tree = generalized_suffix_tree::GeneralizedSuffixTree::new(); 139 | tree.add_string(s2.clone(), '$'); 140 | tree.longest_common_substring_with(&s1) 141 | }; 142 | assert_eq!(result1.len(), result2.len()); 143 | assert_eq!(result1.len(), result3.len()); 144 | } 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! A Generalized Suffix Tree implementation using Ukkonen's algorithm. 2 | mod disjoint_set; 3 | 4 | use std::collections::HashMap; 5 | 6 | use mediumvec::{Vec32, vec32}; 7 | 8 | type NodeID = u32; 9 | type StrID = u32; 10 | type IndexType = u32; 11 | type CharType = u8; 12 | 13 | // Special nodes. 14 | const ROOT: NodeID = 0; 15 | const SINK: NodeID = 1; 16 | const INVALID: NodeID = NodeID::max_value(); 17 | 18 | /// This structure represents a slice to a string. 19 | #[derive(Debug, Clone)] 20 | struct MappedSubstring { 21 | /// Unique ID of the string it's slicing, which can be used to locate the string from the tree's string storage. 22 | str_id: StrID, 23 | 24 | /// Index of the first character of the slice. 25 | start: IndexType, 26 | 27 | /// One past the index of the last character of the slice. 28 | /// e.g. when `end` is equal to `start`, this is an empty slice. 29 | /// Note that `end` here always represents a meaningful index, unlike in the original algorithm where a slice could potentially be open-ended. 30 | /// Such open-endedness allows for online construction of the tree. Here I chose to not support online construction for convenience. It's possible 31 | /// to support it by changing `end`'s type to `Option`. 32 | end: IndexType, 33 | } 34 | 35 | impl MappedSubstring { 36 | const fn new(str_id: StrID, start: IndexType, end: IndexType) -> Self { 37 | Self { str_id, start, end } 38 | } 39 | 40 | const fn is_empty(&self) -> bool { 41 | self.start == self.end 42 | } 43 | 44 | const fn len(&self) -> IndexType { 45 | self.end - self.start 46 | } 47 | } 48 | 49 | /// This is a node in the tree. `transitions` represents all the possible 50 | /// transitions from this node to other nodes, indexed by the first character 51 | /// of the string slice that transition represents. The character needs to 52 | /// be encoded to an index between `0..MAX_CHAR_COUNT` first. 53 | /// `suffix_link` contains the suffix link of this node (a term used in the 54 | /// context of Ukkonen's algorithm). 55 | /// `substr` stores the slice of the string that the transition from the parent 56 | /// node represents. By doing so we avoid having an explicit edge data type. 57 | #[derive(Debug)] 58 | struct Node { 59 | transitions: HashMap, 60 | 61 | suffix_link: NodeID, 62 | 63 | /// The slice of the string this node represents. 64 | substr: MappedSubstring, 65 | } 66 | 67 | impl Node { 68 | fn new(str_id: StrID, start: IndexType, end: IndexType) -> Self { 69 | Self { 70 | transitions: HashMap::new(), 71 | suffix_link: INVALID, 72 | substr: MappedSubstring::new(str_id, start, end), 73 | } 74 | } 75 | 76 | fn get_suffix_link(&self) -> NodeID { 77 | assert!(self.suffix_link != INVALID, "Invalid suffix link"); 78 | self.suffix_link 79 | } 80 | } 81 | 82 | /// A data structure used to store the current state during the Ukkonen's algorithm. 83 | struct ReferencePoint { 84 | /// The active node. 85 | node: NodeID, 86 | 87 | /// The current string we are processing. 88 | str_id: StrID, 89 | 90 | /// The active point. 91 | index: IndexType, 92 | } 93 | 94 | impl ReferencePoint { 95 | const fn new(node: NodeID, str_id: StrID, index: IndexType) -> Self { 96 | Self { 97 | node, 98 | str_id, 99 | index, 100 | } 101 | } 102 | } 103 | 104 | /// This is the generalized suffix tree, implemented using Ukkonen's Algorithm. 105 | /// One important modification to the algorithm is that this is no longer an online 106 | /// algorithm, i.e. it only accepts strings fully provided to the suffix tree, instead 107 | /// of being able to stream processing each string. It is not a fundamental limitation and can be supported. 108 | /// 109 | /// # Examples 110 | /// 111 | /// ``` 112 | /// use generalized_suffix_tree::GeneralizedSuffixTree; 113 | /// let mut tree = GeneralizedSuffixTree::new(); 114 | /// tree.add_string(String::from("ABCDABCE"), '$'); 115 | /// tree.add_string(String::from("CDEFDEFG"), '#'); 116 | /// println!("{}", tree.is_suffix("BCE")); 117 | /// ``` 118 | #[derive(Debug)] 119 | pub struct GeneralizedSuffixTree { 120 | node_storage: Vec32, 121 | str_storage: Vec32, 122 | } 123 | 124 | impl Default for GeneralizedSuffixTree { 125 | fn default() -> Self { 126 | // Set the slice of root to be [0, 1) to allow it consume one character whenever we are transitioning from sink to root. 127 | // No other node will ever transition to root so this won't affect anything else. 128 | let mut root = Node::new(0, 0, 1); 129 | let mut sink = Node::new(0, 0, 0); 130 | 131 | root.suffix_link = SINK; 132 | sink.suffix_link = ROOT; 133 | 134 | let node_storage: Vec32 = vec32![root, sink]; 135 | Self { 136 | node_storage, 137 | str_storage: Vec32::new(), 138 | } 139 | } 140 | } 141 | 142 | impl GeneralizedSuffixTree { 143 | #[must_use] 144 | pub fn new() -> Self { 145 | Self::default() 146 | } 147 | 148 | /// Add a new string to the generalized suffix tree. 149 | pub fn add_string(&mut self, mut s: String, term: char) { 150 | self.validate_string(&s, term); 151 | 152 | let str_id = self.str_storage.len() as StrID; 153 | 154 | // Add a unique terminator character to the end of the string. 155 | s.push(term); 156 | 157 | self.str_storage.push(s); 158 | self.process_suffixes(str_id); 159 | } 160 | 161 | fn validate_string(&self, s: &str, term: char) { 162 | assert!(s.len() <= IndexType::max_value() as usize); 163 | assert!(term.is_ascii(), "Only accept ASCII terminator"); 164 | assert!( 165 | !s.contains(term), 166 | "String should not contain terminator character" 167 | ); 168 | for existing_str in &self.str_storage { 169 | assert!( 170 | !existing_str.contains(term), 171 | "Any existing string should not contain terminator character" 172 | ); 173 | } 174 | } 175 | 176 | /// Find the longest common substring among all strings in the suffix. 177 | /// This function can be used when you already have a suffix tree built, 178 | /// and would need to know the longest commmon substring. 179 | /// It can be trivially extended to support longest common substring among 180 | /// `K` strings. 181 | #[must_use] 182 | pub fn longest_common_substring_all(&self) -> String { 183 | let mut disjoint_set = disjoint_set::DisjointSet::new(self.node_storage.len()); 184 | 185 | // prev_node stores the most recent occurance of a leaf that belongs to each string. 186 | // We use the terminator character (which uniquely represents a string) as the key. 187 | let mut prev_node: HashMap = HashMap::new(); 188 | 189 | // lca_cnt[v] means the total number of times that the lca of two nodes is node v. 190 | let mut lca_cnt: Vec32 = vec32![0; self.node_storage.len()]; 191 | 192 | let mut longest_str: (Vec32<&MappedSubstring>, IndexType) = (Vec32::new(), 0); 193 | let mut cur_str: (Vec32<&MappedSubstring>, IndexType) = (Vec32::new(), 0); 194 | self.longest_common_substring_all_rec( 195 | &mut disjoint_set, 196 | &mut prev_node, 197 | &mut lca_cnt, 198 | ROOT, 199 | &mut longest_str, 200 | &mut cur_str, 201 | ); 202 | 203 | let mut result = String::new(); 204 | for s in longest_str.0 { 205 | result.push_str(self.get_string_slice_short(s)); 206 | } 207 | result 208 | } 209 | 210 | /// A recursive DFS that does a couple of things in one run: 211 | /// - Obtain the each pair of leaves that belong to the same string and are 212 | /// consecutive in DFS visits. (stored in `prev_node`) 213 | /// - Tarjan's Algorithm to compute the least common ancestor for each 214 | /// of the above pairs. (information stored in `disjoint_set`) 215 | /// - Maintain the number of times an LCA lands on each node. (`lca_cnt`) 216 | /// This function returns a tuple: 217 | /// - Total number of leaves in the subtree. 218 | /// - Sum of all LCA counts from each node in the subtree, 219 | /// including the node itself. 220 | /// These two numbers can be used to compute the number of unique strings 221 | /// occured in the subtree, which can be used to check whether we found 222 | /// a common substring. 223 | /// Details of the algorithm can be found here: 224 | /// 225 | fn longest_common_substring_all_rec<'a>( 226 | &'a self, 227 | disjoint_set: &mut disjoint_set::DisjointSet, 228 | prev_node: &mut HashMap, 229 | lca_cnt: &mut Vec32, 230 | node: NodeID, 231 | longest_str: &mut (Vec32<&'a MappedSubstring>, IndexType), 232 | cur_str: &mut (Vec32<&'a MappedSubstring>, IndexType), 233 | ) -> (usize, usize) { 234 | let mut total_leaf = 0; 235 | let mut total_correction = 0; 236 | for target_node in self.get_node(node).transitions.values() { 237 | if *target_node == INVALID { 238 | continue; 239 | } 240 | let slice = &self.get_node(*target_node).substr; 241 | if slice.end as usize == self.get_string(slice.str_id).len() { 242 | // target_node is a leaf node. 243 | total_leaf += 1; 244 | let last_ch = self.get_char(slice.str_id, slice.end - 1); 245 | if let Some(prev) = prev_node.get(&last_ch) { 246 | let lca = disjoint_set.find_set(*prev as usize); 247 | lca_cnt[lca as usize] += 1; 248 | } 249 | prev_node.insert(last_ch, *target_node); 250 | } else { 251 | cur_str.0.push(slice); 252 | cur_str.1 += slice.len(); 253 | let result = self.longest_common_substring_all_rec( 254 | disjoint_set, 255 | prev_node, 256 | lca_cnt, 257 | *target_node, 258 | longest_str, 259 | cur_str, 260 | ); 261 | total_leaf += result.0; 262 | total_correction += result.1; 263 | 264 | cur_str.0.pop(); 265 | cur_str.1 -= slice.len(); 266 | } 267 | 268 | disjoint_set.union(node as usize, *target_node as usize); 269 | } 270 | total_correction += lca_cnt[node as usize]; 271 | let unique_str_cnt = total_leaf - total_correction; 272 | if unique_str_cnt == self.str_storage.len() { 273 | // This node represnets a substring that is common among all strings. 274 | if cur_str.1 > longest_str.1 { 275 | *longest_str = cur_str.clone(); 276 | } 277 | } 278 | (total_leaf, total_correction) 279 | } 280 | 281 | /// Find the longest common substring between string `s` and the current suffix. 282 | /// This function allows us compute this without adding `s` to the suffix. 283 | #[must_use] 284 | pub fn longest_common_substring_with<'a>(&self, s: &'a str) -> &'a str { 285 | let mut longest_start: IndexType = 0; 286 | let mut longest_len: IndexType = 0; 287 | let mut cur_start: IndexType = 0; 288 | let mut cur_len: IndexType = 0; 289 | let mut node: NodeID = ROOT; 290 | 291 | let chars = s.as_bytes(); 292 | let mut index = 0; 293 | let mut active_length = 0; 294 | while index < chars.len() { 295 | let target_node_id = self.transition(node, chars[index - active_length as usize]); 296 | if target_node_id != INVALID { 297 | let slice = &self.get_node(target_node_id).substr; 298 | while index != chars.len() 299 | && active_length < slice.len() 300 | && self.get_char(slice.str_id, active_length + slice.start) == chars[index] 301 | { 302 | index += 1; 303 | active_length += 1; 304 | } 305 | 306 | let final_len = cur_len + active_length; 307 | if final_len > longest_len { 308 | longest_len = final_len; 309 | longest_start = cur_start; 310 | } 311 | 312 | if index == chars.len() { 313 | break; 314 | } 315 | 316 | if active_length == slice.len() { 317 | // We can keep following this route. 318 | node = target_node_id; 319 | cur_len = final_len; 320 | active_length = 0; 321 | continue; 322 | } 323 | } 324 | // There was a mismatch. 325 | cur_start += 1; 326 | if cur_start as usize > index { 327 | index += 1; 328 | continue; 329 | } 330 | // We want to follow a different path with one less character from the start. 331 | let suffix_link = self.get_node(node).suffix_link; 332 | if suffix_link != INVALID && suffix_link != SINK { 333 | assert!(cur_len > 0); 334 | node = suffix_link; 335 | cur_len -= 1; 336 | } else { 337 | node = ROOT; 338 | active_length = active_length + cur_len - 1; 339 | cur_len = 0; 340 | } 341 | while active_length > 0 { 342 | assert!(((cur_start + cur_len) as usize) < chars.len()); 343 | let target_node_id = self.transition(node, chars[(cur_start + cur_len) as usize]); 344 | assert!(target_node_id != INVALID); 345 | let slice = &self.get_node(target_node_id).substr; 346 | if active_length < slice.len() { 347 | break; 348 | } 349 | active_length -= slice.len(); 350 | cur_len += slice.len(); 351 | node = target_node_id; 352 | } 353 | } 354 | &s[longest_start as usize..(longest_start + longest_len) as usize] 355 | } 356 | 357 | /// Checks whether a given string `s` is a suffix in the suffix tree. 358 | #[must_use] 359 | pub fn is_suffix(&self, s: &str) -> bool { 360 | self.is_suffix_or_substr(s, false) 361 | } 362 | 363 | /// Checks whether a given string `s` is a substring of any of the strings 364 | /// in the suffix tree. 365 | #[must_use] 366 | pub fn is_substr(&self, s: &str) -> bool { 367 | self.is_suffix_or_substr(s, true) 368 | } 369 | 370 | #[must_use] 371 | fn is_suffix_or_substr(&self, s: &str, check_substr: bool) -> bool { 372 | for existing_str in &self.str_storage { 373 | assert!( 374 | !s.contains(existing_str.chars().last().unwrap()), 375 | "Queried string cannot contain terminator char" 376 | ); 377 | } 378 | let mut node = ROOT; 379 | let mut index = 0; 380 | let chars = s.as_bytes(); 381 | while index < s.len() { 382 | let target_node = self.transition(node, chars[index]); 383 | if target_node == INVALID { 384 | return false; 385 | } 386 | let slice = &self.get_node(target_node).substr; 387 | for i in slice.start..slice.end { 388 | if index == s.len() { 389 | let is_suffix = i as usize == self.get_string(slice.str_id).len() - 1; 390 | return check_substr || is_suffix; 391 | } 392 | if chars[index] != self.get_char(slice.str_id, i) { 393 | return false; 394 | } 395 | index += 1; 396 | } 397 | node = target_node; 398 | } 399 | let mut is_suffix = false; 400 | for s in &self.str_storage { 401 | // The last character of each string is a terminator. We use that 402 | // to look up in the current transitions to determine if we have 403 | // reached the end of any string. If needed, we are also able to 404 | // return which string the queried string is a suffix of. 405 | if self.transition(node, *s.as_bytes().last().unwrap()) != INVALID { 406 | is_suffix = true; 407 | break; 408 | } 409 | } 410 | 411 | check_substr || is_suffix 412 | } 413 | 414 | pub fn pretty_print(&self) { 415 | self.print_recursive(ROOT, 0); 416 | } 417 | 418 | fn print_recursive(&self, node: NodeID, space_count: u32) { 419 | for target_node in self.get_node(node).transitions.values() { 420 | if *target_node == INVALID { 421 | continue; 422 | } 423 | for _ in 0..space_count { 424 | print!(" "); 425 | } 426 | let slice = &self.get_node(*target_node).substr; 427 | println!( 428 | "{}", 429 | self.get_string_slice(slice.str_id, slice.start, slice.end), 430 | ); 431 | self.print_recursive(*target_node, space_count + 4); 432 | } 433 | } 434 | 435 | fn process_suffixes(&mut self, str_id: StrID) { 436 | let mut active_point = ReferencePoint::new(ROOT, str_id, 0); 437 | for i in 0..self.get_string(str_id).len() { 438 | let mut cur_str = 439 | MappedSubstring::new(str_id, active_point.index, (i + 1) as IndexType); 440 | active_point = self.update(active_point.node, &cur_str); 441 | cur_str.start = active_point.index; 442 | active_point = self.canonize(active_point.node, &cur_str); 443 | } 444 | } 445 | 446 | fn update(&mut self, node: NodeID, cur_str: &MappedSubstring) -> ReferencePoint { 447 | assert!(!cur_str.is_empty()); 448 | 449 | let mut cur_str = cur_str.clone(); 450 | 451 | let mut oldr = ROOT; 452 | 453 | let mut split_str = cur_str.clone(); 454 | split_str.end -= 1; 455 | 456 | let last_ch = self.get_char(cur_str.str_id, cur_str.end - 1); 457 | 458 | let mut active_point = ReferencePoint::new(node, cur_str.str_id, cur_str.start); 459 | 460 | let mut r = node; 461 | 462 | let mut is_endpoint = self.test_and_split(node, &split_str, last_ch, &mut r); 463 | while !is_endpoint { 464 | let str_len = self.get_string(active_point.str_id).len() as IndexType; 465 | let leaf_node = 466 | self.create_node_with_slice(active_point.str_id, cur_str.end - 1, str_len); 467 | self.set_transition(r, last_ch, leaf_node); 468 | if oldr != ROOT { 469 | self.get_node_mut(oldr).suffix_link = r; 470 | } 471 | oldr = r; 472 | let suffix_link = self.get_node(active_point.node).get_suffix_link(); 473 | active_point = self.canonize(suffix_link, &split_str); 474 | split_str.start = active_point.index; 475 | cur_str.start = active_point.index; 476 | is_endpoint = self.test_and_split(active_point.node, &split_str, last_ch, &mut r); 477 | } 478 | if oldr != ROOT { 479 | self.get_node_mut(oldr).suffix_link = active_point.node; 480 | } 481 | active_point 482 | } 483 | 484 | fn test_and_split( 485 | &mut self, 486 | node: NodeID, 487 | split_str: &MappedSubstring, 488 | ch: CharType, 489 | r: &mut NodeID, 490 | ) -> bool { 491 | if split_str.is_empty() { 492 | *r = node; 493 | return self.transition(node, ch) != INVALID; 494 | } 495 | let first_ch = self.get_char(split_str.str_id, split_str.start); 496 | 497 | let target_node_id = self.transition(node, first_ch); 498 | let target_node_slice = self.get_node(target_node_id).substr.clone(); 499 | 500 | let split_index = target_node_slice.start + split_str.len(); 501 | let ref_ch = self.get_char(target_node_slice.str_id, split_index); 502 | 503 | if ref_ch == ch { 504 | *r = node; 505 | return true; 506 | } 507 | // Split target_node into two nodes by inserting r in the middle. 508 | *r = self.create_node_with_slice(split_str.str_id, split_str.start, split_str.end); 509 | self.set_transition(*r, ref_ch, target_node_id); 510 | self.set_transition(node, first_ch, *r); 511 | self.get_node_mut(target_node_id).substr.start = split_index; 512 | 513 | false 514 | } 515 | 516 | fn canonize(&mut self, mut node: NodeID, cur_str: &MappedSubstring) -> ReferencePoint { 517 | let mut cur_str = cur_str.clone(); 518 | loop { 519 | if cur_str.is_empty() { 520 | return ReferencePoint::new(node, cur_str.str_id, cur_str.start); 521 | } 522 | 523 | let ch = self.get_char(cur_str.str_id, cur_str.start); 524 | 525 | let target_node = self.transition(node, ch); 526 | if target_node == INVALID { 527 | break; 528 | } 529 | let slice = &self.get_node(target_node).substr; 530 | if slice.len() > cur_str.len() { 531 | break; 532 | } 533 | cur_str.start += slice.len(); 534 | node = target_node; 535 | } 536 | ReferencePoint::new(node, cur_str.str_id, cur_str.start) 537 | } 538 | 539 | fn create_node_with_slice( 540 | &mut self, 541 | str_id: StrID, 542 | start: IndexType, 543 | end: IndexType, 544 | ) -> NodeID { 545 | let node = Node::new(str_id, start, end); 546 | self.node_storage.push(node); 547 | 548 | (self.node_storage.len() - 1) as NodeID 549 | } 550 | 551 | fn get_node(&self, node_id: NodeID) -> &Node { 552 | &self.node_storage[node_id as usize] 553 | } 554 | 555 | fn get_node_mut(&mut self, node_id: NodeID) -> &mut Node { 556 | &mut self.node_storage[node_id as usize] 557 | } 558 | 559 | fn get_string(&self, str_id: StrID) -> &str { 560 | &self.str_storage[str_id as usize] 561 | } 562 | 563 | fn get_string_slice(&self, str_id: StrID, start: IndexType, end: IndexType) -> &str { 564 | &self.get_string(str_id)[start as usize..end as usize] 565 | } 566 | 567 | fn get_string_slice_short(&self, slice: &MappedSubstring) -> &str { 568 | self.get_string_slice(slice.str_id, slice.start, slice.end) 569 | } 570 | 571 | fn transition(&self, node: NodeID, ch: CharType) -> NodeID { 572 | if node == SINK { 573 | // SINK always transition to ROOT. 574 | return ROOT; 575 | } 576 | match self.get_node(node).transitions.get(&ch) { 577 | None => INVALID, 578 | Some(x) => *x, 579 | } 580 | } 581 | 582 | fn set_transition(&mut self, node: NodeID, ch: CharType, target_node: NodeID) { 583 | self.get_node_mut(node).transitions.insert(ch, target_node); 584 | } 585 | 586 | fn get_char(&self, str_id: StrID, index: IndexType) -> u8 { 587 | assert!((index as usize) < self.get_string(str_id).len()); 588 | self.get_string(str_id).as_bytes()[index as usize] 589 | } 590 | } 591 | --------------------------------------------------------------------------------