├── crates ├── divsuftest │ ├── .gitignore │ ├── crosscheck │ │ ├── rust │ │ └── c │ ├── Cargo.toml │ └── src │ │ └── main.rs ├── divsufsort │ ├── fuzz │ │ ├── .gitignore │ │ ├── fuzz_targets │ │ │ └── fuzz_target_1.rs │ │ ├── Cargo.toml │ │ └── Cargo.lock │ ├── src │ │ ├── testdata │ │ │ ├── fuzz1 │ │ │ ├── fuzz2 │ │ │ ├── fuzz3 │ │ │ ├── crash-04dc74e45e66386a3312a5a5825b020bcadc175c │ │ │ ├── crash-16356e91966a827f79e49167170194fc3088a7ab │ │ │ ├── crash-4f8c31dec8c3678a07e0fbacc6bd69e7cc9037fb │ │ │ ├── crash-8765ef2258178ca027876eab83e01d6d58db9ca0 │ │ │ ├── crash-90b42d1c55ee90a8b004fb9db1853429ceb4c4ba │ │ │ ├── crash-c792e788de61771b6cd65c1aa5670c62e57a33c4 │ │ │ ├── crash-ce407adf7cf638d3fa89b5637a94355d7d658872 │ │ │ └── crash-cf8673530fdca659e0ddf070b4718b9c0bb504ec │ │ ├── lib.rs │ │ ├── crosscheck.rs │ │ ├── common.rs │ │ └── divsufsort.rs │ ├── papers │ │ └── dismantling-divsufsort-2017.pdf │ ├── Cargo.toml │ ├── LICENSE │ └── README.md ├── dc3 │ ├── README.md │ ├── Cargo.toml │ └── src │ │ └── lib.rs ├── sacabase │ ├── README.md │ ├── Cargo.toml │ └── src │ │ └── lib.rs ├── cdivsufsort │ ├── Cargo.toml │ ├── build.rs │ ├── src │ │ └── lib.rs │ ├── LICENSE │ ├── c-sources │ │ ├── LICENSE │ │ ├── config.h │ │ ├── divsufsort.h │ │ ├── divsufsort_private.h │ │ ├── utils.c │ │ ├── divsufsort.c │ │ ├── trsort.c │ │ └── sssort.c │ └── README.md └── sacapart │ ├── Cargo.toml │ ├── README.md │ └── src │ └── lib.rs ├── .gitignore ├── .travis.yml ├── Cargo.toml ├── shell.nix ├── README.md └── Cargo.lock /crates/divsuftest/.gitignore: -------------------------------------------------------------------------------- 1 | /wavelet-matrix-rs 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /testdata 3 | /crosscheck 4 | **/*.rs.bk 5 | -------------------------------------------------------------------------------- /crates/divsufsort/fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target 3 | corpus 4 | artifacts 5 | -------------------------------------------------------------------------------- /crates/dc3/README.md: -------------------------------------------------------------------------------- 1 | 2 | # dc3 3 | 4 | An incomplete, exploratory implementation of DC-3. 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | 3 | rust: 4 | - stable 5 | 6 | os: 7 | - linux 8 | - windows 9 | - osx 10 | -------------------------------------------------------------------------------- /crates/divsufsort/src/testdata/fuzz1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fasterthanlime/stringsearch/HEAD/crates/divsufsort/src/testdata/fuzz1 -------------------------------------------------------------------------------- /crates/divsufsort/src/testdata/fuzz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fasterthanlime/stringsearch/HEAD/crates/divsufsort/src/testdata/fuzz2 -------------------------------------------------------------------------------- /crates/divsufsort/src/testdata/fuzz3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fasterthanlime/stringsearch/HEAD/crates/divsufsort/src/testdata/fuzz3 -------------------------------------------------------------------------------- /crates/dc3/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "dc3" 3 | version = "0.1.0" 4 | authors = ["Amos Wenger "] 5 | edition = "2018" 6 | -------------------------------------------------------------------------------- /crates/divsufsort/papers/dismantling-divsufsort-2017.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fasterthanlime/stringsearch/HEAD/crates/divsufsort/papers/dismantling-divsufsort-2017.pdf -------------------------------------------------------------------------------- /crates/divsufsort/fuzz/fuzz_targets/fuzz_target_1.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | use libfuzzer_sys::fuzz_target; 3 | 4 | fuzz_target!(|data: &[u8]| { 5 | divsufsort::sort(data); 6 | }); 7 | -------------------------------------------------------------------------------- /crates/sacabase/README.md: -------------------------------------------------------------------------------- 1 | 2 | # sacabase 3 | 4 | Base types and functions for suffix arrays. 5 | 6 | These are meant to be used when implementing a SACA in Rust, to make substring 7 | search easy. 8 | 9 | -------------------------------------------------------------------------------- /crates/divsufsort/src/testdata/crash-04dc74e45e66386a3312a5a5825b020bcadc175c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fasterthanlime/stringsearch/HEAD/crates/divsufsort/src/testdata/crash-04dc74e45e66386a3312a5a5825b020bcadc175c -------------------------------------------------------------------------------- /crates/divsufsort/src/testdata/crash-16356e91966a827f79e49167170194fc3088a7ab: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fasterthanlime/stringsearch/HEAD/crates/divsufsort/src/testdata/crash-16356e91966a827f79e49167170194fc3088a7ab -------------------------------------------------------------------------------- /crates/divsufsort/src/testdata/crash-4f8c31dec8c3678a07e0fbacc6bd69e7cc9037fb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fasterthanlime/stringsearch/HEAD/crates/divsufsort/src/testdata/crash-4f8c31dec8c3678a07e0fbacc6bd69e7cc9037fb -------------------------------------------------------------------------------- /crates/divsufsort/src/testdata/crash-8765ef2258178ca027876eab83e01d6d58db9ca0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fasterthanlime/stringsearch/HEAD/crates/divsufsort/src/testdata/crash-8765ef2258178ca027876eab83e01d6d58db9ca0 -------------------------------------------------------------------------------- /crates/divsufsort/src/testdata/crash-90b42d1c55ee90a8b004fb9db1853429ceb4c4ba: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fasterthanlime/stringsearch/HEAD/crates/divsufsort/src/testdata/crash-90b42d1c55ee90a8b004fb9db1853429ceb4c4ba -------------------------------------------------------------------------------- /crates/divsufsort/src/testdata/crash-c792e788de61771b6cd65c1aa5670c62e57a33c4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fasterthanlime/stringsearch/HEAD/crates/divsufsort/src/testdata/crash-c792e788de61771b6cd65c1aa5670c62e57a33c4 -------------------------------------------------------------------------------- /crates/divsufsort/src/testdata/crash-ce407adf7cf638d3fa89b5637a94355d7d658872: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fasterthanlime/stringsearch/HEAD/crates/divsufsort/src/testdata/crash-ce407adf7cf638d3fa89b5637a94355d7d658872 -------------------------------------------------------------------------------- /crates/divsufsort/src/testdata/crash-cf8673530fdca659e0ddf070b4718b9c0bb504ec: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fasterthanlime/stringsearch/HEAD/crates/divsufsort/src/testdata/crash-cf8673530fdca659e0ddf070b4718b9c0bb504ec -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "crates/sacabase", 4 | "crates/sacapart", 5 | "crates/divsufsort", 6 | "crates/cdivsufsort", 7 | "crates/divsuftest", 8 | "crates/dc3", 9 | ] 10 | 11 | [profile.release] 12 | debug = true 13 | 14 | 15 | -------------------------------------------------------------------------------- /crates/divsuftest/crosscheck/rust: -------------------------------------------------------------------------------- 1 | :: sssort(A) 2 | 7 1 2 3 4 6 3 | :: sssort(B) 4 | 4 6 -2 -4 0 7 5 | pascal limit=1 first=3 last=6 6 | insertionsort last-first=3 7 | pascal limit=-3 first=3 last=6 8 | first"] 5 | edition = "2018" 6 | 7 | [features] 8 | crosscheck = ["divsufsort/crosscheck", "cdivsufsort/crosscheck"] 9 | 10 | [dependencies] 11 | divsufsort = { path = "../divsufsort" } 12 | cdivsufsort = { path = "../cdivsufsort" } 13 | suffix_array = "0.4.0" 14 | better-panic = "0.2.0" 15 | size_format = "1.0.2" 16 | cli-table = "0.2.0" 17 | pico-args = "0.3.0" 18 | failure = "0.1.6" 19 | -------------------------------------------------------------------------------- /crates/sacabase/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sacabase" 3 | version = "2.0.0" 4 | authors = ["Amos Wenger "] 5 | edition = "2018" 6 | 7 | description = "Base types and functions for suffix arrays and longest substring search" 8 | repository = "https://github.com/fasterthanlime/stringsearch" 9 | readme = "README.md" 10 | keywords = ["saca", "suffix", "search", "index"] 11 | categories = ["algorithms", "data-structures"] 12 | license = "MIT" 13 | 14 | [dependencies] 15 | num-traits = "0.2.9" 16 | -------------------------------------------------------------------------------- /crates/divsufsort/fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | 2 | [package] 3 | name = "divsufsort-fuzz" 4 | version = "0.0.0" 5 | authors = ["Automatically generated"] 6 | publish = false 7 | edition = "2018" 8 | 9 | [package.metadata] 10 | cargo-fuzz = true 11 | 12 | [dependencies.divsufsort] 13 | path = ".." 14 | [dependencies.libfuzzer-sys] 15 | git = "https://github.com/rust-fuzz/libfuzzer-sys.git" 16 | 17 | # Prevent this from interfering with workspaces 18 | [workspace] 19 | members = ["."] 20 | 21 | [[bin]] 22 | name = "fuzz_target_1" 23 | path = "fuzz_targets/fuzz_target_1.rs" 24 | -------------------------------------------------------------------------------- /crates/cdivsufsort/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cdivsufsort" 3 | version = "2.0.0" 4 | authors = ["Amos Wenger "] 5 | edition = "2018" 6 | 7 | description = "Rust bindings for Yuta Mori's divsufsort" 8 | repository = "https://github.com/fasterthanlime/stringsearch" 9 | readme = "README.md" 10 | keywords = ["saca", "suffix", "search", "index", "divsufsort"] 11 | categories = ["algorithms", "api-bindings"] 12 | license = "MIT" 13 | 14 | [features] 15 | crosscheck = [] 16 | 17 | [dependencies] 18 | sacabase = { path = "../sacabase", version = "2.0.0" } 19 | 20 | [build-dependencies] 21 | cc = "1.0.47" 22 | -------------------------------------------------------------------------------- /crates/divsufsort/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "divsufsort" 3 | version = "2.0.0" 4 | authors = ["Amos Wenger "] 5 | edition = "2018" 6 | 7 | description = "Rust port of Yuta Mori's divsufsort" 8 | repository = "https://github.com/fasterthanlime/stringsearch" 9 | readme = "README.md" 10 | keywords = ["saca", "suffix", "search", "index", "divsufsort"] 11 | categories = ["algorithms", "data-structures"] 12 | license = "MIT" 13 | 14 | [features] 15 | crosscheck = ["once_cell"] 16 | 17 | [dependencies] 18 | once_cell = { version = "1.2.0", optional = true } 19 | sacabase = { path = "../sacabase", version = "2.0.0" } 20 | -------------------------------------------------------------------------------- /crates/sacapart/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sacapart" 3 | version = "2.0.0" 4 | authors = ["Amos Wenger "] 5 | edition = "2018" 6 | 7 | description = "Partitioned suffix arrays, for use with `sacabase`" 8 | repository = "https://github.com/fasterthanlime/stringsearch" 9 | readme = "README.md" 10 | keywords = ["saca", "suffix", "search", "index"] 11 | categories = ["algorithms", "data-structures"] 12 | license = "MIT" 13 | 14 | [dependencies] 15 | sacabase = { path = "../sacabase", version = "2.0.0" } 16 | num-traits = "0.2.9" 17 | rayon = "1.2.1" 18 | 19 | [dev-dependencies] 20 | divsufsort = { path = "../divsufsort", version = "2.0.0" } 21 | -------------------------------------------------------------------------------- /shell.nix: -------------------------------------------------------------------------------- 1 | { pkgs ? import {} }: 2 | 3 | let 4 | moz_overlay = import (builtins.fetchTarball "https://github.com/mozilla/nixpkgs-mozilla/archive/master.tar.gz"); 5 | nixpkgs = import { overlays = [ moz_overlay ]; }; 6 | custom = import (builtins.fetchTarball { 7 | name = "killercup-nixpkgs"; 8 | url = "https://github.com/killercup/nixpkgs/archive/cargo-fuzz-0.5.4.tar.gz"; 9 | }) {}; 10 | in pkgs.mkShell { 11 | buildInputs = with pkgs; [ 12 | git 13 | (nixpkgs.rustChannelOf { date = "2019-10-23"; channel = "nightly"; }).rust 14 | custom.pkgs.cargo-fuzz 15 | ]; 16 | 17 | RUSTFLAGS="-C link-arg=-fuse-ld=gold"; 18 | RUST_BACKTRACE = 1; 19 | } 20 | 21 | -------------------------------------------------------------------------------- /crates/cdivsufsort/build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | let mut build = cc::Build::new(); 3 | 4 | build.flag("-DHAVE_CONFIG_H=1").warnings(false); 5 | 6 | let profile = std::env::var("PROFILE").unwrap(); 7 | match profile.as_str() { 8 | "debug" => { 9 | // muffin 10 | } 11 | "release" => { 12 | if build.get_compiler().is_like_msvc() { 13 | build.flag("/Oi").flag("/Ot").flag("/Ox").flag("/Oy"); 14 | } 15 | } 16 | _ => {} 17 | }; 18 | if std::env::var("CARGO_FEATURE_CROSSCHECK").is_ok() { 19 | build.flag("-DENABLE_CROSSCHECK=1"); 20 | } 21 | 22 | build 23 | .file("c-sources/divsufsort.c") 24 | .file("c-sources/sssort.c") 25 | .file("c-sources/trsort.c") 26 | .file("c-sources/utils.c"); 27 | 28 | build.compile("libdivsufsort.a"); 29 | } 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # suffixsearch 3 | 4 | [![Build Status](https://travis-ci.org/fasterthanlime/suffixsearch.svg?branch=master)](https://travis-ci.org/fasterthanlime/suffixsearch) 5 | 6 | A collection of SACAs (suffix array construction algorithms) and other 7 | methods of indexing and searching for substrings in all suffixes of a 8 | given input. 9 | 10 | ## Crates 11 | 12 | * [divsufsort](crates/divsufsort) is Rust version of Yuta Mori's `libdivsufsort`, ported by hand 13 | * [cdivsfusort](crates/cdivsufsort) is Yuta Mori's original `libdivsufsort`, built with the `cc` crate 14 | * [divsuftest](crates/divsuftest) is a test executable that allows comparing against the 15 | above crates. 16 | * [dc3](crates/dc3) is a naive work-in-progress implementation of DC3 (Differential Cover, v=3) 17 | 18 | See the crates' README files for more information on their status, 19 | expected performance and licensing. 20 | -------------------------------------------------------------------------------- /crates/sacapart/README.md: -------------------------------------------------------------------------------- 1 | 2 | # sacapart 3 | 4 | Computing the suffix array (the lexicographic order of all suffixes of a 5 | text) is expensive, especially as the text gets large. 6 | 7 | Sometimes, for very large inputs, a compromise is possible. Instead of 8 | computing the suffix array of the *whole text*, we can compute the suffix 9 | array of the first half, and the suffix array of the second half. 10 | 11 | Memory usage remains roughly the same (depending on the SACA used), lookup 12 | time gets worse by a constant factor (the number of partitions), and, across 13 | partitions boundaries, worse (shorter) matches are sometimes found. 14 | 15 | For some applications, like diffing very large files, this compromise makes 16 | sense. Read the docs and the tests to see if `sacapart` is right for you. 17 | 18 | Note: `sacapart` is meant to be used in conjuction with a SACA that supports 19 | `sacabase`, like `divsufsort`. 20 | -------------------------------------------------------------------------------- /crates/cdivsufsort/src/lib.rs: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | fn divsufsort(T: *const u8, SA: *mut i32, n: i32) -> i32; 3 | pub fn dss_flush(); 4 | } 5 | 6 | /// Sort suffixes of `text` and store their lexographic order 7 | /// in the given suffix array `sa`. 8 | /// Will panic if `sa.len()` != `text.len()` 9 | pub fn sort_in_place(text: &[u8], sa: &mut [i32]) { 10 | assert_eq!( 11 | text.len(), 12 | sa.len(), 13 | "text and suffix array should have same len" 14 | ); 15 | assert!( 16 | text.len() < i32::max_value() as usize, 17 | "text too large, should not exceed {} bytes", 18 | i32::max_value() - 1 19 | ); 20 | 21 | let ret = unsafe { divsufsort(text.as_ptr(), sa.as_mut_ptr(), text.len() as i32) }; 22 | assert_eq!(0, ret); 23 | } 24 | 25 | //// Sort suffixes 26 | pub fn sort<'a>(text: &'a [u8]) -> sacabase::SuffixArray { 27 | let mut sa = vec![0; text.len()]; 28 | sort_in_place(text, &mut sa); 29 | sacabase::SuffixArray::new(text, sa) 30 | } 31 | -------------------------------------------------------------------------------- /crates/cdivsufsort/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 Amos Wenger All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /crates/divsufsort/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 Amos Wenger All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /crates/cdivsufsort/c-sources/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2003 Yuta Mori All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /crates/cdivsufsort/README.md: -------------------------------------------------------------------------------- 1 | 2 | # cdivsufsort 3 | 4 | This crate contains Yuta Mori's C codebase `libdivsufsort`, as found on: 5 | 6 | * 7 | 8 | ...and a minimal Rust interface to it. 9 | 10 | ## Changes 11 | 12 | There are no functional changes to the codebase, however: 13 | 14 | * Parts of the code have been formatted with clang-format (LLVM style) 15 | * Many of the loops (for, do..while) have been given names in comments, for 16 | ease of translation. 17 | * The codebase contains "cross-checking" facilities (the macros `crosscheck`, 18 | `SA_dump`, etc.) so its behavior can be compared with the Rust port. 19 | 20 | Cross-checking is only built when the `crosscheck` feature is enabled. It is 21 | not intended for general use, only for debugging the `divsufsort` crate. 22 | 23 | ## Further reading 24 | 25 | The divsufsort algorithm is based on "", 26 | 27 | ## Authors 28 | 29 | The original code was written by Yuta Mori, and its essence is not changed 30 | here. 31 | 32 | ## License 33 | 34 | `cdivsufsort` is released under the MIT license, same as the original. 35 | 36 | See the `LICENSE` and `c-sources/LICENSE` files for details. 37 | 38 | -------------------------------------------------------------------------------- /crates/divsufsort/README.md: -------------------------------------------------------------------------------- 1 | 2 | # divsufsort 3 | 4 | This crate contains a Rust handmade port of Yuta Mori's `libdivsufsort`, as found on: 5 | 6 | * 7 | 8 | ## Changes 9 | 10 | The main changes from the C codebase are as follows. 11 | 12 | Instead of passing pointers to T (the original text) and SA (the suffix array), 13 | slices and indices are passed instead. This sometimes involves adding more parameters 14 | to functions (like `tr_heapsort`). 15 | 16 | Some macros (for stacks, used in `sssort` and `trsort`) have been replaced with 17 | proper Rust types. The `SAPtr` type is used to represent an index into `SA`. 18 | A/B/B* access has also been translated from C macros to Rust (inlined) functions. 19 | 20 | Cross-checking is only built when the `crosscheck` feature is enabled. It is 21 | not intended for general use, only for debugging the `divsufsort` crate. 22 | 23 | ## Authors 24 | 25 | The original C code was written by Yuta Mori. 26 | 27 | The port was done by hand, by [Amos Wenger](https://github.com/fasterthanlime). 28 | 29 | ## License 30 | 31 | `divsufsort` is released under the MIT license, same as the original. 32 | 33 | See the `LICENSE` file for details. 34 | -------------------------------------------------------------------------------- /crates/divsufsort/fuzz/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | [[package]] 4 | name = "arbitrary" 5 | version = "0.1.1" 6 | source = "registry+https://github.com/rust-lang/crates.io-index" 7 | 8 | [[package]] 9 | name = "autocfg" 10 | version = "0.1.7" 11 | source = "registry+https://github.com/rust-lang/crates.io-index" 12 | 13 | [[package]] 14 | name = "cc" 15 | version = "1.0.47" 16 | source = "registry+https://github.com/rust-lang/crates.io-index" 17 | 18 | [[package]] 19 | name = "divsufsort" 20 | version = "0.1.0" 21 | dependencies = [ 22 | "sacabase 0.1.0", 23 | ] 24 | 25 | [[package]] 26 | name = "divsufsort-fuzz" 27 | version = "0.0.0" 28 | dependencies = [ 29 | "divsufsort 0.1.0", 30 | "libfuzzer-sys 0.1.0 (git+https://github.com/rust-fuzz/libfuzzer-sys.git)", 31 | ] 32 | 33 | [[package]] 34 | name = "libfuzzer-sys" 35 | version = "0.1.0" 36 | source = "git+https://github.com/rust-fuzz/libfuzzer-sys.git#4ad88ec54fa2e7a0c877b6477cc86ea0de14a90d" 37 | dependencies = [ 38 | "arbitrary 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", 39 | "cc 1.0.47 (registry+https://github.com/rust-lang/crates.io-index)", 40 | ] 41 | 42 | [[package]] 43 | name = "num-traits" 44 | version = "0.2.9" 45 | source = "registry+https://github.com/rust-lang/crates.io-index" 46 | dependencies = [ 47 | "autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", 48 | ] 49 | 50 | [[package]] 51 | name = "sacabase" 52 | version = "0.1.0" 53 | dependencies = [ 54 | "num-traits 0.2.9 (registry+https://github.com/rust-lang/crates.io-index)", 55 | ] 56 | 57 | [metadata] 58 | "checksum arbitrary 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "6c7d1523aa3a127adf8b27af2404c03c12825b4c4d0698f01648d63fa9df62ee" 59 | "checksum autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2" 60 | "checksum cc 1.0.47 (registry+https://github.com/rust-lang/crates.io-index)" = "aa87058dce70a3ff5621797f1506cb837edd02ac4c0ae642b4542dce802908b8" 61 | "checksum libfuzzer-sys 0.1.0 (git+https://github.com/rust-fuzz/libfuzzer-sys.git)" = "" 62 | "checksum num-traits 0.2.9 (registry+https://github.com/rust-lang/crates.io-index)" = "443c53b3c3531dfcbfa499d8893944db78474ad7a1d87fa2d94d1a2231693ac6" 63 | -------------------------------------------------------------------------------- /crates/divsufsort/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(nonstandard_style)] 2 | #![allow(unused_variables)] 3 | #![allow(unused_parens)] 4 | #![allow(unused_mut)] 5 | #![allow(unused_imports)] 6 | #![allow(dead_code)] 7 | 8 | mod common; 9 | pub mod crosscheck; 10 | mod divsufsort; 11 | mod sssort; 12 | mod trsort; 13 | 14 | use common::Idx; 15 | use sacabase::SuffixArray; 16 | 17 | /// Sort suffixes of `text` and store their lexographic order 18 | /// in the given suffix array `sa`. 19 | /// Will panic if `sa.len()` != `text.len()` 20 | pub fn sort_in_place(text: &[u8], sa: &mut [Idx]) { 21 | divsufsort::divsufsort(text, sa); 22 | } 23 | 24 | //// Sort suffixes 25 | pub fn sort(text: &[u8]) -> sacabase::SuffixArray { 26 | let mut sa = vec![0; text.len()]; 27 | sort_in_place(text, &mut sa); 28 | sacabase::SuffixArray::new(text, sa) 29 | } 30 | 31 | #[cfg(test)] 32 | mod tests { 33 | #[test] 34 | fn fuzz1() { 35 | sort(include_bytes!("./testdata/fuzz1")); 36 | } 37 | 38 | #[test] 39 | fn fuzz2() { 40 | sort(include_bytes!("./testdata/fuzz2")); 41 | } 42 | 43 | #[test] 44 | fn fuzz3() { 45 | sort(include_bytes!("./testdata/fuzz3")); 46 | } 47 | 48 | #[test] 49 | fn fuzz_cf86735() { 50 | sort(include_bytes!("./testdata/crash-cf8673530fdca659e0ddf070b4718b9c0bb504ec")); 51 | } 52 | 53 | #[test] 54 | fn fuzz_ce407ad() { 55 | sort(include_bytes!("./testdata/crash-ce407adf7cf638d3fa89b5637a94355d7d658872")); 56 | } 57 | 58 | #[test] 59 | fn fuzz_c792e78() { 60 | sort(include_bytes!("./testdata/crash-c792e788de61771b6cd65c1aa5670c62e57a33c4")); 61 | } 62 | 63 | #[test] 64 | fn fuzz_90b42d1() { 65 | sort(include_bytes!("./testdata/crash-90b42d1c55ee90a8b004fb9db1853429ceb4c4ba")); 66 | } 67 | 68 | #[test] 69 | fn fuzz_8765ef2() { 70 | sort(include_bytes!("./testdata/crash-8765ef2258178ca027876eab83e01d6d58db9ca0")); 71 | } 72 | 73 | #[test] 74 | fn fuzz_4f8c31d() { 75 | sort(include_bytes!("./testdata/crash-4f8c31dec8c3678a07e0fbacc6bd69e7cc9037fb")); 76 | } 77 | 78 | #[test] 79 | fn fuzz_16356e9() { 80 | sort(include_bytes!("./testdata/crash-16356e91966a827f79e49167170194fc3088a7ab")); 81 | } 82 | 83 | #[test] 84 | fn shruggy() { 85 | sort(r#"¯\_(ツ)_/¯"#.as_bytes()); 86 | } 87 | 88 | fn sort(s: &[u8]) { 89 | let sa = super::sort(s); 90 | sa.verify().unwrap(); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /crates/divsufsort/src/crosscheck.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "crosscheck")] 2 | use once_cell::sync::Lazy; 3 | 4 | use crate::common::{ABucket, BMixBucket, Idx, SuffixArray, ALPHABET_SIZE}; 5 | use std::{ 6 | fs::File, 7 | io::{BufWriter, Write}, 8 | sync::Mutex, 9 | }; 10 | 11 | #[cfg(feature = "crosscheck")] 12 | pub static CROSSCHECK_FILE: Lazy>> = Lazy::new(|| { 13 | std::fs::create_dir_all("crosscheck").unwrap(); 14 | Mutex::new(BufWriter::new(File::create("crosscheck/rust").unwrap())) 15 | }); 16 | 17 | #[macro_export] 18 | macro_rules! crosscheck { 19 | ($($arg: expr),*) => { 20 | #[cfg(feature = "crosscheck")] 21 | { 22 | use std::io::Write; 23 | let mut f = crate::crosscheck::CROSSCHECK_FILE.lock().unwrap(); 24 | writeln!(f, $($arg),*).unwrap(); 25 | } 26 | }; 27 | } 28 | 29 | pub fn flush() { 30 | #[cfg(feature = "crosscheck")] 31 | { 32 | let mut f = crate::crosscheck::CROSSCHECK_FILE.lock().unwrap(); 33 | f.flush().unwrap(); 34 | } 35 | } 36 | 37 | #[macro_export] 38 | macro_rules! SA_dump { 39 | ($SA: expr, $label: expr) => { 40 | #[cfg(feature = "crosscheck")] 41 | { 42 | use std::io::Write; 43 | let mut f = crate::crosscheck::CROSSCHECK_FILE.lock().unwrap(); 44 | 45 | writeln!(f, ":: {}", $label).unwrap(); 46 | for i in 0..$SA.0.len() { 47 | write!(f, "{} ", $SA.0[i]).unwrap(); 48 | if (i + 1) % 25 == 0 { 49 | writeln!(f).unwrap(); 50 | } 51 | } 52 | writeln!(f).unwrap(); 53 | } 54 | }; 55 | } 56 | 57 | #[macro_export] 58 | macro_rules! A_dump { 59 | ($A: expr, $label: expr) => { 60 | #[cfg(feature = "crosscheck")] 61 | { 62 | crosscheck!(":: {}", $label); 63 | crosscheck!("A = {:?}", $A.0); 64 | } 65 | }; 66 | } 67 | 68 | #[macro_export] 69 | macro_rules! BSTAR_dump { 70 | ($B: expr, $label: expr) => { 71 | #[cfg(feature = "crosscheck")] 72 | { 73 | use std::io::Write; 74 | let mut f = crate::crosscheck::CROSSCHECK_FILE.lock().unwrap(); 75 | 76 | writeln!(f, "{} B* dump:", $label).unwrap(); 77 | for ii in 0..(ALPHABET_SIZE as Idx) { 78 | for jj in 0..(ALPHABET_SIZE as Idx) { 79 | writeln!(f, "{} B*[{},{}]={}", $label, ii, jj, $B.bstar()[(ii, jj)]).unwrap(); 80 | } 81 | } 82 | } 83 | }; 84 | } 85 | -------------------------------------------------------------------------------- /crates/cdivsufsort/c-sources/config.h: -------------------------------------------------------------------------------- 1 | /* 2 | * config.h for libdivsufsort 3 | * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. 4 | * 5 | * Permission is hereby granted, free of charge, to any person 6 | * obtaining a copy of this software and associated documentation 7 | * files (the "Software"), to deal in the Software without 8 | * restriction, including without limitation the rights to use, 9 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following 12 | * conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be 15 | * included in all copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | * OTHER DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | #ifndef _CONFIG_H 28 | #define _CONFIG_H 1 29 | 30 | #ifdef __cplusplus 31 | extern "C" { 32 | #endif /* __cplusplus */ 33 | 34 | /** Define to the version of this package. **/ 35 | #define PROJECT_VERSION_FULL "2.0.1-14-g5f60d6f" 36 | 37 | /** Define to 1 if you have the header files. **/ 38 | #define HAVE_INTTYPES_H 1 39 | #define HAVE_STDDEF_H 1 40 | #define HAVE_STDINT_H 1 41 | #define HAVE_STDLIB_H 1 42 | #define HAVE_STRING_H 1 43 | /* #undef HAVE_STRINGS_H */ 44 | #define HAVE_MEMORY_H 1 45 | #define HAVE_SYS_TYPES_H 1 46 | 47 | /** for WinIO **/ 48 | /* #undef HAVE_IO_H */ 49 | /* #undef HAVE_FCNTL_H */ 50 | /* #undef HAVE__SETMODE */ 51 | /* #undef HAVE_SETMODE */ 52 | /* #undef HAVE__FILENO */ 53 | /* #undef HAVE_FOPEN_S */ 54 | /* #undef HAVE__O_BINARY */ 55 | #ifndef HAVE__SETMODE 56 | # if HAVE_SETMODE 57 | # define _setmode setmode 58 | # define HAVE__SETMODE 1 59 | # endif 60 | # if HAVE__SETMODE && !HAVE__O_BINARY 61 | # define _O_BINARY 0 62 | # define HAVE__O_BINARY 1 63 | # endif 64 | #endif 65 | 66 | /** for inline **/ 67 | #ifndef INLINE 68 | # define INLINE inline 69 | #endif 70 | 71 | /** for VC++ warning **/ 72 | #ifdef _MSC_VER 73 | #pragma warning(disable: 4127) 74 | #endif 75 | 76 | 77 | #ifdef __cplusplus 78 | } /* extern "C" */ 79 | #endif /* __cplusplus */ 80 | 81 | #endif /* _CONFIG_H */ 82 | -------------------------------------------------------------------------------- /crates/sacabase/src/lib.rs: -------------------------------------------------------------------------------- 1 | use num_traits::ToPrimitive; 2 | use std::{cmp::min, fmt}; 3 | 4 | pub struct LongestCommonSubstring<'a> { 5 | pub text: &'a [u8], 6 | pub start: usize, 7 | pub len: usize, 8 | } 9 | 10 | impl<'a> fmt::Debug for LongestCommonSubstring<'a> { 11 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 12 | write!(f, "T[{}..{}]", self.start, self.start + self.len) 13 | } 14 | } 15 | 16 | impl<'a> LongestCommonSubstring<'a> { 17 | #[inline(always)] 18 | pub fn as_bytes(&self) -> &[u8] { 19 | &self.text[self.start..self.start + self.len] 20 | } 21 | } 22 | 23 | /// Returns the number of bytes `a` and `b` have in common. 24 | /// Ex: `common_prefix_len("banana", "banter") = 3` 25 | #[inline(always)] 26 | pub fn common_prefix_len(a: &[u8], b: &[u8]) -> usize { 27 | // TODO: try to exploit SSE 4.2 28 | let n = min(a.len(), b.len()); 29 | for i in 0..n { 30 | if a[i] != b[i] { 31 | return i; 32 | } 33 | } 34 | n 35 | } 36 | 37 | /// Searches for the longest substring match for `needle` 38 | /// in `input`, using its suffix array `sa`. 39 | pub fn longest_substring_match<'a, Index>( 40 | text: &'a [u8], 41 | mut sa: &[Index], 42 | needle: &[u8], 43 | ) -> LongestCommonSubstring<'a> 44 | where 45 | Index: ToPrimitive, 46 | { 47 | macro_rules! sa { 48 | ($x: expr) => { 49 | sa[$x].to_usize().unwrap() 50 | }; 51 | } 52 | 53 | macro_rules! suff { 54 | ($x: expr) => { 55 | &text[sa!($x)..] 56 | }; 57 | } 58 | 59 | macro_rules! len { 60 | ($x: expr) => { 61 | common_prefix_len(suff!($x), needle) 62 | }; 63 | } 64 | 65 | macro_rules! lcs { 66 | ($start: expr, $len: expr) => { 67 | LongestCommonSubstring { 68 | text: text, 69 | start: $start, 70 | len: $len, 71 | } 72 | }; 73 | } 74 | 75 | loop { 76 | match sa.len() { 77 | 1 => { 78 | return lcs!(sa!(0), len!(0)); 79 | } 80 | 2 => { 81 | let x = len!(0); 82 | let y = len!(1); 83 | return if x > y { 84 | lcs!(sa!(0), x) 85 | } else { 86 | lcs!(sa!(1), y) 87 | }; 88 | } 89 | _ => { 90 | let mid = sa.len() / 2; 91 | if needle > suff!(mid) { 92 | sa = &sa[mid..]; 93 | } else { 94 | sa = &sa[..=mid]; 95 | } 96 | } 97 | } 98 | } 99 | } 100 | 101 | /// Error returned by `verify` when a suffix array is not sorted. 102 | pub struct NotSorted { 103 | i: usize, 104 | j: usize, 105 | } 106 | 107 | impl fmt::Debug for NotSorted { 108 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 109 | write!( 110 | f, 111 | "invariant doesn't hold: suf(SA({})) < suf(SA({}))", 112 | self.i, self.j 113 | ) 114 | } 115 | } 116 | 117 | impl fmt::Display for NotSorted { 118 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 119 | fmt::Debug::fmt(self, f) 120 | } 121 | } 122 | 123 | impl std::error::Error for NotSorted {} 124 | 125 | /// Returns an error if `sa` is not the suffix array of `input`, 126 | /// Ok(()) otherwise. 127 | pub fn verify(input: &[u8], sa: &[Index]) -> Result<(), NotSorted> 128 | where 129 | Index: ToPrimitive, 130 | { 131 | macro_rules! sa { 132 | ($x: expr) => { 133 | sa[$x].to_usize().unwrap() 134 | }; 135 | } 136 | 137 | macro_rules! suff { 138 | ($x: expr) => { 139 | &input[sa!($x)..] 140 | }; 141 | } 142 | 143 | for i in 0..(input.len() - 1) { 144 | if !(suff!(i) < suff!(i + 1)) { 145 | return Err(NotSorted { i: i, j: i + 1 }); 146 | } 147 | } 148 | Ok(()) 149 | } 150 | 151 | /// A suffix array 152 | pub struct SuffixArray<'a, Index> 153 | where 154 | Index: ToPrimitive, 155 | { 156 | sa: Vec, 157 | text: &'a [u8], 158 | } 159 | 160 | pub trait StringIndex<'a> { 161 | /// Returns the longest substring that matches `needle` in text 162 | fn longest_substring_match(&self, needle: &[u8]) -> LongestCommonSubstring<'a>; 163 | } 164 | 165 | impl<'a, Index> SuffixArray<'a, Index> 166 | where 167 | Index: ToPrimitive, 168 | { 169 | /// Create an instance of SuffixArray, taking ownership of `sa` 170 | pub fn new(text: &'a [u8], sa: Vec) -> Self { 171 | Self { sa, text } 172 | } 173 | 174 | /// Return (text, sa), giving back ownership of `sa` 175 | pub fn into_parts(self) -> (&'a [u8], Vec) { 176 | (self.text, self.sa) 177 | } 178 | 179 | /// Verifies that this suffix array is sorted. 180 | pub fn verify(&self) -> Result<(), NotSorted> { 181 | verify(self.text, &self.sa[..]) 182 | } 183 | 184 | /// Returns a reference to the text 185 | pub fn text(&self) -> &[u8] { 186 | return self.text; 187 | } 188 | } 189 | 190 | impl<'a, Index> StringIndex<'a> for SuffixArray<'a, Index> 191 | where 192 | Index: ToPrimitive, 193 | { 194 | fn longest_substring_match(&self, needle: &[u8]) -> LongestCommonSubstring<'a> { 195 | longest_substring_match(self.text, &self.sa[..], needle) 196 | } 197 | } 198 | 199 | #[cfg(test)] 200 | mod tests { 201 | #[test] 202 | fn it_works() { 203 | assert_eq!(2 + 2, 4); 204 | } 205 | } 206 | -------------------------------------------------------------------------------- /crates/cdivsufsort/c-sources/divsufsort.h: -------------------------------------------------------------------------------- 1 | /* 2 | * divsufsort.h for libdivsufsort 3 | * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. 4 | * 5 | * Permission is hereby granted, free of charge, to any person 6 | * obtaining a copy of this software and associated documentation 7 | * files (the "Software"), to deal in the Software without 8 | * restriction, including without limitation the rights to use, 9 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following 12 | * conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be 15 | * included in all copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | * OTHER DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | #ifndef _DIVSUFSORT_H 28 | #define _DIVSUFSORT_H 1 29 | 30 | #ifdef __cplusplus 31 | extern "C" { 32 | #endif /* __cplusplus */ 33 | 34 | #include 35 | 36 | #ifndef DIVSUFSORT_API 37 | # ifdef DIVSUFSORT_BUILD_DLL 38 | # define DIVSUFSORT_API 39 | # else 40 | # define DIVSUFSORT_API 41 | # endif 42 | #endif 43 | 44 | /*- Datatypes -*/ 45 | #ifndef SAUCHAR_T 46 | #define SAUCHAR_T 47 | typedef uint8_t sauchar_t; 48 | #endif /* SAUCHAR_T */ 49 | #ifndef SAINT_T 50 | #define SAINT_T 51 | typedef int32_t saint_t; 52 | #endif /* SAINT_T */ 53 | #ifndef SAIDX_T 54 | #define SAIDX_T 55 | typedef int32_t saidx_t; 56 | #endif /* SAIDX_T */ 57 | #ifndef PRIdSAINT_T 58 | #define PRIdSAINT_T PRId32 59 | #endif /* PRIdSAINT_T */ 60 | #ifndef PRIdSAIDX_T 61 | #define PRIdSAIDX_T PRId32 62 | #endif /* PRIdSAIDX_T */ 63 | 64 | 65 | /*- Prototypes -*/ 66 | 67 | /** 68 | * Constructs the suffix array of a given string. 69 | * @param T[0..n-1] The input string. 70 | * @param SA[0..n-1] The output array of suffixes. 71 | * @param n The length of the given string. 72 | * @return 0 if no error occurred, -1 or -2 otherwise. 73 | */ 74 | DIVSUFSORT_API 75 | saint_t 76 | divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n); 77 | 78 | /** 79 | * Constructs the burrows-wheeler transformed string of a given string. 80 | * @param T[0..n-1] The input string. 81 | * @param U[0..n-1] The output string. (can be T) 82 | * @param A[0..n-1] The temporary array. (can be NULL) 83 | * @param n The length of the given string. 84 | * @return The primary index if no error occurred, -1 or -2 otherwise. 85 | */ 86 | DIVSUFSORT_API 87 | saidx_t 88 | divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n); 89 | 90 | /** 91 | * Returns the version of the divsufsort library. 92 | * @return The version number string. 93 | */ 94 | DIVSUFSORT_API 95 | const char * 96 | divsufsort_version(void); 97 | 98 | 99 | /** 100 | * Constructs the burrows-wheeler transformed string of a given string and suffix array. 101 | * @param T[0..n-1] The input string. 102 | * @param U[0..n-1] The output string. (can be T) 103 | * @param SA[0..n-1] The suffix array. (can be NULL) 104 | * @param n The length of the given string. 105 | * @param idx The output primary index. 106 | * @return 0 if no error occurred, -1 or -2 otherwise. 107 | */ 108 | DIVSUFSORT_API 109 | saint_t 110 | bw_transform(const sauchar_t *T, sauchar_t *U, 111 | saidx_t *SA /* can NULL */, 112 | saidx_t n, saidx_t *idx); 113 | 114 | /** 115 | * Inverse BW-transforms a given BWTed string. 116 | * @param T[0..n-1] The input string. 117 | * @param U[0..n-1] The output string. (can be T) 118 | * @param A[0..n-1] The temporary array. (can be NULL) 119 | * @param n The length of the given string. 120 | * @param idx The primary index. 121 | * @return 0 if no error occurred, -1 or -2 otherwise. 122 | */ 123 | DIVSUFSORT_API 124 | saint_t 125 | inverse_bw_transform(const sauchar_t *T, sauchar_t *U, 126 | saidx_t *A /* can NULL */, 127 | saidx_t n, saidx_t idx); 128 | 129 | /** 130 | * Checks the correctness of a given suffix array. 131 | * @param T[0..n-1] The input string. 132 | * @param SA[0..n-1] The input suffix array. 133 | * @param n The length of the given string. 134 | * @param verbose The verbose mode. 135 | * @return 0 if no error occurred. 136 | */ 137 | DIVSUFSORT_API 138 | saint_t 139 | sufcheck(const sauchar_t *T, const saidx_t *SA, saidx_t n, saint_t verbose); 140 | 141 | /** 142 | * Search for the pattern P in the string T. 143 | * @param T[0..Tsize-1] The input string. 144 | * @param Tsize The length of the given string. 145 | * @param P[0..Psize-1] The input pattern string. 146 | * @param Psize The length of the given pattern string. 147 | * @param SA[0..SAsize-1] The input suffix array. 148 | * @param SAsize The length of the given suffix array. 149 | * @param idx The output index. 150 | * @return The count of matches if no error occurred, -1 otherwise. 151 | */ 152 | DIVSUFSORT_API 153 | saidx_t 154 | sa_search(const sauchar_t *T, saidx_t Tsize, 155 | const sauchar_t *P, saidx_t Psize, 156 | const saidx_t *SA, saidx_t SAsize, 157 | saidx_t *left); 158 | 159 | /** 160 | * Search for the character c in the string T. 161 | * @param T[0..Tsize-1] The input string. 162 | * @param Tsize The length of the given string. 163 | * @param SA[0..SAsize-1] The input suffix array. 164 | * @param SAsize The length of the given suffix array. 165 | * @param c The input character. 166 | * @param idx The output index. 167 | * @return The count of matches if no error occurred, -1 otherwise. 168 | */ 169 | DIVSUFSORT_API 170 | saidx_t 171 | sa_simplesearch(const sauchar_t *T, saidx_t Tsize, 172 | const saidx_t *SA, saidx_t SAsize, 173 | saint_t c, saidx_t *left); 174 | 175 | 176 | DIVSUFSORT_API 177 | void dss_flush(); 178 | 179 | 180 | #ifdef __cplusplus 181 | } /* extern "C" */ 182 | #endif /* __cplusplus */ 183 | 184 | #endif /* _DIVSUFSORT_H */ 185 | -------------------------------------------------------------------------------- /crates/divsuftest/src/main.rs: -------------------------------------------------------------------------------- 1 | use failure::Fallible; 2 | use size_format::SizeFormatterBinary; 3 | use std::{io::Write, process, time::Instant}; 4 | 5 | struct Args { 6 | free: Vec, 7 | } 8 | 9 | enum Command { 10 | Crosscheck, 11 | Bench, 12 | Run, 13 | } 14 | 15 | impl Command { 16 | fn parse(s: &str) -> Option { 17 | match s { 18 | "crosscheck" => Some(Self::Crosscheck), 19 | "bench" => Some(Self::Bench), 20 | "run" => Some(Self::Run), 21 | _ => None, 22 | } 23 | } 24 | } 25 | 26 | fn main() -> Fallible<()> { 27 | better_panic::install(); 28 | 29 | let args = pico_args::Arguments::from_env(); 30 | let args = Args { free: args.free()? }; 31 | 32 | if args.free.is_empty() { 33 | usage(); 34 | } 35 | let cmd = Command::parse(&args.free.get(0).unwrap_or_else(|| { 36 | usage(); 37 | unreachable!(); 38 | })) 39 | .expect("Command should be one of crosscheck bench or run"); 40 | 41 | let input_path = args.free.get(1).unwrap_or_else(|| { 42 | usage(); 43 | unreachable!(); 44 | }); 45 | let input_full = std::fs::read(input_path).unwrap(); 46 | let len = args 47 | .free 48 | .get(2) 49 | .map(parse_size) 50 | .unwrap_or_else(|| input_full.len()); 51 | let input = &input_full[..len]; 52 | println!( 53 | "Input is size {}B", 54 | SizeFormatterBinary::new(input.len() as u64) 55 | ); 56 | 57 | match cmd { 58 | Command::Crosscheck => { 59 | #[cfg(not(feature = "crosscheck"))] 60 | { 61 | println!( 62 | "Error: This version of divsuftest wasn't built with crosscheck enabled :(" 63 | ); 64 | println!("Bailing out."); 65 | process::exit(1); 66 | } 67 | 68 | #[cfg(feature = "crosscheck")] 69 | command_crosscheck(input)?; 70 | } 71 | Command::Bench => command_bench(input)?, 72 | Command::Run => command_run(input)?, 73 | } 74 | Ok(()) 75 | } 76 | 77 | fn usage() { 78 | println!("Usage: divsuftest bench|crosscheck|run INPUT [LENGTH]"); 79 | process::exit(1); 80 | } 81 | 82 | #[cfg(feature = "crosscheck")] 83 | fn command_crosscheck(input: &[u8]) { 84 | println!("Cross-checking!"); 85 | std::fs::create_dir_all("crosscheck").unwrap(); 86 | 87 | { 88 | println!("Running C version..."); 89 | let sa = cdivsufsort::sort(input); 90 | unsafe { 91 | cdivsufsort::dss_flush(); 92 | } 93 | println!("Verifying C result..."); 94 | sa.verify().expect("cdivsufsort should sort all suffixes"); 95 | } 96 | 97 | { 98 | let res = std::panic::catch_unwind(|| { 99 | println!("Running Rust version..."); 100 | std::thread::spawn(|| loop { 101 | std::thread::sleep(std::time::Duration::from_millis(500)); 102 | divsufsort::crosscheck::flush(); 103 | }); 104 | 105 | let sa = divsufsort::sort(input); 106 | 107 | println!("Verifying Rust result..."); 108 | sa.verify().expect("cdivsufsort should sort all suffixes"); 109 | }); 110 | divsufsort::crosscheck::flush(); 111 | res.unwrap() 112 | }; 113 | } 114 | 115 | fn command_run(input: &[u8]) -> Fallible<()> { 116 | let before = Instant::now(); 117 | divsufsort::sort(input); 118 | println!("Done in {:?}", before.elapsed()); 119 | 120 | Ok(()) 121 | } 122 | 123 | fn command_bench(input: &[u8]) -> Fallible<()> { 124 | #[cfg(debug_assertions)] 125 | { 126 | println!("=========================================="); 127 | println!("Warning: benchmarking with a debug build."); 128 | println!("This will be slow.."); 129 | println!("=========================================="); 130 | } 131 | 132 | #[cfg(feature = "crosscheck")] 133 | { 134 | println!("=========================================="); 135 | println!("Warning: benchmarking with crosscheck enabled."); 136 | println!("This will be slow.."); 137 | println!("=========================================="); 138 | } 139 | 140 | let flush = || { 141 | std::io::stdout().lock().flush().unwrap(); 142 | }; 143 | 144 | let mut datapoints = Vec::new(); 145 | let mut measure = |name: &'static str, f: &dyn Fn()| { 146 | print!("."); 147 | flush(); 148 | let before = Instant::now(); 149 | f(); 150 | datapoints.push((name, before.elapsed())) 151 | }; 152 | 153 | print!("measuring"); 154 | flush(); 155 | 156 | measure("c-divsufsort", &|| { 157 | cdivsufsort::sort(input); 158 | }); 159 | measure("divsufsort", &|| { 160 | divsufsort::sort(input); 161 | }); 162 | measure("saca-k", &|| { 163 | suffix_array::SuffixArray::new(input); 164 | }); 165 | 166 | println!("done!"); 167 | 168 | { 169 | use cli_table::{format::CellFormat, Cell, Row, Table}; 170 | let bold = CellFormat::builder().bold(true).build(); 171 | let regular = CellFormat::builder().build(); 172 | 173 | let mut rows = vec![Row::new(vec![ 174 | Cell::new("Algorithm", bold), 175 | Cell::new("Time", bold), 176 | Cell::new("Average speed", bold), 177 | ])]; 178 | for dp in datapoints { 179 | let bps = (input.len() as f64 / dp.1.as_secs_f64()) as u64; 180 | rows.push(Row::new(vec![ 181 | Cell::new(dp.0, regular), 182 | Cell::new(&format!("{:?}", dp.1), regular), 183 | Cell::new(&format!("{}B/s", SizeFormatterBinary::new(bps)), regular), 184 | ])); 185 | } 186 | 187 | Table::new(rows, Default::default()).print_stdout().unwrap(); 188 | } 189 | Ok(()) 190 | } 191 | 192 | fn parse_size>(input: I) -> usize { 193 | let mut factor = 1_usize; 194 | 195 | let input = input.as_ref().to_lowercase(); 196 | let input = if input.ends_with("k") { 197 | factor = 1024; 198 | input.trim_end_matches("k") 199 | } else if input.ends_with("m") { 200 | factor = 1024 * 1024; 201 | input.trim_end_matches("m") 202 | } else { 203 | &input[..] 204 | }; 205 | 206 | let size: usize = input.parse().unwrap(); 207 | size * factor 208 | } 209 | -------------------------------------------------------------------------------- /crates/sacapart/src/lib.rs: -------------------------------------------------------------------------------- 1 | use num_traits::ToPrimitive; 2 | use rayon::prelude::*; 3 | use sacabase::{LongestCommonSubstring, StringIndex, SuffixArray}; 4 | 5 | /// A partitioned suffix array, that is faster to construct but finds 6 | /// slightly worse matches in a slightly longer amount of time. 7 | /// 8 | /// Suffix sorting is an expensive operation that is hard to parallelize 9 | /// well. The idea behind a partitioned suffix array is to suffix sort 10 | /// multiple parts of a text (in parallel) rather than the full text. 11 | /// 12 | /// Using two partitions will result in *roughly* 2x faster construction 13 | /// (assuming there are two cores available), but search will now take 14 | /// O(2 * log n), and matches across the boundaries may be much worse. 15 | /// 16 | /// For example, the text "totor" may be partitioned into "tot" and "or". 17 | /// Looking for matches for "tor" may well only return a substring of "to", 18 | /// at offset 0, because the first partition only has the suffixes "t", 19 | /// "to", and "tot". The second partition only has the suffixes "or" and "r". 20 | /// So, it finds the substring "(to)tor", tries to extend it to the right, 21 | /// and fails. It doesn't try to extend "to(t)or", because in the suffix 22 | /// array of that partition, that substring is a weaker match than "(to)tor". 23 | /// 24 | /// For some applications (like bsdiff-like algorithms), this is an acceptable 25 | /// tradeoff (the resulting patch will be slightly larger). For others, it isn't. 26 | pub struct PartitionedSuffixArray<'a, Index> 27 | where 28 | Index: ToPrimitive + Send, 29 | { 30 | partition_size: usize, 31 | text: &'a [u8], 32 | sas: Vec>, 33 | } 34 | 35 | impl<'a, Index> PartitionedSuffixArray<'a, Index> 36 | where 37 | Index: ToPrimitive + Send, 38 | { 39 | pub fn new(text: &'a [u8], num_partitions: usize, f: F) -> Self 40 | where 41 | F: Fn(&'a [u8]) -> SuffixArray<'a, Index> + Sync, 42 | { 43 | let partition_size = text.len() / num_partitions + 1; 44 | 45 | let mut sas: Vec<_> = text 46 | .par_chunks(text.len() / num_partitions + 1) 47 | .enumerate() 48 | .map(|(i, chunk)| (i, f(chunk))) 49 | .collect(); 50 | sas.sort_by(|(i, _), (j, _)| i.cmp(j)); 51 | let sas = sas.into_iter().map(|(_, chunk)| chunk).collect(); 52 | 53 | Self { 54 | partition_size, 55 | text, 56 | sas, 57 | } 58 | } 59 | 60 | pub fn num_partitions(&self) -> usize { 61 | self.sas.len() 62 | } 63 | } 64 | 65 | impl<'a, Index> StringIndex<'a> for PartitionedSuffixArray<'a, Index> 66 | where 67 | Index: ToPrimitive + Send, 68 | { 69 | fn longest_substring_match(&self, needle: &[u8]) -> LongestCommonSubstring<'a> { 70 | let mut best_lcs: Option = None; 71 | for (i, sa) in self.sas.iter().enumerate() { 72 | let mut lcs = sa.longest_substring_match(needle); 73 | let offset = i * self.partition_size; 74 | 75 | // if match reaches the end of the partition's text, it may be 76 | // extended. 77 | let may_extend = lcs.start + lcs.len == sa.text().len(); 78 | 79 | // start was relative to the partition's beginning, make it absolute 80 | lcs.start += offset; 81 | lcs.text = self.text; 82 | if may_extend { 83 | lcs.len = sacabase::common_prefix_len(&self.text[lcs.start..], needle); 84 | } 85 | 86 | let replace = match best_lcs { 87 | None => true, 88 | Some(ref prev_lcs) => lcs.len > prev_lcs.len, 89 | }; 90 | if replace { 91 | best_lcs.replace(lcs); 92 | } 93 | } 94 | best_lcs.expect( 95 | "partitioned suffix arrays should always find at least one longest common substring", 96 | ) 97 | } 98 | } 99 | 100 | #[cfg(test)] 101 | mod tests { 102 | use super::*; 103 | use sacabase::StringIndex; 104 | 105 | #[test] 106 | fn worse_test() { 107 | let input = "totor"; 108 | let sa_full = divsufsort::sort(input.as_bytes()); 109 | let sa_part = PartitionedSuffixArray::new(input.as_bytes(), 2, divsufsort::sort); 110 | 111 | let needle = "tor"; 112 | 113 | let full_match = sa_full.longest_substring_match(needle.as_bytes()); 114 | assert_eq!(needle.as_bytes(), full_match.as_bytes()); 115 | 116 | let part_match = sa_part.longest_substring_match(needle.as_bytes()); 117 | assert_eq!(needle[..2].as_bytes(), part_match.as_bytes()); 118 | 119 | let needle = "otor"; 120 | 121 | let full_match = sa_full.longest_substring_match(needle.as_bytes()); 122 | assert_eq!(needle.as_bytes(), full_match.as_bytes()); 123 | 124 | let part_match = sa_part.longest_substring_match(needle.as_bytes()); 125 | assert_eq!(needle.as_bytes(), part_match.as_bytes()); 126 | } 127 | 128 | #[test] 129 | fn equivalent_test() { 130 | let input = "This is a rather long text. We can probably find matches that span two partitions. Oh yes."; 131 | let sa_full = divsufsort::sort(input.as_bytes()); 132 | 133 | for &partitions in &[1, 2, 3] { 134 | println!("{} partitions", partitions); 135 | for needle in &[ 136 | "rather long", 137 | "text. We can", 138 | "We can probably find matches that span", 139 | ] { 140 | println!("needle: {:?}", needle); 141 | let sa_part = 142 | PartitionedSuffixArray::new(input.as_bytes(), partitions, divsufsort::sort); 143 | 144 | let full_match = sa_full.longest_substring_match(needle.as_bytes()); 145 | let part_match = sa_part.longest_substring_match(needle.as_bytes()); 146 | 147 | assert_eq!( 148 | full_match.as_bytes(), 149 | part_match.as_bytes(), 150 | "should find same match bytes for {:?}", 151 | needle 152 | ); 153 | assert_eq!( 154 | full_match.start, part_match.start, 155 | "should find same match start for {:?}", 156 | needle 157 | ); 158 | assert_eq!( 159 | full_match.len, part_match.len, 160 | "should find same match len for {:?}", 161 | needle 162 | ); 163 | } 164 | } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /crates/dc3/src/lib.rs: -------------------------------------------------------------------------------- 1 | /// Lexicographic order for pairs 2 | #[inline(always)] 3 | fn leq2(a1: usize, a2: usize, b1: usize, b2: usize) -> bool { 4 | (a1 < b1) || (a1 == b1 && a2 <= b2) 5 | } 6 | 7 | /// Lexicographic order for triples 8 | #[inline(always)] 9 | fn leq3(a1: usize, a2: usize, b1: usize, b2: usize, a3: usize, b3: usize) -> bool { 10 | (a1 < b1) || (a1 == b1 && leq2(a2, a3, b2, b3)) 11 | } 12 | 13 | /// Stably sort a[0..n-1] to b[0..n-1] with keys in 0..K from r 14 | #[allow(non_snake_case)] 15 | fn radix_pass(a: &[usize], b: &mut [usize], r: &[usize], n: usize, K: usize) { 16 | // counter array 17 | let mut c = vec![0_usize; K + 1]; 18 | 19 | // count occurrences 20 | for i in 0..n { 21 | c[r[a[i]]] += 1; 22 | } 23 | 24 | // exclusive prefix sums 25 | { 26 | let mut sum = 0; 27 | for i in 0..=K { 28 | let t = c[i]; 29 | c[i] = sum; 30 | sum += t; 31 | } 32 | } 33 | 34 | // sort 35 | for i in 0..n { 36 | b[c[r[a[i]]]] = a[i]; 37 | c[r[a[i]]] += 1; 38 | } 39 | } 40 | 41 | /// Find the suffix array SA of T[0..n-1] in {1..K}^n 42 | /// require T[n]=T[n+1]=T[n+2]=0, n >= 2 43 | #[allow(non_snake_case)] 44 | pub fn suffix_array(T: &[usize], SA: &mut [usize], n: usize, K: usize) { 45 | let n0 = (n + 2) / 3; 46 | let n1 = (n + 1) / 3; 47 | let n2 = n / 3; 48 | let n02 = n0 + n2; 49 | 50 | let mut R = vec![0; n02 + 3]; 51 | R[n02] = 0; 52 | R[n02 + 1] = 0; 53 | R[n02 + 2] = 0; 54 | 55 | let mut SA12 = vec![0; n02 + 3]; 56 | let mut R0 = vec![0; n0]; 57 | let mut SA0 = vec![0; n0]; 58 | 59 | // Step 0: Construct sample 60 | // Generate positions of mod 1 and mod 2 suffixes 61 | // the "+(n0-n2)" adds a dummy mod 1 suffix if n%3 == 1 62 | { 63 | let mut j = 0; 64 | for i in 0..(n + (n0 - n1)) { 65 | if (i % 3) != 0 { 66 | R[j] = i; 67 | j += 1; 68 | } 69 | } 70 | } 71 | 72 | // Step 1: Sort sample suffixes 73 | // lsb radix sort the mod 1 and mod 2 triples 74 | radix_pass(&R[..], &mut SA12[..], &T[2..], n02, K); 75 | radix_pass(&SA12[..], &mut R[..], &T[1..], n02, K); 76 | radix_pass(&R[..], &mut SA12[..], &T[..], n02, K); 77 | 78 | // Find lexicographic names of triples and 79 | // write them to the correct places in R 80 | let mut name = 0; 81 | let mut c0 = 0; 82 | let mut c1 = 0; 83 | let mut c2 = 0; 84 | let mut first = true; 85 | for i in 0..n02 { 86 | if first || (T[SA12[i]] != c0 || T[SA12[i] + 1] != c1 || T[SA12[i] + 2] != c2) { 87 | first = false; 88 | name += 1; 89 | c0 = T[SA12[i] + 0]; 90 | c1 = T[SA12[i] + 1]; 91 | c2 = T[SA12[i] + 2]; 92 | } 93 | if SA12[i] % 3 == 1 { 94 | // write to R1 95 | R[SA12[i] / 3] = name; 96 | } else { 97 | // write to R2 98 | R[SA12[i] / 3 + n0] = name; 99 | } 100 | } 101 | 102 | // recurse if names are not yet unique 103 | if name < n02 { 104 | suffix_array(&R[..], &mut SA12[..], n02, name); 105 | // store unique names in R using the suffix array 106 | for i in 0..n02 { 107 | R[SA12[i]] = i + 1; 108 | } 109 | } else { 110 | // generate the suffix array of R directly 111 | for i in 0..n02 { 112 | SA12[R[i] - 1] = i; 113 | } 114 | } 115 | 116 | // Step 2: sort nonsample suffixes 117 | // stably sort the mod 0 suffixes from SA12 by their first character 118 | { 119 | let mut j = 0; 120 | for i in 0..n02 { 121 | if SA12[i] < n0 { 122 | R0[j] = 3 * SA12[i]; 123 | j += 1; 124 | } 125 | } 126 | radix_pass(&R0[..], &mut SA0[..], T, n0, K); 127 | } 128 | 129 | // Step 3: merge 130 | // merge sorted SA0 suffixes and sorted SA12 suffixes 131 | { 132 | let mut p = 0; 133 | let mut t = n0 - n1; 134 | let mut k = 0; 135 | while k < n { 136 | macro_rules! get_i { 137 | () => { 138 | if SA12[t] < n0 { 139 | SA12[t] * 3 + 1 140 | } else { 141 | (SA12[t] - n0) * 3 + 2 142 | } 143 | }; 144 | } 145 | 146 | // pos of current offset 12 suffix 147 | let i = get_i!(); 148 | // pos of current offset 0 suffix 149 | let j = SA0[p]; 150 | 151 | let sa12_smaller = if SA12[t] < n0 { 152 | leq2(T[i], R[SA12[t] + n0], T[j], R[j / 3]) 153 | } else { 154 | leq3( 155 | T[i], 156 | T[i + 1], 157 | R[SA12[t] - n0 + 1], 158 | T[j], 159 | T[j + 1], 160 | R[j / 3 + n0], 161 | ) 162 | }; 163 | if sa12_smaller { 164 | // suffix from SA12 is smaller 165 | SA[k] = i; 166 | t += 1; 167 | if t == n02 { 168 | // done --- only SA0 suffixes left 169 | k += 1; 170 | while p < n0 { 171 | SA[k] = SA0[p]; 172 | p += 1; 173 | k += 1; 174 | } 175 | } 176 | } else { 177 | // suffix from SA0 is smaller 178 | SA[k] = j; 179 | p += 1; 180 | if p == n0 { 181 | // done ---- only SA12 suffixes left 182 | k += 1; 183 | while t < n02 { 184 | SA[k] = get_i!(); 185 | t += 1; 186 | k += 1; 187 | } 188 | } 189 | } 190 | k += 1; 191 | } 192 | } 193 | } 194 | 195 | #[cfg(test)] 196 | mod tests { 197 | use super::*; 198 | 199 | #[test] 200 | fn it_works() { 201 | let s = "Once upon a time, in a land most dreary"; 202 | let mut T = vec![0usize; s.len() + 3]; 203 | for (i, &b) in s.as_bytes().iter().enumerate() { 204 | T[i] = b as usize; 205 | } 206 | 207 | let n = s.len(); 208 | let mut SA = vec![0; n]; 209 | let K = *T.iter().max().unwrap(); 210 | 211 | suffix_array(&T[..], &mut SA[..], n, K); 212 | for i in 0..(n - 1) { 213 | println!("==============="); 214 | println!("suf(SA[{}]) = {:?}", i, &s[SA[i]..]); 215 | println!("suf(SA[{}]) = {:?}", i + 1, &s[SA[i + 1]..]); 216 | // FIXME: this is busted 217 | // assert!(s[SA[i]..] < s[SA[i + 1]..]) 218 | } 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /crates/cdivsufsort/c-sources/divsufsort_private.h: -------------------------------------------------------------------------------- 1 | /* 2 | * divsufsort_private.h for libdivsufsort 3 | * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. 4 | * 5 | * Permission is hereby granted, free of charge, to any person 6 | * obtaining a copy of this software and associated documentation 7 | * files (the "Software"), to deal in the Software without 8 | * restriction, including without limitation the rights to use, 9 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following 12 | * conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be 15 | * included in all copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | * OTHER DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | #ifndef _DIVSUFSORT_PRIVATE_H 28 | #define _DIVSUFSORT_PRIVATE_H 1 29 | 30 | #define HAVE_CONFIG_H 1 31 | 32 | #ifdef __cplusplus 33 | extern "C" { 34 | #endif /* __cplusplus */ 35 | 36 | #if HAVE_CONFIG_H 37 | # include "config.h" 38 | #endif 39 | #include 40 | #include 41 | #if HAVE_STRING_H 42 | # include 43 | #endif 44 | #if HAVE_STDLIB_H 45 | # include 46 | #endif 47 | #if HAVE_MEMORY_H 48 | # include 49 | #endif 50 | #if HAVE_STDDEF_H 51 | # include 52 | #endif 53 | #if HAVE_STRINGS_H 54 | # include 55 | #endif 56 | #if HAVE_INTTYPES_H 57 | # include 58 | #else 59 | # if HAVE_STDINT_H 60 | # include 61 | # endif 62 | #endif 63 | #if defined(BUILD_DIVSUFSORT64) 64 | # include "divsufsort64.h" 65 | # ifndef SAIDX_T 66 | # define SAIDX_T 67 | # define saidx_t saidx64_t 68 | # endif /* SAIDX_T */ 69 | # ifndef PRIdSAIDX_T 70 | # define PRIdSAIDX_T PRIdSAIDX64_T 71 | # endif /* PRIdSAIDX_T */ 72 | # define divsufsort divsufsort64 73 | # define divbwt divbwt64 74 | # define divsufsort_version divsufsort64_version 75 | # define bw_transform bw_transform64 76 | # define inverse_bw_transform inverse_bw_transform64 77 | # define sufcheck sufcheck64 78 | # define sa_search sa_search64 79 | # define sa_simplesearch sa_simplesearch64 80 | # define sssort sssort64 81 | # define trsort trsort64 82 | #else 83 | # include "divsufsort.h" 84 | #endif 85 | 86 | 87 | /*- Constants -*/ 88 | #if !defined(UINT8_MAX) 89 | # define UINT8_MAX (255) 90 | #endif /* UINT8_MAX */ 91 | #if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1) 92 | # undef ALPHABET_SIZE 93 | #endif 94 | #if !defined(ALPHABET_SIZE) 95 | # define ALPHABET_SIZE (UINT8_MAX + 1) 96 | #endif 97 | /* for divsufsort.c */ 98 | #define BUCKET_A_SIZE (ALPHABET_SIZE) 99 | #define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE) 100 | /* for sssort.c */ 101 | #if defined(SS_INSERTIONSORT_THRESHOLD) 102 | # if SS_INSERTIONSORT_THRESHOLD < 1 103 | # undef SS_INSERTIONSORT_THRESHOLD 104 | # define SS_INSERTIONSORT_THRESHOLD (1) 105 | # endif 106 | #else 107 | # define SS_INSERTIONSORT_THRESHOLD (8) 108 | #endif 109 | #if defined(SS_BLOCKSIZE) 110 | # if SS_BLOCKSIZE < 0 111 | # undef SS_BLOCKSIZE 112 | # define SS_BLOCKSIZE (0) 113 | # elif 32768 <= SS_BLOCKSIZE 114 | # undef SS_BLOCKSIZE 115 | # define SS_BLOCKSIZE (32767) 116 | # endif 117 | #else 118 | # define SS_BLOCKSIZE (1024) 119 | #endif 120 | /* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */ 121 | #if SS_BLOCKSIZE == 0 122 | # if defined(BUILD_DIVSUFSORT64) 123 | # define SS_MISORT_STACKSIZE (96) 124 | # else 125 | # define SS_MISORT_STACKSIZE (64) 126 | # endif 127 | #elif SS_BLOCKSIZE <= 4096 128 | # define SS_MISORT_STACKSIZE (16) 129 | #else 130 | # define SS_MISORT_STACKSIZE (24) 131 | #endif 132 | #if defined(BUILD_DIVSUFSORT64) 133 | # define SS_SMERGE_STACKSIZE (64) 134 | #else 135 | # define SS_SMERGE_STACKSIZE (32) 136 | #endif 137 | /* for trsort.c */ 138 | #define TR_INSERTIONSORT_THRESHOLD (8) 139 | #if defined(BUILD_DIVSUFSORT64) 140 | # define TR_STACKSIZE (96) 141 | #else 142 | # define TR_STACKSIZE (64) 143 | #endif 144 | 145 | /*- Cross-checking -*/ 146 | 147 | #ifdef ENABLE_CROSSCHECK 148 | 149 | extern FILE *CROSSCHECK_FILE; 150 | #define crosscheck(...) \ 151 | do { \ 152 | fprintf(CROSSCHECK_FILE, __VA_ARGS__); \ 153 | fprintf(CROSSCHECK_FILE, "\n"); \ 154 | } while (0) 155 | 156 | #define SA_dump(SA, start, len, label) \ 157 | do { \ 158 | fprintf(CROSSCHECK_FILE, ":: %s\n", label); \ 159 | for (int z = 0; z < len; z++) { \ 160 | fprintf(CROSSCHECK_FILE, "%d ", SA[start+z]); \ 161 | if ((z+1)%25==0) { \ 162 | fprintf(CROSSCHECK_FILE, "\n"); \ 163 | } \ 164 | } \ 165 | fprintf(CROSSCHECK_FILE, "\n"); \ 166 | } while (0); 167 | 168 | // #define SA_dump(SA, label) \ 169 | // do { \ 170 | // fprintf(CROSSCHECK_FILE, ":: %s\n", label); \ 171 | // fprintf(CROSSCHECK_FILE, "SA = ["); \ 172 | // for (int z = 0; z < n; z++) { \ 173 | // if (z == n - 1) { \ 174 | // fprintf(CROSSCHECK_FILE, "%d", SA[z]); \ 175 | // } else { \ 176 | // fprintf(CROSSCHECK_FILE, "%d, ", SA[z]); \ 177 | // } \ 178 | // } \ 179 | // fprintf(CROSSCHECK_FILE, "]\n"); \ 180 | // } while (0); 181 | 182 | #define A_dump(A, label) \ 183 | do { \ 184 | fprintf(CROSSCHECK_FILE, ":: %s\n", label); \ 185 | fprintf(CROSSCHECK_FILE, "A = ["); \ 186 | for (int z = 0; z < BUCKET_A_SIZE; z++) { \ 187 | if (z == BUCKET_A_SIZE - 1) { \ 188 | fprintf(CROSSCHECK_FILE, "%d", BUCKET_A(z)); \ 189 | } else { \ 190 | fprintf(CROSSCHECK_FILE, "%d, ", BUCKET_A(z)); \ 191 | } \ 192 | } \ 193 | fprintf(CROSSCHECK_FILE, "]\n"); \ 194 | } while (0); 195 | 196 | #define BSTAR_dump(label) \ 197 | do { \ 198 | crosscheck("%s B* dump:", label); \ 199 | for (int ii = 0; ii < ALPHABET_SIZE; ii++) { \ 200 | for (int jj = 0; jj < ALPHABET_SIZE; jj++) { \ 201 | crosscheck("%s B*[%d,%d]=%d", label, ii, jj, BUCKET_BSTAR(ii, jj)); \ 202 | } \ 203 | } \ 204 | } while (0); 205 | 206 | #else 207 | 208 | #define crosscheck(...) 209 | #define SA_dump(SA, start, len, label) 210 | #define A_dump(A, label) 211 | #define BSTAR_dump(label) 212 | 213 | #endif // ENABLE_CROSSCHECK else 214 | 215 | /*- Macros -*/ 216 | #ifndef SWAP 217 | # define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0) 218 | #endif /* SWAP */ 219 | #ifndef MIN 220 | # define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b)) 221 | #endif /* MIN */ 222 | #ifndef MAX 223 | # define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b)) 224 | #endif /* MAX */ 225 | #define STACK_PUSH(_a, _b, _c, _d)\ 226 | do {\ 227 | assert(ssize < STACK_SIZE);\ 228 | stack[ssize].a = (_a), stack[ssize].b = (_b),\ 229 | stack[ssize].c = (_c), stack[ssize++].d = (_d);\ 230 | } while(0) 231 | #define STACK_PUSH5(_a, _b, _c, _d, _e)\ 232 | do {\ 233 | assert(ssize < STACK_SIZE);\ 234 | stack[ssize].a = (_a), stack[ssize].b = (_b),\ 235 | stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\ 236 | } while(0) 237 | #define STACK_POP(_a, _b, _c, _d)\ 238 | do {\ 239 | assert(0 <= ssize);\ 240 | if(ssize == 0) { return; }\ 241 | (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ 242 | (_c) = stack[ssize].c, (_d) = stack[ssize].d;\ 243 | } while(0) 244 | #define STACK_POP5(_a, _b, _c, _d, _e)\ 245 | do {\ 246 | assert(0 <= ssize);\ 247 | if(ssize == 0) { return; }\ 248 | (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ 249 | (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\ 250 | } while(0) 251 | /* for divsufsort.c */ 252 | #define BUCKET_A(_c0) bucket_A[(_c0)] 253 | #if ALPHABET_SIZE == 256 254 | #define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)]) 255 | #define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)]) 256 | #else 257 | #define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)]) 258 | #define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)]) 259 | #endif 260 | 261 | 262 | /*- Private Prototypes -*/ 263 | /* sssort.c */ 264 | void 265 | sssort(const sauchar_t *Td, const saidx_t *PA, 266 | saidx_t *first, saidx_t *last, 267 | saidx_t *buf, saidx_t bufsize, 268 | saidx_t depth, saidx_t n, saint_t lastsuffix); 269 | /* trsort.c */ 270 | void 271 | trsort(saidx_t *ISA, saidx_t *SA, saidx_t n, saidx_t depth); 272 | 273 | 274 | #ifdef __cplusplus 275 | } /* extern "C" */ 276 | #endif /* __cplusplus */ 277 | 278 | #endif /* _DIVSUFSORT_PRIVATE_H */ 279 | -------------------------------------------------------------------------------- /crates/divsufsort/src/common.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::{Ordering, PartialEq, PartialOrd}; 2 | use std::ops::{self, Add, AddAssign, Div, Index, IndexMut, Sub, SubAssign}; 3 | 4 | pub type Char = u8; 5 | pub type Idx = i32; 6 | 7 | pub const TR_INSERTIONSORT_THRESHOLD: Idx = 8; 8 | 9 | pub const SS_INSERTIONSORT_THRESHOLD: Idx = 8; 10 | pub const SS_BLOCKSIZE: Idx = 1024; 11 | 12 | pub const ALPHABET_SIZE: usize = u8::max_value() as usize + 1; 13 | pub const BUCKET_A_SIZE: usize = ALPHABET_SIZE; 14 | pub const BUCKET_B_SIZE: usize = ALPHABET_SIZE * ALPHABET_SIZE; 15 | 16 | // Read-only input to suffix-sort 17 | pub struct Text<'a>(pub &'a [Char]); 18 | 19 | impl<'a> Index for Text<'a> { 20 | type Output = Char; 21 | 22 | fn index(&self, index: Idx) -> &Self::Output { 23 | &self.0[index as usize] 24 | } 25 | } 26 | 27 | impl<'a> Text<'a> { 28 | #[inline(always)] 29 | pub fn get(&self, i: Idx) -> Idx { 30 | debug_assert!(i >= 0, "assert violated: {} >= 0", i); 31 | self.0[i as usize] as Idx 32 | } 33 | 34 | #[inline(always)] 35 | pub fn len(&self) -> Idx { 36 | self.0.len() as Idx 37 | } 38 | } 39 | 40 | // Indexes of all suffixes in lexicographical order 41 | #[derive(Debug)] 42 | pub struct SuffixArray<'a>(pub &'a mut [Idx]); 43 | 44 | impl<'a> SuffixArray<'a> { 45 | #[inline(always)] 46 | pub fn swap, B: Into>(&mut self, a: A, b: B) { 47 | self.0.swap(a.into() as usize, b.into() as usize); 48 | } 49 | 50 | pub fn range<'b, I: Into>(&'b mut self, range: ops::Range) -> SuffixArray<'b> { 51 | let usize_range = (range.start.into() as usize)..(range.end.into() as usize); 52 | SuffixArray(&mut self.0[usize_range]) 53 | } 54 | 55 | pub fn range_to<'b, I: Into>(&'b mut self, range: ops::RangeTo) -> SuffixArray<'b> { 56 | let usize_range = ..(range.end.into() as usize); 57 | SuffixArray(&mut self.0[usize_range]) 58 | } 59 | 60 | pub fn range_from<'b, I: Into>(&'b mut self, range: ops::RangeFrom) -> SuffixArray<'b> { 61 | let usize_range = (range.start.into() as usize)..; 62 | SuffixArray(&mut self.0[usize_range]) 63 | } 64 | } 65 | 66 | impl<'a> Index for SuffixArray<'a> { 67 | type Output = Idx; 68 | 69 | fn index(&self, index: Idx) -> &Self::Output { 70 | debug_assert!(index >= 0, "assert violated: {} >= 0", index); 71 | &self.0[index as usize] 72 | } 73 | } 74 | 75 | impl<'a> IndexMut for SuffixArray<'a> { 76 | fn index_mut(&mut self, index: Idx) -> &mut Self::Output { 77 | &mut self.0[index as usize] 78 | } 79 | } 80 | 81 | impl<'a> Index for SuffixArray<'a> { 82 | type Output = Idx; 83 | 84 | fn index(&self, index: SAPtr) -> &Self::Output { 85 | debug_assert!(index.0 >= 0, "assert violated: {} >= 0", index); 86 | &self.0[index.0 as usize] 87 | } 88 | } 89 | 90 | impl<'a> IndexMut for SuffixArray<'a> { 91 | fn index_mut(&mut self, index: SAPtr) -> &mut Self::Output { 92 | &mut self.0[index.0 as usize] 93 | } 94 | } 95 | 96 | impl<'a> SuffixArray<'a> { 97 | pub fn len(&self) -> Idx { 98 | self.0.len() as Idx 99 | } 100 | } 101 | 102 | // ---------- Immutable variant ----------- *shakes fist at borrowck* 103 | 104 | // Indexes of all suffixes in lexicographical order 105 | #[derive(Debug)] 106 | pub struct SuffixArrayImm<'a>(pub &'a [Idx]); 107 | 108 | impl<'a> Index for SuffixArrayImm<'a> { 109 | type Output = Idx; 110 | 111 | fn index(&self, index: Idx) -> &Self::Output { 112 | &self.0[index as usize] 113 | } 114 | } 115 | 116 | impl<'a> Index for SuffixArrayImm<'a> { 117 | type Output = Idx; 118 | 119 | fn index(&self, index: SAPtr) -> &Self::Output { 120 | &self.0[index.0 as usize] 121 | } 122 | } 123 | 124 | //------------------------------------------- 125 | // Suffix array pointers 126 | //------------------------------------------- 127 | 128 | #[derive(Clone, Copy)] 129 | pub struct SAPtr(pub Idx); 130 | 131 | use std::fmt; 132 | 133 | impl fmt::Display for SAPtr { 134 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 135 | write!(f, "{}", self.0) 136 | } 137 | } 138 | 139 | impl fmt::Debug for SAPtr { 140 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 141 | write!(f, "SAPtr({})", self.0) 142 | } 143 | } 144 | 145 | impl Div for SAPtr { 146 | type Output = SAPtr; 147 | 148 | #[inline(always)] 149 | fn div(self, rhs: Idx) -> Self::Output { 150 | SAPtr(self.0 / rhs) 151 | } 152 | } 153 | 154 | impl Add for SAPtr { 155 | type Output = SAPtr; 156 | 157 | #[inline(always)] 158 | fn add(self, rhs: Idx) -> Self::Output { 159 | SAPtr(self.0 + rhs) 160 | } 161 | } 162 | 163 | impl Add for SAPtr { 164 | type Output = SAPtr; 165 | 166 | #[inline(always)] 167 | fn add(self, rhs: Self) -> Self::Output { 168 | SAPtr(self.0 + rhs.0) 169 | } 170 | } 171 | 172 | impl AddAssign for SAPtr { 173 | fn add_assign(&mut self, rhs: Idx) { 174 | self.0 += rhs 175 | } 176 | } 177 | 178 | impl AddAssign for SAPtr { 179 | fn add_assign(&mut self, rhs: Self) { 180 | self.0 += rhs.0 181 | } 182 | } 183 | 184 | impl Sub for SAPtr { 185 | type Output = SAPtr; 186 | 187 | #[inline(always)] 188 | fn sub(self, rhs: Idx) -> Self::Output { 189 | SAPtr(self.0 - rhs) 190 | } 191 | } 192 | 193 | impl Sub for SAPtr { 194 | type Output = SAPtr; 195 | 196 | #[inline(always)] 197 | fn sub(self, rhs: Self) -> Self::Output { 198 | SAPtr(self.0 - rhs.0) 199 | } 200 | } 201 | 202 | impl Into for SAPtr { 203 | #[inline(always)] 204 | fn into(self) -> Idx { 205 | self.0 206 | } 207 | } 208 | 209 | impl From for SAPtr { 210 | #[inline(always)] 211 | fn from(idx: Idx) -> Self { 212 | SAPtr(idx) 213 | } 214 | } 215 | 216 | impl SubAssign for SAPtr { 217 | #[inline(always)] 218 | fn sub_assign(&mut self, rhs: Idx) { 219 | self.0 -= rhs 220 | } 221 | } 222 | 223 | impl SubAssign for SAPtr { 224 | #[inline(always)] 225 | fn sub_assign(&mut self, rhs: Self) { 226 | self.0 -= rhs.0 227 | } 228 | } 229 | 230 | impl PartialEq for SAPtr { 231 | #[inline(always)] 232 | fn eq(&self, other: &Idx) -> bool { 233 | self.0 == *other 234 | } 235 | } 236 | 237 | impl PartialOrd for SAPtr { 238 | #[inline(always)] 239 | fn partial_cmp(&self, other: &Idx) -> Option { 240 | self.0.partial_cmp(other) 241 | } 242 | } 243 | 244 | impl PartialEq for Idx { 245 | #[inline(always)] 246 | fn eq(&self, other: &SAPtr) -> bool { 247 | *self == other.0 248 | } 249 | } 250 | 251 | impl PartialOrd for Idx { 252 | #[inline(always)] 253 | fn partial_cmp(&self, other: &SAPtr) -> Option { 254 | self.partial_cmp(&other.0) 255 | } 256 | } 257 | 258 | impl PartialEq for SAPtr { 259 | #[inline(always)] 260 | fn eq(&self, other: &Self) -> bool { 261 | self.0.eq(&other.0) 262 | } 263 | } 264 | 265 | impl PartialOrd for SAPtr { 266 | #[inline(always)] 267 | fn partial_cmp(&self, other: &Self) -> Option { 268 | self.0.partial_cmp(&other.0) 269 | } 270 | } 271 | 272 | //---------------------------------------------- 273 | // Bucket types 274 | //---------------------------------------------- 275 | 276 | pub struct BMixBucket(pub Vec); 277 | 278 | impl BMixBucket { 279 | #[inline(always)] 280 | pub fn b<'a>(&'a mut self) -> BBucket<'a> { 281 | BBucket(&mut self.0) 282 | } 283 | 284 | #[inline(always)] 285 | pub fn bstar<'a>(&'a mut self) -> BStarBucket<'a> { 286 | BStarBucket(&mut self.0) 287 | } 288 | } 289 | 290 | pub struct ABucket(pub Vec); 291 | 292 | impl Index for ABucket { 293 | type Output = Idx; 294 | 295 | fn index(&self, index: Idx) -> &Self::Output { 296 | &self.0[index as usize] 297 | } 298 | } 299 | 300 | impl IndexMut for ABucket { 301 | fn index_mut(&mut self, index: Idx) -> &mut Self::Output { 302 | &mut self.0[index as usize] 303 | } 304 | } 305 | 306 | pub struct BBucket<'a>(pub &'a mut [Idx]); 307 | 308 | impl<'a> Index<(Idx, Idx)> for BBucket<'a> { 309 | type Output = Idx; 310 | 311 | fn index(&self, index: (Idx, Idx)) -> &Self::Output { 312 | let (c0, c1) = index; 313 | &self.0[((c1 << 8) | c0) as usize] 314 | } 315 | } 316 | 317 | impl<'a> IndexMut<(Idx, Idx)> for BBucket<'a> { 318 | fn index_mut(&mut self, index: (Idx, Idx)) -> &mut Self::Output { 319 | let (c0, c1) = index; 320 | &mut self.0[((c1 << 8) | c0) as usize] 321 | } 322 | } 323 | 324 | pub struct BStarBucket<'a>(&'a mut [Idx]); 325 | 326 | impl<'a> Index<(Idx, Idx)> for BStarBucket<'a> { 327 | type Output = Idx; 328 | 329 | fn index(&self, index: (Idx, Idx)) -> &Self::Output { 330 | let (c0, c1) = index; 331 | &self.0[((c0 << 8) | c1) as usize] 332 | } 333 | } 334 | 335 | impl<'a> IndexMut<(Idx, Idx)> for BStarBucket<'a> { 336 | fn index_mut(&mut self, index: (Idx, Idx)) -> &mut Self::Output { 337 | let (c0, c1) = index; 338 | &mut self.0[((c0 << 8) | c1) as usize] 339 | } 340 | } 341 | -------------------------------------------------------------------------------- /crates/cdivsufsort/c-sources/utils.c: -------------------------------------------------------------------------------- 1 | /* 2 | * utils.c for libdivsufsort 3 | * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. 4 | * 5 | * Permission is hereby granted, free of charge, to any person 6 | * obtaining a copy of this software and associated documentation 7 | * files (the "Software"), to deal in the Software without 8 | * restriction, including without limitation the rights to use, 9 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following 12 | * conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be 15 | * included in all copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | * OTHER DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | #include "divsufsort_private.h" 28 | 29 | 30 | /*- Private Function -*/ 31 | 32 | /* Binary search for inverse bwt. */ 33 | static 34 | saidx_t 35 | binarysearch_lower(const saidx_t *A, saidx_t size, saidx_t value) { 36 | saidx_t half, i; 37 | for(i = 0, half = size >> 1; 38 | 0 < size; 39 | size = half, half >>= 1) { 40 | if(A[i + half] < value) { 41 | i += half + 1; 42 | half -= (size & 1) ^ 1; 43 | } 44 | } 45 | return i; 46 | } 47 | 48 | 49 | /*- Functions -*/ 50 | 51 | /* Burrows-Wheeler transform. */ 52 | saint_t 53 | bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *SA, 54 | saidx_t n, saidx_t *idx) { 55 | saidx_t *A, i, j, p, t; 56 | saint_t c; 57 | 58 | /* Check arguments. */ 59 | if((T == NULL) || (U == NULL) || (n < 0) || (idx == NULL)) { return -1; } 60 | if(n <= 1) { 61 | if(n == 1) { U[0] = T[0]; } 62 | *idx = n; 63 | return 0; 64 | } 65 | 66 | if((A = SA) == NULL) { 67 | i = divbwt(T, U, NULL, n); 68 | if(0 <= i) { *idx = i; i = 0; } 69 | return (saint_t)i; 70 | } 71 | 72 | /* BW transform. */ 73 | if(T == U) { 74 | t = n; 75 | for(i = 0, j = 0; i < n; ++i) { 76 | p = t - 1; 77 | t = A[i]; 78 | if(0 <= p) { 79 | c = T[j]; 80 | U[j] = (j <= p) ? T[p] : (sauchar_t)A[p]; 81 | A[j] = c; 82 | j++; 83 | } else { 84 | *idx = i; 85 | } 86 | } 87 | p = t - 1; 88 | if(0 <= p) { 89 | c = T[j]; 90 | U[j] = (j <= p) ? T[p] : (sauchar_t)A[p]; 91 | A[j] = c; 92 | } else { 93 | *idx = i; 94 | } 95 | } else { 96 | U[0] = T[n - 1]; 97 | for(i = 0; A[i] != 0; ++i) { U[i + 1] = T[A[i] - 1]; } 98 | *idx = i + 1; 99 | for(++i; i < n; ++i) { U[i] = T[A[i] - 1]; } 100 | } 101 | 102 | if(SA == NULL) { 103 | /* Deallocate memory. */ 104 | free(A); 105 | } 106 | 107 | return 0; 108 | } 109 | 110 | /* Inverse Burrows-Wheeler transform. */ 111 | saint_t 112 | inverse_bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *A, 113 | saidx_t n, saidx_t idx) { 114 | saidx_t C[ALPHABET_SIZE]; 115 | sauchar_t D[ALPHABET_SIZE]; 116 | saidx_t *B; 117 | saidx_t i, p; 118 | saint_t c, d; 119 | 120 | /* Check arguments. */ 121 | if((T == NULL) || (U == NULL) || (n < 0) || (idx < 0) || 122 | (n < idx) || ((0 < n) && (idx == 0))) { 123 | return -1; 124 | } 125 | if(n <= 1) { return 0; } 126 | 127 | if((B = A) == NULL) { 128 | /* Allocate n*sizeof(saidx_t) bytes of memory. */ 129 | if((B = (saidx_t *)malloc((size_t)n * sizeof(saidx_t))) == NULL) { return -2; } 130 | } 131 | 132 | /* Inverse BW transform. */ 133 | for(c = 0; c < ALPHABET_SIZE; ++c) { C[c] = 0; } 134 | for(i = 0; i < n; ++i) { ++C[T[i]]; } 135 | for(c = 0, d = 0, i = 0; c < ALPHABET_SIZE; ++c) { 136 | p = C[c]; 137 | if(0 < p) { 138 | C[c] = i; 139 | D[d++] = (sauchar_t)c; 140 | i += p; 141 | } 142 | } 143 | for(i = 0; i < idx; ++i) { B[C[T[i]]++] = i; } 144 | for( ; i < n; ++i) { B[C[T[i]]++] = i + 1; } 145 | for(c = 0; c < d; ++c) { C[c] = C[D[c]]; } 146 | for(i = 0, p = idx; i < n; ++i) { 147 | U[i] = D[binarysearch_lower(C, d, p)]; 148 | p = B[p - 1]; 149 | } 150 | 151 | if(A == NULL) { 152 | /* Deallocate memory. */ 153 | free(B); 154 | } 155 | 156 | return 0; 157 | } 158 | 159 | /* Checks the suffix array SA of the string T. */ 160 | saint_t 161 | sufcheck(const sauchar_t *T, const saidx_t *SA, 162 | saidx_t n, saint_t verbose) { 163 | saidx_t C[ALPHABET_SIZE]; 164 | saidx_t i, p, q, t; 165 | saint_t c; 166 | 167 | if(verbose) { fprintf(stderr, "sufcheck: "); } 168 | 169 | /* Check arguments. */ 170 | if((T == NULL) || (SA == NULL) || (n < 0)) { 171 | if(verbose) { fprintf(stderr, "Invalid arguments.\n"); } 172 | return -1; 173 | } 174 | if(n == 0) { 175 | if(verbose) { fprintf(stderr, "Done.\n"); } 176 | return 0; 177 | } 178 | 179 | /* check range: [0..n-1] */ 180 | for(i = 0; i < n; ++i) { 181 | if((SA[i] < 0) || (n <= SA[i])) { 182 | if(verbose) { 183 | fprintf(stderr, "Out of the range [0,%" PRIdSAIDX_T "].\n" 184 | " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n", 185 | n - 1, i, SA[i]); 186 | } 187 | return -2; 188 | } 189 | } 190 | 191 | /* check first characters. */ 192 | for(i = 1; i < n; ++i) { 193 | if(T[SA[i - 1]] > T[SA[i]]) { 194 | if(verbose) { 195 | fprintf(stderr, "Suffixes in wrong order.\n" 196 | " T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d" 197 | " > T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d\n", 198 | i - 1, SA[i - 1], T[SA[i - 1]], i, SA[i], T[SA[i]]); 199 | } 200 | return -3; 201 | } 202 | } 203 | 204 | /* check suffixes. */ 205 | for(i = 0; i < ALPHABET_SIZE; ++i) { C[i] = 0; } 206 | for(i = 0; i < n; ++i) { ++C[T[i]]; } 207 | for(i = 0, p = 0; i < ALPHABET_SIZE; ++i) { 208 | t = C[i]; 209 | C[i] = p; 210 | p += t; 211 | } 212 | 213 | q = C[T[n - 1]]; 214 | C[T[n - 1]] += 1; 215 | for(i = 0; i < n; ++i) { 216 | p = SA[i]; 217 | if(0 < p) { 218 | c = T[--p]; 219 | t = C[c]; 220 | } else { 221 | c = T[p = n - 1]; 222 | t = q; 223 | } 224 | if((t < 0) || (p != SA[t])) { 225 | if(verbose) { 226 | fprintf(stderr, "Suffix in wrong position.\n" 227 | " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T " or\n" 228 | " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n", 229 | t, (0 <= t) ? SA[t] : -1, i, SA[i]); 230 | } 231 | return -4; 232 | } 233 | if(t != q) { 234 | ++C[c]; 235 | if((n <= C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; } 236 | } 237 | } 238 | 239 | if(1 <= verbose) { fprintf(stderr, "Done.\n"); } 240 | return 0; 241 | } 242 | 243 | 244 | static 245 | int 246 | _compare(const sauchar_t *T, saidx_t Tsize, 247 | const sauchar_t *P, saidx_t Psize, 248 | saidx_t suf, saidx_t *match) { 249 | saidx_t i, j; 250 | saint_t r; 251 | for(i = suf + *match, j = *match, r = 0; 252 | (i < Tsize) && (j < Psize) && ((r = T[i] - P[j]) == 0); ++i, ++j) { } 253 | *match = j; 254 | return (r == 0) ? -(j != Psize) : r; 255 | } 256 | 257 | /* Search for the pattern P in the string T. */ 258 | saidx_t 259 | sa_search(const sauchar_t *T, saidx_t Tsize, 260 | const sauchar_t *P, saidx_t Psize, 261 | const saidx_t *SA, saidx_t SAsize, 262 | saidx_t *idx) { 263 | saidx_t size, lsize, rsize, half; 264 | saidx_t match, lmatch, rmatch; 265 | saidx_t llmatch, lrmatch, rlmatch, rrmatch; 266 | saidx_t i, j, k; 267 | saint_t r; 268 | 269 | if(idx != NULL) { *idx = -1; } 270 | if((T == NULL) || (P == NULL) || (SA == NULL) || 271 | (Tsize < 0) || (Psize < 0) || (SAsize < 0)) { return -1; } 272 | if((Tsize == 0) || (SAsize == 0)) { return 0; } 273 | if(Psize == 0) { if(idx != NULL) { *idx = 0; } return SAsize; } 274 | 275 | for(i = j = k = 0, lmatch = rmatch = 0, size = SAsize, half = size >> 1; 276 | 0 < size; 277 | size = half, half >>= 1) { 278 | match = MIN(lmatch, rmatch); 279 | r = _compare(T, Tsize, P, Psize, SA[i + half], &match); 280 | if(r < 0) { 281 | i += half + 1; 282 | half -= (size & 1) ^ 1; 283 | lmatch = match; 284 | } else if(r > 0) { 285 | rmatch = match; 286 | } else { 287 | lsize = half, j = i, rsize = size - half - 1, k = i + half + 1; 288 | 289 | /* left part */ 290 | for(llmatch = lmatch, lrmatch = match, half = lsize >> 1; 291 | 0 < lsize; 292 | lsize = half, half >>= 1) { 293 | lmatch = MIN(llmatch, lrmatch); 294 | r = _compare(T, Tsize, P, Psize, SA[j + half], &lmatch); 295 | if(r < 0) { 296 | j += half + 1; 297 | half -= (lsize & 1) ^ 1; 298 | llmatch = lmatch; 299 | } else { 300 | lrmatch = lmatch; 301 | } 302 | } 303 | 304 | /* right part */ 305 | for(rlmatch = match, rrmatch = rmatch, half = rsize >> 1; 306 | 0 < rsize; 307 | rsize = half, half >>= 1) { 308 | rmatch = MIN(rlmatch, rrmatch); 309 | r = _compare(T, Tsize, P, Psize, SA[k + half], &rmatch); 310 | if(r <= 0) { 311 | k += half + 1; 312 | half -= (rsize & 1) ^ 1; 313 | rlmatch = rmatch; 314 | } else { 315 | rrmatch = rmatch; 316 | } 317 | } 318 | 319 | break; 320 | } 321 | } 322 | 323 | if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; } 324 | return k - j; 325 | } 326 | 327 | /* Search for the character c in the string T. */ 328 | saidx_t 329 | sa_simplesearch(const sauchar_t *T, saidx_t Tsize, 330 | const saidx_t *SA, saidx_t SAsize, 331 | saint_t c, saidx_t *idx) { 332 | saidx_t size, lsize, rsize, half; 333 | saidx_t i, j, k, p; 334 | saint_t r; 335 | 336 | if(idx != NULL) { *idx = -1; } 337 | if((T == NULL) || (SA == NULL) || (Tsize < 0) || (SAsize < 0)) { return -1; } 338 | if((Tsize == 0) || (SAsize == 0)) { return 0; } 339 | 340 | for(i = j = k = 0, size = SAsize, half = size >> 1; 341 | 0 < size; 342 | size = half, half >>= 1) { 343 | p = SA[i + half]; 344 | r = (p < Tsize) ? T[p] - c : -1; 345 | if(r < 0) { 346 | i += half + 1; 347 | half -= (size & 1) ^ 1; 348 | } else if(r == 0) { 349 | lsize = half, j = i, rsize = size - half - 1, k = i + half + 1; 350 | 351 | /* left part */ 352 | for(half = lsize >> 1; 353 | 0 < lsize; 354 | lsize = half, half >>= 1) { 355 | p = SA[j + half]; 356 | r = (p < Tsize) ? T[p] - c : -1; 357 | if(r < 0) { 358 | j += half + 1; 359 | half -= (lsize & 1) ^ 1; 360 | } 361 | } 362 | 363 | /* right part */ 364 | for(half = rsize >> 1; 365 | 0 < rsize; 366 | rsize = half, half >>= 1) { 367 | p = SA[k + half]; 368 | r = (p < Tsize) ? T[p] - c : -1; 369 | if(r <= 0) { 370 | k += half + 1; 371 | half -= (rsize & 1) ^ 1; 372 | } 373 | } 374 | 375 | break; 376 | } 377 | } 378 | 379 | if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; } 380 | return k - j; 381 | } 382 | -------------------------------------------------------------------------------- /crates/cdivsufsort/c-sources/divsufsort.c: -------------------------------------------------------------------------------- 1 | /* 2 | * divsufsort.c for libdivsufsort 3 | * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. 4 | * 5 | * Permission is hereby granted, free of charge, to any person 6 | * obtaining a copy of this software and associated documentation 7 | * files (the "Software"), to deal in the Software without 8 | * restriction, including without limitation the rights to use, 9 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following 12 | * conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be 15 | * included in all copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | * OTHER DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | #include "divsufsort_private.h" 28 | #ifdef _OPENMP 29 | # include 30 | #endif 31 | 32 | /*- Cross-checking -*/ 33 | FILE *CROSSCHECK_FILE; 34 | 35 | /*- Private Functions -*/ 36 | 37 | /* Sorts suffixes of type B*. */ 38 | static 39 | saidx_t 40 | sort_typeBstar(const sauchar_t *T, saidx_t *SA, 41 | saidx_t *bucket_A, saidx_t *bucket_B, 42 | saidx_t n) { 43 | saidx_t *PAb, *ISAb, *buf; 44 | #ifdef _OPENMP 45 | saidx_t *curbuf; 46 | saidx_t l; 47 | #endif 48 | saidx_t i, j, k, t, m, bufsize; 49 | saint_t c0, c1; 50 | #ifdef _OPENMP 51 | saint_t d0, d1; 52 | int tmp; 53 | #endif 54 | 55 | /* Initialize bucket arrays. */ 56 | for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; } 57 | for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; } 58 | 59 | /* Count the number of occurrences of the first one or two characters of each 60 | type A, B and B* suffix. Moreover, store the beginning position of all 61 | type B* suffixes into the array SA. */ 62 | for( 63 | /* init */ i = n - 1, 64 | m = n, 65 | c0 = T[n - 1]; 66 | /* cond */ 0 <= i; 67 | /* iter */ 68 | ) { 69 | /* type A suffix. */ 70 | do { 71 | c1 = c0; 72 | ++BUCKET_A(c1); 73 | } while( 74 | (0 <= --i) && 75 | ((c0 = T[i]) >= c1) 76 | ); 77 | if(0 <= i) { 78 | /* type B* suffix. */ 79 | ++BUCKET_BSTAR(c0, c1); 80 | SA[--m] = i; 81 | /* type B suffix. */ 82 | for( 83 | /* init */ --i, 84 | c1 = c0; 85 | /* cond */ (0 <= i) && ((c0 = T[i]) <= c1); 86 | /* iter */ --i, 87 | c1 = c0) { 88 | 89 | ++BUCKET_B(c0, c1); 90 | } 91 | } 92 | } 93 | m = n - m; 94 | 95 | /* 96 | note: 97 | A type B* suffix is lexicographically smaller than a type B suffix that 98 | begins with the same first two characters. 99 | */ 100 | 101 | /* Calculate the index of start/end point of each bucket. */ 102 | for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) { 103 | t = i + BUCKET_A(c0); 104 | BUCKET_A(c0) = i + j; /* start point */ 105 | i = t + BUCKET_B(c0, c0); 106 | for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) { 107 | j += BUCKET_BSTAR(c0, c1); 108 | BUCKET_BSTAR(c0, c1) = j; /* end point */ 109 | i += BUCKET_B(c0, c1); 110 | } 111 | } 112 | 113 | 114 | if(0 < m) { 115 | 116 | /* Sort the type B* suffixes by their first two characters. */ 117 | PAb = SA + n - m; ISAb = SA + m; 118 | for(i = m - 2; 0 <= i; --i) { 119 | t = PAb[i], c0 = T[t], c1 = T[t + 1]; 120 | SA[--BUCKET_BSTAR(c0, c1)] = i; 121 | } 122 | t = PAb[m - 1], c0 = T[t], c1 = T[t + 1]; 123 | SA[--BUCKET_BSTAR(c0, c1)] = m - 1; 124 | 125 | 126 | /* Sort the type B* substrings using sssort. */ 127 | buf = SA + m, bufsize = n - (2 * m); 128 | for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) { 129 | for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) { 130 | i = BUCKET_BSTAR(c0, c1); 131 | if(1 < (j - i)) { 132 | SA_dump(SA, i, j-i, "sssort(A)"); 133 | sssort(T, PAb, SA + i, SA + j, 134 | buf, bufsize, 2, n, *(SA + i) == (m - 1)); 135 | SA_dump(SA, i, j-i, "sssort(B)"); 136 | } 137 | } 138 | } 139 | 140 | /* Compute ranks of type B* substrings. */ 141 | for(i = m - 1; 0 <= i; --i) { 142 | if(0 <= SA[i]) { 143 | j = i; 144 | do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i])); 145 | SA[i + 1] = i - j; 146 | if(i <= 0) { break; } 147 | } 148 | j = i; 149 | do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0); 150 | ISAb[SA[i]] = j; 151 | } 152 | 153 | /* Construct the inverse suffix array of type B* suffixes using trsort. */ 154 | trsort(ISAb, SA, m, 1); 155 | 156 | /* Set the sorted order of tyoe B* suffixes. */ 157 | for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) { 158 | for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { 159 | } 160 | if(0 <= i) { 161 | t = i; 162 | for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { 163 | } 164 | SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t; 165 | } 166 | } 167 | 168 | /* Calculate the index of start/end point of each bucket. */ 169 | BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */ 170 | for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) { 171 | i = BUCKET_A(c0 + 1) - 1; 172 | for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) { 173 | t = i - BUCKET_B(c0, c1); 174 | BUCKET_B(c0, c1) = i; /* end point */ 175 | 176 | /* Move all type B* suffixes to the correct position. */ 177 | for(i = t, j = BUCKET_BSTAR(c0, c1); 178 | j <= k; 179 | --i, --k) { SA[i] = SA[k]; } 180 | } 181 | BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */ 182 | BUCKET_B(c0, c0) = i; /* end point */ 183 | } 184 | 185 | } 186 | 187 | return m; 188 | } 189 | 190 | /* Constructs the suffix array by using the sorted order of type B* suffixes. */ 191 | static 192 | void 193 | construct_SA(const sauchar_t *T, saidx_t *SA, 194 | saidx_t *bucket_A, saidx_t *bucket_B, 195 | saidx_t n, saidx_t m) { 196 | saidx_t *i, *j, *k; 197 | saidx_t s; 198 | saint_t c0, c1, c2; 199 | 200 | if(0 < m) { 201 | /* Construct the sorted order of type B suffixes by using 202 | the sorted order of type B* suffixes. */ 203 | for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { 204 | 205 | /* Scan the suffix array from right to left. */ 206 | for(i = SA + BUCKET_BSTAR(c1, c1 + 1), 207 | j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; 208 | i <= j; 209 | --j) { 210 | if(0 < (s = *j)) { 211 | assert(T[s] == c1); 212 | assert(((s + 1) < n) && (T[s] <= T[s + 1])); 213 | assert(T[s - 1] <= T[s]); 214 | *j = ~s; 215 | c0 = T[--s]; 216 | if((0 < s) && (T[s - 1] > c0)) { s = ~s; } 217 | if(c0 != c2) { 218 | if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } 219 | k = SA + BUCKET_B(c2 = c0, c1); 220 | } 221 | assert(k < j); 222 | *k-- = s; 223 | } else { 224 | assert(((s == 0) && (T[s] == c1)) || (s < 0)); 225 | *j = ~s; 226 | } 227 | } 228 | } 229 | } 230 | 231 | /* Construct the suffix array by using 232 | the sorted order of type B suffixes. */ 233 | k = SA + BUCKET_A(c2 = T[n - 1]); 234 | *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1); 235 | /* Scan the suffix array from left to right. */ 236 | for(i = SA, j = SA + n; i < j; ++i) { 237 | if(0 < (s = *i)) { 238 | assert(T[s - 1] >= T[s]); 239 | c0 = T[--s]; 240 | if((s == 0) || (T[s - 1] < c0)) { s = ~s; } 241 | if(c0 != c2) { 242 | BUCKET_A(c2) = k - SA; 243 | k = SA + BUCKET_A(c2 = c0); 244 | } 245 | assert(i < k); 246 | *k++ = s; 247 | } else { 248 | assert(s < 0); 249 | *i = ~s; 250 | } 251 | } 252 | 253 | } 254 | 255 | /* Constructs the burrows-wheeler transformed string directly 256 | by using the sorted order of type B* suffixes. */ 257 | static 258 | saidx_t 259 | construct_BWT(const sauchar_t *T, saidx_t *SA, 260 | saidx_t *bucket_A, saidx_t *bucket_B, 261 | saidx_t n, saidx_t m) { 262 | saidx_t *i, *j, *k, *orig; 263 | saidx_t s; 264 | saint_t c0, c1, c2; 265 | 266 | if(0 < m) { 267 | /* Construct the sorted order of type B suffixes by using 268 | the sorted order of type B* suffixes. */ 269 | for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { 270 | /* Scan the suffix array from right to left. */ 271 | for(i = SA + BUCKET_BSTAR(c1, c1 + 1), 272 | j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; 273 | i <= j; 274 | --j) { 275 | if(0 < (s = *j)) { 276 | assert(T[s] == c1); 277 | assert(((s + 1) < n) && (T[s] <= T[s + 1])); 278 | assert(T[s - 1] <= T[s]); 279 | c0 = T[--s]; 280 | *j = ~((saidx_t)c0); 281 | if((0 < s) && (T[s - 1] > c0)) { s = ~s; } 282 | if(c0 != c2) { 283 | if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } 284 | k = SA + BUCKET_B(c2 = c0, c1); 285 | } 286 | assert(k < j); 287 | *k-- = s; 288 | } else if(s != 0) { 289 | *j = ~s; 290 | #ifndef NDEBUG 291 | } else { 292 | assert(T[s] == c1); 293 | #endif 294 | } 295 | } 296 | } 297 | } 298 | 299 | /* Construct the BWTed string by using 300 | the sorted order of type B suffixes. */ 301 | k = SA + BUCKET_A(c2 = T[n - 1]); 302 | *k++ = (T[n - 2] < c2) ? ~((saidx_t)T[n - 2]) : (n - 1); 303 | /* Scan the suffix array from left to right. */ 304 | for(i = SA, j = SA + n, orig = SA; i < j; ++i) { 305 | if(0 < (s = *i)) { 306 | assert(T[s - 1] >= T[s]); 307 | c0 = T[--s]; 308 | *i = c0; 309 | if((0 < s) && (T[s - 1] < c0)) { s = ~((saidx_t)T[s - 1]); } 310 | if(c0 != c2) { 311 | BUCKET_A(c2) = k - SA; 312 | k = SA + BUCKET_A(c2 = c0); 313 | } 314 | assert(i < k); 315 | *k++ = s; 316 | } else if(s != 0) { 317 | *i = ~s; 318 | } else { 319 | orig = i; 320 | } 321 | } 322 | 323 | return orig - SA; 324 | } 325 | 326 | 327 | /*---------------------------------------------------------------------------*/ 328 | 329 | /*- Function -*/ 330 | 331 | saint_t 332 | divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n) { 333 | #ifdef ENABLE_CROSSCHECK 334 | CROSSCHECK_FILE = fopen("crosscheck/c", "wb"); 335 | if (!CROSSCHECK_FILE) { 336 | fprintf(stderr, "Could not open crosscheck file"); 337 | return -2; 338 | } 339 | #endif 340 | 341 | saidx_t *bucket_A, *bucket_B; 342 | saidx_t m; 343 | saint_t err = 0; 344 | 345 | /* Check arguments. */ 346 | if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; } 347 | else if(n == 0) { return 0; } 348 | else if(n == 1) { SA[0] = 0; return 0; } 349 | else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; } 350 | 351 | bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t)); 352 | bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t)); 353 | 354 | /* Suffixsort. */ 355 | if((bucket_A != NULL) && (bucket_B != NULL)) { 356 | m = sort_typeBstar(T, SA, bucket_A, bucket_B, n); 357 | construct_SA(T, SA, bucket_A, bucket_B, n, m); 358 | } else { 359 | err = -2; 360 | } 361 | 362 | free(bucket_B); 363 | free(bucket_A); 364 | 365 | #ifdef ENABLE_CROSSCHECK 366 | fclose(CROSSCHECK_FILE); 367 | #endif 368 | 369 | return err; 370 | } 371 | 372 | saidx_t 373 | divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n) { 374 | saidx_t *B; 375 | saidx_t *bucket_A, *bucket_B; 376 | saidx_t m, pidx, i; 377 | 378 | /* Check arguments. */ 379 | if((T == NULL) || (U == NULL) || (n < 0)) { return -1; } 380 | else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } 381 | 382 | if((B = A) == NULL) { B = (saidx_t *)malloc((size_t)(n + 1) * sizeof(saidx_t)); } 383 | bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t)); 384 | bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t)); 385 | 386 | /* Burrows-Wheeler Transform. */ 387 | if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) { 388 | m = sort_typeBstar(T, B, bucket_A, bucket_B, n); 389 | pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m); 390 | 391 | /* Copy to output string. */ 392 | U[0] = T[n - 1]; 393 | for(i = 0; i < pidx; ++i) { U[i + 1] = (sauchar_t)B[i]; } 394 | for(i += 1; i < n; ++i) { U[i] = (sauchar_t)B[i]; } 395 | pidx += 1; 396 | } else { 397 | pidx = -2; 398 | } 399 | 400 | free(bucket_B); 401 | free(bucket_A); 402 | if(A == NULL) { free(B); } 403 | 404 | return pidx; 405 | } 406 | 407 | const char * 408 | divsufsort_version(void) { 409 | return PROJECT_VERSION_FULL; 410 | } 411 | 412 | void dss_flush() { 413 | fflush(stdout); 414 | } -------------------------------------------------------------------------------- /crates/divsufsort/src/divsufsort.rs: -------------------------------------------------------------------------------- 1 | use crate::{common::*, crosscheck, crosscheck::*, sssort, trsort, SA_dump}; 2 | 3 | pub fn divsufsort(T: &[Char], SA: &mut [Idx]) { 4 | assert_eq!( 5 | T.len(), 6 | SA.len(), 7 | "text and suffix array should have same len" 8 | ); 9 | assert!( 10 | T.len() < i32::max_value() as usize, 11 | "text too large, should not exceed {} bytes", 12 | i32::max_value() - 1 13 | ); 14 | 15 | let n = T.len(); 16 | 17 | // short T cases 18 | match n { 19 | 0 => return, 20 | 1 => { 21 | SA[0] = 0; 22 | return; 23 | } 24 | 2 => { 25 | SA.copy_from_slice(if T[0] < T[1] { &[0, 1] } else { &[1, 0] }); 26 | return; 27 | } 28 | _ => { /* continue */ } 29 | } 30 | 31 | let T = Text(T); 32 | let mut SA = SuffixArray(SA); 33 | 34 | // Suffixsort. 35 | let res = sort_typeBstar(&T, &mut SA); 36 | construct_SA(&T, &mut SA, res.A, res.B, res.m); 37 | } 38 | 39 | struct SortTypeBstarResult { 40 | A: ABucket, 41 | B: BMixBucket, 42 | m: Idx, 43 | } 44 | 45 | fn sort_typeBstar(T: &Text, SA: &mut SuffixArray) -> SortTypeBstarResult { 46 | let n = T.len(); 47 | 48 | // Initialize bucket arrays 49 | let A: Vec = vec![0; BUCKET_A_SIZE]; 50 | let mut A = ABucket(A); 51 | 52 | let B: Vec = vec![0; BUCKET_B_SIZE]; 53 | let mut B = BMixBucket(B); 54 | 55 | // temps 56 | let mut c0: Idx; 57 | let mut c1: Idx; 58 | let mut i: Idx; 59 | let mut j: Idx; 60 | let mut k: Idx; 61 | let mut t: Idx; 62 | let mut m: Idx; 63 | 64 | // Count the number of occurences of the first one or two characters of each 65 | // type A, B and B* suffix. Moreover, store the beginning position of all 66 | // type B* suffixes into the array SA. 67 | i = n - 1; 68 | m = n; 69 | c0 = T.get(n - 1); 70 | 71 | while 0 <= i { 72 | // type A suffix (originally do..while) 73 | loop { 74 | c1 = c0; 75 | A[c1] += 1; 76 | 77 | // original loop condition 78 | i -= 1; 79 | if !(0 <= i) { 80 | break; 81 | } 82 | 83 | c0 = T.get(i); 84 | if !(c0 >= c1) { 85 | break; 86 | } 87 | } 88 | 89 | if 0 <= i { 90 | // type B* suffix 91 | B.bstar()[(c0, c1)] += 1; 92 | 93 | m -= 1; 94 | SA[m] = i; 95 | 96 | // type B suffix 97 | 98 | // init 99 | i -= 1; 100 | c1 = c0; 101 | 102 | loop { 103 | // cond 104 | if !(0 <= i) { 105 | break; 106 | } 107 | c0 = T.get(i); 108 | if !(c0 <= c1) { 109 | break; 110 | } 111 | 112 | // body 113 | B.b()[(c0, c1)] += 1; 114 | 115 | // iter 116 | i -= 1; 117 | c1 = c0; 118 | } 119 | } 120 | } 121 | m = n - m; 122 | 123 | // Note: A type B* suffix is lexicographically smaller than a type B suffix 124 | // that beings with the same first two characters. 125 | 126 | // Calculate the index of start/end point of each bucket. 127 | { 128 | i = 0; 129 | j = 0; 130 | for c0 in 0..(ALPHABET_SIZE as Idx) { 131 | // body 132 | t = i + A[c0]; 133 | A[c0] = i + j; // start point 134 | i = t + B.b()[(c0, c0)]; 135 | 136 | for c1 in (c0 + 1)..(ALPHABET_SIZE as Idx) { 137 | j += B.bstar()[(c0, c1)]; 138 | B.bstar()[(c0, c1)] = j; // end point 139 | i += B.b()[(c0, c1)]; 140 | } 141 | } 142 | } 143 | 144 | if (0 < m) { 145 | // Sort the type B* suffixes by their first two characters 146 | let PAb = SAPtr(n - m); 147 | let ISAb = SAPtr(m); 148 | 149 | for i in (0..=(m - 2)).rev() { 150 | t = SA[PAb + i]; 151 | c0 = T.get(t); 152 | c1 = T.get(t + 1); 153 | B.bstar()[(c0, c1)] -= 1; 154 | SA[B.bstar()[(c0, c1)]] = i; 155 | } 156 | t = SA[PAb + m - 1]; 157 | c0 = T.get(t); 158 | c1 = T.get(t + 1); 159 | B.bstar()[(c0, c1)] -= 1; 160 | SA[B.bstar()[(c0, c1)]] = m - 1; 161 | 162 | // Sort the type B* substrings using sssort. 163 | let buf = SAPtr(m); 164 | let bufsize = n - (2 * m); 165 | 166 | // init (outer) 167 | c0 = ALPHABET_SIZE as Idx - 2; 168 | j = m; 169 | while 0 < j { 170 | // init (inner) 171 | c1 = ALPHABET_SIZE as Idx - 1; 172 | while c0 < c1 { 173 | // body (inner) 174 | i = B.bstar()[(c0, c1)]; 175 | 176 | if (1 < (j - i)) { 177 | SA_dump!(&SA.range(i..j), "sssort(A)"); 178 | sssort::sssort( 179 | T, 180 | SA, 181 | PAb, 182 | SAPtr(i), 183 | SAPtr(j), 184 | buf, 185 | bufsize, 186 | 2, 187 | n, 188 | SA[i] == (m - 1), 189 | ); 190 | SA_dump!(&SA.range(i..j), "sssort(B)"); 191 | } 192 | 193 | // iter (inner) 194 | j = i; 195 | c1 -= 1; 196 | } 197 | 198 | // iter (outer) 199 | c0 -= 1; 200 | } 201 | 202 | // Compute ranks of type B* substrings 203 | i = m - 1; 204 | while 0 <= i { 205 | if (0 <= SA[i]) { 206 | j = i; 207 | loop { 208 | { 209 | let SAi = SA[i]; 210 | SA[ISAb + SAi] = i; 211 | } 212 | 213 | i -= 1; 214 | if !((0 <= i) && (0 <= SA[i])) { 215 | break; 216 | } 217 | } 218 | 219 | SA[i + 1] = i - j; 220 | if (i <= 0) { 221 | break; 222 | } 223 | } 224 | j = i; 225 | loop { 226 | SA[i] = !SA[i]; 227 | { 228 | let idx = ISAb + SA[i]; 229 | SA[idx] = j; 230 | } 231 | 232 | i -= 1; 233 | if !(SA[i] < 0) { 234 | break; 235 | } 236 | } 237 | { 238 | let idx = ISAb + SA[i]; 239 | SA[idx] = j; 240 | } 241 | 242 | i -= 1; 243 | } 244 | 245 | // Construct the inverse suffix array of type B* suffixes using trsort. 246 | trsort::trsort(ISAb, SA, m, 1); 247 | 248 | // Set the sorted order of type B* suffixes 249 | { 250 | // init 251 | i = n - 1; 252 | j = m; 253 | c0 = T.get(n - 1); 254 | while 0 <= i { 255 | // init 256 | i -= 1; 257 | c1 = c0; 258 | 259 | loop { 260 | // cond 261 | if !(0 <= i) { 262 | break; 263 | } 264 | c0 = T.get(i); 265 | if !(c0 >= c1) { 266 | break; 267 | } 268 | 269 | // body (empty) 270 | 271 | // iter 272 | i -= 1; 273 | c1 = c0; 274 | } 275 | 276 | if 0 <= i { 277 | t = i; 278 | 279 | // init 280 | i -= 1; 281 | c1 = c0; 282 | 283 | loop { 284 | // cond 285 | if !(0 <= i) { 286 | break; 287 | } 288 | c0 = T.get(i); 289 | if !(c0 <= c1) { 290 | break; 291 | } 292 | 293 | // body (empty) 294 | 295 | // iter 296 | i -= 1; 297 | c1 = c0; 298 | } 299 | 300 | j -= 1; 301 | { 302 | let pos = SA[ISAb + j]; 303 | SA[pos] = if (t == 0) || (1 < (t - i)) { t } else { !t }; 304 | } 305 | } 306 | } 307 | } // End: Set the sorted order of type B* suffixes 308 | 309 | // Calculate the index of start/end point of each bucket 310 | { 311 | B.b()[(ALPHABET_SIZE as Idx - 1, ALPHABET_SIZE as Idx - 1)] = n; // end point 312 | 313 | // init 314 | c0 = ALPHABET_SIZE as Idx - 2; 315 | k = m - 1; 316 | 317 | while 0 <= c0 { 318 | i = A[c0 + 1] - 1; 319 | 320 | // init 321 | c1 = ALPHABET_SIZE as Idx - 1; 322 | while c0 < c1 { 323 | t = i - B.b()[(c0, c1)]; 324 | B.b()[(c0, c1)] = i; // end point 325 | 326 | // Move all type B* suffixes to the correct position 327 | { 328 | // init 329 | i = t; 330 | j = B.bstar()[(c0, c1)]; 331 | 332 | while j <= k { 333 | SA[i] = SA[k]; 334 | 335 | // iter 336 | i -= 1; 337 | k -= 1; 338 | } 339 | } // End: Move all type B* suffixes to the correct position 340 | 341 | // iter 342 | c1 -= 1; 343 | } 344 | B.bstar()[(c0, c0 + 1)] = i - B.b()[(c0, c0)] + 1; 345 | B.b()[(c0, c0)] = i; // end point 346 | 347 | // iter 348 | c0 -= 1; 349 | } 350 | } // End: Calculate the index of start/end point of each bucket 351 | } 352 | 353 | SortTypeBstarResult { A, B, m } 354 | } 355 | 356 | fn construct_SA(T: &Text, SA: &mut SuffixArray, mut A: ABucket, mut B: BMixBucket, m: Idx) { 357 | let n = T.len() as Idx; 358 | let mut i: SAPtr; 359 | let mut j: SAPtr; 360 | let mut k: Idx; 361 | let mut s: Idx; 362 | let mut c0: Idx; 363 | let mut c1: Idx; 364 | let mut c2: Idx; 365 | 366 | if 0 < m { 367 | // Construct the sorted order of type B suffixes by using the 368 | // sorted order of type B* suffixes 369 | c1 = ALPHABET_SIZE as Idx - 2; 370 | while 0 <= c1 { 371 | // Scan the suffix array from right to left 372 | i = SAPtr(B.bstar()[(c1, c1 + 1)]); 373 | j = SAPtr(A[c1 + 1] - 1); 374 | k = 0; 375 | c2 = -1; 376 | 377 | while i <= j { 378 | s = SA[j]; 379 | if (0 < s) { 380 | assert_eq!(T.get(s), c1); 381 | assert!((s + 1) < n); 382 | assert!(T[s] <= T[s + 1]); 383 | 384 | SA[j] = !s; 385 | s -= 1; 386 | c0 = T.get(s); 387 | if (0 < s) && (T.get(s - 1) > c0) { 388 | s = !s; 389 | } 390 | if (c0 != c2) { 391 | if (0 <= c2) { 392 | B.b()[(c2, c1)] = k; 393 | } 394 | c2 = c0; 395 | k = B.b()[(c2, c1)]; 396 | } 397 | assert!(k < j); 398 | SA[k] = s; 399 | k -= 1; 400 | } else { 401 | assert!(((s == 0) && (T.get(s) == c1)) || (s < 0)); 402 | SA[j] = !s; 403 | } 404 | 405 | // iter 406 | j -= 1; 407 | } 408 | 409 | // iter 410 | c1 -= 1; 411 | } 412 | } 413 | 414 | // Construct the suffix array by using the sorted order of type B suffixes 415 | c2 = T.get(n - 1); 416 | k = A[c2]; 417 | SA[k] = if T.get(n - 2) < c2 { !(n - 1) } else { n - 1 }; 418 | k += 1; 419 | // Scan the suffix array from left to right 420 | { 421 | // init 422 | i = SAPtr(0); 423 | j = SAPtr(n); 424 | 425 | while i < j { 426 | s = SA[i]; 427 | if 0 < s { 428 | assert!(T[s - 1] >= T[s]); 429 | s -= 1; 430 | c0 = T.get(s); 431 | if (s == 0) || (T.get(s - 1) < c0) { 432 | s = !s; 433 | } 434 | if (c0 != c2) { 435 | A[c2] = k; 436 | c2 = c0; 437 | k = A[c2]; 438 | } 439 | assert!(i < k); 440 | SA[k] = s; 441 | k += 1; 442 | } else { 443 | assert!(s < 0); 444 | SA[i] = !s; 445 | } 446 | 447 | // iter 448 | i += 1; 449 | } 450 | } 451 | } 452 | -------------------------------------------------------------------------------- /crates/cdivsufsort/c-sources/trsort.c: -------------------------------------------------------------------------------- 1 | /* 2 | * trsort.c for libdivsufsort 3 | * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. 4 | * 5 | * Permission is hereby granted, free of charge, to any person 6 | * obtaining a copy of this software and associated documentation 7 | * files (the "Software"), to deal in the Software without 8 | * restriction, including without limitation the rights to use, 9 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following 12 | * conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be 15 | * included in all copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | * OTHER DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | #include "divsufsort_private.h" 28 | 29 | 30 | /*- Private Functions -*/ 31 | 32 | static const saint_t lg_table[256]= { 33 | -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, 34 | 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 35 | 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, 36 | 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, 37 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 38 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 39 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 40 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 41 | }; 42 | 43 | static INLINE 44 | saint_t 45 | tr_ilg(saidx_t n) { 46 | #if defined(BUILD_DIVSUFSORT64) 47 | return (n >> 32) ? 48 | ((n >> 48) ? 49 | ((n >> 56) ? 50 | 56 + lg_table[(n >> 56) & 0xff] : 51 | 48 + lg_table[(n >> 48) & 0xff]) : 52 | ((n >> 40) ? 53 | 40 + lg_table[(n >> 40) & 0xff] : 54 | 32 + lg_table[(n >> 32) & 0xff])) : 55 | ((n & 0xffff0000) ? 56 | ((n & 0xff000000) ? 57 | 24 + lg_table[(n >> 24) & 0xff] : 58 | 16 + lg_table[(n >> 16) & 0xff]) : 59 | ((n & 0x0000ff00) ? 60 | 8 + lg_table[(n >> 8) & 0xff] : 61 | 0 + lg_table[(n >> 0) & 0xff])); 62 | #else 63 | return (n & 0xffff0000) ? 64 | ((n & 0xff000000) ? 65 | 24 + lg_table[(n >> 24) & 0xff] : 66 | 16 + lg_table[(n >> 16) & 0xff]) : 67 | ((n & 0x0000ff00) ? 68 | 8 + lg_table[(n >> 8) & 0xff] : 69 | 0 + lg_table[(n >> 0) & 0xff]); 70 | #endif 71 | } 72 | 73 | 74 | /*---------------------------------------------------------------------------*/ 75 | 76 | /* Simple insertionsort for small size groups. */ 77 | static void tr_insertionsort(const saidx_t *ISAd, saidx_t *first, 78 | saidx_t *last) { 79 | saidx_t *a, *b; 80 | saidx_t t, r; 81 | 82 | // KAREN 83 | for (a = first + 1; a < last; ++a) { 84 | // JEZEBEL 85 | for (t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) { 86 | // LILITH 87 | do { 88 | *(b + 1) = *b; 89 | } while ((first <= --b) && (*b < 0)); 90 | if (b < first) { 91 | break; 92 | } 93 | } 94 | if (r == 0) { 95 | *b = ~*b; 96 | } 97 | *(b + 1) = t; 98 | } 99 | } 100 | 101 | /*---------------------------------------------------------------------------*/ 102 | 103 | static INLINE void tr_fixdown(const saidx_t *ISAd, saidx_t *SA, saidx_t i, 104 | saidx_t size) { 105 | saidx_t j, k; 106 | saidx_t v; 107 | saidx_t c, d, e; 108 | 109 | crosscheck("fixdown i=%d size=%d", i, size); 110 | 111 | // WILMOT 112 | for (v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { 113 | d = ISAd[SA[k = j++]]; 114 | if (d < (e = ISAd[SA[j]])) { 115 | k = j; 116 | d = e; 117 | } 118 | if (d <= c) { 119 | break; 120 | } 121 | } 122 | SA[i] = v; 123 | } 124 | 125 | /* Simple top-down heapsort. */ 126 | static void tr_heapsort(const saidx_t *ISAd, saidx_t *SA, saidx_t size) { 127 | saidx_t i, m; 128 | saidx_t t; 129 | 130 | m = size; 131 | if ((size % 2) == 0) { 132 | m--; 133 | if (ISAd[SA[m / 2]] < ISAd[SA[m]]) { 134 | SWAP(SA[m], SA[m / 2]); 135 | } 136 | } 137 | 138 | // LISA 139 | for (i = m / 2 - 1; 0 <= i; --i) { 140 | crosscheck("LISA i=%d", i); 141 | tr_fixdown(ISAd, SA, i, m); 142 | } 143 | if ((size % 2) == 0) { 144 | SWAP(SA[0], SA[m]); 145 | tr_fixdown(ISAd, SA, 0, m); 146 | } 147 | // MARK 148 | for (i = m - 1; 0 < i; --i) { 149 | crosscheck("MARK i=%d", i); 150 | t = SA[0], SA[0] = SA[i]; 151 | tr_fixdown(ISAd, SA, 0, i); 152 | SA[i] = t; 153 | } 154 | } 155 | 156 | /*---------------------------------------------------------------------------*/ 157 | 158 | /* Returns the median of three elements. */ 159 | static INLINE saidx_t *tr_median3(const saidx_t *ISAd, saidx_t *v1, saidx_t *v2, 160 | saidx_t *v3) { 161 | saidx_t *t; 162 | if (ISAd[*v1] > ISAd[*v2]) { 163 | SWAP(v1, v2); 164 | } 165 | if (ISAd[*v2] > ISAd[*v3]) { 166 | if (ISAd[*v1] > ISAd[*v3]) { 167 | return v1; 168 | } else { 169 | return v3; 170 | } 171 | } 172 | return v2; 173 | } 174 | 175 | /* Returns the median of five elements. */ 176 | static INLINE saidx_t *tr_median5(const saidx_t *ISAd, saidx_t *v1, saidx_t *v2, 177 | saidx_t *v3, saidx_t *v4, saidx_t *v5) { 178 | saidx_t *t; 179 | if (ISAd[*v2] > ISAd[*v3]) { 180 | SWAP(v2, v3); 181 | } 182 | if (ISAd[*v4] > ISAd[*v5]) { 183 | SWAP(v4, v5); 184 | } 185 | if (ISAd[*v2] > ISAd[*v4]) { 186 | SWAP(v2, v4); 187 | SWAP(v3, v5); 188 | } 189 | if (ISAd[*v1] > ISAd[*v3]) { 190 | SWAP(v1, v3); 191 | } 192 | if (ISAd[*v1] > ISAd[*v4]) { 193 | SWAP(v1, v4); 194 | SWAP(v3, v5); 195 | } 196 | if (ISAd[*v3] > ISAd[*v4]) { 197 | return v4; 198 | } 199 | return v3; 200 | } 201 | 202 | /* Returns the pivot element. */ 203 | static INLINE saidx_t *tr_pivot(const saidx_t *ISAd, saidx_t *first, 204 | saidx_t *last) { 205 | saidx_t *middle; 206 | saidx_t t; 207 | 208 | t = last - first; 209 | middle = first + t / 2; 210 | 211 | if (t <= 512) { 212 | if (t <= 32) { 213 | return tr_median3(ISAd, first, middle, last - 1); 214 | } else { 215 | t >>= 2; 216 | return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1); 217 | } 218 | } 219 | t >>= 3; 220 | first = tr_median3(ISAd, first, first + t, first + (t << 1)); 221 | middle = tr_median3(ISAd, middle - t, middle, middle + t); 222 | last = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1); 223 | return tr_median3(ISAd, first, middle, last); 224 | } 225 | 226 | /*---------------------------------------------------------------------------*/ 227 | 228 | typedef struct _trbudget_t trbudget_t; 229 | struct _trbudget_t { 230 | saidx_t chance; 231 | saidx_t remain; 232 | saidx_t incval; 233 | saidx_t count; 234 | }; 235 | 236 | static INLINE 237 | void 238 | trbudget_init(trbudget_t *budget, saidx_t chance, saidx_t incval) { 239 | budget->chance = chance; 240 | budget->remain = budget->incval = incval; 241 | } 242 | 243 | static INLINE 244 | saint_t 245 | trbudget_check(trbudget_t *budget, saidx_t size) { 246 | if(size <= budget->remain) { budget->remain -= size; return 1; } 247 | if(budget->chance == 0) { budget->count += size; return 0; } 248 | budget->remain += budget->incval - size; 249 | budget->chance -= 1; 250 | return 1; 251 | } 252 | 253 | 254 | /*---------------------------------------------------------------------------*/ 255 | 256 | static INLINE void tr_partition(const saidx_t *ISAd, saidx_t *first, 257 | saidx_t *middle, saidx_t *last, saidx_t **pa, 258 | saidx_t **pb, saidx_t v) { 259 | saidx_t *a, *b, *c, *d, *e, *f; 260 | saidx_t t, s; 261 | saidx_t x = 0; 262 | 263 | // JOSEPH 264 | for (b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { 265 | } 266 | if (((a = b) < last) && (x < v)) { 267 | // MARY 268 | for (; (++b < last) && ((x = ISAd[*b]) <= v);) { 269 | if (x == v) { 270 | SWAP(*b, *a); 271 | ++a; 272 | } 273 | } 274 | } 275 | // JEREMIAH 276 | for (c = last; (b < --c) && ((x = ISAd[*c]) == v);) { 277 | } 278 | if ((b < (d = c)) && (x > v)) { 279 | // BEDELIA 280 | for (; (b < --c) && ((x = ISAd[*c]) >= v);) { 281 | if (x == v) { 282 | SWAP(*c, *d); 283 | --d; 284 | } 285 | } 286 | } 287 | // ALEX 288 | for (; b < c;) { 289 | SWAP(*b, *c); 290 | // SIMON 291 | for (; (++b < c) && ((x = ISAd[*b]) <= v);) { 292 | if (x == v) { 293 | SWAP(*b, *a); 294 | ++a; 295 | } 296 | } 297 | // GREGORY 298 | for (; (b < --c) && ((x = ISAd[*c]) >= v);) { 299 | if (x == v) { 300 | SWAP(*c, *d); 301 | --d; 302 | } 303 | } 304 | } // end ALEX 305 | 306 | if (a <= d) { 307 | c = b - 1; 308 | if ((s = a - first) > (t = b - a)) { 309 | s = t; 310 | } 311 | // GENEVIEVE 312 | for (e = first, f = b - s; 0 < s; --s, ++e, ++f) { 313 | SWAP(*e, *f); 314 | } 315 | if ((s = d - c) > (t = last - d - 1)) { 316 | s = t; 317 | } 318 | // MARISSA 319 | for (e = b, f = last - s; 0 < s; --s, ++e, ++f) { 320 | SWAP(*e, *f); 321 | } 322 | first += (b - a), last -= (d - c); 323 | } 324 | *pa = first, *pb = last; 325 | } 326 | 327 | static void tr_copy(saidx_t *ISA, const saidx_t *SA, saidx_t *first, saidx_t *a, 328 | saidx_t *b, saidx_t *last, saidx_t depth) { 329 | /* sort suffixes of middle partition 330 | by using sorted order of suffixes of left and right partition. */ 331 | saidx_t *c, *d, *e; 332 | saidx_t s, v; 333 | 334 | crosscheck("tr_copy first=%d a=%d b=%d last=%d", first - SA, a - SA, b - SA, 335 | last - SA); 336 | 337 | v = b - SA - 1; 338 | // JACK 339 | for (c = first, d = a - 1; c <= d; ++c) { 340 | if ((0 <= (s = *c - depth)) && (ISA[s] == v)) { 341 | *++d = s; 342 | ISA[s] = d - SA; 343 | } 344 | } 345 | // JILL 346 | for (c = last - 1, e = d + 1, d = b; e < d; --c) { 347 | if ((0 <= (s = *c - depth)) && (ISA[s] == v)) { 348 | *--d = s; 349 | ISA[s] = d - SA; 350 | } 351 | } 352 | } 353 | 354 | static void tr_partialcopy(saidx_t *ISA, const saidx_t *SA, saidx_t *first, 355 | saidx_t *a, saidx_t *b, saidx_t *last, 356 | saidx_t depth) { 357 | saidx_t *c, *d, *e; 358 | saidx_t s, v; 359 | saidx_t rank, lastrank, newrank = -1; 360 | 361 | v = b - SA - 1; 362 | lastrank = -1; 363 | // JETHRO 364 | for (c = first, d = a - 1; c <= d; ++c) { 365 | if ((0 <= (s = *c - depth)) && (ISA[s] == v)) { 366 | *++d = s; 367 | rank = ISA[s + depth]; 368 | if (lastrank != rank) { 369 | lastrank = rank; 370 | newrank = d - SA; 371 | } 372 | ISA[s] = newrank; 373 | } 374 | } 375 | 376 | lastrank = -1; 377 | // SCROOGE 378 | for (e = d; first <= e; --e) { 379 | rank = ISA[*e]; 380 | if (lastrank != rank) { 381 | lastrank = rank; 382 | newrank = e - SA; 383 | } 384 | if (newrank != rank) { 385 | ISA[*e] = newrank; 386 | } 387 | } 388 | 389 | lastrank = -1; 390 | // DEWEY 391 | for (c = last - 1, e = d + 1, d = b; e < d; --c) { 392 | if ((0 <= (s = *c - depth)) && (ISA[s] == v)) { 393 | *--d = s; 394 | rank = ISA[s + depth]; 395 | if (lastrank != rank) { 396 | lastrank = rank; 397 | newrank = d - SA; 398 | } 399 | ISA[s] = newrank; 400 | } 401 | } 402 | } 403 | 404 | static void tr_introsort(saidx_t *ISA, const saidx_t *ISAd, saidx_t *SA, 405 | saidx_t *first, saidx_t *last, trbudget_t *budget) { 406 | #define STACK_SIZE TR_STACKSIZE 407 | struct { 408 | const saidx_t *a; 409 | saidx_t *b, *c; 410 | saint_t d, e; 411 | } stack[STACK_SIZE]; 412 | saidx_t *a, *b, *c; 413 | saidx_t t; 414 | saidx_t v, x = 0; 415 | saidx_t incr = ISAd - ISA; 416 | saint_t limit, next; 417 | saint_t ssize, trlink = -1; 418 | 419 | { saidx_t n = last - SA; } 420 | 421 | // PASCAL 422 | for (ssize = 0, limit = tr_ilg(last - first);;) { 423 | crosscheck("pascal limit=%d first=%d last=%d", limit, first-SA, last-SA); 424 | if (limit < 0) { 425 | if (limit == -1) { 426 | /* tandem repeat partition */ 427 | tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1); 428 | 429 | /* update ranks */ 430 | if (a < last) { 431 | crosscheck("ranks a> 32) ? 51 | ((n >> 48) ? 52 | ((n >> 56) ? 53 | 56 + lg_table[(n >> 56) & 0xff] : 54 | 48 + lg_table[(n >> 48) & 0xff]) : 55 | ((n >> 40) ? 56 | 40 + lg_table[(n >> 40) & 0xff] : 57 | 32 + lg_table[(n >> 32) & 0xff])) : 58 | ((n & 0xffff0000) ? 59 | ((n & 0xff000000) ? 60 | 24 + lg_table[(n >> 24) & 0xff] : 61 | 16 + lg_table[(n >> 16) & 0xff]) : 62 | ((n & 0x0000ff00) ? 63 | 8 + lg_table[(n >> 8) & 0xff] : 64 | 0 + lg_table[(n >> 0) & 0xff])); 65 | # else 66 | return (n & 0xffff0000) ? 67 | ((n & 0xff000000) ? 68 | 24 + lg_table[(n >> 24) & 0xff] : 69 | 16 + lg_table[(n >> 16) & 0xff]) : 70 | ((n & 0x0000ff00) ? 71 | 8 + lg_table[(n >> 8) & 0xff] : 72 | 0 + lg_table[(n >> 0) & 0xff]); 73 | # endif 74 | #elif SS_BLOCKSIZE < 256 75 | return lg_table[n]; 76 | #else 77 | return (n & 0xff00) ? 78 | 8 + lg_table[(n >> 8) & 0xff] : 79 | 0 + lg_table[(n >> 0) & 0xff]; 80 | #endif 81 | } 82 | 83 | #endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */ 84 | 85 | #if SS_BLOCKSIZE != 0 86 | 87 | static const saint_t sqq_table[256] = { 88 | 0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57, 59, 61, 89 | 64, 65, 67, 69, 71, 73, 75, 76, 78, 80, 81, 83, 84, 86, 87, 89, 90 | 90, 91, 93, 94, 96, 97, 98, 99, 101, 102, 103, 104, 106, 107, 108, 109, 91 | 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 92 | 128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 93 | 143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155, 94 | 156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168, 95 | 169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180, 96 | 181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191, 97 | 192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201, 98 | 202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211, 99 | 212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221, 100 | 221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230, 101 | 230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, 102 | 239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 103 | 247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255 104 | }; 105 | 106 | static INLINE 107 | saidx_t 108 | ss_isqrt(saidx_t x) { 109 | saidx_t y, e; 110 | 111 | if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; } 112 | e = (x & 0xffff0000) ? 113 | ((x & 0xff000000) ? 114 | 24 + lg_table[(x >> 24) & 0xff] : 115 | 16 + lg_table[(x >> 16) & 0xff]) : 116 | ((x & 0x0000ff00) ? 117 | 8 + lg_table[(x >> 8) & 0xff] : 118 | 0 + lg_table[(x >> 0) & 0xff]); 119 | 120 | if(e >= 16) { 121 | y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7); 122 | if(e >= 24) { y = (y + 1 + x / y) >> 1; } 123 | y = (y + 1 + x / y) >> 1; 124 | } else if(e >= 8) { 125 | y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1; 126 | } else { 127 | return sqq_table[x] >> 4; 128 | } 129 | 130 | return (x < (y * y)) ? y - 1 : y; 131 | } 132 | 133 | #endif /* SS_BLOCKSIZE != 0 */ 134 | 135 | 136 | /*---------------------------------------------------------------------------*/ 137 | 138 | /* Compares two suffixes. */ 139 | static INLINE saint_t ss_compare(const sauchar_t *T, const saidx_t *p1, 140 | const saidx_t *p2, saidx_t depth) { 141 | const sauchar_t *U1, *U2, *U1n, *U2n; 142 | 143 | for (U1 = T + depth + *p1, U2 = T + depth + *p2, U1n = T + *(p1 + 1) + 2, 144 | U2n = T + *(p2 + 1) + 2; 145 | (U1 < U1n) && (U2 < U2n) && (*U1 == *U2); ++U1, ++U2) { 146 | } 147 | 148 | if (U1 < U1n) { 149 | if (U2 < U2n) { 150 | return *U1 - *U2; 151 | } else { 152 | return 1; 153 | } 154 | } else { 155 | if (U2 < U2n) { 156 | return -1; 157 | } else { 158 | return 0; 159 | } 160 | } 161 | } 162 | 163 | /*---------------------------------------------------------------------------*/ 164 | 165 | #if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) 166 | 167 | /* Insertionsort for small size groups */ 168 | static void ss_insertionsort(const sauchar_t *T, const saidx_t *PA, 169 | saidx_t *first, saidx_t *last, saidx_t depth) { 170 | saidx_t *i, *j; 171 | saidx_t t; 172 | saint_t r; 173 | 174 | for (i = last - 2; first <= i; --i) { 175 | for (t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) { 176 | do { 177 | *(j - 1) = *j; 178 | } while ((++j < last) && (*j < 0)); 179 | if (last <= j) { 180 | break; 181 | } 182 | } 183 | if (r == 0) { 184 | *j = ~*j; 185 | } 186 | *(j - 1) = t; 187 | } 188 | } 189 | 190 | #endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */ 191 | 192 | 193 | /*---------------------------------------------------------------------------*/ 194 | 195 | #if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) 196 | 197 | static INLINE void ss_fixdown(const sauchar_t *Td, const saidx_t *PA, 198 | saidx_t *SA, saidx_t i, saidx_t size) { 199 | saidx_t j, k; 200 | saidx_t v; 201 | saint_t c, d, e; 202 | 203 | // BEAST 204 | for (v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { 205 | d = Td[PA[SA[k = j++]]]; 206 | if (d < (e = Td[PA[SA[j]]])) { 207 | k = j; 208 | d = e; 209 | } 210 | if (d <= c) { 211 | break; 212 | } 213 | } 214 | SA[i] = v; 215 | } 216 | 217 | /* Simple top-down heapsort. */ 218 | static void ss_heapsort(const sauchar_t *Td, const saidx_t *PA, saidx_t *SA, 219 | saidx_t size) { 220 | saidx_t i, m; 221 | saidx_t t; 222 | 223 | m = size; 224 | if ((size % 2) == 0) { 225 | m--; 226 | if (Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { 227 | SWAP(SA[m], SA[m / 2]); 228 | } 229 | } 230 | 231 | // LADY 232 | for (i = m / 2 - 1; 0 <= i; --i) { 233 | ss_fixdown(Td, PA, SA, i, m); 234 | } 235 | 236 | if ((size % 2) == 0) { 237 | SWAP(SA[0], SA[m]); 238 | ss_fixdown(Td, PA, SA, 0, m); 239 | } 240 | 241 | // TRUMPET 242 | for (i = m - 1; 0 < i; --i) { 243 | t = SA[0], SA[0] = SA[i]; 244 | ss_fixdown(Td, PA, SA, 0, i); 245 | SA[i] = t; 246 | } 247 | } 248 | 249 | /*---------------------------------------------------------------------------*/ 250 | 251 | /* Returns the median of three elements. */ 252 | static INLINE saidx_t *ss_median3(const sauchar_t *Td, const saidx_t *PA, 253 | saidx_t *v1, saidx_t *v2, saidx_t *v3) { 254 | saidx_t *t; 255 | if (Td[PA[*v1]] > Td[PA[*v2]]) { 256 | SWAP(v1, v2); 257 | } 258 | if (Td[PA[*v2]] > Td[PA[*v3]]) { 259 | if (Td[PA[*v1]] > Td[PA[*v3]]) { 260 | return v1; 261 | } else { 262 | return v3; 263 | } 264 | } 265 | return v2; 266 | } 267 | 268 | /* Returns the median of five elements. */ 269 | static INLINE saidx_t *ss_median5(const sauchar_t *Td, const saidx_t *PA, 270 | saidx_t *v1, saidx_t *v2, saidx_t *v3, 271 | saidx_t *v4, saidx_t *v5) { 272 | saidx_t *t; 273 | if (Td[PA[*v2]] > Td[PA[*v3]]) { 274 | SWAP(v2, v3); 275 | } 276 | if (Td[PA[*v4]] > Td[PA[*v5]]) { 277 | SWAP(v4, v5); 278 | } 279 | if (Td[PA[*v2]] > Td[PA[*v4]]) { 280 | SWAP(v2, v4); 281 | SWAP(v3, v5); 282 | } 283 | if (Td[PA[*v1]] > Td[PA[*v3]]) { 284 | SWAP(v1, v3); 285 | } 286 | if (Td[PA[*v1]] > Td[PA[*v4]]) { 287 | SWAP(v1, v4); 288 | SWAP(v3, v5); 289 | } 290 | if (Td[PA[*v3]] > Td[PA[*v4]]) { 291 | return v4; 292 | } 293 | return v3; 294 | } 295 | 296 | /* Returns the pivot element. */ 297 | static INLINE saidx_t *ss_pivot(const sauchar_t *Td, const saidx_t *PA, 298 | saidx_t *first, saidx_t *last) { 299 | saidx_t *middle; 300 | saidx_t t; 301 | 302 | t = last - first; 303 | middle = first + t / 2; 304 | 305 | if (t <= 512) { 306 | if (t <= 32) { 307 | return ss_median3(Td, PA, first, middle, last - 1); 308 | } else { 309 | t >>= 2; 310 | return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, 311 | last - 1); 312 | } 313 | } 314 | t >>= 3; 315 | first = ss_median3(Td, PA, first, first + t, first + (t << 1)); 316 | middle = ss_median3(Td, PA, middle - t, middle, middle + t); 317 | last = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1); 318 | return ss_median3(Td, PA, first, middle, last); 319 | } 320 | 321 | /*---------------------------------------------------------------------------*/ 322 | 323 | /* Binary partition for substrings. */ 324 | static INLINE saidx_t *ss_partition(const saidx_t *PA, saidx_t *first, 325 | saidx_t *last, saidx_t depth) { 326 | saidx_t *a, *b; 327 | saidx_t t; 328 | // JIMMY 329 | for (a = first - 1, b = last;;) { 330 | // JANINE 331 | for (; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { 332 | *a = ~*a; 333 | } 334 | // GEORGIO 335 | for (; (a < --b) && ((PA[*b] + depth) < (PA[*b + 1] + 1));) { 336 | } 337 | if (b <= a) { 338 | break; 339 | } 340 | t = ~*b; 341 | *b = *a; 342 | *a = t; 343 | } 344 | if (first < a) { 345 | *first = ~*first; 346 | } 347 | return a; 348 | } 349 | 350 | /* Multikey introsort for medium size groups. */ 351 | static void ss_mintrosort(const sauchar_t *T, const saidx_t *PA, saidx_t *first, 352 | saidx_t *last, saidx_t depth) { 353 | 354 | #define STACK_SIZE SS_MISORT_STACKSIZE 355 | struct { 356 | saidx_t *a, *b, c; 357 | saint_t d; 358 | } stack[STACK_SIZE]; 359 | const sauchar_t *Td; 360 | saidx_t *a, *b, *c, *d, *e, *f; 361 | saidx_t s, t; 362 | saint_t ssize; 363 | saint_t limit; 364 | saint_t v, x = 0; 365 | 366 | // RENEE 367 | for (ssize = 0, limit = ss_ilg(last - first);;) { 368 | if ((last - first) <= SS_INSERTIONSORT_THRESHOLD) { 369 | #if 1 < SS_INSERTIONSORT_THRESHOLD 370 | if (1 < (last - first)) { 371 | ss_insertionsort(T, PA, first, last, depth); 372 | } 373 | #endif 374 | STACK_POP(first, last, depth, limit); 375 | continue; 376 | } 377 | 378 | Td = T + depth; 379 | if (limit-- == 0) { 380 | SA_dump(first, 0, last - first, "before heapsort"); 381 | ss_heapsort(Td, PA, first, last - first); 382 | SA_dump(first, 0, last - first, "after heapsort"); 383 | } 384 | 385 | if (limit < 0) { 386 | // DAVE 387 | for (a = first + 1, v = Td[PA[*first]]; a < last; ++a) { 388 | if ((x = Td[PA[*a]]) != v) { 389 | if (1 < (a - first)) { 390 | break; 391 | } 392 | v = x; 393 | first = a; 394 | } 395 | } 396 | if (Td[PA[*first] - 1] < v) { 397 | first = ss_partition(PA, first, a, depth); 398 | } 399 | if ((a - first) <= (last - a)) { 400 | if (1 < (a - first)) { 401 | STACK_PUSH(a, last, depth, -1); 402 | last = a; 403 | depth += 1; 404 | limit = ss_ilg(a - first); 405 | } else { 406 | first = a; 407 | limit = -1; 408 | } 409 | } else { 410 | if (1 < (last - a)) { 411 | STACK_PUSH(first, a, depth + 1, ss_ilg(a - first)); 412 | first = a; 413 | limit = -1; 414 | } else { 415 | last = a; 416 | depth += 1; 417 | limit = ss_ilg(a - first); 418 | } 419 | } 420 | continue; 421 | } 422 | 423 | /* choose pivot */ 424 | a = ss_pivot(Td, PA, first, last); 425 | v = Td[PA[*a]]; 426 | SWAP(*first, *a); 427 | 428 | /* partition */ 429 | // NORA 430 | for (b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { 431 | } 432 | if (((a = b) < last) && (x < v)) { 433 | // STAN 434 | for (; (++b < last) && ((x = Td[PA[*b]]) <= v);) { 435 | if (x == v) { 436 | SWAP(*b, *a); 437 | ++a; 438 | } 439 | } 440 | } 441 | // NATHAN 442 | for (c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { 443 | } 444 | if ((b < (d = c)) && (x > v)) { 445 | // JACOB 446 | for (; (b < --c) && ((x = Td[PA[*c]]) >= v);) { 447 | if (x == v) { 448 | SWAP(*c, *d); 449 | --d; 450 | } 451 | } 452 | } 453 | // RITA 454 | for (; b < c;) { 455 | SWAP(*b, *c); 456 | // ROMEO 457 | for (; (++b < c) && ((x = Td[PA[*b]]) <= v);) { 458 | if (x == v) { 459 | SWAP(*b, *a); 460 | ++a; 461 | } 462 | } 463 | // JULIET 464 | for (; (b < --c) && ((x = Td[PA[*c]]) >= v);) { 465 | if (x == v) { 466 | SWAP(*c, *d); 467 | --d; 468 | } 469 | } 470 | } 471 | 472 | if (a <= d) { 473 | c = b - 1; 474 | 475 | if ((s = a - first) > (t = b - a)) { 476 | s = t; 477 | } 478 | 479 | // JOSHUA 480 | for (e = first, f = b - s; 0 < s; --s, ++e, ++f) { 481 | SWAP(*e, *f); 482 | } 483 | if ((s = d - c) > (t = last - d - 1)) { 484 | s = t; 485 | } 486 | // BERENICE 487 | for (e = b, f = last - s; 0 < s; --s, ++e, ++f) { 488 | SWAP(*e, *f); 489 | } 490 | 491 | a = first + (b - a), c = last - (d - c); 492 | 493 | // b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth); 494 | if (v <= Td[PA[*a] - 1]) { 495 | b = a; 496 | } else { 497 | b = ss_partition(PA, a, c, depth); 498 | } 499 | 500 | if ((a - first) <= (last - c)) { 501 | if ((last - c) <= (c - b)) { 502 | STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); 503 | STACK_PUSH(c, last, depth, limit); 504 | last = a; 505 | } else if ((a - first) <= (c - b)) { 506 | STACK_PUSH(c, last, depth, limit); 507 | STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); 508 | last = a; 509 | } else { 510 | STACK_PUSH(c, last, depth, limit); 511 | STACK_PUSH(first, a, depth, limit); 512 | first = b, last = c, depth += 1, limit = ss_ilg(c - b); 513 | } 514 | } else { 515 | if ((a - first) <= (c - b)) { 516 | STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); 517 | STACK_PUSH(first, a, depth, limit); 518 | first = c; 519 | } else if ((last - c) <= (c - b)) { 520 | STACK_PUSH(first, a, depth, limit); 521 | STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); 522 | first = c; 523 | } else { 524 | STACK_PUSH(first, a, depth, limit); 525 | STACK_PUSH(c, last, depth, limit); 526 | first = b, last = c, depth += 1, limit = ss_ilg(c - b); 527 | } 528 | } 529 | } else { 530 | limit += 1; 531 | if (Td[PA[*first] - 1] < v) { 532 | first = ss_partition(PA, first, last, depth); 533 | limit = ss_ilg(last - first); 534 | } 535 | depth += 1; 536 | } 537 | } 538 | #undef STACK_SIZE 539 | } 540 | 541 | #endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */ 542 | 543 | 544 | /*---------------------------------------------------------------------------*/ 545 | 546 | #if SS_BLOCKSIZE != 0 547 | 548 | static INLINE void ss_blockswap(saidx_t *a, saidx_t *b, saidx_t n) { 549 | saidx_t t; 550 | for (; 0 < n; --n, ++a, ++b) { 551 | t = *a, *a = *b, *b = t; 552 | } 553 | } 554 | 555 | static INLINE void ss_rotate(saidx_t *first, saidx_t *middle, saidx_t *last) { 556 | saidx_t *original_first = first; 557 | saidx_t *original_last = last; 558 | 559 | saidx_t *a, *b, t; 560 | saidx_t l, r; 561 | l = middle - first, r = last - middle; 562 | 563 | SA_dump(original_first, 0, original_last - original_first, "pre-brendan"); 564 | 565 | // BRENDAN 566 | for (; (0 < l) && (0 < r);) { 567 | if (l == r) { 568 | ss_blockswap(first, middle, l); 569 | SA_dump(original_first, 0, original_last - original_first, "post-blockswap"); 570 | break; 571 | } 572 | if (l < r) { 573 | a = last - 1, b = middle - 1; 574 | t = *a; 575 | // ALICE 576 | do { 577 | *a-- = *b, *b-- = *a; 578 | if (b < first) { 579 | *a = t; 580 | last = a; 581 | if ((r -= l + 1) <= l) { 582 | break; 583 | } 584 | a -= 1, b = middle - 1; 585 | t = *a; 586 | } 587 | } while (1); 588 | SA_dump(original_first, 0, original_last - original_first, "post-alice"); 589 | } else { 590 | a = first, b = middle; 591 | t = *a; 592 | // ROBERT 593 | do { 594 | *a++ = *b, *b++ = *a; 595 | if (last <= b) { 596 | *a = t; 597 | first = a + 1; 598 | if ((l -= r + 1) <= r) { 599 | break; 600 | } 601 | a += 1, b = middle; 602 | t = *a; 603 | } 604 | } while (1); 605 | SA_dump(original_first, 0, original_last - original_first, "post-robert"); 606 | } 607 | } 608 | } 609 | 610 | /*---------------------------------------------------------------------------*/ 611 | 612 | static void ss_inplacemerge(const sauchar_t *T, const saidx_t *PA, 613 | saidx_t *first, saidx_t *middle, saidx_t *last, 614 | saidx_t depth) { 615 | const saidx_t *p; 616 | saidx_t *a, *b; 617 | saidx_t len, half; 618 | saint_t q, r; 619 | saint_t x; 620 | 621 | saidx_t *original_first = first; 622 | saidx_t *original_last = last; 623 | 624 | SA_dump(original_first, 0, original_last - original_first, "inplacemerge start"); 625 | 626 | // FERRIS 627 | for (;;) { 628 | if (*(last - 1) < 0) { 629 | x = 1; 630 | p = PA + ~*(last - 1); 631 | } else { 632 | x = 0; 633 | p = PA + *(last - 1); 634 | } 635 | // LOIS 636 | for (a = first, len = middle - first, half = len >> 1, r = -1; 0 < len; 637 | len = half, half >>= 1) { 638 | b = a + half; 639 | q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth); 640 | if (q < 0) { 641 | a = b + 1; 642 | half -= (len & 1) ^ 1; 643 | } else { 644 | r = q; 645 | } 646 | } 647 | SA_dump(original_first, 0, original_last - original_first, "post-lois"); 648 | 649 | if (a < middle) { 650 | if (r == 0) { 651 | *a = ~*a; 652 | } 653 | ss_rotate(a, middle, last); 654 | SA_dump(original_first, 0, original_last - original_first, "post-rotate"); 655 | last -= middle - a; 656 | middle = a; 657 | if (first == middle) { 658 | break; 659 | } 660 | } 661 | --last; 662 | if (x != 0) { 663 | // TIMMY 664 | while (*--last < 0) { 665 | } 666 | SA_dump(original_first, 0, original_last - original_first, "post-timmy"); 667 | } 668 | if (middle == last) { 669 | break; 670 | } 671 | 672 | SA_dump(original_first, 0, original_last - original_first, "ferris-wrap"); 673 | } 674 | } 675 | 676 | /*---------------------------------------------------------------------------*/ 677 | 678 | /* Merge-forward with internal buffer. */ 679 | static void ss_mergeforward(const sauchar_t *T, const saidx_t *PA, 680 | saidx_t *first, saidx_t *middle, saidx_t *last, 681 | saidx_t *buf, saidx_t depth) { 682 | saidx_t *a, *b, *c, *bufend; 683 | saidx_t t; 684 | saint_t r; 685 | 686 | SA_dump(first, 0, last-first, "ss_mergeforward start"); 687 | 688 | bufend = buf + (middle - first) - 1; 689 | ss_blockswap(buf, first, middle - first); 690 | 691 | // IGNACE 692 | for (t = *(a = first), b = buf, c = middle;;) { 693 | r = ss_compare(T, PA + *b, PA + *c, depth); 694 | if (r < 0) { 695 | // RONALD 696 | do { 697 | *a++ = *b; 698 | if (bufend <= b) { 699 | *bufend = t; 700 | return; 701 | } 702 | *b++ = *a; 703 | } while (*b < 0); 704 | } else if (r > 0) { 705 | // JEREMY 706 | do { 707 | *a++ = *c, *c++ = *a; 708 | if (last <= c) { 709 | // TONY 710 | while (b < bufend) { 711 | *a++ = *b, *b++ = *a; 712 | } 713 | *a = *b, *b = t; 714 | return; 715 | } 716 | } while (*c < 0); 717 | } else { 718 | *c = ~*c; 719 | // JENS 720 | do { 721 | *a++ = *b; 722 | if (bufend <= b) { 723 | *bufend = t; 724 | return; 725 | } 726 | *b++ = *a; 727 | } while (*b < 0); 728 | 729 | // DIMITER 730 | do { 731 | *a++ = *c, *c++ = *a; 732 | if (last <= c) { 733 | // MIDORI 734 | while (b < bufend) { 735 | *a++ = *b, *b++ = *a; 736 | } 737 | *a = *b, *b = t; 738 | return; 739 | } 740 | } while (*c < 0); 741 | } 742 | } 743 | } 744 | 745 | /* Merge-backward with internal buffer. */ 746 | static void ss_mergebackward(const sauchar_t *T, const saidx_t *PA, 747 | saidx_t *first, saidx_t *middle, saidx_t *last, 748 | saidx_t *buf, saidx_t depth) { 749 | const saidx_t *p1, *p2; 750 | saidx_t *a, *b, *c, *bufend; 751 | saidx_t t; 752 | saint_t r; 753 | saint_t x; 754 | 755 | bufend = buf + (last - middle) - 1; 756 | ss_blockswap(buf, middle, last - middle); 757 | 758 | x = 0; 759 | if (*bufend < 0) { 760 | p1 = PA + ~*bufend; 761 | x |= 1; 762 | } else { 763 | p1 = PA + *bufend; 764 | } 765 | if (*(middle - 1) < 0) { 766 | p2 = PA + ~*(middle - 1); 767 | x |= 2; 768 | } else { 769 | p2 = PA + *(middle - 1); 770 | } 771 | // MARTIN 772 | for (t = *(a = last - 1), b = bufend, c = middle - 1;;) { 773 | r = ss_compare(T, p1, p2, depth); 774 | if (0 < r) { 775 | if (x & 1) { 776 | // BAPTIST 777 | do { 778 | *a-- = *b, *b-- = *a; 779 | } while (*b < 0); 780 | x ^= 1; 781 | } 782 | *a-- = *b; 783 | if (b <= buf) { 784 | *buf = t; 785 | break; 786 | } 787 | *b-- = *a; 788 | if (*b < 0) { 789 | p1 = PA + ~*b; 790 | x |= 1; 791 | } else { 792 | p1 = PA + *b; 793 | } 794 | } else if (r < 0) { 795 | if (x & 2) { 796 | // JULES 797 | do { 798 | *a-- = *c, *c-- = *a; 799 | } while (*c < 0); 800 | x ^= 2; 801 | } 802 | *a-- = *c, *c-- = *a; 803 | if (c < first) { 804 | // GARAMOND 805 | while (buf < b) { 806 | *a-- = *b, *b-- = *a; 807 | } 808 | *a = *b, *b = t; 809 | break; 810 | } 811 | if (*c < 0) { 812 | p2 = PA + ~*c; 813 | x |= 2; 814 | } else { 815 | p2 = PA + *c; 816 | } 817 | } else { 818 | if (x & 1) { 819 | // XAVIER 820 | do { 821 | *a-- = *b, *b-- = *a; 822 | } while (*b < 0); 823 | x ^= 1; 824 | } 825 | *a-- = ~*b; 826 | if (b <= buf) { 827 | *buf = t; 828 | break; 829 | } 830 | *b-- = *a; 831 | if (x & 2) { 832 | // WALTER 833 | do { 834 | *a-- = *c, *c-- = *a; 835 | } while (*c < 0); 836 | x ^= 2; 837 | } 838 | *a-- = *c, *c-- = *a; 839 | if (c < first) { 840 | // ZENITH 841 | while (buf < b) { 842 | *a-- = *b, *b-- = *a; 843 | } 844 | *a = *b, *b = t; 845 | break; 846 | } 847 | if (*b < 0) { 848 | p1 = PA + ~*b; 849 | x |= 1; 850 | } else { 851 | p1 = PA + *b; 852 | } 853 | if (*c < 0) { 854 | p2 = PA + ~*c; 855 | x |= 2; 856 | } else { 857 | p2 = PA + *c; 858 | } 859 | } 860 | } 861 | } 862 | 863 | /* D&C based merge. */ 864 | static void ss_swapmerge(const sauchar_t *T, const saidx_t *PA, saidx_t *first, 865 | saidx_t *middle, saidx_t *last, saidx_t *buf, 866 | saidx_t bufsize, saidx_t depth) { 867 | #define STACK_SIZE SS_SMERGE_STACKSIZE 868 | #define GETIDX(a) ((0 <= (a)) ? (a) : (~(a))) 869 | #define MERGE_CHECK(a, b, c) \ 870 | do { \ 871 | crosscheck("mc c=%d", c); \ 872 | if (((c)&1) || (((c)&2) && (ss_compare(T, PA + GETIDX(*((a)-1)), \ 873 | PA + *(a), depth) == 0))) { \ 874 | crosscheck("swapping a-first=%d", a - first); \ 875 | *(a) = ~*(a); \ 876 | } \ 877 | if (((c)&4) && \ 878 | ((ss_compare(T, PA + GETIDX(*((b)-1)), PA + *(b), depth) == 0))) { \ 879 | crosscheck("swapping b-first=%d", b - first); \ 880 | *(b) = ~*(b); \ 881 | } \ 882 | } while (0) 883 | struct { 884 | saidx_t *a, *b, *c; 885 | saint_t d; 886 | } stack[STACK_SIZE]; 887 | saidx_t *l, *r, *lm, *rm; 888 | saidx_t m, len, half; 889 | saint_t ssize; 890 | saint_t check, next; 891 | 892 | // BARBARIAN 893 | for (check = 0, ssize = 0;;) { 894 | crosscheck("barbarian check=%d", check); 895 | SA_dump(first, 0, last-first, "ss_swapmerge barbarian"); 896 | SA_dump(buf, 0, bufsize, "ss_swapmerge barbarian buf"); 897 | if ((last - middle) <= bufsize) { 898 | crosscheck("<=bufsize"); 899 | if ((first < middle) && (middle < last)) { 900 | crosscheck("f> 1; 927 | 0 < len; len = half, half >>= 1) { 928 | crosscheck("in-olanna len=%d half=%d", len, half); 929 | if (ss_compare(T, PA + GETIDX(*(middle + m + half)), 930 | PA + GETIDX(*(middle - m - half - 1)), depth) < 0) { 931 | m += half + 1; 932 | half -= (len & 1) ^ 1; 933 | } 934 | } 935 | 936 | if (0 < m) { 937 | crosscheck("0 < m, m=%d", m); 938 | lm = middle - m, rm = middle + m; 939 | ss_blockswap(lm, middle, m); 940 | l = r = middle, next = 0; 941 | if (rm < last) { 942 | if (*rm < 0) { 943 | *rm = ~*rm; 944 | if (first < lm) { 945 | // KOOPA 946 | for (; *--l < 0;) { 947 | } 948 | crosscheck("post-koopa l-first=%d", l - first); 949 | next |= 4; 950 | crosscheck("post-koopa next=%d", next); 951 | } 952 | next |= 1; 953 | } else if (first < lm) { 954 | // MUNCHER 955 | for (; *r < 0; ++r) { 956 | } 957 | crosscheck("post-muncher r-first=%d", r - first); 958 | next |= 2; 959 | } 960 | } 961 | 962 | if ((l - first) <= (last - r)) { 963 | crosscheck("post-muncher l-f>= 1) { 1033 | crosscheck("ss_swapmerge %d", k); 1034 | ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth); 1035 | } 1036 | } 1037 | 1038 | crosscheck("ss_mintrosort (pre-mariachi) a=%d depth=%d", a-PA, depth); 1039 | ss_mintrosort(T, PA, a, middle, depth); 1040 | 1041 | SA_dump(first, 0, last-first, "pre-mariachi"); 1042 | 1043 | // MARIACHI 1044 | for (k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) { 1045 | if (i & 1) { 1046 | SA_dump(first, 0, last - first, "in-mariachi pre-swap"); 1047 | crosscheck("a=%d middle=%d bufsize=%d depth=%d", a - first, 1048 | middle - first, bufsize, depth); 1049 | ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth); 1050 | SA_dump(first, 0, last - first, "in-mariachi post-swap"); 1051 | a -= k; 1052 | } 1053 | } 1054 | SA_dump(first, 0, last-first, "post-mariachi"); 1055 | 1056 | if (limit != 0) { 1057 | crosscheck("ss_mintrosort limit!=0"); 1058 | ss_mintrosort(T, PA, middle, last, depth); 1059 | SA_dump(first, 0, last-first, "post-mintrosort limit!=0"); 1060 | ss_inplacemerge(T, PA, first, middle, last, depth); 1061 | SA_dump(first, 0, last-first, "post-inplacemerge limit!=0"); 1062 | } 1063 | SA_dump(first, 0, last-first, "post-limit!=0"); 1064 | 1065 | if (lastsuffix != 0) { 1066 | crosscheck("lastsuffix!"); 1067 | 1068 | /* Insert last type B* suffix. */ 1069 | saidx_t PAi[2]; 1070 | PAi[0] = PA[*(first - 1)], PAi[1] = n - 2; 1071 | // CELINE 1072 | for (a = first, i = *(first - 1); 1073 | (a < last) && 1074 | ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth))); 1075 | ++a) { 1076 | *(a - 1) = *a; 1077 | } 1078 | *(a - 1) = i; 1079 | } 1080 | } --------------------------------------------------------------------------------