├── .gitignore ├── LICENSE ├── README.md ├── gosaca_test.go ├── common.go ├── level0.go └── level1.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 John Gallagher 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | gosaca 2 | ======== 3 | 4 | Description 5 | ----------- 6 | 7 | Pure Go implementation of [An Optimal Suffix Array Construction 8 | Algorithm](http://ge-nong.googlecode.com/files/tr-osaca-nong.pdf), a paper by 9 | [Ge Nong](http://code.google.com/p/ge-nong/). 10 | 11 | Benchmarks 12 | ---------- 13 | 14 | More extensive tests and benchmarks run on large copora are available as 15 | [gosaca-bigtests](https://github.com/jgallagher/gosaca-bigtest) so as not to 16 | balloon the size of this repo. 17 | 18 | The following table compares 19 | [sa-is](https://sites.google.com/site/yuta256/sais) with gosaca running on a 20 | 2012 Macbook Pro. The comparision is not really fair in many senses (SA-IS is 21 | an earlier, slightly less efficient algorithm, but it's implemented in 22 | optimized C code); here it is nonetheless. Times are in seconds. 23 | 24 | File | Size | sa-is | gosaca 25 | -------------- | --------: | -----: | -----: 26 | chr22dna | 34553758 | 5.893 | 10.807 27 | etext99 | 105277340 | 21.507 | 41.862 28 | gcc30tar | 86630400 | 10.252 | 23.149 29 | howto | 39422105 | 6.050 | 12.123 30 | jdk13c | 69728899 | 6.567 | 19.807 31 | linux245tar | 116254720 | 14.635 | 32.696 32 | rctail96 | 114711151 | 15.143 | 40.225 33 | rfc | 116421901 | 16.191 | 36.505 34 | sprot34dat | 109617186 | 17.485 | 39.164 35 | w3c2 | 104201579 | 10.395 | 30.164 36 | abac | 200000 | 0.005 | 0.016 37 | abba | 10500600 | 0.646 | 2.326 38 | book1x20 | 15375420 | 1.620 | 5.126 39 | fib\_s14930352 | 14930352 | 0.981 | 4.579 40 | fss10 | 12078908 | 0.739 | 3.539 41 | fss9 | 2851443 | 0.145 | 0.518 42 | houston | 3840000 | 0.102 | 0.288 43 | paper5x80 | 981924 | 0.041 | 0.099 44 | test1 | 2097152 | 0.096 | 0.236 45 | test2 | 2097152 | 0.103 | 0.251 46 | test3 | 2097152 | 0.094 | 0.206 47 | 48 | Copyright 49 | --------- 50 | 51 | Copyright © John Gallagher. [MIT 52 | License](http://opensource.org/licenses/MIT); see LICENSE for further details. 53 | -------------------------------------------------------------------------------- /gosaca_test.go: -------------------------------------------------------------------------------- 1 | package gosaca 2 | 3 | import ( 4 | "fmt" 5 | "math/rand" 6 | "testing" 7 | ) 8 | 9 | func checkCorrectSuffixArray(input []byte, SA []int) error { 10 | suffixesSeen := make(map[int]bool) 11 | 12 | for i, s := range SA { 13 | suffixesSeen[s] = true 14 | 15 | // make sure suffix starting at SA[i] precedes suffix starting at SA[i+1] 16 | if i+1 == len(SA) { 17 | break 18 | } 19 | 20 | s1, s2 := SA[i], SA[i+1] 21 | if s1 < 0 || s1 >= len(input) { 22 | return fmt.Errorf("Invalid suffix array: SA[%d] = %d is out of range\n", i, s1) 23 | } 24 | if s2 < 0 || s2 >= len(input) { 25 | return fmt.Errorf("Invalid suffix array: SA[%d] = %d is out of range\n", i+1, s2) 26 | } 27 | if s1 == s2 { 28 | return fmt.Errorf("Invalid suffix array: SA[%d] = SA[%d]\n", i, i+1) 29 | } 30 | for { 31 | if input[s1] < input[s2] { 32 | // success 33 | break 34 | } 35 | if input[s1] > input[s2] { 36 | return fmt.Errorf("Invalid suffix array: suffix starting at SA[%d]=%d is greater than suffix starting at SA[%d]=%d\n", i, SA[i], i+1, SA[i+1]) 37 | } 38 | s1++ 39 | if s1 == len(input) { 40 | // success 41 | break 42 | } 43 | s2++ 44 | if s2 == len(input) { 45 | return fmt.Errorf("Invalid suffix array: suffix starting at SA[%d]=%d is greater than suffix starting at SA[%d]=%d\n", i, SA[i], i+1, SA[i+1]) 46 | } 47 | } 48 | } 49 | 50 | if len(suffixesSeen) != len(input) { 51 | return fmt.Errorf("Invalid suffix array: only saw %d unique suffixes (expected %d)\n", len(suffixesSeen), len(input)) 52 | } 53 | 54 | return nil 55 | } 56 | 57 | func TestBasic(t *testing.T) { 58 | ws := &WorkSpace{} 59 | for _, input := range [][]byte{ 60 | // simple tests 61 | []byte("baa"), 62 | []byte("banana"), 63 | []byte("bananafoobar"), 64 | []byte("mmiissiippii"), 65 | 66 | // random tests that failed during development/debugging 67 | []byte("anzazzdexszakdovkzahyckszpfqqfquuszaongqn"), 68 | []byte("dlvoppoimkrvyktwwxvbmemsvopnexqdftnuamepiu"), 69 | []byte("agidaenivhknajhfgekpmmugaqljpaoerhyyerhzxaehp"), 70 | []byte("slsluwafbeygwtsflijvcfedimtewfybyhzkzjbpewfifl"), 71 | []byte("eckdiyvvlrsemmcpawoirivnockdlbrmaufbehxipsqhyanosoqbp"), 72 | []byte("xbuxnvtzouehxqopupcwfivyhnwvkftcwfhjmjxzncnmvwlisrpxqnvczo"), 73 | []byte("naxkmuquvkhngkcqalbgpdxjkalbvrmbqscyikqdhrvvijkfngfikxtvalsmobje"), 74 | []byte("dxjpmcwlvmuswgfatoqolxcicbbvgbvrhwvibjbliiqcbolxcfnajixxbskjylcfxuhgvrwcfqahe"), 75 | []byte("tqxqbajhitxealorzfbmiulasimxpfxqvzenmdjhththzmyxrsqakcoqunzgopagkoevslbyndymbpiyswsdngoyuipxfxmswvkhu"), 76 | []byte("hngsmqhsuuwwqdwueftduujrxiyygqskpczwoyibotyqdfkdoayaujteffqmavwjkrgpbqwiobmkglovzhgdiuimkffjwpqusynihwkokduowszlhefkskkfkefbwilfzvfyzsamfkavfjoyfekfamqvtamfmniazidowdakbhedhzwzdanjavtmgciicmewqtrbcvmbljuhostzjojqzdtegzquljdhdwzqhfojvctkzekxwurtralnipuicjbol"), 77 | []byte("vxijixkikjtgfxpwdizlmgkslnmtdiaftiexzfkppkqwanbzgvibonysykramrtklvnqbljynrddqyqlbtpugakabinvvuzpqfxvbefhopzvgmeasomnmhnghdatmobksibaipjtvmufpqstnojqjyfmhibltyafjjednaywgpgrglhxonkhibsrlmxvubwecqeddpzpksvjtsimgtyrpvtnqrfsgolsznbdtbuttgcnlvslnmmnjlnhepapdbsbhdrvqbmtxomtty"), 78 | []byte("bbsomwccexbwzqmeqdgcgymeyavyydmmcowprhahcjvloltsrwbkmuvtiwysdkyxygdojbdaubsbvbluicuhrxbqxhtwuphwaytpvjcgespczoreufogbflleubowaklbcttxyjnaufhdwbzniafrghlyszgolkyjumwzwwjzwcpjjrmbwyzoymgbpypreqsngoulmaxygcazmigmpoajmswefwcflrqxhrqytqogypsyslypgvrihlfeqrxhytbvpggbqubinydvcwtnxbmwlvilhuxnsdewuaovnsvozhnrwhmqrmrutrxjgknujcxabaaqedijbrdytqmndkjlfffauohhmyswdafxweyocylutquvtlpqteuyetgaftlngwcuxhylhdkprvgfvnqncywxftnwghlnqplxfmqxehbsisxcytpmliupvwnjzmuysjjntmwezfuxauiwbfggkxceayjdkgvghxhztdjmjisbkqallwtloyepczldrcldktggbzowjtjneziyrvnmecpzkodvxizhgnehjhiwwgnmyzbjtjfcktighrbwoibvjkobxgqhbfvguldejsxvgwbfwyanvtsqgtxhnhjecrpxwtovfhgejdczmbwoifsirjfztdjfupitvaljqqiyqqpxlbhwqpwplmysqdvzriqpjdrnkajkywlxtcetbgtvxorbrtpmfayeadiaoymqaetsgocvkmymrwfhpyzylieghmuegoqqjhdelswfllzuykysjzadutkxwgfhdmihvkcfvalofynroljcncdvblnreguoyhyrzhnoubladowjdjhyuazhyapwioxhrdtvmfljazmvbxjtkrwqrkepfctekohrtvsifxvqbzekddplytdwxgudgzsvyvxlxjdyqqmpsimuwvdmjtjpcyctorbdmffbzwxexygppzsdoczuppxiqxnnwewyjyeohpgkglstinafynsoyqtjybrdwsgvssuwcikhhoyhszglpuzmttmwezfknplhzjnapnxlbepxahcjjreysmzdwroclrylkqwoxwstzridtlraybpcohjuvltzypcwqfakgwxyybqeildjyvuiaakwvdduckmsvkyaqyebtgkrflatnlyqhycbrputyqofjfplxdxprfpbvjyifzsjwmnceiaovnqgfzaofjqqoffbrpfxygxlvyekoifiihzryeagcwwglvwbovtffehxamoznrtolqgyfkxlhjpjaqyfefoxlphficbcndpssiosqhkjmegnvpxynsipougnogroestwxamfprtsxffbhslwrnmjyjdolcekuzqwoauamufvqhzsbbpfsvupjscavgpgybgkzsicpgcxukkhgaiyxqauqienozaufwenctcgcibwyfsejfdrujqutiosvfctqroncnggxdjmmpjajsrbpjjsgqulgbbiauxndntroharhqglkjzgkprcwosychvvpfyedjtrcfpgjdmesbhlyzkeukxiesbtkdjpwikdesrjbfiabtufrkoevscabjmxmkdwekstnujocxtzcwlbmafmskhslsredavkpzjhbsfhwxmoauhixwolumhbqffduilfuecubztsqur"), 79 | } { 80 | SA := make([]int, len(input)) 81 | ws.ComputeSuffixArray(input, SA) 82 | if err := checkCorrectSuffixArray(input, SA); err != nil { 83 | t.Fatalf("input %s failed: %s", string(input), err) 84 | } 85 | } 86 | } 87 | 88 | func TestRandom(t *testing.T) { 89 | ws := &WorkSpace{} 90 | var ( 91 | seed = 12345 92 | nlengths = 5000 93 | testsPerLength = 1 94 | ) 95 | 96 | if testing.Short() { 97 | nlengths = 1000 98 | } 99 | 100 | rand.Seed(int64(seed)) 101 | 102 | input := make([]byte, nlengths) 103 | SA := make([]int, nlengths) 104 | for i := 1; i <= nlengths; i++ { 105 | for j := 0; j < testsPerLength; j++ { 106 | for k := 0; k < i; k++ { 107 | input[k] = 'a' + byte(rand.Intn(26)) 108 | } 109 | ws.ComputeSuffixArray(input[:i], SA[:i]) 110 | if err := checkCorrectSuffixArray(input[:i], SA[:i]); err != nil { 111 | t.Fatalf("input %s failed: %s", string(input), err) 112 | } 113 | } 114 | } 115 | } 116 | 117 | func Benchmark900K(b *testing.B) { 118 | b.StopTimer() 119 | ws := &WorkSpace{} 120 | input := make([]byte, 900*1000) 121 | for i := range input { 122 | input[i] = byte(rand.Intn(256)) 123 | } 124 | SA := make([]int, len(input)) 125 | b.StartTimer() 126 | 127 | for i := 0; i < b.N; i++ { 128 | ws.ComputeSuffixArray(input, SA) 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /common.go: -------------------------------------------------------------------------------- 1 | package gosaca 2 | 3 | func setAllToEmpty(SA []int) { 4 | for i := range SA { 5 | SA[i] = empty 6 | } 7 | } 8 | 9 | // compute the length of the LMS substring at the front of the LMS suffix s[:] 10 | // pre-condition: s[:] is an LMS suffix 11 | // WARNING: if s[:] ends on the sentinel, the returned value will be len(s)+1! 12 | func lmsSubstringLength0(s []byte) int { 13 | n := len(s) 14 | for i := 2; i < n; i++ { 15 | if s[i] < s[i-1] { 16 | // s[i-1] is L-type; move on to step 2 17 | for j := i; j < n; j++ { 18 | if s[j] > s[j-1] { 19 | return i 20 | } else if s[j] < s[j-1] { 21 | i = j + 1 22 | } 23 | } 24 | } 25 | } 26 | return n + 1 // add one to indicate substring ended with the sentinel 27 | } 28 | 29 | // for level 0, rename the LMS substrings sitting in SA1, and return the new alphabet size (k1) 30 | func rename0(S []byte, SA1, work, S1 []int) int { 31 | n := len(S) 32 | n1 := len(SA1) 33 | 34 | if n1 == 0 { 35 | return 0 36 | } 37 | 38 | // currently work only holds positive values; we save the time of clearing 39 | // it out by inserting bitwise-negated (negative) values so we know which 40 | // ones we put in vs which ones just contain old data 41 | 42 | // walk SA1 from left to right, creating Z1 (spread throughout work) 43 | 44 | // first, record the first LMS suffix 45 | k1 := 1 46 | bktHead := 0 // renamed value == head of bucket in SA1 (part of property 4.1) 47 | prev := SA1[0] 48 | work[prev/2] = ^bktHead 49 | SA1[0] = 1 // after we read SA[i], reuse it as a bucket size (needed for post-Z1 step) 50 | 51 | // at each step, we need to see if the LMS substring starting at S[SA1[i]] (S[pos]) 52 | // is the same as the one we just saw starting at S[SA1[i-1]] (S[prev]) 53 | for i := 1; i < n1; i++ { 54 | pos := SA1[i] 55 | SA1[i] = 0 // reused as bucket size 56 | diff := false 57 | 58 | // quick first test - if initial character is different we're done 59 | if S[prev] != S[pos] { 60 | diff = true 61 | } else { 62 | // TODO - this walks both LMS substrings to calculate their lengths; can we combine this to short-circuit earlier if possible? tricky to do correctly! 63 | prevLen := lmsSubstringLength0(S[prev:]) 64 | posLen := lmsSubstringLength0(S[pos:]) 65 | if prev+prevLen == n+1 || // S[prev:] ends with sentinel 66 | pos+posLen == n+1 || // S[pos:] ends with sentinel 67 | prevLen != posLen { // different lengths 68 | diff = true 69 | } else { 70 | // if we get here: 71 | // (a) first character is the same 72 | // (b) both end before the sentinel 73 | // (c) both have the same length 74 | // so we need to check the rest of the characters one-by-one 75 | for j := 1; j < prevLen; j++ { 76 | if S[prev+j] != S[pos+j] { 77 | diff = true 78 | break 79 | } 80 | } 81 | } 82 | } 83 | 84 | if diff { 85 | bktHead = i 86 | k1++ 87 | } 88 | work[pos/2] = ^bktHead 89 | SA1[bktHead]++ // increment bucket size 90 | prev = pos 91 | } 92 | 93 | // Z1 is now sitting (sparsely) in work[] 94 | buildS1FromZ1(S1, SA1, work) 95 | 96 | return k1 97 | } 98 | 99 | // this function is almost the same as rename0; differences: 100 | // (a) S is an []int 101 | // (b) our LMS substring comparision is different (and easier) 102 | func rename1(S, SA1, work, S1 []int) int { 103 | n := len(S) 104 | n1 := len(SA1) 105 | 106 | if n1 == 0 { 107 | return 0 108 | } 109 | 110 | // currently work only holds positive values; we save the time of clearing 111 | // it out by inserting bitwise-negated (negative) values so we know which 112 | // ones we put in vs which ones just contain old data 113 | 114 | // walk SA1 from left to right, creating Z1 (spread throughout work) 115 | 116 | // first, record the first LMS suffix 117 | k1 := 1 118 | bktHead := 0 // renamed value == head of bucket in SA1 (part of property 4.1) 119 | prev := SA1[0] 120 | work[prev/2] = ^bktHead 121 | SA1[0] = 1 // after we read SA[i], reuse it as a bucket size (needed for post-Z1 step) 122 | 123 | // at each step, we need to see if the LMS substring starting at S[SA1[i]] (S[pos]) 124 | // is the same as the one we just saw starting at S[SA1[i-1]] (S[prev]) 125 | for i := 1; i < n1; i++ { 126 | pos := SA1[i] 127 | SA1[i] = 0 // reused as bucket size 128 | diff := false 129 | 130 | // walk both strings character-by-character until (a) we get a 131 | // difference or (b) we begin the L+ sequence 132 | j := 0 133 | for j = 0; j < n; j++ { 134 | if S[prev+j] != S[pos+j] { 135 | diff = true 136 | break 137 | } else if S[prev+j] >= 0 { 138 | break 139 | } 140 | } 141 | 142 | if !diff { 143 | // both strings started L+ at the same place; now walk until: 144 | // (a) either hits the end (=> different) 145 | // (b) we get a different character (=> different) 146 | // (c) both hit the same S-type value (=> same) 147 | for j++; j < n; j++ { 148 | if prev+j == n || pos+j == n || S[prev+j] != S[pos+j] { 149 | diff = true 150 | break 151 | } else if S[prev+j] < 0 { 152 | break 153 | } 154 | } 155 | } 156 | 157 | if diff { 158 | bktHead = i 159 | k1++ 160 | } 161 | work[pos/2] = ^bktHead 162 | SA1[bktHead]++ // increment bucket size 163 | prev = pos 164 | } 165 | 166 | // Z1 is now sitting (sparsely) in work[] 167 | buildS1FromZ1(S1, SA1, work) 168 | 169 | return k1 170 | } 171 | 172 | // Build S1 from Z1 (which is sitting sparsely in work[] - all the negative 173 | // values for work are the bitwise inversions of Z1). 174 | func buildS1FromZ1(S1, SA1, work []int) { 175 | n1 := len(S1) 176 | 177 | // walk work[] from right-to-left and adjust any S-type characters to point to the end of their bucket instead of the beginning 178 | Z1pos := len(work) - 1 179 | for i := 0; i < n1; i++ { 180 | // find next element of Z1 181 | for work[Z1pos] >= 0 { 182 | Z1pos-- 183 | } 184 | 185 | // record character (head of bucket, only correct for L-type) 186 | c := ^work[Z1pos] 187 | S1[n1-1-i] = c 188 | Z1pos-- 189 | 190 | // check and see if c is S-type 191 | if i > 0 && // S1[n-1] is L-type by definition due to sentinel 192 | ((S1[n1-i] < 0 && c <= ^S1[n1-i]) || // S1[n1-i] was S-type and we are <= it 193 | (S1[n1-i] >= 0 && c < S1[n1-i])) { // S1[n1-i] was L-type and we are < it 194 | // Adjust c so it points to the end of its bucket instead of the 195 | // head. Note that in the Z1 construction loop above, we stored 196 | // the width of each bucket in SA1[c]. Also, bitwise negate it so 197 | // the recursive computeSuffixArray1 doesn't have to. 198 | S1[n1-1-i] = ^(S1[n1-1-i] + SA1[c] - 1) 199 | } 200 | } 201 | } 202 | 203 | func sortRecursively(S1, SA1 []int, k1 int) { 204 | if k1 == len(S1) { 205 | for i, s := range S1 { 206 | if s < 0 { 207 | SA1[^s] = i 208 | } else { 209 | SA1[s] = i 210 | } 211 | } 212 | } else { 213 | computeSuffixArray1(S1, SA1, k1) 214 | } 215 | } 216 | -------------------------------------------------------------------------------- /level0.go: -------------------------------------------------------------------------------- 1 | package gosaca 2 | 3 | const ( 4 | maxInt = int(^uint(0) >> 1) 5 | minInt = -(maxInt - 1) 6 | empty = minInt 7 | ) 8 | 9 | // WorkSpace contains the O(1) scratch space used in constructing a suffix array with an alphabet of sisze 256 (any byte value). 10 | type WorkSpace struct { 11 | bkt [256]int // working space buckets 12 | bktHead [256]int // save off bucket heads 13 | bktTail [256]int // save off bucket tails 14 | dirty bool // true if the scratch space is dirty from a previous run 15 | } 16 | 17 | // Compute the suffix array of S, storing it into SA. len(S) and len(SA) must be equal. 18 | func (ws *WorkSpace) ComputeSuffixArray(S []byte, SA []int) { 19 | n := len(S) 20 | bkt := ws.bkt[:] 21 | 22 | // scan S once, computing all bucket heads/tails 23 | ws.computeBuckets(S) 24 | 25 | // ********************************************* 26 | // Stage 1: Induced-sort the LMS-substrings of S 27 | // ********************************************* 28 | 29 | // step 1 - initialize SA as empty 30 | setAllToEmpty(SA) 31 | 32 | // step 2 - put all LMS substrings into buckets based on their first character 33 | // insert from the end to the head of the buckets (bkt currently holds the tails of the buckets from computeBuckets) 34 | for i := n - 2; i >= 0; i-- { 35 | if S[i] >= S[i+1] { 36 | // S[i] is L-type; move on 37 | continue 38 | } 39 | 40 | // S[i] is S-type; keep moving back until S[i-1] is L-type 41 | for i >= 1 && (S[i-1] < S[i] || S[i-1] == S[i]) { 42 | // S[i-1] is also S-type; keep moving back 43 | i-- 44 | } 45 | 46 | // unless we hit S[0] (which is not LMS by definition), S[i] begins an LMS suffix, so insert it into its bucket 47 | if i > 0 { 48 | SA[bkt[S[i]]] = i 49 | bkt[S[i]]-- 50 | } 51 | } 52 | 53 | // step 3 - induced sort the L-type suffixes of S into their buckets 54 | copy(bkt, ws.bktHead[:]) 55 | induceSortL0(S, SA, bkt) 56 | 57 | // step 4 - induced sort the S-type suffixes of S into their buckets 58 | copy(bkt, ws.bktTail[:]) 59 | induceSortS0(S, SA, bkt) 60 | 61 | // NOT DESCRIBED IN PAPER BUT STILL NECESSARY (see SA-IS) 62 | // We need to compact all the now-sorted LMS substrings into the first n1 positions of SA 63 | // To do this, make use of the fact that since we just inserted all the S-type 64 | // suffixes into SA from tail-to-head of the buckets, we can loop over the buckets 65 | // themselves and pull out the S-type suffixes: all the S-type suffixes starting with c 66 | // are contained in SA[bkt[c]+1] to SA[bktTail[c]] 67 | n1 := 0 68 | for c := 0; c < 256; c++ { 69 | for i := bkt[c] + 1; i <= ws.bktTail[c]; i++ { 70 | j := SA[i] 71 | // we know S[j] is S-type; now see if it's LMS (i.e., preceded by an L-type) 72 | if j > 0 && S[j-1] > S[j] { 73 | SA[n1] = j 74 | n1++ 75 | } 76 | } 77 | } 78 | 79 | // ********************************************* 80 | // Stage 2: Rename the LMS substrings 81 | // ********************************************* 82 | 83 | // provably, n1 is at most floor(n/2), so the following overlapping works 84 | SA1 := SA[:n1] // SA1 overlaps the front of SA 85 | work := SA[n1:] // workspace overlaps the rest of SA 86 | S1 := SA[n-n1:] // S1 overlaps the end of SA (including part of "work", but rename deals with that correctly) 87 | k1 := rename0(S, SA1, work, S1) 88 | 89 | // ********************************************* 90 | // Stage 3: Sort recursively 91 | // ********************************************* 92 | sortRecursively(S1, SA1, k1) 93 | 94 | // NOT DESCRIBED IN PAPER BUT STILL NECESSARY (see SA-IS) 95 | // We need to undo the renaming of the LMS suffixes. 96 | // We no longer need S1, so reuse it to hold all the LMS indices. 97 | j := n1 - 1 98 | for i := n - 2; i >= 0; i-- { 99 | if S[i] >= S[i+1] { 100 | // S[i] is L-type 101 | continue 102 | } 103 | // S[i] is S-type; walk backwards to find LMS 104 | for i >= 1 && (S[i-1] < S[i] || S[i-1] == S[i]) { 105 | // S[i-1] is also S-type 106 | i-- 107 | } 108 | // S[0] is not LMS by definition, but otherwise S[i] is LMS 109 | if i > 0 { 110 | S1[j] = i 111 | j-- 112 | } 113 | } 114 | // Now convert SA1 from renamed values to true values. 115 | for i, s := range SA1 { 116 | SA1[i] = S1[s] 117 | } 118 | 119 | // ********************************************* 120 | // Stage 4: Induced-sort SA(S) from SA1(S1) 121 | // ********************************************* 122 | 123 | // step 1 - initialize SA[n1:] as empty 124 | setAllToEmpty(SA[n1:]) 125 | 126 | // step 2 - put all sorted LMS substrings into buckets based on their first character 127 | // insert from the end to the head of the buckets 128 | copy(bkt, ws.bktTail[:]) 129 | for i := n1 - 1; i >= 0; i-- { 130 | j := SA1[i] 131 | SA1[i] = empty // clear it out in preparation for steps 3-4 132 | if j == 0 { 133 | panic("unexpected j == 0") 134 | } 135 | c := S[j] 136 | SA[bkt[c]] = j 137 | bkt[c]-- 138 | } 139 | 140 | // step 3 - induced sort the L-type suffixes of S into their buckets 141 | copy(bkt, ws.bktHead[:]) 142 | induceSortL0(S, SA, bkt) 143 | 144 | // step 4 - induced sort the S-type suffixes of S into their buckets 145 | copy(bkt, ws.bktTail[:]) 146 | induceSortS0(S, SA, bkt) 147 | } 148 | 149 | func (ws *WorkSpace) computeBuckets(S []byte) { 150 | if ws.dirty { 151 | // clear out bucket counters from a previous call to ComputeSuffixArray 152 | for i := 0; i < 256; i++ { 153 | ws.bkt[i] = 0 154 | } 155 | } 156 | 157 | // compute sizes of each bucket 158 | for _, c := range S { 159 | ws.bkt[c]++ 160 | } 161 | 162 | // record head and tail of each bucket (also store tails into bkt, as that's the one we need first) 163 | sum := 0 164 | for i := 0; i < 256; i++ { 165 | ws.bktHead[i] = sum 166 | sum += ws.bkt[i] 167 | ws.bktTail[i] = sum - 1 168 | ws.bkt[i] = sum - 1 169 | } 170 | 171 | // record that our buckets are dirty in case ws is used again 172 | ws.dirty = true 173 | } 174 | 175 | // pre-condition: SA contains properly bucketed LMS substrings 176 | // pre-condition: bkt contains the head of each character's bucket 177 | // post-condition: SA contains properly bucketed L-type and LMS suffixes 178 | func induceSortL0(S []byte, SA, bkt []int) { 179 | n := len(S) 180 | 181 | // special case to deal with the (virtual) sentinel: 182 | // S[n-1] is L-type because of the sentinel, and if we were treating 183 | // the sentinel as a real character, it would be at the front of SA[] 184 | // (it's effectively stored in "SA[-1]") 185 | c := S[n-1] 186 | SA[bkt[c]] = n - 1 187 | bkt[c]++ 188 | 189 | // at each step, look at the character *before* S[SA[i]]; if it's L-type, insert it 190 | for _, SAi := range SA { 191 | if SAi <= 0 { 192 | // if SA[i] is empty or points to S[0], we don't have a preceding character to check 193 | continue 194 | } 195 | 196 | j := SAi - 1 197 | c := S[j] // character we care about 198 | 199 | // check for L-type (described in section 3) 200 | // since SA only holds L-type and LMS suffixes, c must be L-type if it is >= S[j] 201 | if c >= S[SAi] { 202 | SA[bkt[c]] = j 203 | bkt[c]++ 204 | } 205 | } 206 | } 207 | 208 | // pre-condition: SA contains properly bucketed L and LMS suffixes 209 | // pre-condition: bkt contains the tail of each character's bucket 210 | // post-condition: SA contains properly also contains all properly bucketed S-type suffixes 211 | func induceSortS0(S []byte, SA, bkt []int) { 212 | n := len(S) 213 | 214 | // at each step, look at the character *before* S[SA[i]]; if it's S-type, insert it 215 | for i := n - 1; i >= 0; i-- { 216 | SAi := SA[i] 217 | if SAi <= 0 { 218 | continue 219 | } 220 | 221 | j := SAi - 1 222 | c := S[j] // character we care about 223 | 224 | // check for S-type (use Property 3.1) 225 | if c < S[SAi] || (c == S[SAi] && bkt[c] < i) { 226 | SA[bkt[c]] = j 227 | bkt[c]-- 228 | } 229 | } 230 | 231 | // we don't need to do anything special for the sentinel - by definition the character before it is not S-type 232 | } 233 | -------------------------------------------------------------------------------- /level1.go: -------------------------------------------------------------------------------- 1 | package gosaca 2 | 3 | // After filling in LMS suffixes using the "end of bucket is a counter" 4 | // algorithm from section 4.2, we need to loop over SA and fix any bucket 5 | // counters still left. 6 | func fixLMSBucketCounters(SA []int) { 7 | for i := len(SA) - 1; i >= 0; i-- { 8 | if SA[i] == empty || SA[i] >= 0 { 9 | // SA[i] isn't a counter; move on 10 | continue 11 | } 12 | // right shift all the elements of the bucket, filling the vacated 13 | // slot with "empty" 14 | d := SA[i] 15 | pos := i + d - 1 16 | prev := empty 17 | for x := pos + 1; x <= i; x++ { 18 | SA[x], prev = prev, SA[x] 19 | } 20 | } 21 | } 22 | 23 | // This helper function implements the logic described in section 4.2 to 24 | // insert an S-type value into its bucket from the end, reusing the ends of 25 | // buckets as counters. If we have to shift a bucket around, the two returned 26 | // integers are the start and end positions of SA that were modified. If we 27 | // don't have to do any shifting, we return -1, -1. 28 | func insertSTypeUsingCounters(SA []int, index, c int) (int, int) { 29 | x0, x1 := -1, -1 30 | n := len(SA) 31 | switch { 32 | case SA[c] >= 0: 33 | // section 4.2 case 2 34 | prev := SA[c] 35 | x0, x1 = c, c 36 | for x := c + 1; x < n; x++ { 37 | SA[x], prev = prev, SA[x] 38 | x1 = x 39 | if prev < 0 && prev != empty { 40 | break 41 | } 42 | } 43 | fallthrough 44 | 45 | case SA[c] == empty: 46 | // section 4.2 case 1 47 | if c-1 >= 0 && SA[c-1] == empty { 48 | SA[c-1] = index 49 | SA[c] = -1 50 | } else { 51 | SA[c] = index 52 | } 53 | break 54 | 55 | default: 56 | // section 4.2 case 3 57 | d := SA[c] 58 | pos := c + d - 1 59 | if pos >= 0 && SA[pos] == empty { 60 | SA[pos] = index 61 | SA[c]-- 62 | } else { 63 | // right-shift SA[pos+1:c-1], inserting index into SA[pos+1] 64 | x0, x1 = pos+1, c 65 | prev := index 66 | for x := pos + 1; x <= c; x++ { 67 | SA[x], prev = prev, SA[x] 68 | } 69 | } 70 | break 71 | } 72 | 73 | return x0, x1 74 | } 75 | 76 | // Same style of helper function as above, except for section 4.1 (L-type 77 | // into buckets from head to tail). 78 | func insertLTypeUsingCounters(SA []int, index, c int) (int, int) { 79 | x0, x1 := -1, -1 80 | n := len(SA) 81 | switch { 82 | case SA[c] >= 0: 83 | // section 4.1 case 1 84 | prev := SA[c] 85 | x0, x1 = c, c 86 | for x := c - 1; x >= 0; x-- { 87 | SA[x], prev = prev, SA[x] 88 | x0 = x 89 | if prev < 0 && prev != empty { 90 | break 91 | } 92 | } 93 | fallthrough 94 | 95 | case SA[c] == empty: 96 | // section 4.1 case 1 97 | if c+1 < n && SA[c+1] == empty { 98 | SA[c+1] = index 99 | SA[c] = -1 100 | } else { 101 | SA[c] = index 102 | } 103 | break 104 | 105 | default: 106 | // section 4.1 case 3 107 | d := SA[c] 108 | pos := c - d + 1 109 | if pos < n && SA[pos] == empty { 110 | SA[pos] = index 111 | SA[c]-- 112 | } else { 113 | // left-shift SA[c+1:pos-1], inserting index into SA[pos-1] 114 | x0, x1 = c, pos-1 115 | prev := index 116 | for x := pos - 1; x >= c; x-- { 117 | SA[x], prev = prev, SA[x] 118 | } 119 | } 120 | } 121 | 122 | return x0, x1 123 | } 124 | 125 | // recursive version of ComputeSuffixArray for levels 1+ 126 | func computeSuffixArray1(S, SA []int, k int) { 127 | n := len(S) 128 | 129 | // ********************************************* 130 | // Stage 1: Induced-sort the LMS-substrings of S 131 | // ********************************************* 132 | 133 | // step 1 - initialize SA as empty 134 | setAllToEmpty(SA) 135 | 136 | // step 2 - put all LMS substrings into buckets based on their first character 137 | for i := n - 2; i >= 0; i-- { 138 | if S[i] >= 0 { 139 | // S[i] is L-type 140 | continue 141 | } 142 | 143 | // S[i] is S-type; walk back until S[i-1] is L-type or -1 144 | for i >= 1 && S[i-1] < 0 { 145 | // S[i-1] is also S-type 146 | i-- 147 | } 148 | 149 | if i == 0 { 150 | // even if S[0] is S-type, it's not LMS - we're done 151 | break 152 | } 153 | 154 | // Insertion of the LMS strings is identical to insertions of S-type 155 | // strings described in section 4.2, but we don't care about the 156 | // returned values. 157 | insertSTypeUsingCounters(SA, i, ^S[i]) 158 | } 159 | 160 | // Remove any leftover bucket counters. 161 | fixLMSBucketCounters(SA) 162 | 163 | // step 3 - induced sort the L-type suffixes of S into their buckets 164 | induceSortL1(S, SA) 165 | 166 | // step 4 - induced sort the S-type suffixes of S into their buckets 167 | induceSortS1(S, SA) 168 | 169 | // compact all the now-sorted LMS substrings into the first n1 positions of SA 170 | n1 := 0 171 | for _, s := range SA { 172 | if s != 0 && // S[0] is not LMS by definition 173 | S[s] < 0 && // S[s] is S-type 174 | S[s-1] >= 0 { // S[s-1] is L-type 175 | SA[n1] = s 176 | n1++ 177 | } 178 | } 179 | 180 | // ********************************************* 181 | // Stage 2: Rename the LMS substrings 182 | // ********************************************* 183 | 184 | // provably, n1 is at most floor(n/2), so the following overlapping works 185 | SA1 := SA[:n1] // SA1 overlaps the front of SA 186 | work := SA[n1:] // workspace overlaps the rest of SA 187 | S1 := SA[n-n1:] // S1 overlaps the end of SA (including part of "work", but rename deals with that correctly) 188 | k1 := rename1(S, SA1, work, S1) 189 | 190 | // ********************************************* 191 | // Stage 3: Sort recursively 192 | // ********************************************* 193 | sortRecursively(S1, SA1, k1) 194 | 195 | // NOT DESCRIBED IN PAPER BUT STILL NECESSARY (see SA-IS) 196 | // We need to undo the renaming of the LMS suffixes. 197 | // We no longer need S1, so reuse it to hold all the LMS indices. 198 | j := n1 - 1 199 | for i := n - 2; i >= 0; i-- { 200 | if S[i] >= 0 { 201 | // L-type; ignore 202 | continue 203 | } 204 | // S[i] is S-type; walk backwards to find LMS 205 | for i >= 1 && S[i-1] < 0 { 206 | // S[i-1] is also S-type; keep moving back 207 | i-- 208 | } 209 | // S[0] is not LMS by definition, but otherwise S[i] is LMS 210 | if i > 0 { 211 | S1[j] = i 212 | j-- 213 | } 214 | } 215 | if j != -1 { 216 | panic("didn't find all the LMS characters we expected") 217 | } 218 | // Now convert SA1 from renamed values to true values. 219 | for i, s := range SA1 { 220 | SA1[i] = S1[s] 221 | } 222 | 223 | // ********************************************* 224 | // Stage 4: Induced-sort SA(S) from SA1(S1) 225 | // ********************************************* 226 | 227 | // step 1 - initialize SA[n1:] as empty 228 | setAllToEmpty(SA[n1:]) 229 | 230 | // step 2 - put all the sorted LMS suffixes of S into their buckets in SA 231 | for i := n1 - 1; i >= 0; i-- { 232 | j := SA[i] 233 | SA[i] = empty 234 | c := ^S[j] 235 | if j == 0 { 236 | panic("unexpected j == 0") 237 | } 238 | // If we've worked our way back to c == i, then all the remaining 239 | // SA[0,c] values are already correct, and going into the loop below 240 | // with bucket counters will just screw things up. 241 | if c == i { 242 | SA[c] = j // restore it (we just emptied it out above...) 243 | break 244 | } 245 | 246 | // Same explanation for what's going on here as in Stage 1 step 2. 247 | insertSTypeUsingCounters(SA, j, c) 248 | } 249 | 250 | // Remove any leftover bucket counters. 251 | fixLMSBucketCounters(SA) 252 | 253 | // step 3 - induced sort the L-type suffixes of S into their buckets 254 | induceSortL1(S, SA) 255 | 256 | // step 4 - induced sort the S-type suffixes of S into their buckets 257 | induceSortS1(S, SA) 258 | } 259 | 260 | // TODO pre-post 261 | func induceSortL1(S, SA []int) { 262 | n := len(S) 263 | 264 | // special case to deal with the (virtual) sentinel: 265 | // S[n-1] is L-type because of the sentinel, and if we were treating 266 | // the sentinel as a real character, it would be at the front of SA[] 267 | // (it's effectively stored in "SA[-1]"). 268 | // 269 | // Because c is L-type, we know SA[c] is empty, so we're in case 1 of section 4.1 270 | c := S[n-1] 271 | if c+1 < n && SA[c+1] == empty { 272 | SA[c+1] = n - 1 273 | SA[c] = -1 274 | } else { 275 | SA[c] = n - 1 276 | } 277 | 278 | for i := 0; i < n; i++ { 279 | if SA[i] < 0 { 280 | // SA[i] is empty or being used as a counter; nothing to do 281 | continue 282 | } 283 | j := SA[i] - 1 284 | // if we just grabbed the character before an LMS suffix, we need to clear 285 | // out that LMS suffix (induceSortS1 assumes only L-type suffixes are in SA) 286 | if S[SA[i]] < 0 { 287 | SA[i] = empty 288 | } 289 | if j < 0 { 290 | // SA[i] was == 0; there is no preceding character to look at 291 | continue 292 | } 293 | c := S[j] 294 | if c < 0 { 295 | // S[j] is S-type; move on 296 | continue 297 | } 298 | 299 | // insert j into its bucket; if we overwrite SA[i], we need to stay 300 | // here and look at it again in the next pass 301 | x0, x1 := insertLTypeUsingCounters(SA, j, c) 302 | if i >= x0 && i <= x1 { 303 | i-- 304 | } 305 | } 306 | 307 | // NOT MENTIONED IN PAPER: We need to go back over SA and fix 308 | // any leftover counter values via left shifting the buckets appropriately. 309 | // This is the moral equivalent of fixLMSBucketCounters, but we only ever 310 | // do this once, so didn't bother extracting it into its own function. 311 | for i, d := range SA { 312 | if d == empty || d >= 0 { 313 | continue 314 | } 315 | pos := i - d + 1 316 | prev := empty 317 | for x := pos - 1; x >= i; x-- { 318 | SA[x], prev = prev, SA[x] 319 | } 320 | } 321 | } 322 | 323 | // TODO pre-post 324 | func induceSortS1(S, SA []int) { 325 | n := len(S) 326 | 327 | for i := n - 1; i >= 0; i-- { 328 | if SA[i] <= 0 { 329 | // SA[i] is empty or being used as a counter; nothing to do 330 | continue 331 | } 332 | j := SA[i] - 1 333 | c := ^S[j] 334 | if c < 0 { 335 | // S[j]==c is L-type; move on 336 | continue 337 | } 338 | 339 | // insert j into its bucket; if we overwrite SA[i], we need to stay 340 | // here and look at it again in the next pass 341 | x0, x1 := insertSTypeUsingCounters(SA, j, c) 342 | if i >= x0 && i <= x1 { 343 | i++ 344 | } 345 | } 346 | } 347 | --------------------------------------------------------------------------------