├── .gitignore
├── LICENSE
├── README.md
├── gosaca_test.go
├── common.go
├── level0.go
└── level1.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 2 | *.o
 3 | *.a
 4 | *.so
 5 | 
 6 | # Folders
 7 | _obj
 8 | _test
 9 | 
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 | 
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 | 
20 | _testmain.go
21 | 
22 | *.exe
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013 John Gallagher
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 7 | of the Software, and to permit persons to whom the Software is furnished to do
 8 | so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | gosaca
 2 | ========
 3 | 
 4 | Description
 5 | -----------
 6 | 
 7 | Pure Go implementation of [An Optimal Suffix Array Construction
 8 | Algorithm](http://ge-nong.googlecode.com/files/tr-osaca-nong.pdf), a paper by
 9 | [Ge Nong](http://code.google.com/p/ge-nong/).
10 | 
11 | Benchmarks
12 | ----------
13 | 
14 | More extensive tests and benchmarks run on large copora are available as
15 | [gosaca-bigtests](https://github.com/jgallagher/gosaca-bigtest) so as not to
16 | balloon the size of this repo.
17 | 
18 | The following table compares
19 | [sa-is](https://sites.google.com/site/yuta256/sais) with gosaca running on a
20 | 2012 Macbook Pro. The comparision is not really fair in many senses (SA-IS is
21 | an earlier, slightly less efficient algorithm, but it's implemented in
22 | optimized C code); here it is nonetheless. Times are in seconds.
23 | 
24 | File           |      Size |  sa-is | gosaca
25 | -------------- | --------: | -----: | -----:
26 | chr22dna       |  34553758 |  5.893 | 10.807
27 | etext99        | 105277340 | 21.507 | 41.862
28 | gcc30tar       |  86630400 | 10.252 | 23.149
29 | howto          |  39422105 |  6.050 | 12.123
30 | jdk13c         |  69728899 |  6.567 | 19.807
31 | linux245tar    | 116254720 | 14.635 | 32.696
32 | rctail96       | 114711151 | 15.143 | 40.225
33 | rfc            | 116421901 | 16.191 | 36.505
34 | sprot34dat     | 109617186 | 17.485 | 39.164
35 | w3c2           | 104201579 | 10.395 | 30.164
36 | abac           |    200000 |  0.005 |  0.016
37 | abba           |  10500600 |  0.646 |  2.326
38 | book1x20       |  15375420 |  1.620 |  5.126
39 | fib\_s14930352 |  14930352 |  0.981 |  4.579
40 | fss10          |  12078908 |  0.739 |  3.539
41 | fss9           |   2851443 |  0.145 |  0.518
42 | houston        |   3840000 |  0.102 |  0.288
43 | paper5x80      |    981924 |  0.041 |  0.099
44 | test1          |   2097152 |  0.096 |  0.236
45 | test2          |   2097152 |  0.103 |  0.251
46 | test3          |   2097152 |  0.094 |  0.206
47 | 
48 | Copyright
49 | ---------
50 | 
51 | Copyright &copy; John Gallagher. [MIT
52 | License](http://opensource.org/licenses/MIT); see LICENSE for further details.
53 | 


--------------------------------------------------------------------------------
/gosaca_test.go:
--------------------------------------------------------------------------------
  1 | package gosaca
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math/rand"
  6 | 	"testing"
  7 | )
  8 | 
  9 | func checkCorrectSuffixArray(input []byte, SA []int) error {
 10 | 	suffixesSeen := make(map[int]bool)
 11 | 
 12 | 	for i, s := range SA {
 13 | 		suffixesSeen[s] = true
 14 | 
 15 | 		// make sure suffix starting at SA[i] precedes suffix starting at SA[i+1]
 16 | 		if i+1 == len(SA) {
 17 | 			break
 18 | 		}
 19 | 
 20 | 		s1, s2 := SA[i], SA[i+1]
 21 | 		if s1 < 0 || s1 >= len(input) {
 22 | 			return fmt.Errorf("Invalid suffix array: SA[%d] = %d is out of range\n", i, s1)
 23 | 		}
 24 | 		if s2 < 0 || s2 >= len(input) {
 25 | 			return fmt.Errorf("Invalid suffix array: SA[%d] = %d is out of range\n", i+1, s2)
 26 | 		}
 27 | 		if s1 == s2 {
 28 | 			return fmt.Errorf("Invalid suffix array: SA[%d] = SA[%d]\n", i, i+1)
 29 | 		}
 30 | 		for {
 31 | 			if input[s1] < input[s2] {
 32 | 				// success
 33 | 				break
 34 | 			}
 35 | 			if input[s1] > input[s2] {
 36 | 				return fmt.Errorf("Invalid suffix array: suffix starting at SA[%d]=%d is greater than suffix starting at SA[%d]=%d\n", i, SA[i], i+1, SA[i+1])
 37 | 			}
 38 | 			s1++
 39 | 			if s1 == len(input) {
 40 | 				// success
 41 | 				break
 42 | 			}
 43 | 			s2++
 44 | 			if s2 == len(input) {
 45 | 				return fmt.Errorf("Invalid suffix array: suffix starting at SA[%d]=%d is greater than suffix starting at SA[%d]=%d\n", i, SA[i], i+1, SA[i+1])
 46 | 			}
 47 | 		}
 48 | 	}
 49 | 
 50 | 	if len(suffixesSeen) != len(input) {
 51 | 		return fmt.Errorf("Invalid suffix array: only saw %d unique suffixes (expected %d)\n", len(suffixesSeen), len(input))
 52 | 	}
 53 | 
 54 | 	return nil
 55 | }
 56 | 
 57 | func TestBasic(t *testing.T) {
 58 | 	ws := &WorkSpace{}
 59 | 	for _, input := range [][]byte{
 60 | 		// simple tests
 61 | 		[]byte("baa"),
 62 | 		[]byte("banana"),
 63 | 		[]byte("bananafoobar"),
 64 | 		[]byte("mmiissiippii"),
 65 | 
 66 | 		// random tests that failed during development/debugging
 67 | 		[]byte("anzazzdexszakdovkzahyckszpfqqfquuszaongqn"),
 68 | 		[]byte("dlvoppoimkrvyktwwxvbmemsvopnexqdftnuamepiu"),
 69 | 		[]byte("agidaenivhknajhfgekpmmugaqljpaoerhyyerhzxaehp"),
 70 | 		[]byte("slsluwafbeygwtsflijvcfedimtewfybyhzkzjbpewfifl"),
 71 | 		[]byte("eckdiyvvlrsemmcpawoirivnockdlbrmaufbehxipsqhyanosoqbp"),
 72 | 		[]byte("xbuxnvtzouehxqopupcwfivyhnwvkftcwfhjmjxzncnmvwlisrpxqnvczo"),
 73 | 		[]byte("naxkmuquvkhngkcqalbgpdxjkalbvrmbqscyikqdhrvvijkfngfikxtvalsmobje"),
 74 | 		[]byte("dxjpmcwlvmuswgfatoqolxcicbbvgbvrhwvibjbliiqcbolxcfnajixxbskjylcfxuhgvrwcfqahe"),
 75 | 		[]byte("tqxqbajhitxealorzfbmiulasimxpfxqvzenmdjhththzmyxrsqakcoqunzgopagkoevslbyndymbpiyswsdngoyuipxfxmswvkhu"),
 76 | 		[]byte("hngsmqhsuuwwqdwueftduujrxiyygqskpczwoyibotyqdfkdoayaujteffqmavwjkrgpbqwiobmkglovzhgdiuimkffjwpqusynihwkokduowszlhefkskkfkefbwilfzvfyzsamfkavfjoyfekfamqvtamfmniazidowdakbhedhzwzdanjavtmgciicmewqtrbcvmbljuhostzjojqzdtegzquljdhdwzqhfojvctkzekxwurtralnipuicjbol"),
 77 | 		[]byte("vxijixkikjtgfxpwdizlmgkslnmtdiaftiexzfkppkqwanbzgvibonysykramrtklvnqbljynrddqyqlbtpugakabinvvuzpqfxvbefhopzvgmeasomnmhnghdatmobksibaipjtvmufpqstnojqjyfmhibltyafjjednaywgpgrglhxonkhibsrlmxvubwecqeddpzpksvjtsimgtyrpvtnqrfsgolsznbdtbuttgcnlvslnmmnjlnhepapdbsbhdrvqbmtxomtty"),
 78 | 		[]byte("bbsomwccexbwzqmeqdgcgymeyavyydmmcowprhahcjvloltsrwbkmuvtiwysdkyxygdojbdaubsbvbluicuhrxbqxhtwuphwaytpvjcgespczoreufogbflleubowaklbcttxyjnaufhdwbzniafrghlyszgolkyjumwzwwjzwcpjjrmbwyzoymgbpypreqsngoulmaxygcazmigmpoajmswefwcflrqxhrqytqogypsyslypgvrihlfeqrxhytbvpggbqubinydvcwtnxbmwlvilhuxnsdewuaovnsvozhnrwhmqrmrutrxjgknujcxabaaqedijbrdytqmndkjlfffauohhmyswdafxweyocylutquvtlpqteuyetgaftlngwcuxhylhdkprvgfvnqncywxftnwghlnqplxfmqxehbsisxcytpmliupvwnjzmuysjjntmwezfuxauiwbfggkxceayjdkgvghxhztdjmjisbkqallwtloyepczldrcldktggbzowjtjneziyrvnmecpzkodvxizhgnehjhiwwgnmyzbjtjfcktighrbwoibvjkobxgqhbfvguldejsxvgwbfwyanvtsqgtxhnhjecrpxwtovfhgejdczmbwoifsirjfztdjfupitvaljqqiyqqpxlbhwqpwplmysqdvzriqpjdrnkajkywlxtcetbgtvxorbrtpmfayeadiaoymqaetsgocvkmymrwfhpyzylieghmuegoqqjhdelswfllzuykysjzadutkxwgfhdmihvkcfvalofynroljcncdvblnreguoyhyrzhnoubladowjdjhyuazhyapwioxhrdtvmfljazmvbxjtkrwqrkepfctekohrtvsifxvqbzekddplytdwxgudgzsvyvxlxjdyqqmpsimuwvdmjtjpcyctorbdmffbzwxexygppzsdoczuppxiqxnnwewyjyeohpgkglstinafynsoyqtjybrdwsgvssuwcikhhoyhszglpuzmttmwezfknplhzjnapnxlbepxahcjjreysmzdwroclrylkqwoxwstzridtlraybpcohjuvltzypcwqfakgwxyybqeildjyvuiaakwvdduckmsvkyaqyebtgkrflatnlyqhycbrputyqofjfplxdxprfpbvjyifzsjwmnceiaovnqgfzaofjqqoffbrpfxygxlvyekoifiihzryeagcwwglvwbovtffehxamoznrtolqgyfkxlhjpjaqyfefoxlphficbcndpssiosqhkjmegnvpxynsipougnogroestwxamfprtsxffbhslwrnmjyjdolcekuzqwoauamufvqhzsbbpfsvupjscavgpgybgkzsicpgcxukkhgaiyxqauqienozaufwenctcgcibwyfsejfdrujqutiosvfctqroncnggxdjmmpjajsrbpjjsgqulgbbiauxndntroharhqglkjzgkprcwosychvvpfyedjtrcfpgjdmesbhlyzkeukxiesbtkdjpwikdesrjbfiabtufrkoevscabjmxmkdwekstnujocxtzcwlbmafmskhslsredavkpzjhbsfhwxmoauhixwolumhbqffduilfuecubztsqur"),
 79 | 	} {
 80 | 		SA := make([]int, len(input))
 81 | 		ws.ComputeSuffixArray(input, SA)
 82 | 		if err := checkCorrectSuffixArray(input, SA); err != nil {
 83 | 			t.Fatalf("input %s failed: %s", string(input), err)
 84 | 		}
 85 | 	}
 86 | }
 87 | 
 88 | func TestRandom(t *testing.T) {
 89 | 	ws := &WorkSpace{}
 90 | 	var (
 91 | 		seed           = 12345
 92 | 		nlengths       = 5000
 93 | 		testsPerLength = 1
 94 | 	)
 95 | 
 96 | 	if testing.Short() {
 97 | 		nlengths = 1000
 98 | 	}
 99 | 
100 | 	rand.Seed(int64(seed))
101 | 
102 | 	input := make([]byte, nlengths)
103 | 	SA := make([]int, nlengths)
104 | 	for i := 1; i <= nlengths; i++ {
105 | 		for j := 0; j < testsPerLength; j++ {
106 | 			for k := 0; k < i; k++ {
107 | 				input[k] = 'a' + byte(rand.Intn(26))
108 | 			}
109 | 			ws.ComputeSuffixArray(input[:i], SA[:i])
110 | 			if err := checkCorrectSuffixArray(input[:i], SA[:i]); err != nil {
111 | 				t.Fatalf("input %s failed: %s", string(input), err)
112 | 			}
113 | 		}
114 | 	}
115 | }
116 | 
117 | func Benchmark900K(b *testing.B) {
118 | 	b.StopTimer()
119 | 	ws := &WorkSpace{}
120 | 	input := make([]byte, 900*1000)
121 | 	for i := range input {
122 | 		input[i] = byte(rand.Intn(256))
123 | 	}
124 | 	SA := make([]int, len(input))
125 | 	b.StartTimer()
126 | 
127 | 	for i := 0; i < b.N; i++ {
128 | 		ws.ComputeSuffixArray(input, SA)
129 | 	}
130 | }
131 | 


--------------------------------------------------------------------------------
/common.go:
--------------------------------------------------------------------------------
  1 | package gosaca
  2 | 
  3 | func setAllToEmpty(SA []int) {
  4 | 	for i := range SA {
  5 | 		SA[i] = empty
  6 | 	}
  7 | }
  8 | 
  9 | // compute the length of the LMS substring at the front of the LMS suffix s[:]
 10 | // pre-condition: s[:] is an LMS suffix
 11 | // WARNING: if s[:] ends on the sentinel, the returned value will be len(s)+1!
 12 | func lmsSubstringLength0(s []byte) int {
 13 | 	n := len(s)
 14 | 	for i := 2; i < n; i++ {
 15 | 		if s[i] < s[i-1] {
 16 | 			// s[i-1] is L-type; move on to step 2
 17 | 			for j := i; j < n; j++ {
 18 | 				if s[j] > s[j-1] {
 19 | 					return i
 20 | 				} else if s[j] < s[j-1] {
 21 | 					i = j + 1
 22 | 				}
 23 | 			}
 24 | 		}
 25 | 	}
 26 | 	return n + 1 // add one to indicate substring ended with the sentinel
 27 | }
 28 | 
 29 | // for level 0, rename the LMS substrings sitting in SA1, and return the new alphabet size (k1)
 30 | func rename0(S []byte, SA1, work, S1 []int) int {
 31 | 	n := len(S)
 32 | 	n1 := len(SA1)
 33 | 
 34 | 	if n1 == 0 {
 35 | 		return 0
 36 | 	}
 37 | 
 38 | 	// currently work only holds positive values; we save the time of clearing
 39 | 	// it out by inserting bitwise-negated (negative) values so we know which
 40 | 	// ones we put in vs which ones just contain old data
 41 | 
 42 | 	// walk SA1 from left to right, creating Z1 (spread throughout work)
 43 | 
 44 | 	// first, record the first LMS suffix
 45 | 	k1 := 1
 46 | 	bktHead := 0 // renamed value == head of bucket in SA1 (part of property 4.1)
 47 | 	prev := SA1[0]
 48 | 	work[prev/2] = ^bktHead
 49 | 	SA1[0] = 1 // after we read SA[i], reuse it as a bucket size (needed for post-Z1 step)
 50 | 
 51 | 	// at each step, we need to see if the LMS substring starting at S[SA1[i]] (S[pos])
 52 | 	// is the same as the one we just saw starting at S[SA1[i-1]] (S[prev])
 53 | 	for i := 1; i < n1; i++ {
 54 | 		pos := SA1[i]
 55 | 		SA1[i] = 0 // reused as bucket size
 56 | 		diff := false
 57 | 
 58 | 		// quick first test - if initial character is different we're done
 59 | 		if S[prev] != S[pos] {
 60 | 			diff = true
 61 | 		} else {
 62 | 			// TODO - this walks both LMS substrings to calculate their lengths; can we combine this to short-circuit earlier if possible? tricky to do correctly!
 63 | 			prevLen := lmsSubstringLength0(S[prev:])
 64 | 			posLen := lmsSubstringLength0(S[pos:])
 65 | 			if prev+prevLen == n+1 || // S[prev:] ends with sentinel
 66 | 				pos+posLen == n+1 || // S[pos:] ends with sentinel
 67 | 				prevLen != posLen { // different lengths
 68 | 				diff = true
 69 | 			} else {
 70 | 				// if we get here:
 71 | 				//   (a) first character is the same
 72 | 				//   (b) both end before the sentinel
 73 | 				//   (c) both have the same length
 74 | 				// so we need to check the rest of the characters one-by-one
 75 | 				for j := 1; j < prevLen; j++ {
 76 | 					if S[prev+j] != S[pos+j] {
 77 | 						diff = true
 78 | 						break
 79 | 					}
 80 | 				}
 81 | 			}
 82 | 		}
 83 | 
 84 | 		if diff {
 85 | 			bktHead = i
 86 | 			k1++
 87 | 		}
 88 | 		work[pos/2] = ^bktHead
 89 | 		SA1[bktHead]++ // increment bucket size
 90 | 		prev = pos
 91 | 	}
 92 | 
 93 | 	// Z1 is now sitting (sparsely) in work[]
 94 | 	buildS1FromZ1(S1, SA1, work)
 95 | 
 96 | 	return k1
 97 | }
 98 | 
 99 | // this function is almost the same as rename0; differences:
100 | //   (a) S is an []int
101 | //   (b) our LMS substring comparision is different (and easier)
102 | func rename1(S, SA1, work, S1 []int) int {
103 | 	n := len(S)
104 | 	n1 := len(SA1)
105 | 
106 | 	if n1 == 0 {
107 | 		return 0
108 | 	}
109 | 
110 | 	// currently work only holds positive values; we save the time of clearing
111 | 	// it out by inserting bitwise-negated (negative) values so we know which
112 | 	// ones we put in vs which ones just contain old data
113 | 
114 | 	// walk SA1 from left to right, creating Z1 (spread throughout work)
115 | 
116 | 	// first, record the first LMS suffix
117 | 	k1 := 1
118 | 	bktHead := 0 // renamed value == head of bucket in SA1 (part of property 4.1)
119 | 	prev := SA1[0]
120 | 	work[prev/2] = ^bktHead
121 | 	SA1[0] = 1 // after we read SA[i], reuse it as a bucket size (needed for post-Z1 step)
122 | 
123 | 	// at each step, we need to see if the LMS substring starting at S[SA1[i]] (S[pos])
124 | 	// is the same as the one we just saw starting at S[SA1[i-1]] (S[prev])
125 | 	for i := 1; i < n1; i++ {
126 | 		pos := SA1[i]
127 | 		SA1[i] = 0 // reused as bucket size
128 | 		diff := false
129 | 
130 | 		// walk both strings character-by-character until (a) we get a
131 | 		// difference or (b) we begin the L+ sequence
132 | 		j := 0
133 | 		for j = 0; j < n; j++ {
134 | 			if S[prev+j] != S[pos+j] {
135 | 				diff = true
136 | 				break
137 | 			} else if S[prev+j] >= 0 {
138 | 				break
139 | 			}
140 | 		}
141 | 
142 | 		if !diff {
143 | 			// both strings started L+ at the same place; now walk until:
144 | 			//  (a) either hits the end (=> different)
145 | 			//  (b) we get a different character (=> different)
146 | 			//  (c) both hit the same S-type value (=> same)
147 | 			for j++; j < n; j++ {
148 | 				if prev+j == n || pos+j == n || S[prev+j] != S[pos+j] {
149 | 					diff = true
150 | 					break
151 | 				} else if S[prev+j] < 0 {
152 | 					break
153 | 				}
154 | 			}
155 | 		}
156 | 
157 | 		if diff {
158 | 			bktHead = i
159 | 			k1++
160 | 		}
161 | 		work[pos/2] = ^bktHead
162 | 		SA1[bktHead]++ // increment bucket size
163 | 		prev = pos
164 | 	}
165 | 
166 | 	// Z1 is now sitting (sparsely) in work[]
167 | 	buildS1FromZ1(S1, SA1, work)
168 | 
169 | 	return k1
170 | }
171 | 
172 | // Build S1 from Z1 (which is sitting sparsely in work[] - all the negative
173 | // values for work are the bitwise inversions of Z1).
174 | func buildS1FromZ1(S1, SA1, work []int) {
175 | 	n1 := len(S1)
176 | 
177 | 	// walk work[] from right-to-left and adjust any S-type characters to point to the end of their bucket instead of the beginning
178 | 	Z1pos := len(work) - 1
179 | 	for i := 0; i < n1; i++ {
180 | 		// find next element of Z1
181 | 		for work[Z1pos] >= 0 {
182 | 			Z1pos--
183 | 		}
184 | 
185 | 		// record character (head of bucket, only correct for L-type)
186 | 		c := ^work[Z1pos]
187 | 		S1[n1-1-i] = c
188 | 		Z1pos--
189 | 
190 | 		// check and see if c is S-type
191 | 		if i > 0 && // S1[n-1] is L-type by definition due to sentinel
192 | 			((S1[n1-i] < 0 && c <= ^S1[n1-i]) || // S1[n1-i] was S-type and we are <= it
193 | 				(S1[n1-i] >= 0 && c < S1[n1-i])) { // S1[n1-i] was L-type and we are < it
194 | 			// Adjust c so it points to the end of its bucket instead of the
195 | 			// head. Note that in the Z1 construction loop above, we stored
196 | 			// the width of each bucket in SA1[c]. Also, bitwise negate it so
197 | 			// the recursive computeSuffixArray1 doesn't have to.
198 | 			S1[n1-1-i] = ^(S1[n1-1-i] + SA1[c] - 1)
199 | 		}
200 | 	}
201 | }
202 | 
203 | func sortRecursively(S1, SA1 []int, k1 int) {
204 | 	if k1 == len(S1) {
205 | 		for i, s := range S1 {
206 | 			if s < 0 {
207 | 				SA1[^s] = i
208 | 			} else {
209 | 				SA1[s] = i
210 | 			}
211 | 		}
212 | 	} else {
213 | 		computeSuffixArray1(S1, SA1, k1)
214 | 	}
215 | }
216 | 


--------------------------------------------------------------------------------
/level0.go:
--------------------------------------------------------------------------------
  1 | package gosaca
  2 | 
  3 | const (
  4 | 	maxInt = int(^uint(0) >> 1)
  5 | 	minInt = -(maxInt - 1)
  6 | 	empty  = minInt
  7 | )
  8 | 
  9 | // WorkSpace contains the O(1) scratch space used in constructing a suffix array with an alphabet of sisze 256 (any byte value).
 10 | type WorkSpace struct {
 11 | 	bkt     [256]int // working space buckets
 12 | 	bktHead [256]int // save off bucket heads
 13 | 	bktTail [256]int // save off bucket tails
 14 | 	dirty   bool     // true if the scratch space is dirty from a previous run
 15 | }
 16 | 
 17 | // Compute the suffix array of S, storing it into SA. len(S) and len(SA) must be equal.
 18 | func (ws *WorkSpace) ComputeSuffixArray(S []byte, SA []int) {
 19 | 	n := len(S)
 20 | 	bkt := ws.bkt[:]
 21 | 
 22 | 	// scan S once, computing all bucket heads/tails
 23 | 	ws.computeBuckets(S)
 24 | 
 25 | 	// *********************************************
 26 | 	// Stage 1: Induced-sort the LMS-substrings of S
 27 | 	// *********************************************
 28 | 
 29 | 	// step 1 - initialize SA as empty
 30 | 	setAllToEmpty(SA)
 31 | 
 32 | 	// step 2 - put all LMS substrings into buckets based on their first character
 33 | 	// insert from the end to the head of the buckets (bkt currently holds the tails of the buckets from computeBuckets)
 34 | 	for i := n - 2; i >= 0; i-- {
 35 | 		if S[i] >= S[i+1] {
 36 | 			// S[i] is L-type; move on
 37 | 			continue
 38 | 		}
 39 | 
 40 | 		// S[i] is S-type; keep moving back until S[i-1] is L-type
 41 | 		for i >= 1 && (S[i-1] < S[i] || S[i-1] == S[i]) {
 42 | 			// S[i-1] is also S-type; keep moving back
 43 | 			i--
 44 | 		}
 45 | 
 46 | 		// unless we hit S[0] (which is not LMS by definition), S[i] begins an LMS suffix, so insert it into its bucket
 47 | 		if i > 0 {
 48 | 			SA[bkt[S[i]]] = i
 49 | 			bkt[S[i]]--
 50 | 		}
 51 | 	}
 52 | 
 53 | 	// step 3 - induced sort the L-type suffixes of S into their buckets
 54 | 	copy(bkt, ws.bktHead[:])
 55 | 	induceSortL0(S, SA, bkt)
 56 | 
 57 | 	// step 4 - induced sort the S-type suffixes of S into their buckets
 58 | 	copy(bkt, ws.bktTail[:])
 59 | 	induceSortS0(S, SA, bkt)
 60 | 
 61 | 	// NOT DESCRIBED IN PAPER BUT STILL NECESSARY (see SA-IS)
 62 | 	// We need to compact all the now-sorted LMS substrings into the first n1 positions of SA
 63 | 	// To do this, make use of the fact that since we just inserted all the S-type
 64 | 	// suffixes into SA from tail-to-head of the buckets, we can loop over the buckets
 65 | 	// themselves and pull out the S-type suffixes: all the S-type suffixes starting with c
 66 | 	// are contained in SA[bkt[c]+1] to SA[bktTail[c]]
 67 | 	n1 := 0
 68 | 	for c := 0; c < 256; c++ {
 69 | 		for i := bkt[c] + 1; i <= ws.bktTail[c]; i++ {
 70 | 			j := SA[i]
 71 | 			// we know S[j] is S-type; now see if it's LMS (i.e., preceded by an L-type)
 72 | 			if j > 0 && S[j-1] > S[j] {
 73 | 				SA[n1] = j
 74 | 				n1++
 75 | 			}
 76 | 		}
 77 | 	}
 78 | 
 79 | 	// *********************************************
 80 | 	// Stage 2: Rename the LMS substrings
 81 | 	// *********************************************
 82 | 
 83 | 	// provably, n1 is at most floor(n/2), so the following overlapping works
 84 | 	SA1 := SA[:n1] // SA1 overlaps the front of SA
 85 | 	work := SA[n1:] // workspace overlaps the rest of SA
 86 | 	S1 := SA[n-n1:] // S1 overlaps the end of SA (including part of "work", but rename deals with that correctly)
 87 | 	k1 := rename0(S, SA1, work, S1)
 88 | 
 89 | 	// *********************************************
 90 | 	// Stage 3: Sort recursively
 91 | 	// *********************************************
 92 | 	sortRecursively(S1, SA1, k1)
 93 | 
 94 | 	// NOT DESCRIBED IN PAPER BUT STILL NECESSARY (see SA-IS)
 95 | 	// We need to undo the renaming of the LMS suffixes.
 96 | 	// We no longer need S1, so reuse it to hold all the LMS indices.
 97 | 	j := n1 - 1
 98 | 	for i := n - 2; i >= 0; i-- {
 99 | 		if S[i] >= S[i+1] {
100 | 			// S[i] is L-type
101 | 			continue
102 | 		}
103 | 		// S[i] is S-type; walk backwards to find LMS
104 | 		for i >= 1 && (S[i-1] < S[i] || S[i-1] == S[i]) {
105 | 			// S[i-1] is also S-type
106 | 			i--
107 | 		}
108 | 		// S[0] is not LMS by definition, but otherwise S[i] is LMS
109 | 		if i > 0 {
110 | 			S1[j] = i
111 | 			j--
112 | 		}
113 | 	}
114 | 	// Now convert SA1 from renamed values to true values.
115 | 	for i, s := range SA1 {
116 | 		SA1[i] = S1[s]
117 | 	}
118 | 
119 | 	// *********************************************
120 | 	// Stage 4: Induced-sort SA(S) from SA1(S1)
121 | 	// *********************************************
122 | 
123 | 	// step 1 - initialize SA[n1:] as empty
124 | 	setAllToEmpty(SA[n1:])
125 | 
126 | 	// step 2 - put all sorted LMS substrings into buckets based on their first character
127 | 	// insert from the end to the head of the buckets
128 | 	copy(bkt, ws.bktTail[:])
129 | 	for i := n1 - 1; i >= 0; i-- {
130 | 		j := SA1[i]
131 | 		SA1[i] = empty // clear it out in preparation for steps 3-4
132 | 		if j == 0 {
133 | 			panic("unexpected j == 0")
134 | 		}
135 | 		c := S[j]
136 | 		SA[bkt[c]] = j
137 | 		bkt[c]--
138 | 	}
139 | 
140 | 	// step 3 - induced sort the L-type suffixes of S into their buckets
141 | 	copy(bkt, ws.bktHead[:])
142 | 	induceSortL0(S, SA, bkt)
143 | 
144 | 	// step 4 - induced sort the S-type suffixes of S into their buckets
145 | 	copy(bkt, ws.bktTail[:])
146 | 	induceSortS0(S, SA, bkt)
147 | }
148 | 
149 | func (ws *WorkSpace) computeBuckets(S []byte) {
150 | 	if ws.dirty {
151 | 		// clear out bucket counters from a previous call to ComputeSuffixArray
152 | 		for i := 0; i < 256; i++ {
153 | 			ws.bkt[i] = 0
154 | 		}
155 | 	}
156 | 
157 | 	// compute sizes of each bucket
158 | 	for _, c := range S {
159 | 		ws.bkt[c]++
160 | 	}
161 | 
162 | 	// record head and tail of each bucket (also store tails into bkt, as that's the one we need first)
163 | 	sum := 0
164 | 	for i := 0; i < 256; i++ {
165 | 		ws.bktHead[i] = sum
166 | 		sum += ws.bkt[i]
167 | 		ws.bktTail[i] = sum - 1
168 | 		ws.bkt[i] = sum - 1
169 | 	}
170 | 
171 | 	// record that our buckets are dirty in case ws is used again
172 | 	ws.dirty = true
173 | }
174 | 
175 | // pre-condition: SA contains properly bucketed LMS substrings
176 | // pre-condition: bkt contains the head of each character's bucket
177 | // post-condition: SA contains properly bucketed L-type and LMS suffixes
178 | func induceSortL0(S []byte, SA, bkt []int) {
179 | 	n := len(S)
180 | 
181 | 	// special case to deal with the (virtual) sentinel:
182 | 	// S[n-1] is L-type because of the sentinel, and if we were treating
183 | 	// the sentinel as a real character, it would be at the front of SA[]
184 | 	// (it's effectively stored in "SA[-1]")
185 | 	c := S[n-1]
186 | 	SA[bkt[c]] = n - 1
187 | 	bkt[c]++
188 | 
189 | 	// at each step, look at the character *before* S[SA[i]]; if it's L-type, insert it
190 | 	for _, SAi := range SA {
191 | 		if SAi <= 0 {
192 | 			// if SA[i] is empty or points to S[0], we don't have a preceding character to check
193 | 			continue
194 | 		}
195 | 
196 | 		j := SAi - 1
197 | 		c := S[j] // character we care about
198 | 
199 | 		// check for L-type (described in section 3)
200 | 		// since SA only holds L-type and LMS suffixes, c must be L-type if it is >= S[j]
201 | 		if c >= S[SAi] {
202 | 			SA[bkt[c]] = j
203 | 			bkt[c]++
204 | 		}
205 | 	}
206 | }
207 | 
208 | // pre-condition: SA contains properly bucketed L and LMS suffixes
209 | // pre-condition: bkt contains the tail of each character's bucket
210 | // post-condition: SA contains properly also contains all properly bucketed S-type suffixes
211 | func induceSortS0(S []byte, SA, bkt []int) {
212 | 	n := len(S)
213 | 
214 | 	// at each step, look at the character *before* S[SA[i]]; if it's S-type, insert it
215 | 	for i := n - 1; i >= 0; i-- {
216 | 		SAi := SA[i]
217 | 		if SAi <= 0 {
218 | 			continue
219 | 		}
220 | 
221 | 		j := SAi - 1
222 | 		c := S[j] // character we care about
223 | 
224 | 		// check for S-type (use Property 3.1)
225 | 		if c < S[SAi] || (c == S[SAi] && bkt[c] < i) {
226 | 			SA[bkt[c]] = j
227 | 			bkt[c]--
228 | 		}
229 | 	}
230 | 
231 | 	// we don't need to do anything special for the sentinel - by definition the character before it is not S-type
232 | }
233 | 


--------------------------------------------------------------------------------
/level1.go:
--------------------------------------------------------------------------------
  1 | package gosaca
  2 | 
  3 | // After filling in LMS suffixes using the "end of bucket is a counter"
  4 | // algorithm from section 4.2, we need to loop over SA and fix any bucket
  5 | // counters still left.
  6 | func fixLMSBucketCounters(SA []int) {
  7 | 	for i := len(SA) - 1; i >= 0; i-- {
  8 | 		if SA[i] == empty || SA[i] >= 0 {
  9 | 			// SA[i] isn't a counter; move on
 10 | 			continue
 11 | 		}
 12 | 		// right shift all the elements of the bucket, filling the vacated
 13 | 		// slot with "empty"
 14 | 		d := SA[i]
 15 | 		pos := i + d - 1
 16 | 		prev := empty
 17 | 		for x := pos + 1; x <= i; x++ {
 18 | 			SA[x], prev = prev, SA[x]
 19 | 		}
 20 | 	}
 21 | }
 22 | 
 23 | // This helper function implements the logic described in section 4.2 to
 24 | // insert an S-type value into its bucket from the end, reusing the ends of
 25 | // buckets as counters. If we have to shift a bucket around, the two returned
 26 | // integers are the start and end positions of SA that were modified.  If we
 27 | // don't have to do any shifting, we return -1, -1.
 28 | func insertSTypeUsingCounters(SA []int, index, c int) (int, int) {
 29 | 	x0, x1 := -1, -1
 30 | 	n := len(SA)
 31 | 	switch {
 32 | 	case SA[c] >= 0:
 33 | 		// section 4.2 case 2
 34 | 		prev := SA[c]
 35 | 		x0, x1 = c, c
 36 | 		for x := c + 1; x < n; x++ {
 37 | 			SA[x], prev = prev, SA[x]
 38 | 			x1 = x
 39 | 			if prev < 0 && prev != empty {
 40 | 				break
 41 | 			}
 42 | 		}
 43 | 		fallthrough
 44 | 
 45 | 	case SA[c] == empty:
 46 | 		// section 4.2 case 1
 47 | 		if c-1 >= 0 && SA[c-1] == empty {
 48 | 			SA[c-1] = index
 49 | 			SA[c] = -1
 50 | 		} else {
 51 | 			SA[c] = index
 52 | 		}
 53 | 		break
 54 | 
 55 | 	default:
 56 | 		// section 4.2 case 3
 57 | 		d := SA[c]
 58 | 		pos := c + d - 1
 59 | 		if pos >= 0 && SA[pos] == empty {
 60 | 			SA[pos] = index
 61 | 			SA[c]--
 62 | 		} else {
 63 | 			// right-shift SA[pos+1:c-1], inserting index into SA[pos+1]
 64 | 			x0, x1 = pos+1, c
 65 | 			prev := index
 66 | 			for x := pos + 1; x <= c; x++ {
 67 | 				SA[x], prev = prev, SA[x]
 68 | 			}
 69 | 		}
 70 | 		break
 71 | 	}
 72 | 
 73 | 	return x0, x1
 74 | }
 75 | 
 76 | // Same style of helper function as above, except for section 4.1 (L-type
 77 | // into buckets from head to tail).
 78 | func insertLTypeUsingCounters(SA []int, index, c int) (int, int) {
 79 | 	x0, x1 := -1, -1
 80 | 	n := len(SA)
 81 | 	switch {
 82 | 	case SA[c] >= 0:
 83 | 		// section 4.1 case 1
 84 | 		prev := SA[c]
 85 | 		x0, x1 = c, c
 86 | 		for x := c - 1; x >= 0; x-- {
 87 | 			SA[x], prev = prev, SA[x]
 88 | 			x0 = x
 89 | 			if prev < 0 && prev != empty {
 90 | 				break
 91 | 			}
 92 | 		}
 93 | 		fallthrough
 94 | 
 95 | 	case SA[c] == empty:
 96 | 		// section 4.1 case 1
 97 | 		if c+1 < n && SA[c+1] == empty {
 98 | 			SA[c+1] = index
 99 | 			SA[c] = -1
100 | 		} else {
101 | 			SA[c] = index
102 | 		}
103 | 		break
104 | 
105 | 	default:
106 | 		// section 4.1 case 3
107 | 		d := SA[c]
108 | 		pos := c - d + 1
109 | 		if pos < n && SA[pos] == empty {
110 | 			SA[pos] = index
111 | 			SA[c]--
112 | 		} else {
113 | 			// left-shift SA[c+1:pos-1], inserting index into SA[pos-1]
114 | 			x0, x1 = c, pos-1
115 | 			prev := index
116 | 			for x := pos - 1; x >= c; x-- {
117 | 				SA[x], prev = prev, SA[x]
118 | 			}
119 | 		}
120 | 	}
121 | 
122 | 	return x0, x1
123 | }
124 | 
125 | // recursive version of ComputeSuffixArray for levels 1+
126 | func computeSuffixArray1(S, SA []int, k int) {
127 | 	n := len(S)
128 | 
129 | 	// *********************************************
130 | 	// Stage 1: Induced-sort the LMS-substrings of S
131 | 	// *********************************************
132 | 
133 | 	// step 1 - initialize SA as empty
134 | 	setAllToEmpty(SA)
135 | 
136 | 	// step 2 - put all LMS substrings into buckets based on their first character
137 | 	for i := n - 2; i >= 0; i-- {
138 | 		if S[i] >= 0 {
139 | 			// S[i] is L-type
140 | 			continue
141 | 		}
142 | 
143 | 		// S[i] is S-type; walk back until S[i-1] is L-type or -1
144 | 		for i >= 1 && S[i-1] < 0 {
145 | 			// S[i-1] is also S-type
146 | 			i--
147 | 		}
148 | 
149 | 		if i == 0 {
150 | 			// even if S[0] is S-type, it's not LMS - we're done
151 | 			break
152 | 		}
153 | 
154 | 		// Insertion of the LMS strings is identical to insertions of S-type
155 | 		// strings described in section 4.2, but we don't care about the
156 | 		// returned values.
157 | 		insertSTypeUsingCounters(SA, i, ^S[i])
158 | 	}
159 | 
160 | 	// Remove any leftover bucket counters.
161 | 	fixLMSBucketCounters(SA)
162 | 
163 | 	// step 3 - induced sort the L-type suffixes of S into their buckets
164 | 	induceSortL1(S, SA)
165 | 
166 | 	// step 4 - induced sort the S-type suffixes of S into their buckets
167 | 	induceSortS1(S, SA)
168 | 
169 | 	// compact all the now-sorted LMS substrings into the first n1 positions of SA
170 | 	n1 := 0
171 | 	for _, s := range SA {
172 | 		if s != 0 && // S[0] is not LMS by definition
173 | 		S[s] < 0 && // S[s] is S-type
174 | 		S[s-1] >= 0 { // S[s-1] is L-type
175 | 			SA[n1] = s
176 | 			n1++
177 | 		}
178 | 	}
179 | 
180 | 	// *********************************************
181 | 	// Stage 2: Rename the LMS substrings
182 | 	// *********************************************
183 | 
184 | 	// provably, n1 is at most floor(n/2), so the following overlapping works
185 | 	SA1 := SA[:n1]  // SA1 overlaps the front of SA
186 | 	work := SA[n1:] // workspace overlaps the rest of SA
187 | 	S1 := SA[n-n1:] // S1 overlaps the end of SA (including part of "work", but rename deals with that correctly)
188 | 	k1 := rename1(S, SA1, work, S1)
189 | 
190 | 	// *********************************************
191 | 	// Stage 3: Sort recursively
192 | 	// *********************************************
193 | 	sortRecursively(S1, SA1, k1)
194 | 
195 | 	// NOT DESCRIBED IN PAPER BUT STILL NECESSARY (see SA-IS)
196 | 	// We need to undo the renaming of the LMS suffixes.
197 | 	// We no longer need S1, so reuse it to hold all the LMS indices.
198 | 	j := n1 - 1
199 | 	for i := n - 2; i >= 0; i-- {
200 | 		if S[i] >= 0 {
201 | 			// L-type; ignore
202 | 			continue
203 | 		}
204 | 		// S[i] is S-type; walk backwards to find LMS
205 | 		for i >= 1 && S[i-1] < 0 {
206 | 			// S[i-1] is also S-type; keep moving back
207 | 			i--
208 | 		}
209 | 		// S[0] is not LMS by definition, but otherwise S[i] is LMS
210 | 		if i > 0 {
211 | 			S1[j] = i
212 | 			j--
213 | 		}
214 | 	}
215 | 	if j != -1 {
216 | 		panic("didn't find all the LMS characters we expected")
217 | 	}
218 | 	// Now convert SA1 from renamed values to true values.
219 | 	for i, s := range SA1 {
220 | 		SA1[i] = S1[s]
221 | 	}
222 | 
223 | 	// *********************************************
224 | 	// Stage 4: Induced-sort SA(S) from SA1(S1)
225 | 	// *********************************************
226 | 
227 | 	// step 1 - initialize SA[n1:] as empty
228 | 	setAllToEmpty(SA[n1:])
229 | 
230 | 	// step 2 - put all the sorted LMS suffixes of S into their buckets in SA
231 | 	for i := n1 - 1; i >= 0; i-- {
232 | 		j := SA[i]
233 | 		SA[i] = empty
234 | 		c := ^S[j]
235 | 		if j == 0 {
236 | 			panic("unexpected j == 0")
237 | 		}
238 | 		// If we've worked our way back to c == i, then all the remaining
239 | 		// SA[0,c] values are already correct, and going into the loop below
240 | 		// with bucket counters will just screw things up.
241 | 		if c == i {
242 | 			SA[c] = j // restore it (we just emptied it out above...)
243 | 			break
244 | 		}
245 | 
246 | 		// Same explanation for what's going on here as in Stage 1 step 2.
247 | 		insertSTypeUsingCounters(SA, j, c)
248 | 	}
249 | 
250 | 	// Remove any leftover bucket counters.
251 | 	fixLMSBucketCounters(SA)
252 | 
253 | 	// step 3 - induced sort the L-type suffixes of S into their buckets
254 | 	induceSortL1(S, SA)
255 | 
256 | 	// step 4 - induced sort the S-type suffixes of S into their buckets
257 | 	induceSortS1(S, SA)
258 | }
259 | 
260 | // TODO pre-post
261 | func induceSortL1(S, SA []int) {
262 | 	n := len(S)
263 | 
264 | 	// special case to deal with the (virtual) sentinel:
265 | 	// S[n-1] is L-type because of the sentinel, and if we were treating
266 | 	// the sentinel as a real character, it would be at the front of SA[]
267 | 	// (it's effectively stored in "SA[-1]").
268 | 	//
269 | 	// Because c is L-type, we know SA[c] is empty, so we're in case 1 of section 4.1
270 | 	c := S[n-1]
271 | 	if c+1 < n && SA[c+1] == empty {
272 | 		SA[c+1] = n - 1
273 | 		SA[c] = -1
274 | 	} else {
275 | 		SA[c] = n - 1
276 | 	}
277 | 
278 | 	for i := 0; i < n; i++ {
279 | 		if SA[i] < 0 {
280 | 			// SA[i] is empty or being used as a counter; nothing to do
281 | 			continue
282 | 		}
283 | 		j := SA[i] - 1
284 | 		// if we just grabbed the character before an LMS suffix, we need to clear
285 | 		// out that LMS suffix (induceSortS1 assumes only L-type suffixes are in SA)
286 | 		if S[SA[i]] < 0 {
287 | 			SA[i] = empty
288 | 		}
289 | 		if j < 0 {
290 | 			// SA[i] was == 0; there is no preceding character to look at
291 | 			continue
292 | 		}
293 | 		c := S[j]
294 | 		if c < 0 {
295 | 			// S[j] is S-type; move on
296 | 			continue
297 | 		}
298 | 
299 | 		// insert j into its bucket; if we overwrite SA[i], we need to stay
300 | 		// here and look at it again in the next pass
301 | 		x0, x1 := insertLTypeUsingCounters(SA, j, c)
302 | 		if i >= x0 && i <= x1 {
303 | 			i--
304 | 		}
305 | 	}
306 | 
307 | 	// NOT MENTIONED IN PAPER: We need to go back over SA and fix
308 | 	// any leftover counter values via left shifting the buckets appropriately.
309 | 	// This is the moral equivalent of fixLMSBucketCounters, but we only ever
310 | 	// do this once, so didn't bother extracting it into its own function.
311 | 	for i, d := range SA {
312 | 		if d == empty || d >= 0 {
313 | 			continue
314 | 		}
315 | 		pos := i - d + 1
316 | 		prev := empty
317 | 		for x := pos - 1; x >= i; x-- {
318 | 			SA[x], prev = prev, SA[x]
319 | 		}
320 | 	}
321 | }
322 | 
323 | // TODO pre-post
324 | func induceSortS1(S, SA []int) {
325 | 	n := len(S)
326 | 
327 | 	for i := n - 1; i >= 0; i-- {
328 | 		if SA[i] <= 0 {
329 | 			// SA[i] is empty or being used as a counter; nothing to do
330 | 			continue
331 | 		}
332 | 		j := SA[i] - 1
333 | 		c := ^S[j]
334 | 		if c < 0 {
335 | 			// S[j]==c is L-type; move on
336 | 			continue
337 | 		}
338 | 
339 | 		// insert j into its bucket; if we overwrite SA[i], we need to stay
340 | 		// here and look at it again in the next pass
341 | 		x0, x1 := insertSTypeUsingCounters(SA, j, c)
342 | 		if i >= x0 && i <= x1 {
343 | 			i++
344 | 		}
345 | 	}
346 | }
347 | 


--------------------------------------------------------------------------------