├── unbwst.go ├── bwst_test.go ├── bwst.go └── README.markdown /unbwst.go: -------------------------------------------------------------------------------- 1 | package bwst 2 | 3 | import ( 4 | "bytes" 5 | "math/big" // bitset 6 | "sort" // TODO: don't use sort to unbwst; interfaces are suboptimal 7 | ) 8 | 9 | // Compute the inverse of the Burrows-Wheeler-Scott transform. This is done 10 | // out-of-place. 11 | func UnBWST(b []byte) []byte { 12 | sorted := make([]byte, len(b)) 13 | copy(sorted, b) 14 | sort.Sort(bytesorter(sorted)) 15 | used := new(big.Int) 16 | used.SetBit(used, len(b), 1) // reserve capacity 17 | links := make([]int, len(b)) 18 | // TODO: use O(lg(N)) search in sorted instead of O(N) search in b 19 | for i, c := range sorted { 20 | // find the first unused index in b of c 21 | for j, c2 := range b { 22 | if c == c2 && used.Bit(j) == 0 { 23 | links[i] = j 24 | used.SetBit(used, j, 1) 25 | break 26 | } 27 | } 28 | } 29 | // We need to know once again whether each byte is used, so instead of 30 | // resetting the bitset or using more memory, we can just ask whether it's 31 | // unused. 32 | unused := used 33 | words := multibytesorter{} 34 | for i := range sorted { 35 | if unused.Bit(i) == 1 { 36 | word := []byte{} 37 | x := i 38 | for unused.Bit(x) == 1 { 39 | word = append(word, sorted[x]) 40 | unused.SetBit(unused, x, 0) 41 | x = links[x] 42 | } 43 | words = append(words, nil) 44 | copy(words[1:], words) 45 | words[0] = word 46 | } 47 | } 48 | if !sort.IsSorted(words) { 49 | sort.Sort(words) 50 | } 51 | x := len(b) 52 | s := make([]byte, len(b)) 53 | for _, word := range words { 54 | x -= len(word) 55 | copy(s[x:], word) 56 | } 57 | return s 58 | } 59 | 60 | type bytesorter []byte 61 | 62 | func (b bytesorter) Len() int { return len(b) } 63 | func (b bytesorter) Less(i, j int) bool { return b[i] < b[j] } 64 | func (b bytesorter) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 65 | 66 | type multibytesorter [][]byte 67 | 68 | func (b multibytesorter) Len() int { return len(b) } 69 | func (b multibytesorter) Less(i, j int) bool { return bytes.Compare(b[i], b[j]) < 0 } 70 | func (b multibytesorter) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 71 | -------------------------------------------------------------------------------- /bwst_test.go: -------------------------------------------------------------------------------- 1 | package bwst 2 | 3 | import ( 4 | "bytes" 5 | "crypto/aes" 6 | "crypto/cipher" 7 | "crypto/rand" 8 | "io" 9 | "testing" 10 | ) 11 | 12 | const test_string = `Wherever you go, I follow, 13 | hands held across sidewalks 14 | frozen 15 | or slopes more slippery 16 | called love, 17 | life, and 1685 miles 18 | of "I can't wait 19 | to be there when you fall."` 20 | 21 | func TestAbsorption(t *testing.T) { 22 | s := []byte(test_string) 23 | s = UnBWST(BWST(s)) 24 | if string(s) != test_string { 25 | t.Fatal("UnBWST(BWST(s)) failed: expected a sweet poem, got gibberish") 26 | } 27 | s = BWST(UnBWST(s)) 28 | if string(s) != test_string { 29 | t.Fatal("BWST(UnBWST(s)) failed: expected a sweet poem, got gibberish") 30 | } 31 | } 32 | 33 | func TestAbsorptionRandom(t *testing.T) { 34 | s := makerandombytes(1 << 15) 35 | c := s 36 | s = UnBWST(BWST(s)) 37 | if !bytes.Equal(c, s) { 38 | t.Fatal("UnBWST(BWST(s)) failed: expected randomness, got gibberish") 39 | } 40 | s = BWST(UnBWST(s)) 41 | if !bytes.Equal(c, s) { 42 | t.Fatal("BWST(UnBWST(s)) failed: expected randomness, got gibberish") 43 | } 44 | } 45 | 46 | func BenchmarkBWST(b *testing.B) { 47 | s := []byte(test_string) 48 | b.ResetTimer() 49 | for i := 0; i < b.N; i++ { 50 | s = BWST(s) 51 | } 52 | } 53 | 54 | func BenchmarkBWSTRandom(b *testing.B) { 55 | s := makerandombytes(1 << 15) 56 | b.ResetTimer() 57 | for i := 0; i < b.N; i++ { 58 | s = BWST(s) 59 | } 60 | } 61 | 62 | func BenchmarkUnBWST(b *testing.B) { 63 | s := []byte(test_string) 64 | b.ResetTimer() 65 | for i := 0; i < b.N; i++ { 66 | s = UnBWST(s) 67 | } 68 | } 69 | 70 | func BenchmarkUnBWSTRandom(b *testing.B) { 71 | s := makerandombytes(1 << 15) 72 | b.ResetTimer() 73 | for i := 0; i < b.N; i++ { 74 | s = UnBWST(s) 75 | } 76 | } 77 | 78 | func makerandombytes(n int) (b []byte) { 79 | iv := make([]byte, aes.BlockSize) 80 | if _, err := io.ReadFull(rand.Reader, iv); err != nil { 81 | panic(err) 82 | } 83 | c, _ := aes.NewCipher(iv) // I do insist. 84 | if _, err := io.ReadFull(rand.Reader, iv); err != nil { 85 | panic(err) 86 | } 87 | s := cipher.NewCTR(c, iv) 88 | b = make([]byte, n) 89 | s.XORKeyStream(b, b) 90 | return b 91 | } 92 | -------------------------------------------------------------------------------- /bwst.go: -------------------------------------------------------------------------------- 1 | package bwst 2 | 3 | import ( 4 | "sort" 5 | "sync" 6 | ) 7 | 8 | // Compute the Burrows-Wheeler-Scott transform of s. This is done 9 | // out-of-place. 10 | func BWST(s []byte) []byte { 11 | words := factorize(s) 12 | // Sorting all rotations of all Lyndon words and then choosing the last 13 | // character of each is the same as choosing the character to the left of 14 | // each character in its Lyndon word in sorted order. Therefore, we find 15 | // all locations of each character, sort them all by their rotations, and 16 | // proceed therein. 17 | locs := locate(s, words) 18 | b := make([]byte, 0, len(s)) 19 | var wg sync.WaitGroup 20 | for _, charLocs := range locs { 21 | wg.Add(1) 22 | go func(charLocs []loc) { defer wg.Done(); sortrots(s, words, charLocs) }(charLocs) 23 | } 24 | wg.Wait() 25 | for _, charLocs := range locs { 26 | for _, l := range charLocs { 27 | word := s[words[l.word]:words[l.word+1]] 28 | i := l.idx - 1 29 | if i < 0 { 30 | i = len(word) - 1 31 | } 32 | b = append(b, word[i]) 33 | } 34 | } 35 | return b 36 | } 37 | 38 | // Better than actually storing all rotations of all words. Probably. 39 | type loc struct { 40 | word, idx int 41 | } 42 | 43 | func locate(s []byte, words []int) (locs [256][]loc) { 44 | w := 0 45 | for i, c := range s { 46 | if i >= words[w+1] { 47 | w++ 48 | } 49 | locs[int(c)] = append(locs[int(c)], loc{w, i - words[w]}) 50 | } 51 | return locs 52 | } 53 | 54 | // Compute the Lyndon factorization of s. Includes both endpoints. 55 | func factorize(s []byte) (bounds []int) { 56 | // Do an initial pass to count the number of words. Hopefully this avoids 57 | // enough copying to be faster. 58 | ch := make(chan int) 59 | go findLyndon(s, ch) 60 | n := 1 61 | for _ = range ch { 62 | n++ 63 | } 64 | ch = make(chan int) 65 | go findLyndon(s, ch) 66 | bounds = make([]int, 1, n+1) 67 | for i := range ch { 68 | bounds = append(bounds, i) 69 | } 70 | return bounds 71 | } 72 | 73 | // Duval's algorithm. This is done concurrently under factorize() to enable 74 | // word counting without doing extra work. 75 | func findLyndon(s []byte, ch chan<- int) { 76 | // Thanks to Jonathan on golang-nuts for simplifying the inner loop. 77 | k := -1 78 | for k < len(s)-1 { 79 | i, j := k+1, k+2 80 | for j < len(s) && s[i] <= s[j] { 81 | if s[i] < s[j] { 82 | // Whenever a character is less than the first character of a 83 | // Lyndon word, it is not in that word. 84 | i = k 85 | } 86 | // When the character at i is equal to the character at the start 87 | // of the word, whether it is a part of that word or the start of 88 | // the next is determined by the remainder of the string: if the 89 | // substring s[k..n] < s[i..n], then s[i] is in the word starting 90 | // at k. 91 | i++ 92 | j++ 93 | } 94 | for k < i { 95 | k += j - i 96 | ch <- k + 1 97 | } 98 | } 99 | close(ch) 100 | } 101 | 102 | // Each instance of a character is considered to be at the beginning of a 103 | // rotation of its word, so the locations can be sorted. Because each char is 104 | // in order already, we only need to sort the occurrences of each char 105 | // separately to sort the entire thing. 106 | 107 | func sortrots(s []byte, words []int, locs []loc) { 108 | l := locsorter{locs, s, words} 109 | sort.Sort(l) 110 | } 111 | 112 | type locsorter struct { 113 | locs []loc 114 | s []byte 115 | words []int 116 | } 117 | 118 | func (l locsorter) Len() int { return len(l.locs) } 119 | func (l locsorter) Swap(i, j int) { l.locs[i], l.locs[j] = l.locs[j], l.locs[i] } 120 | 121 | // Cyclic order - AXYA < AXY here because AXYAAXYA < AXYAXY 122 | func (l locsorter) Less(i, j int) bool { 123 | loc1, loc2 := l.locs[i], l.locs[j] 124 | // get the actual sequences 125 | w1 := l.s[l.words[loc1.word]:l.words[loc1.word+1]] 126 | w2 := l.s[l.words[loc2.word]:l.words[loc2.word+1]] 127 | x, y := loc1.idx, loc2.idx 128 | n := lcm(len(w1), len(w2)) 129 | for i := 0; i < n; i++ { 130 | if a, b := w1[x], w2[y]; a < b { 131 | return true 132 | } else if a > b { 133 | return false 134 | } 135 | x++ 136 | if x >= len(w1) { 137 | x = 0 138 | } 139 | y++ 140 | if y >= len(w2) { 141 | y = 0 142 | } 143 | } 144 | // words are equal 145 | return false 146 | } 147 | 148 | func gcd(m, n int) int { 149 | var tmp int 150 | for m != 0 { 151 | tmp = m 152 | m = n % m 153 | n = tmp 154 | } 155 | return n 156 | } 157 | 158 | func lcm(m, n int) int { 159 | return m / gcd(m, n) * n 160 | } 161 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | The Burrows-Wheeler-Scott transform (called the Burrows-Wheeler transform 2 | "Scottified" in existing literature, but that sounds silly) sorts together all 3 | infinitely repeated cycles of each Lyndon word of the input, then takes the 4 | last character of each rotation of each Lyndon word in the overall sorted 5 | order. Of course, this description is about as intelligible as any of the 6 | existing literature on it for someone not intimately familiar with the 7 | concepts involved, and incomplete for someone who is. 8 | 9 | Lyndon words are sequences which are less than any of the rotations of that 10 | sequence. "Less than", that is, the order, is defined in "the usual 11 | lexicographical way": 'a' < 'b' by definition; 'aa' < 'ab' because 12 | the first positions are the same and the second position is lesser in 'aa'; 13 | 'ab' < 'ba' because 'ab' is lesser in the first position. For now, the order 14 | of sequences whose lengths are not equal is left undefined. 15 | 16 | A Lyndon word is a sequence such that no matter how many times you take the 17 | rightmost (or leftmost) element and attach it to the left (or right, 18 | respectively), the result is never less than the word. By the Chen-Fox-Lyndon 19 | theorem, every ordered sequence has a unique "Lyndon factorization" of Lyndon 20 | words, such that each word in the factorization is never greater than its 21 | predecessor, where order is _here_ (not for the BWST algorithm) defined for 22 | words of unequal length such that if a is a prefix of b, then a < b. 23 | 24 | The concept is best illustrated by example. The Lyndon factorization of the 25 | sequence 'FOOBAR2000' (assume ASCII - digits precede letters) is the words 26 | 'FOO', 'B', 'AR', '2', '0', '0', and '0'. 'F' is a Lyndon word because it has 27 | no rotations, but it's not part of the factorization because 'FO' is also a 28 | Lyndon word: 'FO' < 'OF'. 'FOO' is a Lyndon word because 29 | 'FOO' < 'OFO' < 'OOF'. 'FOOB' is _not_ a Lyndon word; 'FOOB' is not less than 30 | its rotation 'BFOO'. 31 | 32 | Duval (1983) gives an algorithm for finding the Lyndon factorization of a 33 | sequence in linear time. Wikipedia's article on Lyndon words now thankfully 34 | has a description of the algorithm, but it does a poor job of explaining what 35 | it actually does, so I'll describe it myself. It is key to realize that all 36 | Lyndon words of length greater than 1 end with a character greater than the 37 | one with which it starts. Knowing this, it's easy to realize that if, while 38 | scanning the string for the Lyndon words, a character is encountered that is 39 | less than the character at the start of the current word, then the word has 40 | ended. Note, however, that this is not the only case; an illustrative example 41 | here is 'ABCA', which factorizes to 'ABC' and 'A'. When the algorithm 42 | encounters a character equal to the first, it has to start comparing to the 43 | second character. Or, more generally, while only one of the two indices into 44 | the string the algorithm holds is incremented on each step, in the case of the 45 | compared characters being equal, both indices are incremented. Since this 46 | causes the algorithm to treat repeated strings as equal, word boundaries are 47 | determined according to the difference of the indices, and the lower one is 48 | reset to the start each time the comparison yields lower earlier. 49 | 50 | Now that I think the Lyndon factorization is satisfactorily explained, the 51 | BWST itself can be introduced. Recall that the BWST sorts the infinitely 52 | repeated rotations of all Lyndon words of the input. Let's take a word that 53 | David Scott, the person who developed BWST, actually used with respect to the 54 | algorithm, which illustrates not only this concept but also one of the 55 | problems involved in learning about the transform: 'SCOTTIFACATION'. Its 56 | Lyndon factorization produces 'S', 'COTTIF', and 'ACATION'. All rotations of 57 | these words are: 58 | 59 | S 60 | COTTIF 61 | FCOTTI 62 | IFCOTT 63 | TIFCOT 64 | TTIFCO 65 | OTTIFC 66 | ACATION 67 | NACATIO 68 | ONACATI 69 | IONACAT 70 | TIONACA 71 | ATIONAC 72 | CATIONA 73 | 74 | These rotations are not sorted according to the usual lexicographical order. 75 | In particular, strings of different lengths are compared as if both are 76 | repeated infinitely. A shorter length to compare is each word repeated as many 77 | times as the other has characters. Even shorter is to compute the LCM of the 78 | lengths of those words and to make up to that many comparisons. 79 | 80 | So, if we sort the rotations, we get: 81 | 82 | S ACATION 83 | COTTIF ATIONAC 84 | FCOTTI CATIONA 85 | IFCOTT COTTIF 86 | TIFCOT FCOTTI 87 | TTIFCO IFCOTT 88 | OTTIFC IONACAT 89 | ACATION NACATIO 90 | NACATIO ONACATI 91 | ONACATI OTTIFC 92 | IONACAT S 93 | TIONACA TIFCOT 94 | ATIONAC TIONACA 95 | CATIONA TTIFCO 96 | 97 | The BWST is now the last character of each rotation in the sorted output: 98 | 'NCAFITTOICSTAO'. 99 | 100 | This was perhaps a bad example; entropy was not reduced, and the special 101 | cyclic order never came into play. The basic concepts have been explained, 102 | however, and that is my aim. 103 | 104 | Now, the entire point of the BWST is that it has an inverse - you can get 105 | 'SCOTTIFACATION' back out of 'NCAFITTOICSTAO'. To do this, we need to compare 106 | the BWST output with its sorted order. Sorting gives 'AACCFIINOOSTTT'. Now we 107 | build a table thus: 108 | 109 | Index Sorted BWST Start + Count = Sum Map 110 | 0 A N 7 0 7 2 111 | 1 A C 2 0 2 12 112 | 2 C A 0 0 0 1 113 | 3 C F 4 0 4 9 114 | 4 F I 5 0 5 3 115 | 5 I T 11 0 11 4 116 | 6 I T 11 1 12 8 117 | 7 N O 8 0 8 0 118 | 8 O I 5 1 6 7 119 | 9 O C 2 1 3 13 120 | 10 S S 10 0 10 10 121 | 11 T T 11 2 13 5 122 | 12 T A 0 1 1 6 123 | 13 T O 8 1 9 11 124 | 125 | - Index is the zero-based index into the sorted sequence for each character. 126 | - Sorted is the sorted sequence. 127 | - BWST is the input into the inverse function. 128 | - Start is the first index in the sorted string at which the corresponding 129 | BWST character is found. 130 | - Count is the number of times the corresponding BWST character already has 131 | been found in the sequence. 132 | - Sum is the sum of the starts and counts. 133 | - Map is the line number whose sum equals the current index. 134 | 135 | We start at index 0 and follow the map, outputting from the sorted sequence as 136 | we go: 137 | 138 | Index Sorted Map Output 139 | 0 A 2 A 140 | 2 C 1 AC 141 | 1 A 12 ACA 142 | 12 T 6 ACAT 143 | 6 I 8 ACATI 144 | 8 O 7 ACATIO 145 | 7 N 0 ACATION 146 | 147 | But the map at line 7 points to an index we've already visited. We've now 148 | retrieved the lexicographically least Lyndon word from the input. Next, we 149 | move to the lowest index we have not yet visited, which is 3. 150 | 151 | Index Sorted Map Output 152 | 3 C 9 C 153 | 9 O 13 CO 154 | 13 T 11 COT 155 | 11 T 5 COTT 156 | 5 I 4 COTTI 157 | 4 F 3 COTTIF 158 | 159 | We know the next greatest Lyndon word. The only unvisited index so far is 10, 160 | so S at 10 is the greatest Lyndon word in the original input. Concatenating 161 | the words in nonincreasing order yields SCOTTIFACATION. BWST inverted. 162 | 163 | Resources: 164 | - http://groups.google.com/group/comp.compression/msg/a0236d754e869212 - This 165 | is an old post, so it misses some connections which now are known, but it 166 | is the only plain-English description of the BWST and UNBWST I could find. 167 | - http://bijective.dogma.net/00yyy.pdf - This paper is a fairly intelligible 168 | description and implementation of the algorithms, but it contains several 169 | significant errors. Its examples are more optimized than the ones I gave; 170 | if you want to learn more, check it out, but not until you understand the 171 | algorithms enough to be able to recognize the errors. 172 | - http://arxiv.org/abs/0908.0239 - This paper is for those with advanced 173 | degrees. It is correct, but almost impossible to understand: "Let k ∈ ℕ. 174 | Let ⋃_{i=1}^s[v_i] = {w_1, ..., w_n} ⊆ ∑^+ be a multiset built from 175 | conjugacy classes [v_i]. Let M = (w_1, ..., w_n) satisfy context_k(w_1) ≤ 176 | ··· ≤ context_k(w_n) and let L = last(w_1) ··· last(w_n) be the sequence of 177 | the last symbols. Then context_k(w_i) = λ_Lπ_L(i)·λ_Lπ_L^2(i)···λ_Lπ_L^k(i) 178 | where π_L^t denotes the t-fold application of π_L and λ_Lπ_L(i) = 179 | λ_L(π_L(i))." 180 | --------------------------------------------------------------------------------