├── unbwst.go
├── bwst_test.go
├── bwst.go
└── README.markdown


/unbwst.go:
--------------------------------------------------------------------------------
 1 | package bwst
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"math/big" // bitset
 6 | 	"sort"     // TODO: don't use sort to unbwst; interfaces are suboptimal
 7 | )
 8 | 
 9 | // Compute the inverse of the Burrows-Wheeler-Scott transform. This is done
10 | // out-of-place.
11 | func UnBWST(b []byte) []byte {
12 | 	sorted := make([]byte, len(b))
13 | 	copy(sorted, b)
14 | 	sort.Sort(bytesorter(sorted))
15 | 	used := new(big.Int)
16 | 	used.SetBit(used, len(b), 1) // reserve capacity
17 | 	links := make([]int, len(b))
18 | 	// TODO: use O(lg(N)) search in sorted instead of O(N) search in b
19 | 	for i, c := range sorted {
20 | 		// find the first unused index in b of c
21 | 		for j, c2 := range b {
22 | 			if c == c2 && used.Bit(j) == 0 {
23 | 				links[i] = j
24 | 				used.SetBit(used, j, 1)
25 | 				break
26 | 			}
27 | 		}
28 | 	}
29 | 	// We need to know once again whether each byte is used, so instead of
30 | 	// resetting the bitset or using more memory, we can just ask whether it's
31 | 	// unused.
32 | 	unused := used
33 | 	words := multibytesorter{}
34 | 	for i := range sorted {
35 | 		if unused.Bit(i) == 1 {
36 | 			word := []byte{}
37 | 			x := i
38 | 			for unused.Bit(x) == 1 {
39 | 				word = append(word, sorted[x])
40 | 				unused.SetBit(unused, x, 0)
41 | 				x = links[x]
42 | 			}
43 | 			words = append(words, nil)
44 | 			copy(words[1:], words)
45 | 			words[0] = word
46 | 		}
47 | 	}
48 | 	if !sort.IsSorted(words) {
49 | 		sort.Sort(words)
50 | 	}
51 | 	x := len(b)
52 | 	s := make([]byte, len(b))
53 | 	for _, word := range words {
54 | 		x -= len(word)
55 | 		copy(s[x:], word)
56 | 	}
57 | 	return s
58 | }
59 | 
60 | type bytesorter []byte
61 | 
62 | func (b bytesorter) Len() int           { return len(b) }
63 | func (b bytesorter) Less(i, j int) bool { return b[i] < b[j] }
64 | func (b bytesorter) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
65 | 
66 | type multibytesorter [][]byte
67 | 
68 | func (b multibytesorter) Len() int           { return len(b) }
69 | func (b multibytesorter) Less(i, j int) bool { return bytes.Compare(b[i], b[j]) < 0 }
70 | func (b multibytesorter) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
71 | 


--------------------------------------------------------------------------------
/bwst_test.go:
--------------------------------------------------------------------------------
 1 | package bwst
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"crypto/aes"
 6 | 	"crypto/cipher"
 7 | 	"crypto/rand"
 8 | 	"io"
 9 | 	"testing"
10 | )
11 | 
12 | const test_string = `Wherever you go, I follow,
13 | hands held across sidewalks
14 | frozen
15 | or slopes more slippery
16 | called love,
17 | life, and 1685 miles
18 | of "I can't wait
19 | to be there when you fall."`
20 | 
21 | func TestAbsorption(t *testing.T) {
22 | 	s := []byte(test_string)
23 | 	s = UnBWST(BWST(s))
24 | 	if string(s) != test_string {
25 | 		t.Fatal("UnBWST(BWST(s)) failed: expected a sweet poem, got gibberish")
26 | 	}
27 | 	s = BWST(UnBWST(s))
28 | 	if string(s) != test_string {
29 | 		t.Fatal("BWST(UnBWST(s)) failed: expected a sweet poem, got gibberish")
30 | 	}
31 | }
32 | 
33 | func TestAbsorptionRandom(t *testing.T) {
34 | 	s := makerandombytes(1 << 15)
35 | 	c := s
36 | 	s = UnBWST(BWST(s))
37 | 	if !bytes.Equal(c, s) {
38 | 		t.Fatal("UnBWST(BWST(s)) failed: expected randomness, got gibberish")
39 | 	}
40 | 	s = BWST(UnBWST(s))
41 | 	if !bytes.Equal(c, s) {
42 | 		t.Fatal("BWST(UnBWST(s)) failed: expected randomness, got gibberish")
43 | 	}
44 | }
45 | 
46 | func BenchmarkBWST(b *testing.B) {
47 | 	s := []byte(test_string)
48 | 	b.ResetTimer()
49 | 	for i := 0; i < b.N; i++ {
50 | 		s = BWST(s)
51 | 	}
52 | }
53 | 
54 | func BenchmarkBWSTRandom(b *testing.B) {
55 | 	s := makerandombytes(1 << 15)
56 | 	b.ResetTimer()
57 | 	for i := 0; i < b.N; i++ {
58 | 		s = BWST(s)
59 | 	}
60 | }
61 | 
62 | func BenchmarkUnBWST(b *testing.B) {
63 | 	s := []byte(test_string)
64 | 	b.ResetTimer()
65 | 	for i := 0; i < b.N; i++ {
66 | 		s = UnBWST(s)
67 | 	}
68 | }
69 | 
70 | func BenchmarkUnBWSTRandom(b *testing.B) {
71 | 	s := makerandombytes(1 << 15)
72 | 	b.ResetTimer()
73 | 	for i := 0; i < b.N; i++ {
74 | 		s = UnBWST(s)
75 | 	}
76 | }
77 | 
78 | func makerandombytes(n int) (b []byte) {
79 | 	iv := make([]byte, aes.BlockSize)
80 | 	if _, err := io.ReadFull(rand.Reader, iv); err != nil {
81 | 		panic(err)
82 | 	}
83 | 	c, _ := aes.NewCipher(iv) // I do insist.
84 | 	if _, err := io.ReadFull(rand.Reader, iv); err != nil {
85 | 		panic(err)
86 | 	}
87 | 	s := cipher.NewCTR(c, iv)
88 | 	b = make([]byte, n)
89 | 	s.XORKeyStream(b, b)
90 | 	return b
91 | }
92 | 


--------------------------------------------------------------------------------
/bwst.go:
--------------------------------------------------------------------------------
  1 | package bwst
  2 | 
  3 | import (
  4 | 	"sort"
  5 | 	"sync"
  6 | )
  7 | 
  8 | // Compute the Burrows-Wheeler-Scott transform of s. This is done
  9 | // out-of-place.
 10 | func BWST(s []byte) []byte {
 11 | 	words := factorize(s)
 12 | 	// Sorting all rotations of all Lyndon words and then choosing the last
 13 | 	// character of each is the same as choosing the character to the left of
 14 | 	// each character in its Lyndon word in sorted order. Therefore, we find
 15 | 	// all locations of each character, sort them all by their rotations, and
 16 | 	// proceed therein.
 17 | 	locs := locate(s, words)
 18 | 	b := make([]byte, 0, len(s))
 19 | 	var wg sync.WaitGroup
 20 | 	for _, charLocs := range locs {
 21 | 		wg.Add(1)
 22 | 		go func(charLocs []loc) { defer wg.Done(); sortrots(s, words, charLocs) }(charLocs)
 23 | 	}
 24 | 	wg.Wait()
 25 | 	for _, charLocs := range locs {
 26 | 		for _, l := range charLocs {
 27 | 			word := s[words[l.word]:words[l.word+1]]
 28 | 			i := l.idx - 1
 29 | 			if i < 0 {
 30 | 				i = len(word) - 1
 31 | 			}
 32 | 			b = append(b, word[i])
 33 | 		}
 34 | 	}
 35 | 	return b
 36 | }
 37 | 
 38 | // Better than actually storing all rotations of all words. Probably.
 39 | type loc struct {
 40 | 	word, idx int
 41 | }
 42 | 
 43 | func locate(s []byte, words []int) (locs [256][]loc) {
 44 | 	w := 0
 45 | 	for i, c := range s {
 46 | 		if i >= words[w+1] {
 47 | 			w++
 48 | 		}
 49 | 		locs[int(c)] = append(locs[int(c)], loc{w, i - words[w]})
 50 | 	}
 51 | 	return locs
 52 | }
 53 | 
 54 | // Compute the Lyndon factorization of s. Includes both endpoints.
 55 | func factorize(s []byte) (bounds []int) {
 56 | 	// Do an initial pass to count the number of words. Hopefully this avoids
 57 | 	// enough copying to be faster.
 58 | 	ch := make(chan int)
 59 | 	go findLyndon(s, ch)
 60 | 	n := 1
 61 | 	for _ = range ch {
 62 | 		n++
 63 | 	}
 64 | 	ch = make(chan int)
 65 | 	go findLyndon(s, ch)
 66 | 	bounds = make([]int, 1, n+1)
 67 | 	for i := range ch {
 68 | 		bounds = append(bounds, i)
 69 | 	}
 70 | 	return bounds
 71 | }
 72 | 
 73 | // Duval's algorithm. This is done concurrently under factorize() to enable
 74 | // word counting without doing extra work.
 75 | func findLyndon(s []byte, ch chan<- int) {
 76 | 	// Thanks to Jonathan on golang-nuts for simplifying the inner loop.
 77 | 	k := -1
 78 | 	for k < len(s)-1 {
 79 | 		i, j := k+1, k+2
 80 | 		for j < len(s) && s[i] <= s[j] {
 81 | 			if s[i] < s[j] {
 82 | 				// Whenever a character is less than the first character of a
 83 | 				// Lyndon word, it is not in that word.
 84 | 				i = k
 85 | 			}
 86 | 			// When the character at i is equal to the character at the start
 87 | 			// of the word, whether it is a part of that word or the start of
 88 | 			// the next is determined by the remainder of the string: if the
 89 | 			// substring s[k..n] < s[i..n], then s[i] is in the word starting
 90 | 			// at k.
 91 | 			i++
 92 | 			j++
 93 | 		}
 94 | 		for k < i {
 95 | 			k += j - i
 96 | 			ch <- k + 1
 97 | 		}
 98 | 	}
 99 | 	close(ch)
100 | }
101 | 
102 | // Each instance of a character is considered to be at the beginning of a
103 | // rotation of its word, so the locations can be sorted. Because each char is
104 | // in order already, we only need to sort the occurrences of each char
105 | // separately to sort the entire thing.
106 | 
107 | func sortrots(s []byte, words []int, locs []loc) {
108 | 	l := locsorter{locs, s, words}
109 | 	sort.Sort(l)
110 | }
111 | 
112 | type locsorter struct {
113 | 	locs  []loc
114 | 	s     []byte
115 | 	words []int
116 | }
117 | 
118 | func (l locsorter) Len() int      { return len(l.locs) }
119 | func (l locsorter) Swap(i, j int) { l.locs[i], l.locs[j] = l.locs[j], l.locs[i] }
120 | 
121 | // Cyclic order - AXYA < AXY here because AXYAAXYA < AXYAXY
122 | func (l locsorter) Less(i, j int) bool {
123 | 	loc1, loc2 := l.locs[i], l.locs[j]
124 | 	// get the actual sequences
125 | 	w1 := l.s[l.words[loc1.word]:l.words[loc1.word+1]]
126 | 	w2 := l.s[l.words[loc2.word]:l.words[loc2.word+1]]
127 | 	x, y := loc1.idx, loc2.idx
128 | 	n := lcm(len(w1), len(w2))
129 | 	for i := 0; i < n; i++ {
130 | 		if a, b := w1[x], w2[y]; a < b {
131 | 			return true
132 | 		} else if a > b {
133 | 			return false
134 | 		}
135 | 		x++
136 | 		if x >= len(w1) {
137 | 			x = 0
138 | 		}
139 | 		y++
140 | 		if y >= len(w2) {
141 | 			y = 0
142 | 		}
143 | 	}
144 | 	// words are equal
145 | 	return false
146 | }
147 | 
148 | func gcd(m, n int) int {
149 | 	var tmp int
150 | 	for m != 0 {
151 | 		tmp = m
152 | 		m = n % m
153 | 		n = tmp
154 | 	}
155 | 	return n
156 | }
157 | 
158 | func lcm(m, n int) int {
159 | 	return m / gcd(m, n) * n
160 | }
161 | 


--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
  1 | The Burrows-Wheeler-Scott transform (called the Burrows-Wheeler transform
  2 | "Scottified" in existing literature, but that sounds silly) sorts together all
  3 | infinitely repeated cycles of each Lyndon word of the input, then takes the
  4 | last character of each rotation of each Lyndon word in the overall sorted
  5 | order. Of course, this description is about as intelligible as any of the
  6 | existing literature on it for someone not intimately familiar with the
  7 | concepts involved, and incomplete for someone who is.
  8 | 
  9 | Lyndon words are sequences which are less than any of the rotations of that
 10 | sequence. "Less than", that is, the order, is defined in "the usual
 11 | lexicographical way": 'a' < 'b' by definition; 'aa' < 'ab' because
 12 | the first positions are the same and the second position is lesser in 'aa';
 13 | 'ab' < 'ba' because 'ab' is lesser in the first position. For now, the order
 14 | of sequences whose lengths are not equal is left undefined.
 15 | 
 16 | A Lyndon word is a sequence such that no matter how many times you take the
 17 | rightmost (or leftmost) element and attach it to the left (or right,
 18 | respectively), the result is never less than the word. By the Chen-Fox-Lyndon
 19 | theorem, every ordered sequence has a unique "Lyndon factorization" of Lyndon
 20 | words, such that each word in the factorization is never greater than its
 21 | predecessor, where order is _here_ (not for the BWST algorithm) defined for
 22 | words of unequal length such that if a is a prefix of b, then a < b.
 23 | 
 24 | The concept is best illustrated by example. The Lyndon factorization of the
 25 | sequence 'FOOBAR2000' (assume ASCII - digits precede letters) is the words
 26 | 'FOO', 'B', 'AR', '2', '0', '0', and '0'. 'F' is a Lyndon word because it has
 27 | no rotations, but it's not part of the factorization because 'FO' is also a
 28 | Lyndon word: 'FO' < 'OF'. 'FOO' is a Lyndon word because
 29 | 'FOO' < 'OFO' < 'OOF'. 'FOOB' is _not_ a Lyndon word; 'FOOB' is not less than
 30 | its rotation 'BFOO'.
 31 | 
 32 | Duval (1983) gives an algorithm for finding the Lyndon factorization of a
 33 | sequence in linear time. Wikipedia's article on Lyndon words now thankfully
 34 | has a description of the algorithm, but it does a poor job of explaining what
 35 | it actually does, so I'll describe it myself. It is key to realize that all
 36 | Lyndon words of length greater than 1 end with a character greater than the
 37 | one with which it starts. Knowing this, it's easy to realize that if, while
 38 | scanning the string for the Lyndon words, a character is encountered that is
 39 | less than the character at the start of the current word, then the word has
 40 | ended. Note, however, that this is not the only case; an illustrative example
 41 | here is 'ABCA', which factorizes to 'ABC' and 'A'. When the algorithm
 42 | encounters a character equal to the first, it has to start comparing to the
 43 | second character. Or, more generally, while only one of the two indices into
 44 | the string the algorithm holds is incremented on each step, in the case of the
 45 | compared characters being equal, both indices are incremented. Since this
 46 | causes the algorithm to treat repeated strings as equal, word boundaries are
 47 | determined according to the difference of the indices, and the lower one is
 48 | reset to the start each time the comparison yields lower earlier.
 49 | 
 50 | Now that I think the Lyndon factorization is satisfactorily explained, the
 51 | BWST itself can be introduced. Recall that the BWST sorts the infinitely
 52 | repeated rotations of all Lyndon words of the input. Let's take a word that
 53 | David Scott, the person who developed BWST, actually used with respect to the
 54 | algorithm, which illustrates not only this concept but also one of the
 55 | problems involved in learning about the transform: 'SCOTTIFACATION'. Its
 56 | Lyndon factorization produces 'S', 'COTTIF', and 'ACATION'. All rotations of
 57 | these words are:
 58 | 
 59 |     S
 60 | 	COTTIF
 61 | 	FCOTTI
 62 | 	IFCOTT
 63 | 	TIFCOT
 64 | 	TTIFCO
 65 | 	OTTIFC
 66 | 	ACATION
 67 | 	NACATIO
 68 | 	ONACATI
 69 | 	IONACAT
 70 | 	TIONACA
 71 | 	ATIONAC
 72 | 	CATIONA
 73 | 
 74 | These rotations are not sorted according to the usual lexicographical order.
 75 | In particular, strings of different lengths are compared as if both are
 76 | repeated infinitely. A shorter length to compare is each word repeated as many
 77 | times as the other has characters. Even shorter is to compute the LCM of the
 78 | lengths of those words and to make up to that many comparisons.
 79 | 
 80 | So, if we sort the rotations, we get:
 81 | 
 82 |     S        ACATION
 83 | 	COTTIF   ATIONAC
 84 | 	FCOTTI   CATIONA
 85 | 	IFCOTT    COTTIF
 86 | 	TIFCOT    FCOTTI
 87 | 	TTIFCO    IFCOTT
 88 | 	OTTIFC   IONACAT
 89 | 	ACATION  NACATIO
 90 | 	NACATIO  ONACATI
 91 | 	ONACATI   OTTIFC
 92 | 	IONACAT        S
 93 | 	TIONACA   TIFCOT
 94 | 	ATIONAC  TIONACA
 95 | 	CATIONA   TTIFCO
 96 | 
 97 | The BWST is now the last character of each rotation in the sorted output:
 98 | 'NCAFITTOICSTAO'.
 99 | 
100 | This was perhaps a bad example; entropy was not reduced, and the special
101 | cyclic order never came into play. The basic concepts have been explained,
102 | however, and that is my aim.
103 | 
104 | Now, the entire point of the BWST is that it has an inverse - you can get
105 | 'SCOTTIFACATION' back out of 'NCAFITTOICSTAO'. To do this, we need to compare
106 | the BWST output with its sorted order. Sorting gives 'AACCFIINOOSTTT'. Now we
107 | build a table thus:
108 | 
109 |     Index  Sorted   BWST   Start + Count = Sum   Map
110 | 	0      A        N      7       0       7     2
111 | 	1      A        C      2       0       2     12
112 | 	2      C        A      0       0       0     1
113 | 	3      C        F      4       0       4     9
114 | 	4      F        I      5       0       5     3
115 | 	5      I        T      11      0       11    4
116 | 	6      I        T      11      1       12    8
117 | 	7      N        O      8       0       8     0
118 | 	8      O        I      5       1       6     7
119 | 	9      O        C      2       1       3     13
120 | 	10     S        S      10      0       10    10
121 | 	11     T        T      11      2       13    5
122 | 	12     T        A      0       1       1     6
123 | 	13     T        O      8       1       9     11
124 | 
125 |  - Index is the zero-based index into the sorted sequence for each character.
126 |  - Sorted is the sorted sequence.
127 |  - BWST is the input into the inverse function.
128 |  - Start is the first index in the sorted string at which the corresponding
129 |    BWST character is found.
130 |  - Count is the number of times the corresponding BWST character already has
131 |    been found in the sequence.
132 |  - Sum is the sum of the starts and counts.
133 |  - Map is the line number whose sum equals the current index.
134 | 
135 | We start at index 0 and follow the map, outputting from the sorted sequence as
136 | we go:
137 | 
138 |     Index   Sorted   Map   Output
139 | 	0       A        2     A
140 | 	2       C        1     AC
141 | 	1       A        12    ACA
142 | 	12      T        6     ACAT
143 | 	6       I        8     ACATI
144 | 	8       O        7     ACATIO
145 | 	7       N        0     ACATION
146 | 
147 | But the map at line 7 points to an index we've already visited. We've now
148 | retrieved the lexicographically least Lyndon word from the input. Next, we
149 | move to the lowest index we have not yet visited, which is 3.
150 | 
151 |     Index   Sorted   Map   Output
152 | 	3       C        9     C
153 | 	9       O        13    CO
154 | 	13      T        11    COT
155 | 	11      T        5     COTT
156 | 	5       I        4     COTTI
157 | 	4       F        3     COTTIF
158 | 
159 | We know the next greatest Lyndon word. The only unvisited index so far is 10,
160 | so S at 10 is the greatest Lyndon word in the original input. Concatenating
161 | the words in nonincreasing order yields SCOTTIFACATION. BWST inverted.
162 | 
163 | Resources:
164 |  - http://groups.google.com/group/comp.compression/msg/a0236d754e869212 - This
165 |    is an old post, so it misses some connections which now are known, but it
166 |    is the only plain-English description of the BWST and UNBWST I could find.
167 |  - http://bijective.dogma.net/00yyy.pdf - This paper is a fairly intelligible
168 |    description and implementation of the algorithms, but it contains several
169 |    significant errors. Its examples are more optimized than the ones I gave;
170 |    if you want to learn more, check it out, but not until you understand the
171 |    algorithms enough to be able to recognize the errors.
172 |  - http://arxiv.org/abs/0908.0239 - This paper is for those with advanced
173 |    degrees. It is correct, but almost impossible to understand: "Let k ∈ ℕ.
174 |    Let ⋃_{i=1}^s[v_i] = {w_1, ..., w_n} ⊆ ∑^+ be a multiset built from
175 |    conjugacy classes [v_i]. Let M = (w_1, ..., w_n) satisfy context_k(w_1) ≤
176 |    ··· ≤ context_k(w_n) and let L = last(w_1) ··· last(w_n) be the sequence of
177 |    the last symbols. Then context_k(w_i) = λ_Lπ_L(i)·λ_Lπ_L^2(i)···λ_Lπ_L^k(i)
178 |    where π_L^t denotes the t-fold application of π_L and λ_Lπ_L(i) =
179 |    λ_L(π_L(i))."
180 | 


--------------------------------------------------------------------------------