├── LICENSE
├── README.md
├── cmd
    ├── cdsplit
    │   └── cdsplit.go
    └── internal
    │   └── cdflags
    │       └── bytes.go
└── rabin
    ├── chunker.go
    ├── chunker_test.go
    ├── poly.go
    ├── poly_test.go
    ├── rabin.go
    └── rabin_test.go


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 The Go Authors. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |    * Redistributions of source code must retain the above copyright
 8 | notice, this list of conditions and the following disclaimer.
 9 |    * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 |    * Neither the name of Google Inc. nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # rabin [![GoDoc](https://godoc.org/github.com/aclements/go-rabin/rabin?status.svg)](https://godoc.org/github.com/aclements/go-rabin/rabin) [![Go Report Card](https://goreportcard.com/badge/github.com/aclements/go-rabin)](https://goreportcard.com/report/github.com/aclements/go-rabin)
 2 | 
 3 | The rabin package implements Rabin hashing (aka fingerprinting) and
 4 | content-defined chunking based on Rabin hashing.
 5 | 
 6 | Rabin hashing has the unusual property that it can efficiently compute
 7 | a "rolling hash" of a stream of data, where the hash value reflects
 8 | only the most recent w bytes of the stream, for some window size w.
 9 | This property makes it ideal for "content-defined chunking", which
10 | sub-divides sequential data on boundaries that are robust to
11 | insertions and deletions.
12 | 
13 | The details of Rabin fingerprinting are described in Rabin, Michael
14 | (1981). "Fingerprinting by Random Polynomials." Center for Research in
15 | Computing Technology, Harvard University. Tech Report TR-CSE-03-01.
16 | 
17 | Installation
18 | ------------
19 | 
20 | To download go-rabin, run
21 | 
22 | ```sh
23 | go get -d -u github.com/aclements/go-rabin/rabin
24 | ```
25 | 
26 | You can then import this package into your projects with
27 | 
28 | ```go
29 | import "github.com/aclements/go-rabin/rabin"
30 | ```
31 | 
32 | Demos
33 | -----
34 | 
35 | There is a small program in `cmd/cdsplit` that divides an input file
36 | into content-defined chunks. It's only intended as a demo, but can be
37 | installed using
38 | 
39 | ```sh
40 | go get github.com/aclements/go-rabin/cmd/cdsplit
41 | ```
42 | 


--------------------------------------------------------------------------------
/cmd/cdsplit/cdsplit.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // Command cdsplit divides a file into variable-sized, content-defined
 6 | // chunks that are robust to insertions, deletions, and changes to the
 7 | // input file.
 8 | //
 9 | // It is a demo for the Go rabin package.
10 | package main
11 | 
12 | import (
13 | 	"bytes"
14 | 	"flag"
15 | 	"fmt"
16 | 	"io"
17 | 	"log"
18 | 	"os"
19 | 
20 | 	"github.com/aclements/go-rabin/cmd/internal/cdflags"
21 | 	"github.com/aclements/go-rabin/rabin"
22 | )
23 | 
24 | func main() {
25 | 	// Parse and validate flags
26 | 	log.SetPrefix(os.Args[0] + ": ")
27 | 	log.SetFlags(0)
28 | 	flag.Usage = func() {
29 | 		fmt.Fprintf(os.Stderr, "usage: %s [flags] in-file\n\n", os.Args[0])
30 | 		fmt.Fprintf(os.Stderr, "Divide in-file into variable-sized, content-defined chunks that are robust to\n")
31 | 		fmt.Fprintf(os.Stderr, "insertions, deletions, and changes to in-file.\n\n")
32 | 		flag.PrintDefaults()
33 | 		os.Exit(2)
34 | 	}
35 | 	window := cdflags.FlagBytes("window", 64, "use a rolling hash with window size `w`")
36 | 	avg := cdflags.FlagBytes("avg", 4<<10, "average chunk `size`; must be a power of 2")
37 | 	min := cdflags.FlagBytes("min", 512, "minimum chunk `size`")
38 | 	max := cdflags.FlagBytes("max", 32<<10, "maximum chunk `size`")
39 | 	outBase := flag.String("out", "", "write output to `base`.NNNNNN")
40 | 	flag.Parse()
41 | 	if flag.NArg() != 1 {
42 | 		flag.Usage()
43 | 	}
44 | 	if *min > *max {
45 | 		log.Fatal("-min must be <= -max")
46 | 	}
47 | 	if *avg&(*avg-1) != 0 {
48 | 		log.Fatal("-avg must be a power of two")
49 | 	}
50 | 	if *min < *window {
51 | 		log.Fatal("-min must be >= -window")
52 | 	}
53 | 	inFile := flag.Arg(0)
54 | 	if *outBase == "" {
55 | 		*outBase = inFile
56 | 	}
57 | 
58 | 	// Open input file
59 | 	f, err := os.Open(inFile)
60 | 	if err != nil {
61 | 		log.Fatal(err)
62 | 	}
63 | 	defer f.Close()
64 | 
65 | 	// Chunk and write output files.
66 | 	copy := new(bytes.Buffer)
67 | 	r := io.TeeReader(f, copy)
68 | 	c := rabin.NewChunker(rabin.NewTable(rabin.Poly64, int(*window)), r, int(*min), int(*avg), int(*max))
69 | 	for i := 0; ; i++ {
70 | 		clen, err := c.Next()
71 | 		if err == io.EOF {
72 | 			break
73 | 		} else if err != nil {
74 | 			log.Fatal(err)
75 | 		}
76 | 
77 | 		name := fmt.Sprintf("%s.%06d", *outBase, i)
78 | 		fOut, err := os.Create(name)
79 | 		if err != nil {
80 | 			log.Fatal(err)
81 | 		}
82 | 		_, err = io.CopyN(fOut, copy, int64(clen))
83 | 		if err == nil {
84 | 			err = fOut.Close()
85 | 		}
86 | 		if err != nil {
87 | 			log.Fatalf("error writing %s: %s", name, err)
88 | 		}
89 | 	}
90 | }
91 | 


--------------------------------------------------------------------------------
/cmd/internal/cdflags/bytes.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package cdflags
 6 | 
 7 | import (
 8 | 	"flag"
 9 | 	"fmt"
10 | )
11 | 
12 | // Bytes is a byte size.
13 | //
14 | // Bytes implements flag.Value.
15 | type Bytes int64
16 | 
17 | var si = []string{"", "k", "M", "G", "T", "P", "E", "Z", "Y"}
18 | 
19 | // String pretty-prints b using an SI prefix.
20 | func (b Bytes) String() string {
21 | 	f := float64(b)
22 | 	for i, s := range si {
23 | 		if f < 1024 || i == len(si)-1 {
24 | 			return fmt.Sprintf("%g%s", f, s)
25 | 		}
26 | 		f /= 1024
27 | 	}
28 | 	panic("not reached")
29 | }
30 | 
31 | // Set parses s into bytes, accepting SI prefixes.
32 | func (b *Bytes) Set(s string) error {
33 | 	var num float64
34 | 	var unit string
35 | 	_, err := fmt.Sscanf(s, "%g%s", &num, &unit)
36 | 	if err == nil {
37 | 		for _, s := range si {
38 | 			if unit == s || unit == s+"B" || unit == s+"iB" {
39 | 				*b = Bytes(num)
40 | 				return nil
41 | 			}
42 | 			num *= 1024
43 | 		}
44 | 	}
45 | 	return fmt.Errorf("expected <num> or <num>[%s]", si)
46 | }
47 | 
48 | // FlagBytes defines a Bytes flag and returns a pointer to the
49 | // variable where the value of the flag will be stored.
50 | func FlagBytes(name string, value Bytes, usage string) *Bytes {
51 | 	flag.Var(&value, name, usage)
52 | 	return &value
53 | }
54 | 
55 | // FlagBytesVar defines a Bytes flag. The argument p points to a Bytes
56 | // variable in which to sore the value of the flag.
57 | func FlagBytesVar(p *Bytes, name string, value Bytes, usage string) {
58 | 	flag.Var(p, name, usage)
59 | 	*p = value
60 | }
61 | 


--------------------------------------------------------------------------------
/rabin/chunker.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package rabin
  6 | 
  7 | import (
  8 | 	"io"
  9 | )
 10 | 
 11 | // A Chunker performs content-defined chunking. It divides a sequence
 12 | // of bytes into chunks such that insertions and deletions in the
 13 | // sequence will only affect chunk boundaries near those
 14 | // modifications.
 15 | type Chunker struct {
 16 | 	tab *Table
 17 | 	r   io.Reader
 18 | 
 19 | 	// buf is a buffer of data read from r. Its length is a power
 20 | 	// of two.
 21 | 	buf []byte
 22 | 
 23 | 	// head is the number of bytes consumed from buf.
 24 | 	// tail is the number of bytes read into buf.
 25 | 	head, tail uint64
 26 | 
 27 | 	// minBytes and maxBytes are the minimum and maximum chunk
 28 | 	// size.
 29 | 	minBytes, maxBytes uint64
 30 | 
 31 | 	// hashMask is the average chunk size minus one. Chunk
 32 | 	// boundaries occur where hash&hashMask == hashMask.
 33 | 	hashMask uint64
 34 | 
 35 | 	// ioErr is the sticky error returned from r.Read.
 36 | 	ioErr error
 37 | }
 38 | 
 39 | // A Discarder supports discarding bytes from an input stream.
 40 | type Discarder interface {
 41 | 	// Discard skips the next n bytes, returning the number of
 42 | 	// bytes discarded.
 43 | 	//
 44 | 	// If Discard skips fewer than n bytes, it also returns an
 45 | 	// error. Discard must not skip beyond the end of the file.
 46 | 	Discard(n int) (discarded int, err error)
 47 | }
 48 | 
 49 | // NewChunker returns a content-defined chunker for data read from r
 50 | // using the Rabin hash defined by table. The chunks produced by this
 51 | // Chunker will be at least minBytes and at most maxBytes large and
 52 | // will, on average, be avgBytes large.
 53 | //
 54 | // The Chunker buffers data from the Reader internally, so the Reader
 55 | // need not buffer itself. The caller may seek the reader, but if it
 56 | // does, it must only seek to a known chunk boundary and it must call
 57 | // Reset on the Chunker.
 58 | //
 59 | // If the Reader additionally implements Discarder, the Chunker will
 60 | // use this to skip over bytes more efficiently.
 61 | //
 62 | // The hash function defined by table must have a non-zero window
 63 | // size.
 64 | //
 65 | // minBytes must be >= the window size. This ensures that chunk
 66 | // boundary n+1 does not depend on data from before chunk boundary n.
 67 | //
 68 | // avgBytes must be a power of two.
 69 | func NewChunker(table *Table, r io.Reader, minBytes, avgBytes, maxBytes int) *Chunker {
 70 | 	if table.window <= 0 {
 71 | 		panic("Chunker requires a windowed hash function")
 72 | 	}
 73 | 	if table.window > minBytes {
 74 | 		panic("minimum block size must be >= window size")
 75 | 	}
 76 | 	if maxBytes < minBytes {
 77 | 		panic("maximum block size must be >= minimum block size")
 78 | 	}
 79 | 	if avgBytes&(avgBytes-1) != 0 {
 80 | 		panic("average block size must be a power of two")
 81 | 	}
 82 | 
 83 | 	logBufSize := uint(10)
 84 | 	for 1<<logBufSize < table.window*2 {
 85 | 		// We use the buffer to store the window, so we need
 86 | 		// at least enough space for that and for reading more
 87 | 		// data.
 88 | 		logBufSize++
 89 | 	}
 90 | 	buf := make([]byte, 1<<logBufSize)
 91 | 
 92 | 	return &Chunker{
 93 | 		tab: table, r: r, buf: buf,
 94 | 		minBytes: uint64(minBytes), maxBytes: uint64(maxBytes),
 95 | 		hashMask: uint64(avgBytes - 1),
 96 | 	}
 97 | }
 98 | 
 99 | // Reset resets c and clears its internal buffer. The caller must
100 | // ensure that the underlying Reader is at a chunk boundary when
101 | // calling Reset.
102 | //
103 | // This is useful if the caller has knowledge of where an
104 | // already-chunked stream is being modified. It can start at the chunk
105 | // boundary before the modified point and re-chunk the stream until a
106 | // new chunk boundary lines up with a boundary in the previous version
107 | // of the stream.
108 | func (c *Chunker) Reset() {
109 | 	c.head, c.tail = 0, 0
110 | 	c.ioErr = nil
111 | }
112 | 
113 | // Next returns the length in bytes of the next chunk. If there are no
114 | // more chunks, it returns 0, io.EOF. If the underlying Reader returns
115 | // some other error, it passes that error on to the caller.
116 | func (c *Chunker) Next() (int, error) {
117 | 	if c.ioErr != nil {
118 | 		return 0, c.ioErr
119 | 	}
120 | 
121 | 	// The buffer head is at the first byte of this chunk. The
122 | 	// reader may be ahead of this.
123 | 	start := c.head
124 | 	tab := c.tab
125 | 	bufMask := uint64(len(c.buf) - 1)
126 | 
127 | 	// Skip forward until we're one window short of the minimum
128 | 	// chunk size.
129 | 	window := uint64(tab.window)
130 | 	c.head += uint64(c.minBytes - window)
131 | 	if c.head > c.tail {
132 | 		if err := c.discard(int(c.head - c.tail)); err != nil {
133 | 			if err == io.EOF {
134 | 				// Return this chunk.
135 | 				return int(c.tail - start), nil
136 | 			}
137 | 			return 0, err
138 | 		}
139 | 	}
140 | 
141 | 	// Prime the hash on the window leading up to the minimum
142 | 	// chunk size. Until we've covered the whole window, these
143 | 	// intermediate hash values don't mean anything, so we ignore
144 | 	// chunk boundaries.
145 | 	for c.tail < c.head+window {
146 | 		if err := c.more(); err != nil {
147 | 			if err == io.EOF && c.tail != start {
148 | 				// Return this chunk.
149 | 				return int(c.tail - start), nil
150 | 			}
151 | 			return 0, err
152 | 		}
153 | 	}
154 | 	b1, b2 := c.buf[c.head&bufMask:], []byte(nil)
155 | 	if uint64(len(b1)) >= window {
156 | 		b1 = b1[:window]
157 | 	} else {
158 | 		b2 = c.buf[:window-uint64(len(b1))]
159 | 	}
160 | 	hash := tab.update(tab.update(0, b1), b2)
161 | 
162 | 	// At this point, c.head points to the *beginning* of the
163 | 	// window, so our hashing position is actually c.head+window.
164 | 
165 | 	// Process bytes and roll the window looking for a hash
166 | 	// boundary.
167 | 	buf, head, hashMask := c.buf, c.head, c.hashMask
168 | 	shift := tab.shift % 64
169 | 	refill := c.tail - window
170 | 	limit := start + c.maxBytes - window
171 | 	for hash&hashMask != hashMask && head < limit {
172 | 		// TODO: This could figure out how many bytes it can
173 | 		// process without refilling or wrapping and process
174 | 		// those without checks.
175 | 		if head == refill {
176 | 			c.head = head
177 | 			if err := c.more(); err != nil {
178 | 				if err == io.EOF {
179 | 					// Return this chunk.
180 | 					break
181 | 				}
182 | 				return 0, err
183 | 			}
184 | 			refill = c.tail - window
185 | 		}
186 | 		pop := buf[head&bufMask]
187 | 		push := buf[(head+window)&bufMask]
188 | 		head++
189 | 
190 | 		// Update the hash.
191 | 		hash ^= tab.pop[pop]
192 | 		top := uint8(hash >> shift)
193 | 		hash = (hash<<8 | uint64(push)) ^ tab.push[top]
194 | 	}
195 | 	// We found a chunk boundary. Shift c.head forward so it
196 | 	// points to the chunk boundary for the next call to Next.
197 | 	head += window
198 | 	// Flush state back.
199 | 	c.head = head
200 | 
201 | 	// Return the size of the chunk.
202 | 	return int(head - start), nil
203 | }
204 | 
205 | // discard discards the next n bytes from the Reader and updates
206 | // c.tail. It may use any of c.buf as scratch space.
207 | func (c *Chunker) discard(n int) error {
208 | 	if c.ioErr != nil {
209 | 		return c.ioErr
210 | 	}
211 | 
212 | 	// If the Reader natively supports discarding, use it.
213 | 	// Unfortunately, io.Seeker isn't sufficient because it can
214 | 	// seek past the end of file and then we don't know how much
215 | 	// was actually available.
216 | 	//
217 | 	// TODO: Alternatively, we could take a Seeker and use SeekEnd
218 | 	// to figure this out (and compare against the return value
219 | 	// from the SeekCurrent to figure out how much we overshot).
220 | 	if d, ok := c.r.(Discarder); ok {
221 | 		m, err := d.Discard(n)
222 | 		c.tail += uint64(m)
223 | 		c.ioErr = err
224 | 		return err
225 | 	}
226 | 
227 | 	for n > 0 {
228 | 		scratch := c.buf
229 | 		if len(scratch) > n {
230 | 			scratch = scratch[:n]
231 | 		}
232 | 		m, err := c.r.Read(scratch)
233 | 		if m > 0 {
234 | 			n -= m
235 | 			c.tail += uint64(m)
236 | 		}
237 | 		if err != nil {
238 | 			c.ioErr = err
239 | 			return err
240 | 		}
241 | 	}
242 | 	return nil
243 | }
244 | 
245 | // more retrieves more data into c.buf. It retrieves the minimum that
246 | // is convenient, rather than attempting to fill c.buf.
247 | func (c *Chunker) more() error {
248 | 	if c.ioErr != nil {
249 | 		return c.ioErr
250 | 	}
251 | 
252 | 	var buf []byte
253 | 	bufMask := uint64(len(c.buf) - 1)
254 | 	if wtail, whead := c.tail&bufMask, c.head&bufMask; whead <= wtail {
255 | 		buf = c.buf[wtail:]
256 | 	} else {
257 | 		buf = c.buf[wtail:whead]
258 | 	}
259 | 	n, err := c.r.Read(buf)
260 | 	if n > 0 {
261 | 		c.tail += uint64(n)
262 | 		// If there was an error, return it on the next
263 | 		// invocation.
264 | 		c.ioErr = err
265 | 		return nil
266 | 	}
267 | 	if err == nil {
268 | 		// This could lead to infinite loops, so bail out
269 | 		// instead.
270 | 		err = &errReadZero{}
271 | 	}
272 | 	// Make the error sticky.
273 | 	c.ioErr = err
274 | 	return err
275 | }
276 | 
277 | type errReadZero struct{}
278 | 
279 | func (e *errReadZero) Error() string {
280 | 	return "io.Reader returned 0 bytes and no error"
281 | }
282 | 


--------------------------------------------------------------------------------
/rabin/chunker_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package rabin
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"io"
 10 | 	"math"
 11 | 	"math/rand"
 12 | 	"reflect"
 13 | 	"testing"
 14 | )
 15 | 
 16 | func TestChunker(t *testing.T) {
 17 | 	const (
 18 | 		min = 128
 19 | 		avg = 1 << 10
 20 | 		max = 4 << 10
 21 | 	)
 22 | 
 23 | 	nTests := 100
 24 | 	if testing.Short() {
 25 | 		nTests = 5
 26 | 	}
 27 | 
 28 | 	totalLen, numLen := 0, 0
 29 | 	for nTest := 0; nTest < nTests; nTest++ {
 30 | 		var l1, l2 []int
 31 | 		rg := rand.New(rand.NewSource(int64(nTest)))
 32 | 		data := make([]byte, 128<<10)
 33 | 		rg.Read(data)
 34 | 		tab := NewTable(Poly64, 64)
 35 | 
 36 | 		// Chunk data using the Chunker.
 37 | 		c := NewChunker(tab, bytes.NewReader(data), min, avg, max)
 38 | 		for {
 39 | 			length, err := c.Next()
 40 | 			if err == io.EOF {
 41 | 				break
 42 | 			} else if err != nil {
 43 | 				t.Fatal("unexpected error", err)
 44 | 			}
 45 | 			l1 = append(l1, length)
 46 | 		}
 47 | 
 48 | 		// Chunk data using the obvious, slow, non-streaming
 49 | 		// implementation.
 50 | 		h := New(tab)
 51 | 		clen := 0
 52 | 		for _, b := range data {
 53 | 			h.Write([]byte{b})
 54 | 			clen++
 55 | 			if (clen >= min && h.Sum64()&(avg-1) == (avg-1)) ||
 56 | 				clen == max {
 57 | 				l2, clen = append(l2, clen), 0
 58 | 			}
 59 | 		}
 60 | 		l2 = append(l2, clen)
 61 | 
 62 | 		// Compare the results.
 63 | 		if !reflect.DeepEqual(l1, l2) {
 64 | 			t.Errorf("bad chunk lengths:\n want: %v\n got:  %v", l2, l1)
 65 | 			continue
 66 | 		}
 67 | 
 68 | 		for _, l := range l1[:len(l1)-1] {
 69 | 			totalLen += l
 70 | 			numLen++
 71 | 		}
 72 | 	}
 73 | 
 74 | 	// Check that the average block length is about right.
 75 | 	avgLen := float64(totalLen) / float64(numLen)
 76 | 	if math.Abs(avgLen-avg) > 0.1*avg {
 77 | 		t.Errorf("want average block length approx %d, got %g", avg, avgLen)
 78 | 	}
 79 | }
 80 | 
 81 | func BenchmarkChunker(b *testing.B) {
 82 | 	const (
 83 | 		min = 128
 84 | 		avg = 1 << 10
 85 | 		max = 4 << 10
 86 | 	)
 87 | 
 88 | 	rg := rand.New(rand.NewSource(42))
 89 | 	data := make([]byte, 1<<20)
 90 | 	rg.Read(data)
 91 | 	b.SetBytes(int64(len(data)))
 92 | 	tab := NewTable(Poly64, 64)
 93 | 	b.ResetTimer()
 94 | 
 95 | 	for i := 0; i < b.N; i++ {
 96 | 		c := NewChunker(tab, bytes.NewReader(data), min, avg, max)
 97 | 		for {
 98 | 			_, err := c.Next()
 99 | 			if err == io.EOF {
100 | 				break
101 | 			} else if err != nil {
102 | 				b.Fatal("unexpected error", err)
103 | 			}
104 | 		}
105 | 	}
106 | }
107 | 


--------------------------------------------------------------------------------
/rabin/poly.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package rabin
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"fmt"
 10 | 	"math/big"
 11 | )
 12 | 
 13 | // polyGF2 is a polynomial over GF(2).
 14 | type polyGF2 struct {
 15 | 	coeff big.Int
 16 | }
 17 | 
 18 | // newPolyGF2 constructs a polyGF2 where the i'th coefficient is the
 19 | // i'th bit of coeffs.
 20 | func newPolyGF2(coeffs uint64) *polyGF2 {
 21 | 	var p polyGF2
 22 | 	p.coeff.SetUint64(coeffs)
 23 | 	return &p
 24 | }
 25 | 
 26 | // Degree returns the degree of polynomial p. If p is 0, it returns
 27 | // -1.
 28 | func (z *polyGF2) Degree() int {
 29 | 	return z.coeff.BitLen() - 1
 30 | }
 31 | 
 32 | // MulX sets z to a(x) * x^n and returns z.
 33 | func (z *polyGF2) MulX(a *polyGF2, n int) *polyGF2 {
 34 | 	if n < 0 {
 35 | 		panic("power must be >= 0")
 36 | 	}
 37 | 	z.coeff.Lsh(&a.coeff, uint(n))
 38 | 	return z
 39 | }
 40 | 
 41 | // Add sets z to a(x) + b(x) and returns z.
 42 | func (z *polyGF2) Add(a, b *polyGF2) *polyGF2 {
 43 | 	z.coeff.Xor(&a.coeff, &b.coeff)
 44 | 	return z
 45 | }
 46 | 
 47 | // Sub sets z to a(x) - b(x) and returns z.
 48 | func (z *polyGF2) Sub(a, b *polyGF2) *polyGF2 {
 49 | 	return z.Add(a, b)
 50 | }
 51 | 
 52 | // Mul sets z to a(x) * b(x) and returns z.
 53 | func (z *polyGF2) Mul(a, b *polyGF2) *polyGF2 {
 54 | 	var out *polyGF2
 55 | 	if z != a && z != b {
 56 | 		out = z
 57 | 	} else {
 58 | 		out = &polyGF2{}
 59 | 	}
 60 | 
 61 | 	dx := a.Degree()
 62 | 	var bs big.Int
 63 | 	for i := 0; i <= dx; i++ {
 64 | 		if a.coeff.Bit(i) != 0 {
 65 | 			bs.Lsh(&b.coeff, uint(i))
 66 | 			out.coeff.Xor(&out.coeff, &bs)
 67 | 		}
 68 | 	}
 69 | 
 70 | 	if z != out {
 71 | 		z.coeff.Set(&out.coeff)
 72 | 	}
 73 | 	return z
 74 | }
 75 | 
 76 | // Mod sets z to the remainder of a(x) / b(x) and returns z.
 77 | func (z *polyGF2) Mod(a, b *polyGF2) *polyGF2 {
 78 | 	var out *polyGF2
 79 | 	if z != a && z != b {
 80 | 		out = z
 81 | 	} else {
 82 | 		out = &polyGF2{}
 83 | 	}
 84 | 
 85 | 	// Compute the remainder using synthetic division.
 86 | 	da, db := a.Degree(), b.Degree()
 87 | 	if db < 0 {
 88 | 		panic("divide by zero")
 89 | 	}
 90 | 	out.coeff.Set(&a.coeff)
 91 | 	var tmp polyGF2
 92 | 	for i := da - db; i >= 0; i-- {
 93 | 		if out.coeff.Bit(i+db) != 0 {
 94 | 			tmp.MulX(b, i)
 95 | 			out.Sub(out, &tmp)
 96 | 		}
 97 | 	}
 98 | 
 99 | 	if z != out {
100 | 		z.coeff.Set(&out.coeff)
101 | 	}
102 | 	return z
103 | }
104 | 
105 | // String returns p represented in mathematical notation.
106 | func (z *polyGF2) String() string {
107 | 	if z.coeff.Sign() == 0 {
108 | 		return "0"
109 | 	}
110 | 	var s bytes.Buffer
111 | 	for i := z.Degree(); i >= 0; i-- {
112 | 		if z.coeff.Bit(i) == 0 {
113 | 			continue
114 | 		}
115 | 		if s.Len() > 0 {
116 | 			s.WriteByte('+')
117 | 		}
118 | 		switch {
119 | 		case i == 0:
120 | 			s.WriteByte('1')
121 | 		case i == 1:
122 | 			s.WriteByte('x')
123 | 		default:
124 | 			fmt.Fprintf(&s, "x^%d", i)
125 | 		}
126 | 	}
127 | 	return s.String()
128 | }
129 | 


--------------------------------------------------------------------------------
/rabin/poly_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package rabin
 6 | 
 7 | import "testing"
 8 | 
 9 | func TestPolyMod(t *testing.T) {
10 | 	var p polyGF2
11 | 	for _, test := range [][3]uint64{
12 | 		{0x3e75, 0x3e75, 0x0},      // a(x) mod a(x) = 0
13 | 		{0x3e75 << 1, 0x3e75, 0x0}, // a(x)*x mod a(x) = 0
14 | 		{0x3e74, 0x3e75, 0x1},      // a(x) + 1 mod a(x) = 1
15 | 		{0x7337, 0xe39b, 0x7337},   // degree(a) < degree(b)
16 | 		// Random polynomials, checked with Wolfram Alpha.
17 | 		{0x3e75, 0x201b, 0x1e6e},
18 | 		{0xd10b, 0x35f7, 0x6d7},
19 | 		{0xe5a2, 0x8c83, 0x6921},
20 | 		{0x9a4a, 0xa8c7, 0x328d},
21 | 	} {
22 | 		a, b := newPolyGF2(test[0]), newPolyGF2(test[1])
23 | 		p.Mod(a, b)
24 | 		if p.coeff.Uint64() != test[2] {
25 | 			t.Errorf("%s mod %s = %s (%#x), want %s", a, b, &p, p.coeff.Uint64(), newPolyGF2(test[2]))
26 | 		}
27 | 	}
28 | }
29 | 


--------------------------------------------------------------------------------
/rabin/rabin.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // Package rabin implements Rabin hashing (fingerprinting).
  6 | //
  7 | // A given Rabin hash function is defined by a polynomial over GF(2):
  8 | //
  9 | //   p(x) = ... + p₂x² + p₁x + p₀   where pₙ ∈ GF(2)
 10 | //
 11 | // The message to be hashed is likewise interpreted as a polynomial
 12 | // over GF(2), where the coefficients are the bits of the message in
 13 | // left-to-right most-significant-bit-first order. Given a message
 14 | // polynomial m(x) and a hashing polynomial p(x), the Rabin hash is
 15 | // simply the coefficients of m(x) mod p(x).
 16 | //
 17 | // Rabin hashing has the unusual property that it can efficiently
 18 | // compute a "rolling hash" of a stream of data, where the hash value
 19 | // reflects only the most recent w bytes of the stream, for some
 20 | // window size w. This property makes it ideal for "content-defined
 21 | // chunking", which sub-divides sequential data on boundaries that are
 22 | // robust to insertions and deletions.
 23 | //
 24 | // The details of Rabin hashing are described in Rabin, Michael
 25 | // (1981). "Fingerprinting by Random Polynomials." Center for Research
 26 | // in Computing Technology, Harvard University. Tech Report
 27 | // TR-CSE-03-01.
 28 | package rabin
 29 | 
 30 | // This implementation is based on ideas and techniques from several
 31 | // sources:
 32 | //
 33 | // - The original Rabin paper, which lays out the fundamentals, but
 34 | // doesn't discuss the byte-wise or table-driven optimizations that
 35 | // are critical in a real implementation.
 36 | //
 37 | // - Broder, Andrei Z. "Some applications of Rabin’s fingerprinting
 38 | // method." Sequences II. Springer, New York, NY, 1993. 143–152. This
 39 | // describes the math behind multi-bit updates.
 40 | //
 41 | // - librabinpoly (github.com/stevegt/librabinpoly), a C
 42 | // implementation which itself has a long and fascinating lineage
 43 | // going back to David Mazieres.
 44 | //
 45 | // - rabinfingerprint (github.com/themadcreator/rabinfingerprint), a
 46 | // well-commented Java implementation by Bill Dwyer.
 47 | 
 48 | // Poly64 is an 64-bit (degree 63) irreducible polynomial over GF(2).
 49 | //
 50 | // This is a convenient polynomial to use for computing 64-bit Rabin
 51 | // hashes.
 52 | const Poly64 = 0xbfe6b8a5bf378d83
 53 | 
 54 | // Table is a set of pre-computed tables for computing Rabin
 55 | // fingerprints for a given polynomial and window size.
 56 | type Table struct {
 57 | 	push   [256]uint64
 58 | 	pop    [256]uint64
 59 | 	degree int
 60 | 	shift  uint
 61 | 	window int
 62 | }
 63 | 
 64 | // NewTable returns a Table for constructing Rabin hashes using the
 65 | // polynomial
 66 | //
 67 | //   p(x) = ... + p₂x² + p₁x + p₀   where pₙ ∈ GF(2)
 68 | //
 69 | // where pₙ = (polynomial >> n) & 1. This polynomial must be
 70 | // irreducible and must have degree >= 8. The number of bits in the
 71 | // resulting hash values will be the same as the number of bits in
 72 | // polynomial.
 73 | //
 74 | // This package defines Poly64 as a convenient 64-bit irreducible
 75 | // polynomial that can be used with this function.
 76 | //
 77 | // If window > 0, hashes constructed from this Table will be rolling
 78 | // hash over only the most recently written window bytes of data.
 79 | func NewTable(polynomial uint64, window int) *Table {
 80 | 	tab := &Table{}
 81 | 	p := newPolyGF2(polynomial)
 82 | 	tab.degree = p.Degree()
 83 | 	if tab.degree < 8 {
 84 | 		panic("polynomial must have degree >= 8")
 85 | 	}
 86 | 	tab.shift = uint(tab.degree - 8)
 87 | 	tab.window = window
 88 | 
 89 | 	// Pre-compute the push table.
 90 | 	var f, f2 polyGF2
 91 | 	for i := 0; i < 256; i++ {
 92 | 		// We shift out 8 bits of the hash at a time, so
 93 | 		// pre-compute the update (i(x) * xⁿ mod p(x)) for all
 94 | 		// possible top 8 bits of the hash.
 95 | 		f.coeff.SetInt64(int64(i))
 96 | 		f.MulX(&f, p.Degree())
 97 | 		f2.Mod(&f, p)
 98 | 		// To avoid explicitly masking away the bits that we
 99 | 		// want to shift out of the hash, we add in (i(x) *
100 | 		// x^n). This is exactly equal to the bits we want to
101 | 		// mask out, so when we xor with this, it will take
102 | 		// care of zeroing out these bits.
103 | 		f.Add(&f, &f2)
104 | 		tab.push[i] = f.coeff.Uint64()
105 | 	}
106 | 
107 | 	// Pre-compute the pop table.
108 | 	if window > 0 {
109 | 		for i := 0; i < 256; i++ {
110 | 			f.coeff.SetInt64(int64(i))
111 | 			// TODO: We could combine the multiply and mod
112 | 			// to make this faster. See, e.g., librabinpoly.
113 | 			f.MulX(&f, (window-1)*8)
114 | 			f2.Mod(&f, p)
115 | 			tab.pop[i] = f2.coeff.Uint64()
116 | 		}
117 | 	}
118 | 
119 | 	return tab
120 | }
121 | 
122 | // update updates the hash as if p had been appended to the currently
123 | // hashed message.
124 | func (tab *Table) update(hash uint64, p []byte) uint64 {
125 | 	// Given the current message
126 | 	//
127 | 	//   m(x) = ... + m₂x² + m₁x + m₀
128 | 	//
129 | 	// and hash
130 | 	//
131 | 	//   h(x) = m(x) mod p(x)
132 | 	//
133 | 	// we can extend the message by one bit b:
134 | 	//
135 | 	//   m'(x) = ... + m₂x³ + m₁x² + m₀x + b = m(x)*x + b
136 | 	//
137 | 	// This yields the hash update:
138 | 	//
139 | 	//   h'(x) = m'(x) mod p(x)
140 | 	//         = (m(x)*x + b) mod p(x)
141 | 	//         = ((m(x) mod p(x)) * x + b) mod p(x)
142 | 	//         = (h(x)*x + b) mod p(x)
143 | 	//         = hₙ₋₂xⁿ⁻¹ + ... + h₀x + b + hₙ₋₁(pₙ₋₁xⁿ⁻¹ + ... + p₀)
144 | 	//
145 | 	// where n is the degree of p(x).
146 | 	//
147 | 	// In general, we can extend the hash with any i bit message
148 | 	// m2 using the fact that
149 | 	//
150 | 	//   r(concat(m1, m2)) = r(r(m1) * r(xⁱ)) + r(m2)
151 | 	//
152 | 	// where r(M) = M(x) mod p(x). Below, we update it 8 bits at a
153 | 	// time and, since we require p(x) to have degree >= 8, this
154 | 	// simplifies to
155 | 	//
156 | 	//   r(concat(m1, m2)) = r(r(m1) * x⁸) + m2
157 | 	//
158 | 	// r(m1) is the current hash value. Multiplication by x⁸ is a
159 | 	// shift. We can compute r(r(m1) * x⁸) using the lookup table
160 | 	// we constructed in New.
161 | 	shift := tab.shift % 64 // shift%64 eliminates checks below
162 | 	for _, b := range p {
163 | 		top := uint8(hash >> shift)
164 | 		hash = (hash<<8 | uint64(b)) ^ tab.push[top]
165 | 	}
166 | 	return hash
167 | }
168 | 
169 | // Hash computes Rabin hashes (often called fingerprints).
170 | //
171 | // Hash implements hash.Hash64.
172 | type Hash struct {
173 | 	tab  *Table
174 | 	hash uint64
175 | 	msg  []byte
176 | 	pos  int
177 | }
178 | 
179 | // New returns a new Rabin hash using the polynomial and window size
180 | // represented by table.
181 | func New(table *Table) *Hash {
182 | 	hash := &Hash{tab: table}
183 | 	if table.window > 0 {
184 | 		// Leading zeros don't affect the hash, so we can
185 | 		// start with a full window of zeros and keep the
186 | 		// later logic simpler.
187 | 		hash.msg = make([]byte, table.window)
188 | 	}
189 | 	return hash
190 | }
191 | 
192 | // Sum64 returns the hash of all bytes written to h.
193 | func (h *Hash) Sum64() uint64 {
194 | 	return h.hash
195 | }
196 | 
197 | // Reset resets h to its initial state.
198 | func (h *Hash) Reset() {
199 | 	h.hash = 0
200 | 	if h.msg != nil {
201 | 		for i := range h.msg {
202 | 			h.msg[i] = 0
203 | 		}
204 | 		h.pos = 0
205 | 	}
206 | }
207 | 
208 | // Write adds p to the running hash h.
209 | //
210 | // If h is windowed, this may also expire previously written bytes
211 | // from the running hash so that h represents the hash of only the
212 | // most recently written window bytes.
213 | //
214 | // It always returns len(p), nil.
215 | func (h *Hash) Write(p []byte) (n int, err error) {
216 | 	n = len(p)
217 | 
218 | 	if h.msg == nil {
219 | 		h.hash = h.tab.update(h.hash, p)
220 | 		return
221 | 	}
222 | 
223 | 	window := len(h.msg)
224 | 	if len(p) >= window {
225 | 		// p covers the entire window. Discard our entire
226 | 		// state and just hash the last window bytes of p.
227 | 		p = p[len(p)-window:]
228 | 		copy(h.msg, p)
229 | 		h.pos, h.hash = 0, 0
230 | 		h.hash = h.tab.update(h.hash, p)
231 | 		return
232 | 	}
233 | 	// Add and remove bytes as we overwrite them in the window.
234 | 	//
235 | 	// TODO: If we made h.win a little bigger, we could copy and
236 | 	// process bigger chunks at a time. Not sure it would actually
237 | 	// help, but we could abstract this more nicely.
238 | 	tab := h.tab
239 | 	pos, hash, shift := h.pos, h.hash, tab.shift%64
240 | 	for _, b := range p {
241 | 		pop := h.msg[pos]
242 | 		h.msg[pos] = b
243 | 		if pos++; pos == window {
244 | 			pos = 0
245 | 		}
246 | 
247 | 		hash ^= tab.pop[pop]
248 | 		top := uint8(hash >> shift)
249 | 		hash = (hash<<8 | uint64(b)) ^ tab.push[top]
250 | 	}
251 | 	h.pos, h.hash = int(pos), hash
252 | 	return
253 | }
254 | 
255 | // Size returns the number of bytes Sum will append. This is the
256 | // minimum number of bytes necessary to represent the hash.
257 | func (h *Hash) Size() int {
258 | 	bits := h.tab.degree + 1
259 | 	return (bits + 7) / 8
260 | }
261 | 
262 | // Sum appends the least-significant byte first representation of the
263 | // current hash to b and returns the resulting slice.
264 | func (h *Hash) Sum(b []byte) []byte {
265 | 	var hbytes [8]byte
266 | 	for i := range hbytes {
267 | 		hbytes[i] = byte(h.hash >> uint(i*8))
268 | 	}
269 | 	return append(b, hbytes[:h.Size()]...)
270 | }
271 | 
272 | // BlockSize returns the window size if a window is configured, and
273 | // otherwise returns 1.
274 | //
275 | // This satisfies the hash.Hash interface and indicates that Write is
276 | // most efficient if writes are a multiple of the returned size.
277 | func (h *Hash) BlockSize() int {
278 | 	if h.msg != nil {
279 | 		return len(h.msg)
280 | 	}
281 | 	return 1
282 | }
283 | 


--------------------------------------------------------------------------------
/rabin/rabin_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package rabin
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"math/rand"
 10 | 	"testing"
 11 | )
 12 | 
 13 | var testPolys = []uint64{
 14 | 	0x11d,              // Degree 8 (smallest supported)
 15 | 	0xbfe6b8a5bf378d83, // Degree 63 (largest supported)
 16 | }
 17 | 
 18 | func TestRabin(t *testing.T) {
 19 | 	nTests := 100
 20 | 	if testing.Short() {
 21 | 		nTests = 5
 22 | 	}
 23 | 
 24 | 	for _, poly := range testPolys {
 25 | 		tab := NewTable(poly, 0)
 26 | 		t.Run("poly="+newPolyGF2(poly).String(), func(t *testing.T) {
 27 | 			rg := rand.New(rand.NewSource(42))
 28 | 			data := make([]byte, 64)
 29 | 			for i := 0; i < nTests; i++ {
 30 | 				rg.Read(data)
 31 | 				h1 := rabinSlow(poly, data)
 32 | 				h2 := New(tab)
 33 | 				h2.Write(data)
 34 | 				if h1 != h2.Sum64() {
 35 | 					t.Errorf("want hash %#x (%s), got %#x (%s) for %x", h1, newPolyGF2(h1), h2.Sum64(), newPolyGF2(h2.Sum64()), data)
 36 | 				}
 37 | 			}
 38 | 		})
 39 | 	}
 40 | }
 41 | 
 42 | // rabinSlow is a slow, very literal implementation of Rabin
 43 | // fingerprinting that doesn't support streaming.
 44 | func rabinSlow(poly uint64, data []byte) uint64 {
 45 | 	var mpoly polyGF2
 46 | 	mpoly.coeff.SetBytes(data)
 47 | 	return mpoly.Mod(&mpoly, newPolyGF2(poly)).coeff.Uint64()
 48 | }
 49 | 
 50 | func TestWindow(t *testing.T) {
 51 | 	rg := rand.New(rand.NewSource(42))
 52 | 	data := make([]byte, 1024)
 53 | 	rg.Read(data)
 54 | 
 55 | 	for _, poly := range testPolys {
 56 | 		hNoWin := New(NewTable(poly, 0))
 57 | 		for _, window := range []int{1, 4, 64, 65} {
 58 | 			hWin := New(NewTable(poly, window))
 59 | 			t.Run(fmt.Sprintf("poly=%s/window=%d", newPolyGF2(poly), window), func(t *testing.T) {
 60 | 				for _, blockSize := range []int{1, 2, 5, 100} {
 61 | 					hWin.Reset()
 62 | 					for i := 0; i < len(data); i += blockSize {
 63 | 						block := data[i:]
 64 | 						if len(block) > blockSize {
 65 | 							block = block[:blockSize]
 66 | 						}
 67 | 
 68 | 						hWin.Write(block)
 69 | 
 70 | 						dataWin := data[:i+len(block)]
 71 | 						if len(dataWin) > window {
 72 | 							dataWin = dataWin[len(dataWin)-window:]
 73 | 						}
 74 | 						hNoWin.Reset()
 75 | 						hNoWin.Write(dataWin)
 76 | 
 77 | 						// Check the hash.
 78 | 						if hNoWin.Sum64() != hWin.Sum64() {
 79 | 							t.Errorf("want hash %#x, got %#x at byte %d with %d byte blocks", hNoWin.Sum64(), hWin.Sum64(), i, blockSize)
 80 | 						}
 81 | 					}
 82 | 				}
 83 | 			})
 84 | 		}
 85 | 	}
 86 | }
 87 | 
 88 | func BenchmarkRabin(b *testing.B) {
 89 | 	rg := rand.New(rand.NewSource(42))
 90 | 	data := make([]byte, 1<<20)
 91 | 	rg.Read(data)
 92 | 	b.SetBytes(int64(len(data)))
 93 | 	h := New(NewTable(Poly64, 0))
 94 | 	b.ResetTimer()
 95 | 
 96 | 	for i := 0; i < b.N; i++ {
 97 | 		h.Reset()
 98 | 		h.Write(data)
 99 | 	}
100 | }
101 | 
102 | func BenchmarkRabinWindowed64(b *testing.B) {
103 | 	const window = 64
104 | 	rg := rand.New(rand.NewSource(42))
105 | 	data := make([]byte, 1<<20)
106 | 	rg.Read(data)
107 | 	b.SetBytes(int64(len(data)))
108 | 	h := New(NewTable(Poly64, window))
109 | 	b.ResetTimer()
110 | 
111 | 	for i := 0; i < b.N; i++ {
112 | 		h.Reset()
113 | 		// Feed it smaller blocks or it will just reset and
114 | 		// hash the end.
115 | 		for j := 0; j < len(data); j += window / 2 {
116 | 			h.Write(data[j : j+window/2])
117 | 		}
118 | 	}
119 | }
120 | 


--------------------------------------------------------------------------------