├── LICENSE ├── README.md ├── cmd ├── cdsplit │ └── cdsplit.go └── internal │ └── cdflags │ └── bytes.go └── rabin ├── chunker.go ├── chunker_test.go ├── poly.go ├── poly_test.go ├── rabin.go └── rabin_test.go /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 The Go Authors. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rabin [![GoDoc](https://godoc.org/github.com/aclements/go-rabin/rabin?status.svg)](https://godoc.org/github.com/aclements/go-rabin/rabin) [![Go Report Card](https://goreportcard.com/badge/github.com/aclements/go-rabin)](https://goreportcard.com/report/github.com/aclements/go-rabin) 2 | 3 | The rabin package implements Rabin hashing (aka fingerprinting) and 4 | content-defined chunking based on Rabin hashing. 5 | 6 | Rabin hashing has the unusual property that it can efficiently compute 7 | a "rolling hash" of a stream of data, where the hash value reflects 8 | only the most recent w bytes of the stream, for some window size w. 9 | This property makes it ideal for "content-defined chunking", which 10 | sub-divides sequential data on boundaries that are robust to 11 | insertions and deletions. 12 | 13 | The details of Rabin fingerprinting are described in Rabin, Michael 14 | (1981). "Fingerprinting by Random Polynomials." Center for Research in 15 | Computing Technology, Harvard University. Tech Report TR-CSE-03-01. 16 | 17 | Installation 18 | ------------ 19 | 20 | To download go-rabin, run 21 | 22 | ```sh 23 | go get -d -u github.com/aclements/go-rabin/rabin 24 | ``` 25 | 26 | You can then import this package into your projects with 27 | 28 | ```go 29 | import "github.com/aclements/go-rabin/rabin" 30 | ``` 31 | 32 | Demos 33 | ----- 34 | 35 | There is a small program in `cmd/cdsplit` that divides an input file 36 | into content-defined chunks. It's only intended as a demo, but can be 37 | installed using 38 | 39 | ```sh 40 | go get github.com/aclements/go-rabin/cmd/cdsplit 41 | ``` 42 | -------------------------------------------------------------------------------- /cmd/cdsplit/cdsplit.go: -------------------------------------------------------------------------------- 1 | // Copyright 2017 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Command cdsplit divides a file into variable-sized, content-defined 6 | // chunks that are robust to insertions, deletions, and changes to the 7 | // input file. 8 | // 9 | // It is a demo for the Go rabin package. 10 | package main 11 | 12 | import ( 13 | "bytes" 14 | "flag" 15 | "fmt" 16 | "io" 17 | "log" 18 | "os" 19 | 20 | "github.com/aclements/go-rabin/cmd/internal/cdflags" 21 | "github.com/aclements/go-rabin/rabin" 22 | ) 23 | 24 | func main() { 25 | // Parse and validate flags 26 | log.SetPrefix(os.Args[0] + ": ") 27 | log.SetFlags(0) 28 | flag.Usage = func() { 29 | fmt.Fprintf(os.Stderr, "usage: %s [flags] in-file\n\n", os.Args[0]) 30 | fmt.Fprintf(os.Stderr, "Divide in-file into variable-sized, content-defined chunks that are robust to\n") 31 | fmt.Fprintf(os.Stderr, "insertions, deletions, and changes to in-file.\n\n") 32 | flag.PrintDefaults() 33 | os.Exit(2) 34 | } 35 | window := cdflags.FlagBytes("window", 64, "use a rolling hash with window size `w`") 36 | avg := cdflags.FlagBytes("avg", 4<<10, "average chunk `size`; must be a power of 2") 37 | min := cdflags.FlagBytes("min", 512, "minimum chunk `size`") 38 | max := cdflags.FlagBytes("max", 32<<10, "maximum chunk `size`") 39 | outBase := flag.String("out", "", "write output to `base`.NNNNNN") 40 | flag.Parse() 41 | if flag.NArg() != 1 { 42 | flag.Usage() 43 | } 44 | if *min > *max { 45 | log.Fatal("-min must be <= -max") 46 | } 47 | if *avg&(*avg-1) != 0 { 48 | log.Fatal("-avg must be a power of two") 49 | } 50 | if *min < *window { 51 | log.Fatal("-min must be >= -window") 52 | } 53 | inFile := flag.Arg(0) 54 | if *outBase == "" { 55 | *outBase = inFile 56 | } 57 | 58 | // Open input file 59 | f, err := os.Open(inFile) 60 | if err != nil { 61 | log.Fatal(err) 62 | } 63 | defer f.Close() 64 | 65 | // Chunk and write output files. 66 | copy := new(bytes.Buffer) 67 | r := io.TeeReader(f, copy) 68 | c := rabin.NewChunker(rabin.NewTable(rabin.Poly64, int(*window)), r, int(*min), int(*avg), int(*max)) 69 | for i := 0; ; i++ { 70 | clen, err := c.Next() 71 | if err == io.EOF { 72 | break 73 | } else if err != nil { 74 | log.Fatal(err) 75 | } 76 | 77 | name := fmt.Sprintf("%s.%06d", *outBase, i) 78 | fOut, err := os.Create(name) 79 | if err != nil { 80 | log.Fatal(err) 81 | } 82 | _, err = io.CopyN(fOut, copy, int64(clen)) 83 | if err == nil { 84 | err = fOut.Close() 85 | } 86 | if err != nil { 87 | log.Fatalf("error writing %s: %s", name, err) 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /cmd/internal/cdflags/bytes.go: -------------------------------------------------------------------------------- 1 | // Copyright 2017 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package cdflags 6 | 7 | import ( 8 | "flag" 9 | "fmt" 10 | ) 11 | 12 | // Bytes is a byte size. 13 | // 14 | // Bytes implements flag.Value. 15 | type Bytes int64 16 | 17 | var si = []string{"", "k", "M", "G", "T", "P", "E", "Z", "Y"} 18 | 19 | // String pretty-prints b using an SI prefix. 20 | func (b Bytes) String() string { 21 | f := float64(b) 22 | for i, s := range si { 23 | if f < 1024 || i == len(si)-1 { 24 | return fmt.Sprintf("%g%s", f, s) 25 | } 26 | f /= 1024 27 | } 28 | panic("not reached") 29 | } 30 | 31 | // Set parses s into bytes, accepting SI prefixes. 32 | func (b *Bytes) Set(s string) error { 33 | var num float64 34 | var unit string 35 | _, err := fmt.Sscanf(s, "%g%s", &num, &unit) 36 | if err == nil { 37 | for _, s := range si { 38 | if unit == s || unit == s+"B" || unit == s+"iB" { 39 | *b = Bytes(num) 40 | return nil 41 | } 42 | num *= 1024 43 | } 44 | } 45 | return fmt.Errorf("expected or [%s]", si) 46 | } 47 | 48 | // FlagBytes defines a Bytes flag and returns a pointer to the 49 | // variable where the value of the flag will be stored. 50 | func FlagBytes(name string, value Bytes, usage string) *Bytes { 51 | flag.Var(&value, name, usage) 52 | return &value 53 | } 54 | 55 | // FlagBytesVar defines a Bytes flag. The argument p points to a Bytes 56 | // variable in which to sore the value of the flag. 57 | func FlagBytesVar(p *Bytes, name string, value Bytes, usage string) { 58 | flag.Var(p, name, usage) 59 | *p = value 60 | } 61 | -------------------------------------------------------------------------------- /rabin/chunker.go: -------------------------------------------------------------------------------- 1 | // Copyright 2017 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package rabin 6 | 7 | import ( 8 | "io" 9 | ) 10 | 11 | // A Chunker performs content-defined chunking. It divides a sequence 12 | // of bytes into chunks such that insertions and deletions in the 13 | // sequence will only affect chunk boundaries near those 14 | // modifications. 15 | type Chunker struct { 16 | tab *Table 17 | r io.Reader 18 | 19 | // buf is a buffer of data read from r. Its length is a power 20 | // of two. 21 | buf []byte 22 | 23 | // head is the number of bytes consumed from buf. 24 | // tail is the number of bytes read into buf. 25 | head, tail uint64 26 | 27 | // minBytes and maxBytes are the minimum and maximum chunk 28 | // size. 29 | minBytes, maxBytes uint64 30 | 31 | // hashMask is the average chunk size minus one. Chunk 32 | // boundaries occur where hash&hashMask == hashMask. 33 | hashMask uint64 34 | 35 | // ioErr is the sticky error returned from r.Read. 36 | ioErr error 37 | } 38 | 39 | // A Discarder supports discarding bytes from an input stream. 40 | type Discarder interface { 41 | // Discard skips the next n bytes, returning the number of 42 | // bytes discarded. 43 | // 44 | // If Discard skips fewer than n bytes, it also returns an 45 | // error. Discard must not skip beyond the end of the file. 46 | Discard(n int) (discarded int, err error) 47 | } 48 | 49 | // NewChunker returns a content-defined chunker for data read from r 50 | // using the Rabin hash defined by table. The chunks produced by this 51 | // Chunker will be at least minBytes and at most maxBytes large and 52 | // will, on average, be avgBytes large. 53 | // 54 | // The Chunker buffers data from the Reader internally, so the Reader 55 | // need not buffer itself. The caller may seek the reader, but if it 56 | // does, it must only seek to a known chunk boundary and it must call 57 | // Reset on the Chunker. 58 | // 59 | // If the Reader additionally implements Discarder, the Chunker will 60 | // use this to skip over bytes more efficiently. 61 | // 62 | // The hash function defined by table must have a non-zero window 63 | // size. 64 | // 65 | // minBytes must be >= the window size. This ensures that chunk 66 | // boundary n+1 does not depend on data from before chunk boundary n. 67 | // 68 | // avgBytes must be a power of two. 69 | func NewChunker(table *Table, r io.Reader, minBytes, avgBytes, maxBytes int) *Chunker { 70 | if table.window <= 0 { 71 | panic("Chunker requires a windowed hash function") 72 | } 73 | if table.window > minBytes { 74 | panic("minimum block size must be >= window size") 75 | } 76 | if maxBytes < minBytes { 77 | panic("maximum block size must be >= minimum block size") 78 | } 79 | if avgBytes&(avgBytes-1) != 0 { 80 | panic("average block size must be a power of two") 81 | } 82 | 83 | logBufSize := uint(10) 84 | for 1< c.tail { 132 | if err := c.discard(int(c.head - c.tail)); err != nil { 133 | if err == io.EOF { 134 | // Return this chunk. 135 | return int(c.tail - start), nil 136 | } 137 | return 0, err 138 | } 139 | } 140 | 141 | // Prime the hash on the window leading up to the minimum 142 | // chunk size. Until we've covered the whole window, these 143 | // intermediate hash values don't mean anything, so we ignore 144 | // chunk boundaries. 145 | for c.tail < c.head+window { 146 | if err := c.more(); err != nil { 147 | if err == io.EOF && c.tail != start { 148 | // Return this chunk. 149 | return int(c.tail - start), nil 150 | } 151 | return 0, err 152 | } 153 | } 154 | b1, b2 := c.buf[c.head&bufMask:], []byte(nil) 155 | if uint64(len(b1)) >= window { 156 | b1 = b1[:window] 157 | } else { 158 | b2 = c.buf[:window-uint64(len(b1))] 159 | } 160 | hash := tab.update(tab.update(0, b1), b2) 161 | 162 | // At this point, c.head points to the *beginning* of the 163 | // window, so our hashing position is actually c.head+window. 164 | 165 | // Process bytes and roll the window looking for a hash 166 | // boundary. 167 | buf, head, hashMask := c.buf, c.head, c.hashMask 168 | shift := tab.shift % 64 169 | refill := c.tail - window 170 | limit := start + c.maxBytes - window 171 | for hash&hashMask != hashMask && head < limit { 172 | // TODO: This could figure out how many bytes it can 173 | // process without refilling or wrapping and process 174 | // those without checks. 175 | if head == refill { 176 | c.head = head 177 | if err := c.more(); err != nil { 178 | if err == io.EOF { 179 | // Return this chunk. 180 | break 181 | } 182 | return 0, err 183 | } 184 | refill = c.tail - window 185 | } 186 | pop := buf[head&bufMask] 187 | push := buf[(head+window)&bufMask] 188 | head++ 189 | 190 | // Update the hash. 191 | hash ^= tab.pop[pop] 192 | top := uint8(hash >> shift) 193 | hash = (hash<<8 | uint64(push)) ^ tab.push[top] 194 | } 195 | // We found a chunk boundary. Shift c.head forward so it 196 | // points to the chunk boundary for the next call to Next. 197 | head += window 198 | // Flush state back. 199 | c.head = head 200 | 201 | // Return the size of the chunk. 202 | return int(head - start), nil 203 | } 204 | 205 | // discard discards the next n bytes from the Reader and updates 206 | // c.tail. It may use any of c.buf as scratch space. 207 | func (c *Chunker) discard(n int) error { 208 | if c.ioErr != nil { 209 | return c.ioErr 210 | } 211 | 212 | // If the Reader natively supports discarding, use it. 213 | // Unfortunately, io.Seeker isn't sufficient because it can 214 | // seek past the end of file and then we don't know how much 215 | // was actually available. 216 | // 217 | // TODO: Alternatively, we could take a Seeker and use SeekEnd 218 | // to figure this out (and compare against the return value 219 | // from the SeekCurrent to figure out how much we overshot). 220 | if d, ok := c.r.(Discarder); ok { 221 | m, err := d.Discard(n) 222 | c.tail += uint64(m) 223 | c.ioErr = err 224 | return err 225 | } 226 | 227 | for n > 0 { 228 | scratch := c.buf 229 | if len(scratch) > n { 230 | scratch = scratch[:n] 231 | } 232 | m, err := c.r.Read(scratch) 233 | if m > 0 { 234 | n -= m 235 | c.tail += uint64(m) 236 | } 237 | if err != nil { 238 | c.ioErr = err 239 | return err 240 | } 241 | } 242 | return nil 243 | } 244 | 245 | // more retrieves more data into c.buf. It retrieves the minimum that 246 | // is convenient, rather than attempting to fill c.buf. 247 | func (c *Chunker) more() error { 248 | if c.ioErr != nil { 249 | return c.ioErr 250 | } 251 | 252 | var buf []byte 253 | bufMask := uint64(len(c.buf) - 1) 254 | if wtail, whead := c.tail&bufMask, c.head&bufMask; whead <= wtail { 255 | buf = c.buf[wtail:] 256 | } else { 257 | buf = c.buf[wtail:whead] 258 | } 259 | n, err := c.r.Read(buf) 260 | if n > 0 { 261 | c.tail += uint64(n) 262 | // If there was an error, return it on the next 263 | // invocation. 264 | c.ioErr = err 265 | return nil 266 | } 267 | if err == nil { 268 | // This could lead to infinite loops, so bail out 269 | // instead. 270 | err = &errReadZero{} 271 | } 272 | // Make the error sticky. 273 | c.ioErr = err 274 | return err 275 | } 276 | 277 | type errReadZero struct{} 278 | 279 | func (e *errReadZero) Error() string { 280 | return "io.Reader returned 0 bytes and no error" 281 | } 282 | -------------------------------------------------------------------------------- /rabin/chunker_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2017 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package rabin 6 | 7 | import ( 8 | "bytes" 9 | "io" 10 | "math" 11 | "math/rand" 12 | "reflect" 13 | "testing" 14 | ) 15 | 16 | func TestChunker(t *testing.T) { 17 | const ( 18 | min = 128 19 | avg = 1 << 10 20 | max = 4 << 10 21 | ) 22 | 23 | nTests := 100 24 | if testing.Short() { 25 | nTests = 5 26 | } 27 | 28 | totalLen, numLen := 0, 0 29 | for nTest := 0; nTest < nTests; nTest++ { 30 | var l1, l2 []int 31 | rg := rand.New(rand.NewSource(int64(nTest))) 32 | data := make([]byte, 128<<10) 33 | rg.Read(data) 34 | tab := NewTable(Poly64, 64) 35 | 36 | // Chunk data using the Chunker. 37 | c := NewChunker(tab, bytes.NewReader(data), min, avg, max) 38 | for { 39 | length, err := c.Next() 40 | if err == io.EOF { 41 | break 42 | } else if err != nil { 43 | t.Fatal("unexpected error", err) 44 | } 45 | l1 = append(l1, length) 46 | } 47 | 48 | // Chunk data using the obvious, slow, non-streaming 49 | // implementation. 50 | h := New(tab) 51 | clen := 0 52 | for _, b := range data { 53 | h.Write([]byte{b}) 54 | clen++ 55 | if (clen >= min && h.Sum64()&(avg-1) == (avg-1)) || 56 | clen == max { 57 | l2, clen = append(l2, clen), 0 58 | } 59 | } 60 | l2 = append(l2, clen) 61 | 62 | // Compare the results. 63 | if !reflect.DeepEqual(l1, l2) { 64 | t.Errorf("bad chunk lengths:\n want: %v\n got: %v", l2, l1) 65 | continue 66 | } 67 | 68 | for _, l := range l1[:len(l1)-1] { 69 | totalLen += l 70 | numLen++ 71 | } 72 | } 73 | 74 | // Check that the average block length is about right. 75 | avgLen := float64(totalLen) / float64(numLen) 76 | if math.Abs(avgLen-avg) > 0.1*avg { 77 | t.Errorf("want average block length approx %d, got %g", avg, avgLen) 78 | } 79 | } 80 | 81 | func BenchmarkChunker(b *testing.B) { 82 | const ( 83 | min = 128 84 | avg = 1 << 10 85 | max = 4 << 10 86 | ) 87 | 88 | rg := rand.New(rand.NewSource(42)) 89 | data := make([]byte, 1<<20) 90 | rg.Read(data) 91 | b.SetBytes(int64(len(data))) 92 | tab := NewTable(Poly64, 64) 93 | b.ResetTimer() 94 | 95 | for i := 0; i < b.N; i++ { 96 | c := NewChunker(tab, bytes.NewReader(data), min, avg, max) 97 | for { 98 | _, err := c.Next() 99 | if err == io.EOF { 100 | break 101 | } else if err != nil { 102 | b.Fatal("unexpected error", err) 103 | } 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /rabin/poly.go: -------------------------------------------------------------------------------- 1 | // Copyright 2017 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package rabin 6 | 7 | import ( 8 | "bytes" 9 | "fmt" 10 | "math/big" 11 | ) 12 | 13 | // polyGF2 is a polynomial over GF(2). 14 | type polyGF2 struct { 15 | coeff big.Int 16 | } 17 | 18 | // newPolyGF2 constructs a polyGF2 where the i'th coefficient is the 19 | // i'th bit of coeffs. 20 | func newPolyGF2(coeffs uint64) *polyGF2 { 21 | var p polyGF2 22 | p.coeff.SetUint64(coeffs) 23 | return &p 24 | } 25 | 26 | // Degree returns the degree of polynomial p. If p is 0, it returns 27 | // -1. 28 | func (z *polyGF2) Degree() int { 29 | return z.coeff.BitLen() - 1 30 | } 31 | 32 | // MulX sets z to a(x) * x^n and returns z. 33 | func (z *polyGF2) MulX(a *polyGF2, n int) *polyGF2 { 34 | if n < 0 { 35 | panic("power must be >= 0") 36 | } 37 | z.coeff.Lsh(&a.coeff, uint(n)) 38 | return z 39 | } 40 | 41 | // Add sets z to a(x) + b(x) and returns z. 42 | func (z *polyGF2) Add(a, b *polyGF2) *polyGF2 { 43 | z.coeff.Xor(&a.coeff, &b.coeff) 44 | return z 45 | } 46 | 47 | // Sub sets z to a(x) - b(x) and returns z. 48 | func (z *polyGF2) Sub(a, b *polyGF2) *polyGF2 { 49 | return z.Add(a, b) 50 | } 51 | 52 | // Mul sets z to a(x) * b(x) and returns z. 53 | func (z *polyGF2) Mul(a, b *polyGF2) *polyGF2 { 54 | var out *polyGF2 55 | if z != a && z != b { 56 | out = z 57 | } else { 58 | out = &polyGF2{} 59 | } 60 | 61 | dx := a.Degree() 62 | var bs big.Int 63 | for i := 0; i <= dx; i++ { 64 | if a.coeff.Bit(i) != 0 { 65 | bs.Lsh(&b.coeff, uint(i)) 66 | out.coeff.Xor(&out.coeff, &bs) 67 | } 68 | } 69 | 70 | if z != out { 71 | z.coeff.Set(&out.coeff) 72 | } 73 | return z 74 | } 75 | 76 | // Mod sets z to the remainder of a(x) / b(x) and returns z. 77 | func (z *polyGF2) Mod(a, b *polyGF2) *polyGF2 { 78 | var out *polyGF2 79 | if z != a && z != b { 80 | out = z 81 | } else { 82 | out = &polyGF2{} 83 | } 84 | 85 | // Compute the remainder using synthetic division. 86 | da, db := a.Degree(), b.Degree() 87 | if db < 0 { 88 | panic("divide by zero") 89 | } 90 | out.coeff.Set(&a.coeff) 91 | var tmp polyGF2 92 | for i := da - db; i >= 0; i-- { 93 | if out.coeff.Bit(i+db) != 0 { 94 | tmp.MulX(b, i) 95 | out.Sub(out, &tmp) 96 | } 97 | } 98 | 99 | if z != out { 100 | z.coeff.Set(&out.coeff) 101 | } 102 | return z 103 | } 104 | 105 | // String returns p represented in mathematical notation. 106 | func (z *polyGF2) String() string { 107 | if z.coeff.Sign() == 0 { 108 | return "0" 109 | } 110 | var s bytes.Buffer 111 | for i := z.Degree(); i >= 0; i-- { 112 | if z.coeff.Bit(i) == 0 { 113 | continue 114 | } 115 | if s.Len() > 0 { 116 | s.WriteByte('+') 117 | } 118 | switch { 119 | case i == 0: 120 | s.WriteByte('1') 121 | case i == 1: 122 | s.WriteByte('x') 123 | default: 124 | fmt.Fprintf(&s, "x^%d", i) 125 | } 126 | } 127 | return s.String() 128 | } 129 | -------------------------------------------------------------------------------- /rabin/poly_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2017 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package rabin 6 | 7 | import "testing" 8 | 9 | func TestPolyMod(t *testing.T) { 10 | var p polyGF2 11 | for _, test := range [][3]uint64{ 12 | {0x3e75, 0x3e75, 0x0}, // a(x) mod a(x) = 0 13 | {0x3e75 << 1, 0x3e75, 0x0}, // a(x)*x mod a(x) = 0 14 | {0x3e74, 0x3e75, 0x1}, // a(x) + 1 mod a(x) = 1 15 | {0x7337, 0xe39b, 0x7337}, // degree(a) < degree(b) 16 | // Random polynomials, checked with Wolfram Alpha. 17 | {0x3e75, 0x201b, 0x1e6e}, 18 | {0xd10b, 0x35f7, 0x6d7}, 19 | {0xe5a2, 0x8c83, 0x6921}, 20 | {0x9a4a, 0xa8c7, 0x328d}, 21 | } { 22 | a, b := newPolyGF2(test[0]), newPolyGF2(test[1]) 23 | p.Mod(a, b) 24 | if p.coeff.Uint64() != test[2] { 25 | t.Errorf("%s mod %s = %s (%#x), want %s", a, b, &p, p.coeff.Uint64(), newPolyGF2(test[2])) 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /rabin/rabin.go: -------------------------------------------------------------------------------- 1 | // Copyright 2017 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package rabin implements Rabin hashing (fingerprinting). 6 | // 7 | // A given Rabin hash function is defined by a polynomial over GF(2): 8 | // 9 | // p(x) = ... + p₂x² + p₁x + p₀ where pₙ ∈ GF(2) 10 | // 11 | // The message to be hashed is likewise interpreted as a polynomial 12 | // over GF(2), where the coefficients are the bits of the message in 13 | // left-to-right most-significant-bit-first order. Given a message 14 | // polynomial m(x) and a hashing polynomial p(x), the Rabin hash is 15 | // simply the coefficients of m(x) mod p(x). 16 | // 17 | // Rabin hashing has the unusual property that it can efficiently 18 | // compute a "rolling hash" of a stream of data, where the hash value 19 | // reflects only the most recent w bytes of the stream, for some 20 | // window size w. This property makes it ideal for "content-defined 21 | // chunking", which sub-divides sequential data on boundaries that are 22 | // robust to insertions and deletions. 23 | // 24 | // The details of Rabin hashing are described in Rabin, Michael 25 | // (1981). "Fingerprinting by Random Polynomials." Center for Research 26 | // in Computing Technology, Harvard University. Tech Report 27 | // TR-CSE-03-01. 28 | package rabin 29 | 30 | // This implementation is based on ideas and techniques from several 31 | // sources: 32 | // 33 | // - The original Rabin paper, which lays out the fundamentals, but 34 | // doesn't discuss the byte-wise or table-driven optimizations that 35 | // are critical in a real implementation. 36 | // 37 | // - Broder, Andrei Z. "Some applications of Rabin’s fingerprinting 38 | // method." Sequences II. Springer, New York, NY, 1993. 143–152. This 39 | // describes the math behind multi-bit updates. 40 | // 41 | // - librabinpoly (github.com/stevegt/librabinpoly), a C 42 | // implementation which itself has a long and fascinating lineage 43 | // going back to David Mazieres. 44 | // 45 | // - rabinfingerprint (github.com/themadcreator/rabinfingerprint), a 46 | // well-commented Java implementation by Bill Dwyer. 47 | 48 | // Poly64 is an 64-bit (degree 63) irreducible polynomial over GF(2). 49 | // 50 | // This is a convenient polynomial to use for computing 64-bit Rabin 51 | // hashes. 52 | const Poly64 = 0xbfe6b8a5bf378d83 53 | 54 | // Table is a set of pre-computed tables for computing Rabin 55 | // fingerprints for a given polynomial and window size. 56 | type Table struct { 57 | push [256]uint64 58 | pop [256]uint64 59 | degree int 60 | shift uint 61 | window int 62 | } 63 | 64 | // NewTable returns a Table for constructing Rabin hashes using the 65 | // polynomial 66 | // 67 | // p(x) = ... + p₂x² + p₁x + p₀ where pₙ ∈ GF(2) 68 | // 69 | // where pₙ = (polynomial >> n) & 1. This polynomial must be 70 | // irreducible and must have degree >= 8. The number of bits in the 71 | // resulting hash values will be the same as the number of bits in 72 | // polynomial. 73 | // 74 | // This package defines Poly64 as a convenient 64-bit irreducible 75 | // polynomial that can be used with this function. 76 | // 77 | // If window > 0, hashes constructed from this Table will be rolling 78 | // hash over only the most recently written window bytes of data. 79 | func NewTable(polynomial uint64, window int) *Table { 80 | tab := &Table{} 81 | p := newPolyGF2(polynomial) 82 | tab.degree = p.Degree() 83 | if tab.degree < 8 { 84 | panic("polynomial must have degree >= 8") 85 | } 86 | tab.shift = uint(tab.degree - 8) 87 | tab.window = window 88 | 89 | // Pre-compute the push table. 90 | var f, f2 polyGF2 91 | for i := 0; i < 256; i++ { 92 | // We shift out 8 bits of the hash at a time, so 93 | // pre-compute the update (i(x) * xⁿ mod p(x)) for all 94 | // possible top 8 bits of the hash. 95 | f.coeff.SetInt64(int64(i)) 96 | f.MulX(&f, p.Degree()) 97 | f2.Mod(&f, p) 98 | // To avoid explicitly masking away the bits that we 99 | // want to shift out of the hash, we add in (i(x) * 100 | // x^n). This is exactly equal to the bits we want to 101 | // mask out, so when we xor with this, it will take 102 | // care of zeroing out these bits. 103 | f.Add(&f, &f2) 104 | tab.push[i] = f.coeff.Uint64() 105 | } 106 | 107 | // Pre-compute the pop table. 108 | if window > 0 { 109 | for i := 0; i < 256; i++ { 110 | f.coeff.SetInt64(int64(i)) 111 | // TODO: We could combine the multiply and mod 112 | // to make this faster. See, e.g., librabinpoly. 113 | f.MulX(&f, (window-1)*8) 114 | f2.Mod(&f, p) 115 | tab.pop[i] = f2.coeff.Uint64() 116 | } 117 | } 118 | 119 | return tab 120 | } 121 | 122 | // update updates the hash as if p had been appended to the currently 123 | // hashed message. 124 | func (tab *Table) update(hash uint64, p []byte) uint64 { 125 | // Given the current message 126 | // 127 | // m(x) = ... + m₂x² + m₁x + m₀ 128 | // 129 | // and hash 130 | // 131 | // h(x) = m(x) mod p(x) 132 | // 133 | // we can extend the message by one bit b: 134 | // 135 | // m'(x) = ... + m₂x³ + m₁x² + m₀x + b = m(x)*x + b 136 | // 137 | // This yields the hash update: 138 | // 139 | // h'(x) = m'(x) mod p(x) 140 | // = (m(x)*x + b) mod p(x) 141 | // = ((m(x) mod p(x)) * x + b) mod p(x) 142 | // = (h(x)*x + b) mod p(x) 143 | // = hₙ₋₂xⁿ⁻¹ + ... + h₀x + b + hₙ₋₁(pₙ₋₁xⁿ⁻¹ + ... + p₀) 144 | // 145 | // where n is the degree of p(x). 146 | // 147 | // In general, we can extend the hash with any i bit message 148 | // m2 using the fact that 149 | // 150 | // r(concat(m1, m2)) = r(r(m1) * r(xⁱ)) + r(m2) 151 | // 152 | // where r(M) = M(x) mod p(x). Below, we update it 8 bits at a 153 | // time and, since we require p(x) to have degree >= 8, this 154 | // simplifies to 155 | // 156 | // r(concat(m1, m2)) = r(r(m1) * x⁸) + m2 157 | // 158 | // r(m1) is the current hash value. Multiplication by x⁸ is a 159 | // shift. We can compute r(r(m1) * x⁸) using the lookup table 160 | // we constructed in New. 161 | shift := tab.shift % 64 // shift%64 eliminates checks below 162 | for _, b := range p { 163 | top := uint8(hash >> shift) 164 | hash = (hash<<8 | uint64(b)) ^ tab.push[top] 165 | } 166 | return hash 167 | } 168 | 169 | // Hash computes Rabin hashes (often called fingerprints). 170 | // 171 | // Hash implements hash.Hash64. 172 | type Hash struct { 173 | tab *Table 174 | hash uint64 175 | msg []byte 176 | pos int 177 | } 178 | 179 | // New returns a new Rabin hash using the polynomial and window size 180 | // represented by table. 181 | func New(table *Table) *Hash { 182 | hash := &Hash{tab: table} 183 | if table.window > 0 { 184 | // Leading zeros don't affect the hash, so we can 185 | // start with a full window of zeros and keep the 186 | // later logic simpler. 187 | hash.msg = make([]byte, table.window) 188 | } 189 | return hash 190 | } 191 | 192 | // Sum64 returns the hash of all bytes written to h. 193 | func (h *Hash) Sum64() uint64 { 194 | return h.hash 195 | } 196 | 197 | // Reset resets h to its initial state. 198 | func (h *Hash) Reset() { 199 | h.hash = 0 200 | if h.msg != nil { 201 | for i := range h.msg { 202 | h.msg[i] = 0 203 | } 204 | h.pos = 0 205 | } 206 | } 207 | 208 | // Write adds p to the running hash h. 209 | // 210 | // If h is windowed, this may also expire previously written bytes 211 | // from the running hash so that h represents the hash of only the 212 | // most recently written window bytes. 213 | // 214 | // It always returns len(p), nil. 215 | func (h *Hash) Write(p []byte) (n int, err error) { 216 | n = len(p) 217 | 218 | if h.msg == nil { 219 | h.hash = h.tab.update(h.hash, p) 220 | return 221 | } 222 | 223 | window := len(h.msg) 224 | if len(p) >= window { 225 | // p covers the entire window. Discard our entire 226 | // state and just hash the last window bytes of p. 227 | p = p[len(p)-window:] 228 | copy(h.msg, p) 229 | h.pos, h.hash = 0, 0 230 | h.hash = h.tab.update(h.hash, p) 231 | return 232 | } 233 | // Add and remove bytes as we overwrite them in the window. 234 | // 235 | // TODO: If we made h.win a little bigger, we could copy and 236 | // process bigger chunks at a time. Not sure it would actually 237 | // help, but we could abstract this more nicely. 238 | tab := h.tab 239 | pos, hash, shift := h.pos, h.hash, tab.shift%64 240 | for _, b := range p { 241 | pop := h.msg[pos] 242 | h.msg[pos] = b 243 | if pos++; pos == window { 244 | pos = 0 245 | } 246 | 247 | hash ^= tab.pop[pop] 248 | top := uint8(hash >> shift) 249 | hash = (hash<<8 | uint64(b)) ^ tab.push[top] 250 | } 251 | h.pos, h.hash = int(pos), hash 252 | return 253 | } 254 | 255 | // Size returns the number of bytes Sum will append. This is the 256 | // minimum number of bytes necessary to represent the hash. 257 | func (h *Hash) Size() int { 258 | bits := h.tab.degree + 1 259 | return (bits + 7) / 8 260 | } 261 | 262 | // Sum appends the least-significant byte first representation of the 263 | // current hash to b and returns the resulting slice. 264 | func (h *Hash) Sum(b []byte) []byte { 265 | var hbytes [8]byte 266 | for i := range hbytes { 267 | hbytes[i] = byte(h.hash >> uint(i*8)) 268 | } 269 | return append(b, hbytes[:h.Size()]...) 270 | } 271 | 272 | // BlockSize returns the window size if a window is configured, and 273 | // otherwise returns 1. 274 | // 275 | // This satisfies the hash.Hash interface and indicates that Write is 276 | // most efficient if writes are a multiple of the returned size. 277 | func (h *Hash) BlockSize() int { 278 | if h.msg != nil { 279 | return len(h.msg) 280 | } 281 | return 1 282 | } 283 | -------------------------------------------------------------------------------- /rabin/rabin_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2017 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package rabin 6 | 7 | import ( 8 | "fmt" 9 | "math/rand" 10 | "testing" 11 | ) 12 | 13 | var testPolys = []uint64{ 14 | 0x11d, // Degree 8 (smallest supported) 15 | 0xbfe6b8a5bf378d83, // Degree 63 (largest supported) 16 | } 17 | 18 | func TestRabin(t *testing.T) { 19 | nTests := 100 20 | if testing.Short() { 21 | nTests = 5 22 | } 23 | 24 | for _, poly := range testPolys { 25 | tab := NewTable(poly, 0) 26 | t.Run("poly="+newPolyGF2(poly).String(), func(t *testing.T) { 27 | rg := rand.New(rand.NewSource(42)) 28 | data := make([]byte, 64) 29 | for i := 0; i < nTests; i++ { 30 | rg.Read(data) 31 | h1 := rabinSlow(poly, data) 32 | h2 := New(tab) 33 | h2.Write(data) 34 | if h1 != h2.Sum64() { 35 | t.Errorf("want hash %#x (%s), got %#x (%s) for %x", h1, newPolyGF2(h1), h2.Sum64(), newPolyGF2(h2.Sum64()), data) 36 | } 37 | } 38 | }) 39 | } 40 | } 41 | 42 | // rabinSlow is a slow, very literal implementation of Rabin 43 | // fingerprinting that doesn't support streaming. 44 | func rabinSlow(poly uint64, data []byte) uint64 { 45 | var mpoly polyGF2 46 | mpoly.coeff.SetBytes(data) 47 | return mpoly.Mod(&mpoly, newPolyGF2(poly)).coeff.Uint64() 48 | } 49 | 50 | func TestWindow(t *testing.T) { 51 | rg := rand.New(rand.NewSource(42)) 52 | data := make([]byte, 1024) 53 | rg.Read(data) 54 | 55 | for _, poly := range testPolys { 56 | hNoWin := New(NewTable(poly, 0)) 57 | for _, window := range []int{1, 4, 64, 65} { 58 | hWin := New(NewTable(poly, window)) 59 | t.Run(fmt.Sprintf("poly=%s/window=%d", newPolyGF2(poly), window), func(t *testing.T) { 60 | for _, blockSize := range []int{1, 2, 5, 100} { 61 | hWin.Reset() 62 | for i := 0; i < len(data); i += blockSize { 63 | block := data[i:] 64 | if len(block) > blockSize { 65 | block = block[:blockSize] 66 | } 67 | 68 | hWin.Write(block) 69 | 70 | dataWin := data[:i+len(block)] 71 | if len(dataWin) > window { 72 | dataWin = dataWin[len(dataWin)-window:] 73 | } 74 | hNoWin.Reset() 75 | hNoWin.Write(dataWin) 76 | 77 | // Check the hash. 78 | if hNoWin.Sum64() != hWin.Sum64() { 79 | t.Errorf("want hash %#x, got %#x at byte %d with %d byte blocks", hNoWin.Sum64(), hWin.Sum64(), i, blockSize) 80 | } 81 | } 82 | } 83 | }) 84 | } 85 | } 86 | } 87 | 88 | func BenchmarkRabin(b *testing.B) { 89 | rg := rand.New(rand.NewSource(42)) 90 | data := make([]byte, 1<<20) 91 | rg.Read(data) 92 | b.SetBytes(int64(len(data))) 93 | h := New(NewTable(Poly64, 0)) 94 | b.ResetTimer() 95 | 96 | for i := 0; i < b.N; i++ { 97 | h.Reset() 98 | h.Write(data) 99 | } 100 | } 101 | 102 | func BenchmarkRabinWindowed64(b *testing.B) { 103 | const window = 64 104 | rg := rand.New(rand.NewSource(42)) 105 | data := make([]byte, 1<<20) 106 | rg.Read(data) 107 | b.SetBytes(int64(len(data))) 108 | h := New(NewTable(Poly64, window)) 109 | b.ResetTimer() 110 | 111 | for i := 0; i < b.N; i++ { 112 | h.Reset() 113 | // Feed it smaller blocks or it will just reset and 114 | // hash the end. 115 | for j := 0; j < len(data); j += window / 2 { 116 | h.Write(data[j : j+window/2]) 117 | } 118 | } 119 | } 120 | --------------------------------------------------------------------------------