├── go.mod ├── README.md ├── example_test.go ├── LICENSE ├── options.go ├── .github └── workflows │ └── tests.yml ├── doc.go ├── polynomials.go ├── polynomials_test.go ├── chunker.go └── chunker_test.go /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/restic/chunker 2 | 3 | go 1.9 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![GoDoc](https://godoc.org/github.com/restic/chunker?status.svg)](http://godoc.org/github.com/restic/chunker) 2 | [![Build Status](https://github.com/restic/chunker/workflows/test/badge.svg)](https://github.com/restic/chunker/actions?query=workflow%3Atest) 3 | 4 | The package `chunker` implements content-defined-chunking (CDC) based on a 5 | rolling Rabin Hash. The library is part of the [restic backup 6 | program](https://github.com/restic/restic). 7 | 8 | An introduction to Content Defined Chunking can be found in the restic blog 9 | post [Foundation - Introducing Content Defined Chunking (CDC)](https://restic.github.io/blog/2015-09-12/restic-foundation1-cdc). 10 | 11 | You can find the API documentation at 12 | https://godoc.org/github.com/restic/chunker 13 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | package chunker_test 2 | 3 | import ( 4 | "bytes" 5 | "crypto/sha256" 6 | "fmt" 7 | "io" 8 | "math/rand" 9 | 10 | "github.com/restic/chunker" 11 | ) 12 | 13 | func ExampleChunker() { 14 | // generate 32MiB of deterministic pseudo-random data 15 | rng := rand.New(rand.NewSource(23)) 16 | data := make([]byte, 32*1024*1024) 17 | 18 | _, err := rng.Read(data) 19 | if err != nil { 20 | panic(err) 21 | } 22 | 23 | // create a chunker 24 | chnkr := chunker.New(bytes.NewReader(data), chunker.Pol(0x3DA3358B4DC173)) 25 | 26 | // reuse this buffer 27 | buf := make([]byte, 8*1024*1024) 28 | 29 | for i := 0; i < 5; i++ { 30 | chunk, err := chnkr.Next(buf) 31 | if err == io.EOF { 32 | break 33 | } 34 | 35 | if err != nil { 36 | panic(err) 37 | } 38 | 39 | fmt.Printf("%d %02x\n", chunk.Length, sha256.Sum256(chunk.Data)) 40 | } 41 | 42 | // Output: 43 | // 1015370 615e8851030f318751f3c8baf8fbfa9958e2dd7f25dc1a87dcf6d6f79d1f1a9f 44 | // 1276199 f1cb038c558d3a2093049815cc45f80cd367712634a28f6dd36642f905d35c37 45 | // 1124437 a8e19dcd4224b58eb2b480ae42bb1a4a3b0c91c074f4745dbe3f8e4ec1a926e7 46 | // 3580969 2b3a3fe65ce9d689599c3b26375c40c22955bf92b170b24258e54dee91e3c2af 47 | // 3709129 47672502d75db244cb3dc3098eed87ffd537c9f0d66fb82a0198b6f6994409f2 48 | } 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Alexander Neumann 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /options.go: -------------------------------------------------------------------------------- 1 | package chunker 2 | 3 | type option func(*Chunker) 4 | type baseOption func(*BaseChunker) 5 | 6 | // WithAverageBits allows to control the frequency of chunk discovery: 7 | // the lower averageBits, the higher amount of chunks will be identified. 8 | // The default value is 20 bits, so chunks will be of 1MiB size on average. 9 | func WithBaseAverageBits(averageBits int) baseOption { 10 | return func(c *BaseChunker) { c.splitmask = (1 << uint64(averageBits)) - 1 } 11 | } 12 | 13 | // WithBoundaries allows to set custom min and max size boundaries. 14 | func WithBaseBoundaries(min, max uint) baseOption { 15 | return func(c *BaseChunker) { 16 | c.MinSize = min 17 | c.MaxSize = max 18 | } 19 | } 20 | 21 | // WithAverageBits allows to control the frequency of chunk discovery: 22 | // the lower averageBits, the higher amount of chunks will be identified. 23 | // The default value is 20 bits, so chunks will be of 1MiB size on average. 24 | func WithAverageBits(averageBits int) option { 25 | return func(c *Chunker) { c.splitmask = (1 << uint64(averageBits)) - 1 } 26 | } 27 | 28 | // WithBoundaries allows to set custom min and max size boundaries. 29 | func WithBoundaries(min, max uint) option { 30 | return func(c *Chunker) { 31 | c.MinSize = min 32 | c.MaxSize = max 33 | } 34 | } 35 | 36 | // WithBuffer allows to set custom buffer for chunker. 37 | func WithBuffer(buf []byte) option { 38 | return func(c *Chunker) { c.buf = buf } 39 | } 40 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | on: 3 | # run tests on push to master, but not when other branches are pushed to 4 | push: 5 | branches: 6 | - master 7 | 8 | # run tests for all pull requests 9 | pull_request: 10 | 11 | env: 12 | latest_go: "1.19.x" 13 | GO111MODULE: on 14 | 15 | jobs: 16 | test: 17 | strategy: 18 | matrix: 19 | # list of jobs to run: 20 | include: 21 | - job_name: Windows 22 | go: 1.19.x 23 | os: windows-latest 24 | install_verb: install 25 | 26 | - job_name: macOS 27 | go: 1.19.x 28 | os: macOS-latest 29 | install_verb: install 30 | 31 | - job_name: Linux 32 | go: 1.19.x 33 | os: ubuntu-latest 34 | install_verb: install 35 | 36 | - job_name: Linux 37 | go: 1.18.x 38 | os: ubuntu-latest 39 | install_verb: install 40 | 41 | - job_name: Linux 42 | go: 1.17.x 43 | os: ubuntu-latest 44 | install_verb: install 45 | 46 | - job_name: Linux 47 | go: 1.16.x 48 | os: ubuntu-latest 49 | install_verb: get 50 | 51 | - job_name: Linux 52 | go: 1.15.x 53 | os: ubuntu-latest 54 | install_verb: get 55 | 56 | name: ${{ matrix.job_name }} Go ${{ matrix.go }} 57 | runs-on: ${{ matrix.os }} 58 | 59 | env: 60 | GOPROXY: https://proxy.golang.org 61 | 62 | steps: 63 | - name: Set up Go ${{ matrix.go }} 64 | uses: actions/setup-go@v2 65 | with: 66 | go-version: ${{ matrix.go }} 67 | 68 | - name: Check out code 69 | uses: actions/checkout@v2 70 | 71 | - name: Run tests 72 | run: | 73 | go test -cover ./... 74 | 75 | lint: 76 | name: lint 77 | runs-on: ubuntu-latest 78 | steps: 79 | - name: Set up Go ${{ env.latest_go }} 80 | uses: actions/setup-go@v2 81 | with: 82 | go-version: ${{ env.latest_go }} 83 | 84 | - name: Check out code 85 | uses: actions/checkout@v2 86 | 87 | - name: golangci-lint 88 | uses: golangci/golangci-lint-action@v2 89 | with: 90 | # Required: the version of golangci-lint is required and must be specified without patch version: we always use the latest patch version. 91 | version: v1.48 92 | # Optional: show only new issues if it's a pull request. The default value is `false`. 93 | only-new-issues: true 94 | args: --verbose --timeout 5m 95 | skip-go-installation: true 96 | 97 | # only run golangci-lint for pull requests, otherwise ALL hints get 98 | # reported. We need to slowly address all issues until we can enable 99 | # linting the master branch :) 100 | if: github.event_name == 'pull_request' 101 | 102 | - name: Check go.mod 103 | run: | 104 | echo "check if go.mod is up to date" 105 | go mod tidy 106 | git diff --exit-code go.mod 107 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Alexander Neumann. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | /* 6 | Package chunker implements Content Defined Chunking (CDC) based on a rolling 7 | Rabin Checksum. 8 | 9 | Choosing a Random Irreducible Polynomial 10 | 11 | The function RandomPolynomial() returns a new random polynomial of degree 53 12 | for use with the chunker. The degree 53 is chosen because it is the largest 13 | prime below 64-8 = 56, so that the top 8 bits of an uint64 can be used for 14 | optimising calculations in the chunker. 15 | 16 | A random polynomial is chosen selecting 64 random bits, masking away bits 17 | 64..54 and setting bit 53 to one (otherwise the polynomial is not of the 18 | desired degree) and bit 0 to one (otherwise the polynomial is trivially 19 | reducible), so that 51 bits are chosen at random. 20 | 21 | This process is repeated until Irreducible() returns true, then this 22 | polynomials is returned. If this doesn't happen after 1 million tries, the 23 | function returns an error. The probability for selecting an irreducible 24 | polynomial at random is about 7.5% ( (2^53-2)/53 / 2^51), so the probability 25 | that no irreducible polynomial has been found after 100 tries is lower than 26 | 0.04%. 27 | 28 | Verifying Irreducible Polynomials 29 | 30 | During development the results have been verified using the computational 31 | discrete algebra system GAP, which can be obtained from the website at 32 | http://www.gap-system.org/. 33 | 34 | For filtering a given list of polynomials in hexadecimal coefficient notation, 35 | the following script can be used: 36 | 37 | # create x over F_2 = GF(2) 38 | x := Indeterminate(GF(2), "x"); 39 | 40 | # test if polynomial is irreducible, i.e. the number of factors is one 41 | IrredPoly := function (poly) 42 | return (Length(Factors(poly)) = 1); 43 | end;; 44 | 45 | # create a polynomial in x from the hexadecimal representation of the 46 | # coefficients 47 | Hex2Poly := function (s) 48 | return ValuePol(CoefficientsQadic(IntHexString(s), 2), x); 49 | end;; 50 | 51 | # list of candidates, in hex 52 | candidates := [ "3DA3358B4DC173" ]; 53 | 54 | # create real polynomials 55 | L := List(candidates, Hex2Poly); 56 | 57 | # filter and display the list of irreducible polynomials contained in L 58 | Display(Filtered(L, x -> (IrredPoly(x)))); 59 | 60 | All irreducible polynomials from the list are written to the output. 61 | 62 | Background Literature 63 | 64 | An introduction to Rabin Fingerprints/Checksums can be found in the following articles: 65 | 66 | Michael O. Rabin (1981): "Fingerprinting by Random Polynomials" 67 | http://www.xmailserver.org/rabin.pdf 68 | 69 | Ross N. Williams (1993): "A Painless Guide to CRC Error Detection Algorithms" 70 | http://www.zlib.net/crc_v3.txt 71 | 72 | Andrei Z. Broder (1993): "Some Applications of Rabin's Fingerprinting Method" 73 | http://www.xmailserver.org/rabin_apps.pdf 74 | 75 | Shuhong Gao and Daniel Panario (1997): "Tests and Constructions of Irreducible Polynomials over Finite Fields" 76 | http://www.math.clemson.edu/~sgao/papers/GP97a.pdf 77 | 78 | Andrew Kadatch, Bob Jenkins (2007): "Everything we know about CRC but afraid to forget" 79 | http://crcutil.googlecode.com/files/crc-doc.1.0.pdf 80 | 81 | */ 82 | package chunker 83 | -------------------------------------------------------------------------------- /polynomials.go: -------------------------------------------------------------------------------- 1 | package chunker 2 | 3 | import ( 4 | "crypto/rand" 5 | "encoding/binary" 6 | "errors" 7 | "fmt" 8 | "io" 9 | "math/bits" 10 | "strconv" 11 | ) 12 | 13 | // Pol is a polynomial from F_2[X]. 14 | type Pol uint64 15 | 16 | // Add returns x+y. 17 | func (x Pol) Add(y Pol) Pol { 18 | r := Pol(uint64(x) ^ uint64(y)) 19 | return r 20 | } 21 | 22 | // Mul returns x*y. When an overflow occurs, Mul panics. 23 | func (x Pol) Mul(y Pol) Pol { 24 | switch { 25 | case x == 0 || y == 0: 26 | return 0 27 | case x == 1: 28 | return y 29 | case y == 1: 30 | return x 31 | case y == 2: 32 | return x.mul2() 33 | } 34 | 35 | var res Pol 36 | for i := 0; i <= y.Deg(); i++ { 37 | if (y & (1 << uint(i))) > 0 { 38 | res = res.Add(x << uint(i)) 39 | } 40 | } 41 | 42 | if res.Div(y) != x { 43 | panic("multiplication would overflow uint64") 44 | } 45 | 46 | return res 47 | } 48 | 49 | // 2*x. 50 | func (x Pol) mul2() Pol { 51 | if x&(1<<63) != 0 { 52 | panic("multiplication would overflow uint64") 53 | } 54 | return x << 1 55 | } 56 | 57 | // Deg returns the degree of the polynomial x. If x is zero, -1 is returned. 58 | func (x Pol) Deg() int { 59 | return bits.Len64(uint64(x)) - 1 60 | } 61 | 62 | // String returns the coefficients in hex. 63 | func (x Pol) String() string { 64 | return "0x" + strconv.FormatUint(uint64(x), 16) 65 | } 66 | 67 | // Expand returns the string representation of the polynomial x. 68 | func (x Pol) Expand() string { 69 | if x == 0 { 70 | return "0" 71 | } 72 | 73 | s := "" 74 | for i := x.Deg(); i > 1; i-- { 75 | if x&(1< 0 { 76 | s += fmt.Sprintf("+x^%d", i) 77 | } 78 | } 79 | 80 | if x&2 > 0 { 81 | s += "+x" 82 | } 83 | 84 | if x&1 > 0 { 85 | s += "+1" 86 | } 87 | 88 | return s[1:] 89 | } 90 | 91 | // DivMod returns x / d = q, and remainder r, 92 | // see https://en.wikipedia.org/wiki/Division_algorithm 93 | func (x Pol) DivMod(d Pol) (Pol, Pol) { 94 | if x == 0 { 95 | return 0, 0 96 | } 97 | 98 | if d == 0 { 99 | panic("division by zero") 100 | } 101 | 102 | D := d.Deg() 103 | diff := x.Deg() - D 104 | if diff < 0 { 105 | return 0, x 106 | } 107 | 108 | var q Pol 109 | for diff >= 0 { 110 | m := d << uint(diff) 111 | q |= (1 << uint(diff)) 112 | x = x.Add(m) 113 | 114 | diff = x.Deg() - D 115 | } 116 | 117 | return q, x 118 | } 119 | 120 | // Div returns the integer division result x / d. 121 | func (x Pol) Div(d Pol) Pol { 122 | q, _ := x.DivMod(d) 123 | return q 124 | } 125 | 126 | // Mod returns the remainder of x / d 127 | func (x Pol) Mod(d Pol) Pol { 128 | _, r := x.DivMod(d) 129 | return r 130 | } 131 | 132 | // I really dislike having a function that does not terminate, so specify a 133 | // really large upper bound for finding a new irreducible polynomial, and 134 | // return an error when no irreducible polynomial has been found within 135 | // randPolMaxTries. 136 | const randPolMaxTries = 1e6 137 | 138 | // RandomPolynomial returns a new random irreducible polynomial 139 | // of degree 53 using the default System CSPRNG as source. 140 | // It is equivalent to calling DerivePolynomial(rand.Reader). 141 | func RandomPolynomial() (Pol, error) { 142 | return DerivePolynomial(rand.Reader) 143 | } 144 | 145 | // DerivePolynomial returns an irreducible polynomial of degree 53 146 | // (largest prime number below 64-8) by reading bytes from source. 147 | // There are (2^53-2/53) irreducible polynomials of degree 53 in 148 | // F_2[X], c.f. Michael O. Rabin (1981): "Fingerprinting by Random 149 | // Polynomials", page 4. If no polynomial could be found in one 150 | // million tries, an error is returned. 151 | func DerivePolynomial(source io.Reader) (Pol, error) { 152 | for i := 0; i < randPolMaxTries; i++ { 153 | var f Pol 154 | 155 | // choose polynomial at (pseudo)random 156 | err := binary.Read(source, binary.LittleEndian, &f) 157 | if err != nil { 158 | return 0, err 159 | } 160 | 161 | // mask away bits above bit 53 162 | f &= Pol((1 << 54) - 1) 163 | 164 | // set highest and lowest bit so that the degree is 53 and the 165 | // polynomial is not trivially reducible 166 | f |= (1 << 53) | 1 167 | 168 | // test if f is irreducible 169 | if f.Irreducible() { 170 | return f, nil 171 | } 172 | } 173 | 174 | // If this is reached, we haven't found an irreducible polynomial in 175 | // randPolMaxTries. This error is very unlikely to occur. 176 | return 0, errors.New("unable to find new random irreducible polynomial") 177 | } 178 | 179 | // GCD computes the Greatest Common Divisor x and f. 180 | func (x Pol) GCD(f Pol) Pol { 181 | if f == 0 { 182 | return x 183 | } 184 | 185 | if x == 0 { 186 | return f 187 | } 188 | 189 | if x.Deg() < f.Deg() { 190 | x, f = f, x 191 | } 192 | 193 | return f.GCD(x.Mod(f)) 194 | } 195 | 196 | // Irreducible returns true iff x is irreducible over F_2. This function 197 | // uses Ben Or's reducibility test. 198 | // 199 | // For details see "Tests and Constructions of Irreducible Polynomials over 200 | // Finite Fields". 201 | func (x Pol) Irreducible() bool { 202 | for i := 1; i <= x.Deg()/2; i++ { 203 | if x.GCD(qp(uint(i), x)) != 1 { 204 | return false 205 | } 206 | } 207 | 208 | return true 209 | } 210 | 211 | // MulMod computes x*f mod g 212 | func (x Pol) MulMod(f, g Pol) Pol { 213 | if x == 0 || f == 0 { 214 | return 0 215 | } 216 | 217 | var res Pol 218 | for i := 0; i <= f.Deg(); i++ { 219 | if (f & (1 << uint(i))) > 0 { 220 | a := x 221 | for j := 0; j < i; j++ { 222 | a = a.Mul(2).Mod(g) 223 | } 224 | res = res.Add(a).Mod(g) 225 | } 226 | } 227 | 228 | return res 229 | } 230 | 231 | // qp computes the polynomial (x^(2^p)-x) mod g. This is needed for the 232 | // reducibility test. 233 | func qp(p uint, g Pol) Pol { 234 | num := (1 << p) 235 | i := 1 236 | 237 | // start with x 238 | res := Pol(2) 239 | 240 | for i < num { 241 | // repeatedly square res 242 | res = res.MulMod(res, g) 243 | i *= 2 244 | } 245 | 246 | // add x 247 | return res.Add(2).Mod(g) 248 | } 249 | 250 | // MarshalJSON returns the JSON representation of the Pol. 251 | func (x Pol) MarshalJSON() ([]byte, error) { 252 | buf := strconv.AppendUint([]byte{'"'}, uint64(x), 16) 253 | buf = append(buf, '"') 254 | return buf, nil 255 | } 256 | 257 | // UnmarshalJSON parses a Pol from the JSON data. 258 | func (x *Pol) UnmarshalJSON(data []byte) error { 259 | if len(data) < 2 { 260 | return errors.New("invalid string for polynomial") 261 | } 262 | n, err := strconv.ParseUint(string(data[1:len(data)-1]), 16, 64) 263 | if err != nil { 264 | return err 265 | } 266 | *x = Pol(n) 267 | 268 | return nil 269 | } 270 | -------------------------------------------------------------------------------- /polynomials_test.go: -------------------------------------------------------------------------------- 1 | package chunker 2 | 3 | import ( 4 | "strconv" 5 | "testing" 6 | ) 7 | 8 | var polAddTests = []struct { 9 | x, y Pol 10 | sum Pol 11 | }{ 12 | {23, 16, 23 ^ 16}, 13 | {0x9a7e30d1e855e0a0, 0x670102a1f4bcd414, 0xfd7f32701ce934b4}, 14 | {0x9a7e30d1e855e0a0, 0x9a7e30d1e855e0a0, 0}, 15 | } 16 | 17 | func TestPolAdd(t *testing.T) { 18 | for i, test := range polAddTests { 19 | if test.sum != test.x.Add(test.y) { 20 | t.Errorf("test %d failed: sum != x+y", i) 21 | } 22 | 23 | if test.sum != test.y.Add(test.x) { 24 | t.Errorf("test %d failed: sum != y+x", i) 25 | } 26 | } 27 | } 28 | 29 | func parseBin(s string) Pol { 30 | i, err := strconv.ParseUint(s, 2, 64) 31 | if err != nil { 32 | panic(err) 33 | } 34 | 35 | return Pol(i) 36 | } 37 | 38 | var polMulTests = []struct { 39 | x, y Pol 40 | res Pol 41 | }{ 42 | {1, 2, 2}, 43 | { 44 | parseBin("1101"), 45 | parseBin("10"), 46 | parseBin("11010"), 47 | }, 48 | { 49 | parseBin("1101"), 50 | parseBin("11"), 51 | parseBin("10111"), 52 | }, 53 | { 54 | 0x40000000, 55 | 0x40000000, 56 | 0x1000000000000000, 57 | }, 58 | { 59 | parseBin("1010"), 60 | parseBin("100100"), 61 | parseBin("101101000"), 62 | }, 63 | { 64 | parseBin("100"), 65 | parseBin("11"), 66 | parseBin("1100"), 67 | }, 68 | { 69 | parseBin("11"), 70 | parseBin("110101"), 71 | parseBin("1011111"), 72 | }, 73 | { 74 | parseBin("10011"), 75 | parseBin("110101"), 76 | parseBin("1100001111"), 77 | }, 78 | } 79 | 80 | func TestPolMul(t *testing.T) { 81 | for i, test := range polMulTests { 82 | m := test.x.Mul(test.y) 83 | if test.res != m { 84 | t.Errorf("TestPolMul failed for test %d: %v * %v: want %v, got %v", 85 | i, test.x, test.y, test.res, m) 86 | } 87 | m = test.y.Mul(test.x) 88 | if test.res != test.y.Mul(test.x) { 89 | t.Errorf("TestPolMul failed for %d: %v * %v: want %v, got %v", 90 | i, test.x, test.y, test.res, m) 91 | } 92 | } 93 | } 94 | 95 | func TestPolMulOverflow(t *testing.T) { 96 | defer func() { 97 | // try to recover overflow error 98 | err := recover() 99 | 100 | if e, ok := err.(string); ok && e == "multiplication would overflow uint64" { 101 | return 102 | } 103 | 104 | t.Logf("invalid error raised: %v", err) 105 | // re-raise error if not overflow 106 | panic(err) 107 | }() 108 | 109 | x := Pol(1 << 63) 110 | x.Mul(2) 111 | t.Fatal("overflow test did not panic") 112 | } 113 | 114 | var polDivTests = []struct { 115 | x, y Pol 116 | res Pol 117 | }{ 118 | {10, 50, 0}, 119 | {0, 1, 0}, 120 | { 121 | parseBin("101101000"), // 0x168 122 | parseBin("1010"), // 0xa 123 | parseBin("100100"), // 0x24 124 | }, 125 | {2, 2, 1}, 126 | { 127 | 0x8000000000000000, 128 | 0x8000000000000000, 129 | 1, 130 | }, 131 | { 132 | parseBin("1100"), 133 | parseBin("100"), 134 | parseBin("11"), 135 | }, 136 | { 137 | parseBin("1100001111"), 138 | parseBin("10011"), 139 | parseBin("110101"), 140 | }, 141 | } 142 | 143 | func TestPolDiv(t *testing.T) { 144 | for i, test := range polDivTests { 145 | m := test.x.Div(test.y) 146 | if test.res != m { 147 | t.Errorf("TestPolDiv failed for test %d: %v * %v: want %v, got %v", 148 | i, test.x, test.y, test.res, m) 149 | } 150 | } 151 | } 152 | 153 | func TestPolDeg(t *testing.T) { 154 | var x Pol 155 | if x.Deg() != -1 { 156 | t.Errorf("deg(0) is not -1: %v", x.Deg()) 157 | } 158 | 159 | x = 1 160 | if x.Deg() != 0 { 161 | t.Errorf("deg(1) is not 0: %v", x.Deg()) 162 | } 163 | 164 | for i := 0; i < 64; i++ { 165 | x = 1 << uint(i) 166 | if x.Deg() != i { 167 | t.Errorf("deg(1<<%d) is not %d: %v", i, i, x.Deg()) 168 | } 169 | } 170 | } 171 | 172 | var polModTests = []struct { 173 | x, y Pol 174 | res Pol 175 | }{ 176 | {10, 50, 10}, 177 | {0, 1, 0}, 178 | { 179 | parseBin("101101001"), 180 | parseBin("1010"), 181 | parseBin("1"), 182 | }, 183 | {2, 2, 0}, 184 | { 185 | 0x8000000000000000, 186 | 0x8000000000000000, 187 | 0, 188 | }, 189 | { 190 | parseBin("1100"), 191 | parseBin("100"), 192 | parseBin("0"), 193 | }, 194 | { 195 | parseBin("1100001111"), 196 | parseBin("10011"), 197 | parseBin("0"), 198 | }, 199 | } 200 | 201 | func TestPolModt(t *testing.T) { 202 | for i, test := range polModTests { 203 | res := test.x.Mod(test.y) 204 | if test.res != res { 205 | t.Errorf("test %d failed: want %v, got %v", i, test.res, res) 206 | } 207 | } 208 | } 209 | 210 | func BenchmarkPolDivMod(t *testing.B) { 211 | f := Pol(0x2482734cacca49) 212 | g := Pol(0x3af4b284899) 213 | 214 | for i := 0; i < t.N; i++ { 215 | g.DivMod(f) 216 | } 217 | } 218 | 219 | func BenchmarkPolDiv(t *testing.B) { 220 | f := Pol(0x2482734cacca49) 221 | g := Pol(0x3af4b284899) 222 | 223 | for i := 0; i < t.N; i++ { 224 | g.Div(f) 225 | } 226 | } 227 | 228 | func BenchmarkPolMod(t *testing.B) { 229 | f := Pol(0x2482734cacca49) 230 | g := Pol(0x3af4b284899) 231 | 232 | for i := 0; i < t.N; i++ { 233 | g.Mod(f) 234 | } 235 | } 236 | 237 | func BenchmarkPolDeg(t *testing.B) { 238 | f := Pol(0x3af4b284899) 239 | d := f.Deg() 240 | if d != 41 { 241 | t.Fatalf("BenchmalPolDeg: Wrong degree %d returned, expected %d", 242 | d, 41) 243 | } 244 | 245 | var sum int 246 | for i := 0; i < t.N; i++ { 247 | sum += f.Deg() 248 | } 249 | // Make sure Deg call isn't optimized away. 250 | t.Log("sum of Deg:", sum) 251 | } 252 | 253 | func TestRandomPolynomial(t *testing.T) { 254 | _, err := RandomPolynomial() 255 | if err != nil { 256 | t.Fatal(err) 257 | } 258 | } 259 | 260 | func BenchmarkRandomPolynomial(t *testing.B) { 261 | for i := 0; i < t.N; i++ { 262 | _, err := RandomPolynomial() 263 | if err != nil { 264 | t.Fatal(err) 265 | } 266 | } 267 | } 268 | 269 | func TestExpandPolynomial(t *testing.T) { 270 | pol := Pol(0x3DA3358B4DC173) 271 | s := pol.Expand() 272 | if s != "x^53+x^52+x^51+x^50+x^48+x^47+x^45+x^41+x^40+x^37+x^36+x^34+x^32+x^31+x^27+x^25+x^24+x^22+x^19+x^18+x^16+x^15+x^14+x^8+x^6+x^5+x^4+x+1" { 273 | t.Fatal("wrong result") 274 | } 275 | } 276 | 277 | var polIrredTests = []struct { 278 | f Pol 279 | irred bool 280 | }{ 281 | {0x38f1e565e288df, false}, 282 | {0x3DA3358B4DC173, true}, 283 | {0x30a8295b9d5c91, false}, 284 | {0x255f4350b962cb, false}, 285 | {0x267f776110a235, false}, 286 | {0x2f4dae10d41227, false}, 287 | {0x2482734cacca49, true}, 288 | {0x312daf4b284899, false}, 289 | {0x29dfb6553d01d1, false}, 290 | {0x3548245eb26257, false}, 291 | {0x3199e7ef4211b3, false}, 292 | {0x362f39017dae8b, false}, 293 | {0x200d57aa6fdacb, false}, 294 | {0x35e0a4efa1d275, false}, 295 | {0x2ced55b026577f, false}, 296 | {0x260b012010893d, false}, 297 | {0x2df29cbcd59e9d, false}, 298 | {0x3f2ac7488bd429, false}, 299 | {0x3e5cb1711669fb, false}, 300 | {0x226d8de57a9959, false}, 301 | {0x3c8de80aaf5835, false}, 302 | {0x2026a59efb219b, false}, 303 | {0x39dfa4d13fb231, false}, 304 | {0x3143d0464b3299, false}, 305 | } 306 | 307 | func TestPolIrreducible(t *testing.T) { 308 | for _, test := range polIrredTests { 309 | if test.f.Irreducible() != test.irred { 310 | t.Errorf("Irreducibility test for Polynomial %v failed: got %v, wanted %v", 311 | test.f, test.f.Irreducible(), test.irred) 312 | } 313 | } 314 | } 315 | 316 | func BenchmarkPolIrreducible(b *testing.B) { 317 | // find first irreducible polynomial 318 | var pol Pol 319 | for _, test := range polIrredTests { 320 | if test.irred { 321 | pol = test.f 322 | break 323 | } 324 | } 325 | 326 | for i := 0; i < b.N; i++ { 327 | if !pol.Irreducible() { 328 | b.Errorf("Irreducibility test for Polynomial %v failed", pol) 329 | } 330 | } 331 | } 332 | 333 | var polGCDTests = []struct { 334 | f1 Pol 335 | f2 Pol 336 | gcd Pol 337 | }{ 338 | {10, 50, 2}, 339 | {0, 1, 1}, 340 | { 341 | parseBin("101101001"), 342 | parseBin("1010"), 343 | parseBin("1"), 344 | }, 345 | {2, 2, 2}, 346 | { 347 | parseBin("1010"), 348 | parseBin("11"), 349 | parseBin("11"), 350 | }, 351 | { 352 | 0x8000000000000000, 353 | 0x8000000000000000, 354 | 0x8000000000000000, 355 | }, 356 | { 357 | parseBin("1100"), 358 | parseBin("101"), 359 | parseBin("11"), 360 | }, 361 | { 362 | parseBin("1100001111"), 363 | parseBin("10011"), 364 | parseBin("10011"), 365 | }, 366 | { 367 | 0x3DA3358B4DC173, 368 | 0x3DA3358B4DC173, 369 | 0x3DA3358B4DC173, 370 | }, 371 | { 372 | 0x3DA3358B4DC173, 373 | 0x230d2259defd, 374 | 1, 375 | }, 376 | { 377 | 0x230d2259defd, 378 | 0x51b492b3eff2, 379 | parseBin("10011"), 380 | }, 381 | } 382 | 383 | func TestPolGCD(t *testing.T) { 384 | for i, test := range polGCDTests { 385 | gcd := test.f1.GCD(test.f2) 386 | if test.gcd != gcd { 387 | t.Errorf("GCD test %d (%+v) failed: got %v, wanted %v", 388 | i, test, gcd, test.gcd) 389 | } 390 | 391 | gcd = test.f2.GCD(test.f1) 392 | if test.gcd != gcd { 393 | t.Errorf("GCD test %d (%+v) failed: got %v, wanted %v", 394 | i, test, gcd, test.gcd) 395 | } 396 | } 397 | } 398 | 399 | var polMulModTests = []struct { 400 | f1 Pol 401 | f2 Pol 402 | g Pol 403 | mod Pol 404 | }{ 405 | { 406 | 0x1230, 407 | 0x230, 408 | 0x55, 409 | 0x22, 410 | }, 411 | { 412 | 0x0eae8c07dbbb3026, 413 | 0xd5d6db9de04771de, 414 | 0xdd2bda3b77c9, 415 | 0x425ae8595b7a, 416 | }, 417 | } 418 | 419 | func TestPolMulMod(t *testing.T) { 420 | for i, test := range polMulModTests { 421 | mod := test.f1.MulMod(test.f2, test.g) 422 | if mod != test.mod { 423 | t.Errorf("MulMod test %d (%+v) failed: got %v, wanted %v", 424 | i, test, mod, test.mod) 425 | } 426 | } 427 | } 428 | -------------------------------------------------------------------------------- /chunker.go: -------------------------------------------------------------------------------- 1 | package chunker 2 | 3 | import ( 4 | "io" 5 | "sync" 6 | ) 7 | 8 | const ( 9 | kiB = 1024 10 | miB = 1024 * kiB 11 | 12 | // WindowSize is the size of the sliding window. 13 | windowSize = 64 14 | 15 | // MinSize is the default minimal size of a chunk. 16 | MinSize = 512 * kiB 17 | // MaxSize is the default maximal size of a chunk. 18 | MaxSize = 8 * miB 19 | 20 | chunkerBufSize = 512 * kiB 21 | ) 22 | 23 | type tables struct { 24 | out [256]Pol 25 | mod [256]Pol 26 | } 27 | 28 | // cache precomputed tables, these are read-only anyway 29 | var cache struct { 30 | entries map[Pol]tables 31 | sync.Mutex 32 | } 33 | 34 | func init() { 35 | cache.entries = make(map[Pol]tables) 36 | } 37 | 38 | type chunkerState struct { 39 | window [windowSize]byte 40 | wpos uint 41 | digest uint64 42 | 43 | pre uint // wait for this many bytes before start calculating an new chunk 44 | count uint // used for max chunk size tracking 45 | } 46 | 47 | type chunkerConfig struct { 48 | MinSize, MaxSize uint 49 | 50 | pol Pol 51 | polShift uint 52 | tables tables 53 | tablesInitialized bool 54 | splitmask uint64 55 | } 56 | 57 | // Chunker splits content with Rabin Fingerprints. 58 | type BaseChunker struct { 59 | chunkerConfig 60 | chunkerState 61 | } 62 | 63 | func NewBase(pol Pol, opts ...baseOption) *BaseChunker { 64 | c := &BaseChunker{ 65 | chunkerState: chunkerState{}, 66 | chunkerConfig: chunkerConfig{ 67 | pol: pol, 68 | MinSize: MinSize, 69 | MaxSize: MaxSize, 70 | splitmask: (1 << 20) - 1, // aim to create chunks of 20 bits or about 1MiB on average. 71 | }, 72 | } 73 | 74 | for _, opt := range opts { 75 | opt(c) 76 | } 77 | 78 | c.reset() 79 | return c 80 | } 81 | 82 | // Reset reinitializes the chunker with a new reader, polynomial, and options. 83 | func (c *BaseChunker) Reset(pol Pol, opts ...baseOption) { 84 | *c = *NewBase(pol, opts...) 85 | } 86 | 87 | func (c *BaseChunker) reset() { 88 | c.polShift = uint(c.pol.Deg() - 8) 89 | c.fillTables() 90 | 91 | for i := 0; i < windowSize; i++ { 92 | c.window[i] = 0 93 | } 94 | 95 | c.digest = 0 96 | c.wpos = 0 97 | c.count = 0 98 | c.digest = c.slide(c.digest, 1) 99 | 100 | // do not start a new chunk unless at least MinSize bytes have been read 101 | c.pre = c.MinSize - windowSize 102 | } 103 | 104 | // fillTables calculates out_table and mod_table for optimization. This 105 | // implementation uses a cache in the global variable cache. 106 | func (c *BaseChunker) fillTables() { 107 | // if polynomial hasn't been specified, do not compute anything for now 108 | if c.pol == 0 { 109 | return 110 | } 111 | 112 | c.tablesInitialized = true 113 | 114 | // test if the tables are cached for this polynomial 115 | cache.Lock() 116 | defer cache.Unlock() 117 | if t, ok := cache.entries[c.pol]; ok { 118 | c.tables = t 119 | return 120 | } 121 | 122 | // calculate table for sliding out bytes. The byte to slide out is used as 123 | // the index for the table, the value contains the following: 124 | // out_table[b] = Hash(b || 0 || ... || 0) 125 | // \ windowsize-1 zero bytes / 126 | // To slide out byte b_0 for window size w with known hash 127 | // H := H(b_0 || ... || b_w), it is sufficient to add out_table[b_0]: 128 | // H(b_0 || ... || b_w) + H(b_0 || 0 || ... || 0) 129 | // = H(b_0 + b_0 || b_1 + 0 || ... || b_w + 0) 130 | // = H( 0 || b_1 || ... || b_w) 131 | // 132 | // Afterwards a new byte can be shifted in. 133 | for b := 0; b < 256; b++ { 134 | var h Pol 135 | 136 | h = appendByte(h, byte(b), c.pol) 137 | for i := 0; i < windowSize-1; i++ { 138 | h = appendByte(h, 0, c.pol) 139 | } 140 | c.tables.out[b] = h 141 | } 142 | 143 | // calculate table for reduction mod Polynomial 144 | k := c.pol.Deg() 145 | for b := 0; b < 256; b++ { 146 | // mod_table[b] = A | B, where A = (b(x) * x^k mod pol) and B = b(x) * x^k 147 | // 148 | // The 8 bits above deg(Polynomial) determine what happens next and so 149 | // these bits are used as a lookup to this table. The value is split in 150 | // two parts: Part A contains the result of the modulus operation, part 151 | // B is used to cancel out the 8 top bits so that one XOR operation is 152 | // enough to reduce modulo Polynomial 153 | c.tables.mod[b] = Pol(uint64(b)<> polShift' 171 | if polShift > 53-8 { 172 | panic("the polynomial must have a degree less than or equal 53") 173 | } 174 | minSize := c.MinSize 175 | maxSize := c.MaxSize 176 | 177 | idx := 0 178 | // check if bytes have to be dismissed before starting a new chunk 179 | if c.pre > 0 { 180 | if c.pre >= uint(len(buf)) { 181 | c.pre -= uint(len(buf)) 182 | c.count += uint(len(buf)) 183 | return -1, 0 184 | } 185 | 186 | buf = buf[c.pre:] 187 | idx = int(c.pre) 188 | c.count += c.pre 189 | c.pre = 0 190 | } 191 | 192 | add := c.count 193 | digest := c.digest 194 | win := c.window 195 | wpos := c.wpos 196 | for i, b := range buf { 197 | // limit wpos to elide array bound checks 198 | out := win[wpos%windowSize] 199 | win[wpos%windowSize] = b 200 | digest ^= uint64(tab.out[out]) 201 | wpos++ 202 | 203 | digest = updateDigest(digest, polShift, tab, b) 204 | // end manual inline 205 | 206 | add++ 207 | 208 | if (digest&c.splitmask) == 0 || add >= maxSize { 209 | if add < minSize { 210 | continue 211 | } 212 | c.reset() 213 | return idx + i + 1, digest 214 | } 215 | } 216 | c.digest = digest 217 | c.window = win 218 | c.wpos = wpos % windowSize 219 | c.count += uint(len(buf)) 220 | return -1, 0 221 | } 222 | 223 | func updateDigest(digest uint64, polShift uint, tab *tables, b byte) (newDigest uint64) { 224 | index := digest >> polShift 225 | digest <<= 8 226 | digest |= uint64(b) 227 | 228 | digest ^= uint64(tab.mod[index]) 229 | return digest 230 | } 231 | 232 | func (c *BaseChunker) slide(digest uint64, b byte) (newDigest uint64) { 233 | out := c.window[c.wpos] 234 | c.window[c.wpos] = b 235 | digest ^= uint64(c.tables.out[out]) 236 | c.wpos = (c.wpos + 1) % windowSize 237 | 238 | digest = updateDigest(digest, c.polShift, &c.tables, b) 239 | return digest 240 | } 241 | 242 | func appendByte(hash Pol, b byte, pol Pol) Pol { 243 | hash <<= 8 244 | hash |= Pol(b) 245 | 246 | return hash.Mod(pol) 247 | } 248 | 249 | // Chunk is one content-dependent chunk of bytes whose end was cut when the 250 | // Rabin Fingerprint had the value stored in Cut. 251 | type Chunk struct { 252 | Start uint 253 | Length uint 254 | Cut uint64 255 | Data []byte 256 | } 257 | 258 | type chunkerBuffer struct { 259 | buf []byte 260 | bpos uint 261 | bmax uint 262 | pos uint 263 | 264 | rd io.Reader 265 | closed bool 266 | } 267 | 268 | // Chunker splits content with Rabin Fingerprints. 269 | type Chunker struct { 270 | BaseChunker 271 | chunkerBuffer 272 | } 273 | 274 | // New returns a new Chunker based on polynomial p that reads from rd. 275 | // Chunker behavior can be customized by passing options, see With* functions. 276 | func New(rd io.Reader, pol Pol, opts ...option) *Chunker { 277 | c := &Chunker{ 278 | BaseChunker: *NewBase(pol), 279 | chunkerBuffer: chunkerBuffer{ 280 | buf: make([]byte, chunkerBufSize), 281 | rd: rd, 282 | }, 283 | } 284 | 285 | for _, opt := range opts { 286 | opt(c) 287 | } 288 | 289 | if c.buf == nil { 290 | c.buf = make([]byte, chunkerBufSize) 291 | } 292 | 293 | c.reset() 294 | return c 295 | } 296 | 297 | // NewWithBoundaries returns a new Chunker based on polynomial p that reads from 298 | // rd and custom min and max size boundaries. 299 | // 300 | // Deprecated: NewWithBoundaries uses should be replaced by New(rd, pol, WithBoundaries(min, max)). 301 | func NewWithBoundaries(rd io.Reader, pol Pol, min, max uint) *Chunker { 302 | return New(rd, pol, WithBoundaries(min, max)) 303 | } 304 | 305 | // SetAverageBits allows to control the frequency of chunk discovery: 306 | // the lower averageBits, the higher amount of chunks will be identified. 307 | // The default value is 20 bits, so chunks will be of 1MiB size on average. 308 | // 309 | // Deprecated: SetAverageBits uses should be replaced by NewBase(rd, pol, WithAverageBits(averageBits)). 310 | func (c *Chunker) SetAverageBits(averageBits int) { 311 | c.splitmask = (1 << uint64(averageBits)) - 1 312 | } 313 | 314 | // Reset reinitializes the chunker with a new reader, polynomial, and options. 315 | func (c *Chunker) Reset(rd io.Reader, pol Pol, opts ...option) { 316 | opts = append([]option{WithBuffer(c.buf)}, opts...) 317 | *c = *New(rd, pol, opts...) 318 | } 319 | 320 | // Deprecated: ResetWithBoundaries uses should be replaced by Reset(rd, pol, WithBoundaries(min, max)). 321 | func (c *Chunker) ResetWithBoundaries(rd io.Reader, pol Pol, min, max uint) { 322 | c.Reset(rd, pol, WithBoundaries(min, max)) 323 | } 324 | 325 | // Next returns the position and length of the next chunk of data. If an error 326 | // occurs while reading, the error is returned. Afterwards, the state of the 327 | // current chunk is undefined. When the last chunk has been returned, all 328 | // subsequent calls yield an io.EOF error. 329 | func (c *Chunker) Next(data []byte) (Chunk, error) { 330 | data = data[:0] 331 | start := c.pos 332 | for { 333 | if c.bpos >= c.bmax { 334 | n, err := io.ReadFull(c.rd, c.buf) 335 | 336 | if err == io.ErrUnexpectedEOF { 337 | err = nil 338 | } 339 | 340 | // io.ReadFull only returns io.EOF when no bytes could be read. If 341 | // this is the case and we're in this branch, there are no more 342 | // bytes to buffer, so this was the last chunk. If a different 343 | // error has occurred, return that error and abandon the current 344 | // chunk. 345 | if err == io.EOF && !c.closed { 346 | c.closed = true 347 | 348 | // return current chunk, if any bytes have been processed 349 | if len(data) > 0 { 350 | return Chunk{ 351 | Start: start, 352 | Length: uint(len(data)), 353 | // somewhat meaningless as this is not a split point 354 | Cut: c.digest, 355 | Data: data, 356 | }, nil 357 | } 358 | } 359 | 360 | if err != nil { 361 | return Chunk{}, err 362 | } 363 | 364 | c.bpos = 0 365 | c.bmax = uint(n) 366 | } 367 | 368 | split, cut := c.NextSplitPoint(c.buf[c.bpos:c.bmax]) 369 | if split == -1 { 370 | data = append(data, c.buf[c.bpos:c.bmax]...) 371 | c.pos += c.bmax - c.bpos 372 | c.bpos = c.bmax 373 | } else { 374 | data = append(data, c.buf[c.bpos:c.bpos+uint(split)]...) 375 | c.bpos += uint(split) 376 | c.pos += uint(split) 377 | 378 | return Chunk{ 379 | Start: start, 380 | Length: uint(len(data)), 381 | Cut: cut, 382 | Data: data, 383 | }, nil 384 | } 385 | } 386 | } 387 | -------------------------------------------------------------------------------- /chunker_test.go: -------------------------------------------------------------------------------- 1 | package chunker 2 | 3 | import ( 4 | "bytes" 5 | "crypto/sha256" 6 | "encoding/hex" 7 | "io" 8 | "math/rand" 9 | "reflect" 10 | "testing" 11 | "time" 12 | ) 13 | 14 | func parseDigest(s string) []byte { 15 | d, err := hex.DecodeString(s) 16 | if err != nil { 17 | panic(err) 18 | } 19 | 20 | return d 21 | } 22 | 23 | type chunk struct { 24 | Length uint 25 | CutFP uint64 26 | Digest []byte 27 | } 28 | 29 | // polynomial used for all the tests below 30 | const testPol = Pol(0x3DA3358B4DC173) 31 | 32 | // created for 32MB of random data out of math/rand's Uint32() seeded by 33 | // constant 23 34 | // 35 | // chunking configuration: 36 | // window size 64, avg chunksize 1<<20, min chunksize 1<<19, max chunksize 1<<23 37 | // polynom 0x3DA3358B4DC173 38 | var chunks1 = []chunk{ 39 | {2163460, 0x000b98d4cdf00000, parseDigest("4b94cb2cf293855ea43bf766731c74969b91aa6bf3c078719aabdd19860d590d")}, 40 | {643703, 0x000d4e8364d00000, parseDigest("5727a63c0964f365ab8ed2ccf604912f2ea7be29759a2b53ede4d6841e397407")}, 41 | {1528956, 0x0015a25c2ef00000, parseDigest("a73759636a1e7a2758767791c69e81b69fb49236c6929e5d1b654e06e37674ba")}, 42 | {1955808, 0x00102a8242e00000, parseDigest("c955fb059409b25f07e5ae09defbbc2aadf117c97a3724e06ad4abd2787e6824")}, 43 | {2222372, 0x00045da878000000, parseDigest("6ba5e9f7e1b310722be3627716cf469be941f7f3e39a4c3bcefea492ec31ee56")}, 44 | {2538687, 0x00198a8179900000, parseDigest("8687937412f654b5cfe4a82b08f28393a0c040f77c6f95e26742c2fc4254bfde")}, 45 | {609606, 0x001d4e8d17100000, parseDigest("5da820742ff5feb3369112938d3095785487456f65a8efc4b96dac4be7ebb259")}, 46 | {1205738, 0x000a7204dd600000, parseDigest("cc70d8fad5472beb031b1aca356bcab86c7368f40faa24fe5f8922c6c268c299")}, 47 | {959742, 0x00183e71e1400000, parseDigest("4065bdd778f95676c92b38ac265d361f81bff17d76e5d9452cf985a2ea5a4e39")}, 48 | {4036109, 0x001fec043c700000, parseDigest("b9cf166e75200eb4993fc9b6e22300a6790c75e6b0fc8f3f29b68a752d42f275")}, 49 | {1525894, 0x000b1574b1500000, parseDigest("2f238180e4ca1f7520a05f3d6059233926341090f9236ce677690c1823eccab3")}, 50 | {1352720, 0x00018965f2e00000, parseDigest("afd12f13286a3901430de816e62b85cc62468c059295ce5888b76b3af9028d84")}, 51 | {811884, 0x00155628aa100000, parseDigest("42d0cdb1ee7c48e552705d18e061abb70ae7957027db8ae8db37ec756472a70a")}, 52 | {1282314, 0x001909a0a1400000, parseDigest("819721c2457426eb4f4c7565050c44c32076a56fa9b4515a1c7796441730eb58")}, 53 | {1318021, 0x001cceb980000000, parseDigest("842eb53543db55bacac5e25cb91e43cc2e310fe5f9acc1aee86bdf5e91389374")}, 54 | {948640, 0x0011f7a470a00000, parseDigest("b8e36bf7019bb96ac3fb7867659d2167d9d3b3148c09fe0de45850b8fe577185")}, 55 | {645464, 0x00030ce2d9400000, parseDigest("5584bd27982191c3329f01ed846bfd266e96548dfa87018f745c33cfc240211d")}, 56 | {533758, 0x0004435c53c00000, parseDigest("4da778a25b72a9a0d53529eccfe2e5865a789116cb1800f470d8df685a8ab05d")}, 57 | {1128303, 0x0000c48517800000, parseDigest("08c6b0b38095b348d80300f0be4c5184d2744a17147c2cba5cc4315abf4c048f")}, 58 | {800374, 0x000968473f900000, parseDigest("820284d2c8fd243429674c996d8eb8d3450cbc32421f43113e980f516282c7bf")}, 59 | {2453512, 0x001e197c92600000, parseDigest("5fa870ed107c67704258e5e50abe67509fb73562caf77caa843b5f243425d853")}, 60 | {2651975, 0x000ae6c868000000, parseDigest("181347d2bbec32bef77ad5e9001e6af80f6abcf3576549384d334ee00c1988d8")}, 61 | {237392, 0x0000000000000001, parseDigest("fcd567f5d866357a8e299fd5b2359bb2c8157c30395229c4e9b0a353944a7978")}, 62 | } 63 | 64 | // test if nullbytes are correctly split, even if length is a multiple of MinSize. 65 | var chunks2 = []chunk{ 66 | {MinSize, 0, parseDigest("07854d2fef297a06ba81685e660c332de36d5d18d546927d30daad6d7fda1541")}, 67 | {MinSize, 0, parseDigest("07854d2fef297a06ba81685e660c332de36d5d18d546927d30daad6d7fda1541")}, 68 | {MinSize, 0, parseDigest("07854d2fef297a06ba81685e660c332de36d5d18d546927d30daad6d7fda1541")}, 69 | {MinSize, 0, parseDigest("07854d2fef297a06ba81685e660c332de36d5d18d546927d30daad6d7fda1541")}, 70 | } 71 | 72 | // the same as chunks1, but avg chunksize is 1<<19 73 | var chunks3 = []chunk{ 74 | {1491586, 0x00023e586ea80000, parseDigest("4c008237df602048039287427171cef568a6cb965d1b5ca28dc80504a24bb061")}, 75 | {671874, 0x000b98d4cdf00000, parseDigest("fa8a42321b90c3d4ce9dd850562b2fd0c0fe4bdd26cf01a24f22046a224225d3")}, 76 | {643703, 0x000d4e8364d00000, parseDigest("5727a63c0964f365ab8ed2ccf604912f2ea7be29759a2b53ede4d6841e397407")}, 77 | {1284146, 0x0012b527e4780000, parseDigest("16d04cafecbeae9eaedd49da14c7ad7cdc2b1cc8569e5c16c32c9fb045aa899a")}, 78 | {823366, 0x000d1d6752180000, parseDigest("48662c118514817825ad4761e8e2e5f28f9bd8281b07e95dcafc6d02e0aa45c3")}, 79 | {810134, 0x0016071b6e180000, parseDigest("f629581aa05562f97f2c359890734c8574c5575da32f9289c5ba70bfd05f3f46")}, 80 | {567118, 0x00102a8242e00000, parseDigest("d4f0797c56c60d01bac33bfd49957a4816b6c067fc155b026de8a214cab4d70a")}, 81 | {821315, 0x001b3e42c8180000, parseDigest("8ebd0fd5db0293bd19140da936eb8b1bbd3cd6ffbec487385b956790014751ca")}, 82 | {1401057, 0x00045da878000000, parseDigest("001360af59adf4871ef138cfa2bb49007e86edaf5ac2d6f0b3d3014510991848")}, 83 | {2311122, 0x0005cbd885380000, parseDigest("8276d489b566086d9da95dc5c5fe6fc7d72646dd3308ced6b5b6ddb8595f0aa1")}, 84 | {608723, 0x001cfcd86f280000, parseDigest("518db33ba6a79d4f3720946f3785c05b9611082586d47ea58390fc2f6de9449e")}, 85 | {980456, 0x0013edb7a7f80000, parseDigest("0121b1690738395e15fecba1410cd0bf13fde02225160cad148829f77e7b6c99")}, 86 | {1140278, 0x0001f9f017e80000, parseDigest("28ca7c74804b5075d4f5eeb11f0845d99f62e8ea3a42b9a05c7bd5f2fca619dd")}, 87 | {2015542, 0x00097bf5d8180000, parseDigest("6fe8291f427d48650a5f0f944305d3a2dbc649bd401d2655fc0bdd42e890ca5a")}, 88 | {904752, 0x000e1863eff80000, parseDigest("62af1f1eb3f588d18aff28473303cc4731fc3cafcc52ce818fee3c4c2820854d")}, 89 | {713072, 0x001f3bb1b9b80000, parseDigest("4bda9dc2e3031d004d87a5cc93fe5207c4b0843186481b8f31597dc6ffa1496c")}, 90 | {675937, 0x001fec043c700000, parseDigest("5299c8c5acec1b90bb020cd75718aab5e12abb9bf66291465fd10e6a823a8b4a")}, 91 | {1525894, 0x000b1574b1500000, parseDigest("2f238180e4ca1f7520a05f3d6059233926341090f9236ce677690c1823eccab3")}, 92 | {1352720, 0x00018965f2e00000, parseDigest("afd12f13286a3901430de816e62b85cc62468c059295ce5888b76b3af9028d84")}, 93 | {811884, 0x00155628aa100000, parseDigest("42d0cdb1ee7c48e552705d18e061abb70ae7957027db8ae8db37ec756472a70a")}, 94 | {1282314, 0x001909a0a1400000, parseDigest("819721c2457426eb4f4c7565050c44c32076a56fa9b4515a1c7796441730eb58")}, 95 | {1093738, 0x0017f5d048880000, parseDigest("5dddfa7a241b68f65d267744bdb082ee865f3c2f0d8b946ea0ee47868a01bbff")}, 96 | {962003, 0x000b921f7ef80000, parseDigest("0cb5c9ebba196b441c715c8d805f6e7143a81cd5b0d2c65c6aacf59ca9124af9")}, 97 | {856384, 0x00030ce2d9400000, parseDigest("7734b206d46f3f387e8661e81edf5b1a91ea681867beb5831c18aaa86632d7fb")}, 98 | {533758, 0x0004435c53c00000, parseDigest("4da778a25b72a9a0d53529eccfe2e5865a789116cb1800f470d8df685a8ab05d")}, 99 | {1128303, 0x0000c48517800000, parseDigest("08c6b0b38095b348d80300f0be4c5184d2744a17147c2cba5cc4315abf4c048f")}, 100 | {800374, 0x000968473f900000, parseDigest("820284d2c8fd243429674c996d8eb8d3450cbc32421f43113e980f516282c7bf")}, 101 | {2453512, 0x001e197c92600000, parseDigest("5fa870ed107c67704258e5e50abe67509fb73562caf77caa843b5f243425d853")}, 102 | {665901, 0x00118c842cb80000, parseDigest("deceec26163842fdef6560311c69bf8a9871a56e16d719e2c4b7e4d668ceb61f")}, 103 | {1986074, 0x000ae6c868000000, parseDigest("64cd64bf3c3bc389eb20df8310f0427d1c36ab2eaaf09e346bfa7f0453fc1a18")}, 104 | {237392, 0x0000000000000001, parseDigest("fcd567f5d866357a8e299fd5b2359bb2c8157c30395229c4e9b0a353944a7978")}, 105 | } 106 | 107 | // the same as chunks1, but with boundaries (16*1024*1024, 32*1024*1024) 108 | var chunks4 = []chunk{ 109 | {17864181, 0x001fec043c700000, parseDigest("4a43d4eccaa3b88514f54d3becddc6ef5e06fbb2de8161b0129888dbae4430a7")}, 110 | {15690251, 0x0000000000000001, parseDigest("4ca32142da6f9130dc9a5d0df82a3c9e359cc4f1f480f4e52042ec0e8ecd3cc0")}, 111 | } 112 | 113 | func testWithData(t *testing.T, chnker *Chunker, testChunks []chunk, checkDigest bool) []Chunk { 114 | chunks := []Chunk{} 115 | 116 | pos := uint(0) 117 | for i, chunk := range testChunks { 118 | c, err := chnker.Next(nil) 119 | 120 | if err != nil { 121 | t.Fatalf("Error returned with chunk %d: %v", i, err) 122 | } 123 | 124 | if c.Start != pos { 125 | t.Fatalf("Start for chunk %d does not match: expected %d, got %d", 126 | i, pos, c.Start) 127 | } 128 | 129 | if c.Length != chunk.Length { 130 | t.Fatalf("Length for chunk %d does not match: expected %d, got %d", 131 | i, chunk.Length, c.Length) 132 | } 133 | 134 | if c.Cut != chunk.CutFP { 135 | t.Fatalf("Cut fingerprint for chunk %d/%d does not match: expected %016x, got %016x", 136 | i, len(chunks)-1, chunk.CutFP, c.Cut) 137 | } 138 | 139 | if checkDigest { 140 | digest := hashData(c.Data) 141 | if !bytes.Equal(chunk.Digest, digest) { 142 | t.Fatalf("Digest fingerprint for chunk %d/%d does not match: expected %02x, got %02x", 143 | i, len(chunks)-1, chunk.Digest, digest) 144 | } 145 | } 146 | 147 | pos += c.Length 148 | chunks = append(chunks, c) 149 | } 150 | 151 | _, err := chnker.Next(nil) 152 | if err != io.EOF { 153 | t.Fatal("Wrong error returned after last chunk") 154 | } 155 | 156 | if len(chunks) != len(testChunks) { 157 | t.Fatal("Amounts of test and resulting chunks do not match") 158 | } 159 | 160 | return chunks 161 | } 162 | 163 | func getRandom(seed int64, count int) []byte { 164 | buf := make([]byte, count) 165 | 166 | rnd := rand.New(rand.NewSource(seed)) 167 | for i := 0; i < count; i += 4 { 168 | r := rnd.Uint32() 169 | buf[i] = byte(r) 170 | buf[i+1] = byte(r >> 8) 171 | buf[i+2] = byte(r >> 16) 172 | buf[i+3] = byte(r >> 24) 173 | } 174 | 175 | return buf 176 | } 177 | 178 | func hashData(d []byte) []byte { 179 | h := sha256.New() 180 | 181 | _, err := h.Write(d) 182 | if err != nil { 183 | panic(err) 184 | } 185 | 186 | return h.Sum(nil) 187 | } 188 | 189 | func TestChunker(t *testing.T) { 190 | // setup data source 191 | buf := getRandom(23, 32*1024*1024) 192 | ch := New(bytes.NewReader(buf), testPol) 193 | testWithData(t, ch, chunks1, true) 194 | 195 | // setup nullbyte data source 196 | buf = bytes.Repeat([]byte{0}, len(chunks2)*MinSize) 197 | ch = New(bytes.NewReader(buf), testPol) 198 | 199 | testWithData(t, ch, chunks2, true) 200 | } 201 | 202 | func TestChunkerWithCustomAverageBits(t *testing.T) { 203 | buf := getRandom(23, 32*1024*1024) 204 | 205 | // sligthly decrease averageBits to get more chunks 206 | ch := New(bytes.NewReader(buf), testPol, WithAverageBits(19)) 207 | 208 | testWithData(t, ch, chunks3, true) 209 | } 210 | 211 | func TestChunkerReset(t *testing.T) { 212 | buf := getRandom(23, 32*1024*1024) 213 | ch := New(bytes.NewReader(buf), testPol) 214 | testWithData(t, ch, chunks1, true) 215 | 216 | ch.Reset(bytes.NewReader(buf), testPol) 217 | testWithData(t, ch, chunks1, true) 218 | 219 | // test Reset with Options 220 | tmpBuf := make([]byte, 1024*1024) 221 | ch.Reset(bytes.NewReader(buf), testPol, WithAverageBits(19), WithBuffer(tmpBuf)) 222 | testWithData(t, ch, chunks3, true) 223 | if reflect.DeepEqual(tmpBuf, make([]byte, 1024*1024)) { 224 | t.Fatalf("Buffer was not used") 225 | } 226 | } 227 | 228 | // TestChunkerWithOptions tests the chunker with boundaries 229 | func TestChunkerWithOptions(t *testing.T) { 230 | buf := getRandom(23, 32*1024*1024) 231 | 232 | // test New with Options 233 | tmpBuf := make([]byte, 1024*1024) 234 | ch := New(bytes.NewReader(buf), testPol, WithBoundaries(16*1024*1024, 32*1024*1024), WithBuffer(tmpBuf)) 235 | testWithData(t, ch, chunks4, true) 236 | if reflect.DeepEqual(tmpBuf, make([]byte, 1024*1024)) { 237 | t.Fatalf("Buffer was not used") 238 | } 239 | } 240 | 241 | func TestChunkerWithRandomPolynomial(t *testing.T) { 242 | // setup data source 243 | buf := getRandom(23, 32*1024*1024) 244 | 245 | // generate a new random polynomial 246 | start := time.Now() 247 | p, err := RandomPolynomial() 248 | if err != nil { 249 | t.Fatal(err) 250 | } 251 | t.Logf("generating random polynomial took %v", time.Since(start)) 252 | 253 | start = time.Now() 254 | ch := New(bytes.NewReader(buf), p) 255 | t.Logf("creating chunker took %v", time.Since(start)) 256 | 257 | // make sure that first chunk is different 258 | c, err := ch.Next(nil) 259 | if err != nil { 260 | t.Fatal(err.Error()) 261 | } 262 | 263 | if c.Cut == chunks1[0].CutFP { 264 | t.Fatal("Cut point is the same") 265 | } 266 | 267 | if c.Length == chunks1[0].Length { 268 | t.Fatal("Length is the same") 269 | } 270 | 271 | if bytes.Equal(hashData(c.Data), chunks1[0].Digest) { 272 | t.Fatal("Digest is the same") 273 | } 274 | } 275 | 276 | func TestChunkerWithoutHash(t *testing.T) { 277 | // setup data source 278 | buf := getRandom(23, 32*1024*1024) 279 | 280 | ch := New(bytes.NewReader(buf), testPol) 281 | chunks := testWithData(t, ch, chunks1, false) 282 | 283 | // test reader 284 | for i, c := range chunks { 285 | if uint(len(c.Data)) != chunks1[i].Length { 286 | t.Fatalf("reader returned wrong number of bytes: expected %d, got %d", 287 | chunks1[i].Length, len(c.Data)) 288 | } 289 | 290 | if !bytes.Equal(buf[c.Start:c.Start+c.Length], c.Data) { 291 | t.Fatalf("invalid data for chunk returned: expected %02x, got %02x", 292 | buf[c.Start:c.Start+c.Length], c.Data) 293 | } 294 | } 295 | 296 | // setup nullbyte data source 297 | buf = bytes.Repeat([]byte{0}, len(chunks2)*MinSize) 298 | ch = New(bytes.NewReader(buf), testPol) 299 | 300 | testWithData(t, ch, chunks2, false) 301 | } 302 | 303 | func benchmarkChunker(b *testing.B, checkDigest bool) { 304 | size := 32 * 1024 * 1024 305 | rd := bytes.NewReader(getRandom(23, size)) 306 | ch := New(rd, testPol) 307 | buf := make([]byte, MaxSize) 308 | 309 | b.ResetTimer() 310 | b.SetBytes(int64(size)) 311 | 312 | var chunks int 313 | for i := 0; i < b.N; i++ { 314 | chunks = 0 315 | 316 | _, err := rd.Seek(0, 0) 317 | if err != nil { 318 | b.Fatalf("Seek() return error %v", err) 319 | } 320 | 321 | ch.Reset(rd, testPol) 322 | 323 | cur := 0 324 | for { 325 | chunk, err := ch.Next(buf) 326 | 327 | if err == io.EOF { 328 | break 329 | } 330 | 331 | if err != nil { 332 | b.Fatalf("Unexpected error occurred: %v", err) 333 | } 334 | 335 | if chunk.Length != chunks1[cur].Length { 336 | b.Errorf("wrong chunk length, want %d, got %d", 337 | chunks1[cur].Length, chunk.Length) 338 | } 339 | 340 | if chunk.Cut != chunks1[cur].CutFP { 341 | b.Errorf("wrong cut fingerprint, want 0x%x, got 0x%x", 342 | chunks1[cur].CutFP, chunk.Cut) 343 | } 344 | 345 | if checkDigest { 346 | h := hashData(chunk.Data) 347 | if !bytes.Equal(h, chunks1[cur].Digest) { 348 | b.Errorf("wrong digest, want %x, got %x", 349 | chunks1[cur].Digest, h) 350 | } 351 | } 352 | 353 | chunks++ 354 | cur++ 355 | } 356 | } 357 | 358 | b.Logf("%d chunks, average chunk size: %d bytes", chunks, size/chunks) 359 | } 360 | 361 | func BenchmarkChunkerWithSHA256(b *testing.B) { 362 | benchmarkChunker(b, true) 363 | } 364 | 365 | func BenchmarkChunker(b *testing.B) { 366 | benchmarkChunker(b, false) 367 | } 368 | 369 | func BenchmarkNewChunker(b *testing.B) { 370 | p, err := RandomPolynomial() 371 | if err != nil { 372 | b.Fatal(err) 373 | } 374 | 375 | b.ResetTimer() 376 | 377 | for i := 0; i < b.N; i++ { 378 | New(bytes.NewBuffer(nil), p) 379 | } 380 | } 381 | --------------------------------------------------------------------------------