├── .gitignore
├── .travis.yml
├── FORMAT.md
├── LICENSE
├── README.md
├── dedup.go
├── filesplitter.go
├── reader.go
├── reader_test.go
├── sort
    └── hashsort.go
├── testdata
    └── sampledata.zip
├── writer.go
└── writer_test.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 2 | *.o
 3 | *.a
 4 | *.so
 5 | 
 6 | # Folders
 7 | _obj
 8 | _test
 9 | 
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 | 
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 | 
20 | _testmain.go
21 | 
22 | *.exe
23 | *.test
24 | *.prof
25 | /.idea
26 | /dedup.iml
27 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: go
 2 | 
 3 | sudo: false
 4 | 
 5 | os:
 6 |   - linux
 7 |   - osx
 8 | 
 9 | go:
10 |   - 1.7.x
11 |   - 1.8.x
12 |   - 1.9.x
13 |   - master
14 | 
15 | script: 
16 |  - go test -v -cpu=1,2,4 .
17 |  - go test -v -cpu=2 -race -short .
18 | 
19 | matrix:
20 |   allow_failures:
21 |     - go: 'master'
22 |   fast_finish: true  
23 | 


--------------------------------------------------------------------------------
/FORMAT.md:
--------------------------------------------------------------------------------
  1 | # Stream formats
  2 | 
  3 | The streaming format is designed to be easy and fast to write or parse.
  4 | 
  5 | It only uses 64 bit unsigned variable sized ints to store the information.  
  6 | 
  7 | The encoding of Uvarint is:
  8 | 
  9 | * unsigned integers are serialized 7 bits at a time, starting with the least significant bits
 10 | * the most significant bit (msb) in each output byte indicates if there is a continuation byte (msb = 1)
 11 | 
 12 | All values in this format can fit in a 64 bit unsigned value. 
 13 | See [encoding/binary.ReadUvarint](https://golang.org/pkg/encoding/binary/#ReadUvarint) for a reference implementation.
 14 | 
 15 | # Format 1
 16 | This format has data and index split in two files, so the index can be quickly read before any decoding starts.
 17 | 
 18 | This index allows to keep track of the last occurence of a block, so it can be deallocated immediately afterwards.
 19 |  
 20 | ## Header
 21 | 
 22 | | Content        | Type    | Values       |
 23 | |----------------|---------|--------------|
 24 | | Format ID      | UvarInt | 0x1 (always) |
 25 | | MaxBlockSize | UvarInt |  >= 512       |
 26 | 
 27 | ## Repeat Blocks
 28 | 
 29 | This is the index parsing algorithm in overall terms.
 30 | 
 31 | ```Go
 32 | for {
 33 |     offset = ReadVarUint()
 34 |     
 35 |     switch offset {
 36 |     
 37 |     // NEW BLOCK 
 38 |      case 0:
 39 |         x = ReadVarUint()
 40 |         if x > MaxBlockSize { ERROR }
 41 |         blockSize = MaxBlockSize - x        
 42 |         block = ReadBytesFromDataStream(blockSize)   
 43 |  
 44 |     // END OF STREAM
 45 |     case  1<<64 - 1:
 46 |         x = ReadVarUint()
 47 |         if x > MaxBlockSize { ERROR }
 48 |         blockSize = MaxBlockSize - x
 49 |         block = ReadBytesFromDataStream(blockSize)
 50 | 
 51 |         // Stream terminator
 52 |         x := ReadVarUint()
 53 |         if x != 0 { ERROR }        
 54 |         break
 55 |         
 56 |     // DEDUPLICATED BLOCK
 57 |     default:
 58 | 		SourceBlockNum = CurrentBlock - offset
 59 |         if SourceBlockNum < 0 { ERROR }
 60 |     }    
 61 | }
 62 | ```
 63 | 
 64 | ### Block sizes
 65 | Block sizes are stored as `MaxSize - Size`, so fixed block sizes are all stored as size '0'.
 66 | 
 67 | ### Block Offset
 68 | The deduplicated offset is backwards from the the current block, so if the current block is the same 
 69 | as the previous, it will be encoded as '1'. If it is two blocks back, 2, etc.
 70 |   
 71 | # Format 2
 72 | 
 73 | Format 2 has block definitions and data interleaved. It only has a minor difference to Format 1, since it includes a 
 74 | Maximum backreference Length, helping the decoder to deallocate blocks.
 75 |  
 76 | 
 77 | ## Header
 78 | 
 79 | | Content        | Type    | Values       |
 80 | |----------------|---------|--------------|
 81 | | Format ID      | UvarInt | 0x2 (always) |
 82 | | MaxBlockSize | UvarInt |  >= 512       |
 83 | | MaxLength | UvarInt |  >= 1       |
 84 | 
 85 | In addition to Maximum Block Size, a `MaxLength` is also added, which indicates the maximum backreference distance 
 86 | of this stream. This means that any offsets will be less or equal to MaxLength.
 87 | 
 88 | ## Repeat Blocks
 89 | 
 90 | This is the decoding loop. `MaxLength` blocks should be kept in memory while the decoding is taking place. 
 91 | Data is read from in between block definitions, if offset is 0.
 92 | 
 93 | ```Go
 94 | for {
 95 |     offset = ReadVarUint()
 96 |     
 97 |     switch offset {
 98 |     
 99 |     // NEW BLOCK 
100 |      case 0:
101 |         x = ReadVarUint()
102 |         if x > MaxBlockSize { ERROR }
103 |         blockSize = MaxBlockSize - x        
104 |         block = ReadBytes(blockSize)   
105 |  
106 |     // END OF STREAM
107 |     case  1<<64 - 1:
108 |         x = ReadVarUint()
109 |         if x > MaxBlockSize { ERROR }
110 |         blockSize = MaxBlockSize - x
111 |         block = ReadBytes(blockSize)
112 | 
113 |         // Stream terminator
114 |         x := ReadVarUint()
115 |         if x != 0 { ERROR }        
116 |         break
117 |         
118 |     // DEDUPLICATED BLOCK
119 |     default:
120 |         if offset > MaxLength { ERROR }
121 | 		SourceBlockNum = CurrentBlock - offset
122 |         if SourceBlockNum < 0 { ERROR }
123 |     }    
124 | }
125 | 
126 | ```
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Klaus Post
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # dedup
  2 | A Streaming Deduplication package for Go
  3 | 
  4 | This package implements streaming deduplication, allowing you to remove duplicated data in streams. It implements variable block sizes and automatic content block adaptation. It has a fully streaming mode and an indexed mode, that has significantly reduced memory requirements.
  5 | 
  6 | For an introduction to deduplication read this blog post [Fast Stream Deduplication in Go](https://blog.klauspost.com/fast-stream-deduplication-in-go/).
  7 | 
  8 | Package home: https://github.com/klauspost/dedup
  9 | 
 10 | Godoc: https://godoc.org/github.com/klauspost/dedup
 11 | 
 12 | [![Build Status](https://travis-ci.org/klauspost/dedup.svg?branch=master)](https://travis-ci.org/klauspost/dedup)
 13 | [![GoDoc][1]][2]
 14 | 
 15 | [1]: https://godoc.org/github.com/klauspost/dedup?status.svg
 16 | [2]: https://godoc.org/github.com/klauspost/dedup
 17 | 
 18 | # Installation
 19 | To get the package use the standard:
 20 | ```bash
 21 | go get -u github.com/klauspost/dedup
 22 | ```
 23 | 
 24 | # Usage
 25 | 
 26 | If you haven't already, you should read the [Fast Stream Deduplication in Go](https://blog.klauspost.com/fast-stream-deduplication-in-go/) blog post, since it will introduce different aspects and help you make choices for your setup.
 27 | 
 28 | There are two symmetric functions [`NewWriter`](https://godoc.org/github.com/klauspost/dedup#NewWriter)/[`NewReader`](https://godoc.org/github.com/klauspost/dedup#NewReader) and [`NewStreamWriter`](https://godoc.org/github.com/klauspost/dedup#NewStreamWriter)/[`NewStreamReader`](https://godoc.org/github.com/klauspost/dedup#NewStreamReader)`. The first pair creates an *indexed* stream, which will write the index and data to two separate streams. This allows to decode the deduplicated stream with much less memory. The second pair will write all data to a *single stream*. This allows for on-the-fly transfers, but will require more memory in the receiving end.
 29 | 
 30 | When you create a deduplicating stream, you can specify between *fixed* or *dynamic* block sizes. The dynamic blocks adapt block splits to the incoming content, but is slower than fixed size, and has to use more conservative memory estimations.
 31 | 
 32 | Here is an example of a full roundtrip with indexed streams. For more examples see the [godoc examples](https://godoc.org/github.com/klauspost/dedup#pkg-examples).
 33 | 
 34 | ```Go
 35 | package main
 36 | 
 37 | import (
 38 |   "bytes"
 39 |   "fmt"
 40 |   "io"
 41 | 
 42 |   "github.com/klauspost/dedup"
 43 | )
 44 | 
 45 | // This will deduplicate a buffer of zeros to an indexed stream
 46 | func main() {
 47 | 	// We will write out deduplicated data to these
 48 | 	idx := bytes.Buffer{}
 49 | 	data := bytes.Buffer{}
 50 | 
 51 | 	// This is our input:
 52 | 	input := bytes.NewBuffer(make([]byte, 50000))
 53 | 
 54 | 	// Create a new writer, with each block being 1000 bytes fixed size.
 55 | 	w, err := dedup.NewWriter(&idx, &data, dedup.ModeFixed, 1000, 0)
 56 | 	if err != nil {
 57 | 		panic(err)
 58 | 	}
 59 | 	// Copy our input to the writer.
 60 | 	io.Copy(w, input)
 61 | 
 62 | 	// Close to flush the remaining buffers
 63 | 	err = w.Close()
 64 | 	if err != nil {
 65 | 		panic(err)
 66 | 	}
 67 | 
 68 | 	// Create a new indexed stream reader:
 69 | 	r, err := dedup.NewReader(&idx, &data)
 70 | 	if err != nil {
 71 | 		panic(err)
 72 | 	}
 73 | 
 74 | 	// Inspect how much memory it will use.
 75 | 	fmt.Println("Memory use:", r.MaxMem())
 76 | 
 77 | 	var dst bytes.Buffer
 78 | 
 79 | 	// Read everything
 80 | 	_, err = io.Copy(&dst, r)
 81 | 	if err != nil && err != io.EOF {
 82 | 		panic(err)
 83 | 	}
 84 | 
 85 | 	// Let us inspect what was written:
 86 | 	fmt.Println("Returned data length:", dst.Len())
 87 | 	fmt.Println("Everything zero:", 0 == bytes.Compare(dst.Bytes(), make([]byte, 50000)))
 88 | }
 89 | ```
 90 | 
 91 | Note that there is no error resilience built in. If any data is corrupted in any way, it will probably not be detected, and there is no way to recover corrupted data. So if you are in an environment where that could occur, you should add additional checks to ensure that data is recoverable.
 92 | 
 93 | ## Input Splitting
 94 | 
 95 | If you want to simply split the input, that functionality is also exposed.
 96 | 
 97 | This can be useful in the case that you want to deduplicate to a your own key-value store.
 98 | In this case, you simply feed the input to a *NewSplitter*.
 99 | This will return the individual fragments along with a hash.
100 | This will allow you to store your files as a stream of hashes, and separate the data into your data store.
101 | 
102 | See the examples attached to the [NewSplitter](https://godoc.org/github.com/klauspost/dedup#example-NewSplitter) function on how to use this.
103 | 
104 | ## Hash collisions
105 | 
106 | The encoder uses SHA-1 to identify and "remember" unique blocks. No hash is secure from collisions, but SHA-1 offers 160 bits of entropy.
107 | 
108 | For example, the chance of a random hash collision to occur when encoding 1 TB data in 1KB blocks is 3.94×10^-31 : 1, or one in "2.5 thousand billion billion billion". This of course assumes a uniform hash distribution and no deliberate hash collision attacks.
109 | 
110 | If SHA-1 doesn't provide sufficient security, it has been made very easy for you to create a stronger version. It is possible for you to create a stronger version by simply changing the import:
111 | 
112 | ```Go
113 | import 	hasher "crypto/sha1"
114 | ```
115 | You can use [sha256](https://golang.org/pkg/crypto/sha256/), [sha512](https://golang.org/pkg/crypto/sha512/) for stronger hashes, or [md5](https://golang.org/pkg/crypto/md5/) for a faster hash.
116 | 
117 | To help you calculate the birthday problem likelyhood with a given number of blocks, I have provided the [BirthdayProblem function](https://godoc.org/github.com/klauspost/dedup#BirthdayProblem).
118 | 
119 | ## Why is this not compression?
120 | 
121 | Deduplication does the same as compression but on a higher level. Instead of looking for small matches, it attempts to find the "bigger" matches. It will attempt to match and eliminate blocks where all content matches.
122 | 
123 | This can be useful when backing up disk images or other content where you have duplicated files, etc.
124 | 
125 | Deduplication is a good step *before* compression. You will still be able to compress your data, since unique blocks are passed through as-is, in order and without any modification.
126 | 
127 | # License
128 | 
129 | This code is published under an MIT license. See LICENSE file for more information.
130 | 


--------------------------------------------------------------------------------
/dedup.go:
--------------------------------------------------------------------------------
 1 | // depdup: A Streaming Deduplication package
 2 | //
 3 | // This package implements streaming deduplication, allowing you to remove duplicated data in streams.
 4 | // It implements variable block sizes and automatic content block adaptation.
 5 | // It has a fully streaming mode and an indexed mode, that has significantly reduced memory requirements.
 6 | //
 7 | // Read for an introduction to deduplication: https://blog.klauspost.com/fast-stream-deduplication-in-go
 8 | //
 9 | // Package home: https://github.com/klauspost/dedup
10 | //
11 | // Godoc: https://godoc.org/github.com/klauspost/dedup
12 | //
13 | package dedup
14 | 
15 | import (
16 | 	"fmt"
17 | 	"math/big"
18 | )
19 | 
20 | // Returns an approximate Birthday probability calculation
21 | // based on the number of blocks given and the hash size.
22 | //
23 | // It uses the simplified calculation:  p = k(k-1) / (2N)
24 | //
25 | // From http://preshing.com/20110504/hash-collision-probabilities/
26 | func BirthdayProblem(blocks int) string {
27 | 	k := big.NewInt(int64(blocks))
28 | 	km1 := big.NewInt(int64(blocks - 1))
29 | 	ksq := k.Mul(k, km1)
30 | 	n := big.NewInt(0)
31 | 	n = n.Exp(big.NewInt(2), big.NewInt(int64(HashSize)*8), nil)
32 | 	twoN := n.Add(n, n)
33 | 	var t, t2 big.Rat
34 | 	var res *big.Rat
35 | 	//
36 | 	res = t.SetFrac(ksq, twoN)
37 | 	f64, _ := res.Float64()
38 | 	inv := t2.Inv(res).FloatString(0)
39 | 	invs := fmt.Sprintf(" ~ 1/%s ~ %v", inv, f64)
40 | 
41 | 	return "Collision probability is" + invs
42 | }
43 | 


--------------------------------------------------------------------------------
/filesplitter.go:
--------------------------------------------------------------------------------
  1 | //+build ignore
  2 | 
  3 | // DISABLED, since I have found no scenarios where it provides improvement
  4 | 
  5 | package dedup
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"errors"
 10 | 	"io"
 11 | 	"io/ioutil"
 12 | 	"testing"
 13 | )
 14 | 
 15 | const (
 16 | 	// Dynamic block size, including split on file signatures.
 17 | 	// There are a number of typical file signartures builtin,
 18 | 	// or you can use AddSignature to add your own.
 19 | 	ModeDynamicSignatures = 2
 20 | 
 21 | 	// Dynamic block size only split on file signatures
 22 | 	ModeSignaturesOnly = 3
 23 | )
 24 | 
 25 | // Split on zpaq hash, file signatures and maximum block size.
 26 | func (z *zpaqWriter) writeFile(w *writer, b []byte) (int, error) {
 27 | 	c1 := z.c1
 28 | 
 29 | 	for i, c := range b {
 30 | 		split := false
 31 | 		v := sigmap[c]
 32 | 		if len(v) > 0 && i < len(b)-6 {
 33 | 			for _, s := range v {
 34 | 				split = true
 35 | 				for j, expect := range s {
 36 | 					if b[j+1] != expect {
 37 | 						split = false
 38 | 						break
 39 | 					}
 40 | 				}
 41 | 			}
 42 | 		}
 43 | 		if c == z.o1[c1] {
 44 | 			z.h = (z.h + uint32(c) + 1) * 314159265
 45 | 		} else {
 46 | 			z.h = (z.h + uint32(c) + 1) * 271828182
 47 | 		}
 48 | 		z.o1[c1] = c
 49 | 		c1 = c
 50 | 		w.cur[w.off] = c
 51 | 		w.off++
 52 | 
 53 | 		// Filled the buffer? Send it off!
 54 | 		if w.off >= z.minFragment && (z.h < z.maxHash || split || w.off >= z.maxFragment) {
 55 | 			b := <-w.buffers
 56 | 			// Swap block with current
 57 | 			w.cur, b.data = b.data[:w.maxSize], w.cur[:w.off]
 58 | 			b.N = w.nblocks
 59 | 
 60 | 			w.input <- b
 61 | 			w.write <- b
 62 | 			w.nblocks++
 63 | 			w.off = 0
 64 | 			z.h = 0
 65 | 			c1 = 0
 66 | 		}
 67 | 	}
 68 | 	z.c1 = c1
 69 | 	return len(b), nil
 70 | }
 71 | 
 72 | // Split on maximum size and file signatures only.
 73 | func fileSplitOnly(w *writer, b []byte) (int, error) {
 74 | 	for i, c := range b {
 75 | 		split := false
 76 | 		v := sigmap[c]
 77 | 		if len(v) > 0 && i < len(b)-6 {
 78 | 			for _, s := range v {
 79 | 				split = true
 80 | 				for j, expect := range s {
 81 | 					if b[j+1] != expect {
 82 | 						split = false
 83 | 						break
 84 | 					}
 85 | 				}
 86 | 			}
 87 | 		}
 88 | 		w.cur[w.off] = c
 89 | 		w.off++
 90 | 
 91 | 		// Filled the buffer? Send it off!
 92 | 		if split || w.off >= w.maxSize {
 93 | 			b := <-w.buffers
 94 | 			// Swap block with current
 95 | 			w.cur, b.data = b.data[:w.maxSize], w.cur[:w.off]
 96 | 			b.N = w.nblocks
 97 | 
 98 | 			w.input <- b
 99 | 			w.write <- b
100 | 			w.nblocks++
101 | 			w.off = 0
102 | 		}
103 | 	}
104 | 	return len(b), nil
105 | }
106 | 
107 | // 4 times faster than map[byte][][]byte
108 | // 2 times faster than generated code (switch byte 0, if)
109 | var sigmap [256][][]byte
110 | 
111 | func init() {
112 | 	for _, sig := range signatures {
113 | 		l := sig[0]
114 | 		err := AddSignature(sig[1 : 1+l])
115 | 		if err != nil {
116 | 			panic(err)
117 | 		}
118 | 	}
119 | }
120 | 
121 | // ErrSignatureTooShort is returned if AddSignature is called
122 | // with a signature shorter than 3 bytes
123 | var ErrSignatureTooShort = errors.New("signature should be at least 2 bytes")
124 | 
125 | // AddSignature will add a signature that will cause a block
126 | // split. The signature must be more than 1 byte (at least 3 is recommended),
127 | // and only up to 7 bytes are compared.
128 | func AddSignature(b []byte) error {
129 | 	if len(b) <= 1 {
130 | 		return ErrSignatureTooShort
131 | 	}
132 | 	if len(b) > 7 {
133 | 		b = b[:7]
134 | 	}
135 | 	x := sigmap[b[0]]
136 | 	dst := make([]byte, len(b)-1)
137 | 	copy(dst, b[1:])
138 | 	x = append(x, dst)
139 | 	sigmap[b[0]] = x
140 | 	return nil
141 | }
142 | 
143 | // File start signatures
144 | // 8 bytes, 1 byte length (1 to 7), 1-7 bytes identifier literals, 7-length padding.
145 | var signatures = [][8]byte{
146 | 	[8]byte{3, 0x42, 0x5A, 0x68, 0, 0, 0, 0},             //bzip 2
147 | 	[8]byte{3, 0x1f, 0x8b, 0x00, 0, 0, 0, 0},             //gzip (store)
148 | 	[8]byte{3, 0x1f, 0x8b, 0x08, 0, 0, 0, 0},             //gzip (deflate)
149 | 	[8]byte{6, 0x47, 0x49, 0x46, 0x38, 0x37, 0x61, 0},    //GIF87a
150 | 	[8]byte{6, 0x47, 0x49, 0x46, 0x38, 0x39, 0x61, 0},    //GIF89a
151 | 	[8]byte{4, 0x49, 0x49, 0x2A, 0x0, 0, 0, 0},           //TIFF
152 | 	[8]byte{4, 0x4D, 0x4D, 0x00, 0x2A, 0, 0, 0},          //TIFF
153 | 	[8]byte{3, 0xFF, 0xD8, 0xFF, 0, 0, 0, 0},             //JPEG
154 | 	[8]byte{4, 0x46, 0x4F, 0x52, 0x4D, 0, 0, 0},          //IFF (FORM)
155 | 	[8]byte{4, 0x50, 0x4B, 0x03, 0x04, 0, 0, 0},          //ZIP
156 | 	[8]byte{4, 0x50, 0x4B, 0x07, 0x08, 0, 0, 0},          //ZIP
157 | 	[8]byte{7, 0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x00}, //RAR
158 | 	[8]byte{4, 0x7F, 0x45, 0x4C, 0x46, 0, 0, 0},          //ELF
159 | 	[8]byte{7, 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A}, //PNG
160 | 	[8]byte{4, 0xCA, 0xFE, 0xBA, 0xBE, 0, 0, 0},          //Java Class
161 | 	[8]byte{3, 0xEF, 0xBB, 0xBF, 0, 0, 0, 0},             //Unicode byte order mark
162 | 	[8]byte{4, 0xFE, 0xED, 0xFA, 0xCE, 0, 0, 0},          //Mach-O binary (32-bit)
163 | 	[8]byte{4, 0xFE, 0xED, 0xFA, 0xCF, 0, 0, 0},          //Mach-O binary (64-bit)
164 | 	[8]byte{4, 0xCE, 0xFA, 0xED, 0xFE, 0, 0, 0},          //Mach-O binary (32-bit)
165 | 	[8]byte{4, 0xCF, 0xFA, 0xED, 0xFE, 0, 0, 0},          //Mach-O binary (64-bit)
166 | 	[8]byte{4, 0xFF, 0xFE, 0x00, 0x00, 0, 0, 0},          //BOM 32-bit Unicode Transfer Format
167 | 	[8]byte{4, 0x50, 0x45, 0x00, 0x00, 0, 0, 0},          //PE (PE Header)
168 | 	[8]byte{4, 0x25, 0x21, 0x50, 0x53, 0, 0, 0},          //PS
169 | 	[8]byte{4, 0x25, 0x50, 0x44, 0x46, 0, 0, 0},          //PDF
170 | 	[8]byte{7, 0x30, 0x26, 0xB2, 0x75, 0x8E, 0x66, 0xCF}, //ASF
171 | 	[8]byte{7, 0xA6, 0xD9, 0x00, 0xAA, 0x00, 0x62, 0xCE}, //WMV
172 | 	[8]byte{7, 0x24, 0x53, 0x44, 0x49, 0x30, 0x30, 0x30}, //SDI
173 | 	[8]byte{4, 0x4F, 0x67, 0x67, 0x53, 0, 0, 0},          //OGG
174 | 	[8]byte{4, 0x38, 0x42, 0x50, 0x53, 0, 0, 0},          //PSD
175 | 	[8]byte{4, 0x52, 0x49, 0x46, 0x46, 0, 0, 0},          //WAV/AVI
176 | 	[8]byte{3, 0x49, 0x44, 0x33, 0, 0, 0, 0},             //MP3 (ID3 v2, all versions)
177 | 	[8]byte{5, 0x43, 0x44, 0x30, 0x30, 0x31, 0, 0},       //ISO
178 | 	[8]byte{3, 0x4B, 0x44, 0x4D, 0, 0, 0, 0},             //VMDK
179 | 	[8]byte{4, 0x66, 0x4C, 0x61, 0x43, 0, 0, 0},          //FLAC
180 | 	[8]byte{4, 0x4D, 0x54, 0x68, 0x64, 0, 0, 0},          //MIDI
181 | 	[8]byte{5, 0x1A, 0x45, 0xDF, 0xA3, 0, 0},             //MKV
182 | 	[8]byte{5, 0x1F, 0x43, 0xB6, 0x75, 0, 0},             //MKV Cluster
183 | 	[8]byte{4, 0x46, 0x4c, 0x56, 0x01, 0, 0, 0},          //FLV (old format)
184 | 	[8]byte{7, 0x66, 0x74, 0x79, 0x70, 0x33, 0x67, 0x70}, //3GG/MP4
185 | 	[8]byte{6, 0x37, 0x7a, 0xbc, 0xaf, 0x27, 0x1c, 0},    //7zip
186 | 	[8]byte{6, 0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00, 0},    //XZ format
187 | 	[8]byte{7, 0x42, 0x4f, 0x4f, 0x4b, 0x4d, 0x4f, 0x42}, //MOBI book format
188 | 	[8]byte{7, 0x53, 0x51, 0x4c, 0x69, 0x74, 0x65, 0x20}, //SQLite DB
189 | 	[8]byte{6, 0x7b, 0x5c, 0x72, 0x74, 0x66, 0x31, 0},    //RTF '{\rtf1\'
190 | 	[8]byte{7, '<', '!', 'D', 'O', 'C', 'T', 'Y'},        //HTML Doctype
191 | 	[8]byte{4, 0x49, 0x54, 0x53, 0x46, 0, 0, 0},          //CHM Fomrat
192 | 	[8]byte{6, '<', '?', 'x', 'm', 'l', ' ', 0},          //XML Doctype
193 | 	[8]byte{5, 0x2e, 0x70, 0x6e, 0x20, 0x30, 0, 0},       //troff page #0
194 | 	[8]byte{4, 0xfe, 0x62, 0x69, 0x6e, 0, 0, 0},          //MySQL binlog
195 | 	[8]byte{5, 'K', 'D', 'M', 'V', 0x01, 0, 0},           //Virtual machine disk image
196 | 	[8]byte{5, 'M', 'R', 'V', 'N', 0x01, 0, 0},           //VMware nvram image
197 | 
198 | 	// Exotics:
199 | 	//[8]byte{7, 0x46, 0x55, 0x4a, 0x49, 0x46, 0x49, 0x4c}, //FUJI Raw format
200 | 	//[8]byte{7, 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a}, //MSI format
201 | 	//[8]byte{5, 0x46, 0x4f, 0x56, 0x62, 0x00, 0, 0}, //X3F format
202 | 
203 | 	//[8]byte{4, 0x50, 0x4B, 0x05, 0x06, 0, 0, 0},          //ZIP empty archive
204 | }
205 | 
206 | // Tests:
207 | 
208 | // Maximum block size: 64k
209 | func BenchmarkDynamicSigsWriter64K(t *testing.B) {
210 | 	const totalinput = 10 << 20
211 | 	input := getBufferSize(totalinput)
212 | 
213 | 	const size = 64 << 10
214 | 	b := input.Bytes()
215 | 	// Create some duplicates
216 | 	for i := 0; i < 50; i++ {
217 | 		// Read from 10 first blocks
218 | 		src := b[(i%10)*size : (i%10)*size+size]
219 | 		// Write into the following ones
220 | 		dst := b[(10+i)*size : (i+10)*size+size]
221 | 		copy(dst, src)
222 | 	}
223 | 	t.ResetTimer()
224 | 	t.SetBytes(totalinput)
225 | 	for i := 0; i < t.N; i++ {
226 | 		input = bytes.NewBuffer(b)
227 | 		w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeDynamicSignatures, size, 0)
228 | 		io.Copy(w, input)
229 | 		err := w.Close()
230 | 		if err != nil {
231 | 			t.Fatal(err)
232 | 		}
233 | 	}
234 | }
235 | 
236 | // Maximum block size: 64k
237 | func BenchmarkSigsOnlyWriter64K(t *testing.B) {
238 | 	const totalinput = 10 << 20
239 | 	input := getBufferSize(totalinput)
240 | 
241 | 	const size = 64 << 10
242 | 	b := input.Bytes()
243 | 	// Create some duplicates
244 | 	for i := 0; i < 50; i++ {
245 | 		// Read from 10 first blocks
246 | 		src := b[(i%10)*size : (i%10)*size+size]
247 | 		// Write into the following ones
248 | 		dst := b[(10+i)*size : (i+10)*size+size]
249 | 		copy(dst, src)
250 | 	}
251 | 	t.ResetTimer()
252 | 	t.SetBytes(totalinput)
253 | 	for i := 0; i < t.N; i++ {
254 | 		input = bytes.NewBuffer(b)
255 | 		w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeSignaturesOnly, size, 0)
256 | 		io.Copy(w, input)
257 | 		err := w.Close()
258 | 		if err != nil {
259 | 			t.Fatal(err)
260 | 		}
261 | 	}
262 | }
263 | 


--------------------------------------------------------------------------------
/reader.go:
--------------------------------------------------------------------------------
  1 | package dedup
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"encoding/binary"
  6 | 	"errors"
  7 | 	"fmt"
  8 | 	"io"
  9 | 	"math"
 10 | )
 11 | 
 12 | // A Reader will decode a deduplicated stream and
 13 | // return the data as it was encoded.
 14 | // Use Close when done to release resources.
 15 | type Reader interface {
 16 | 	io.ReadCloser
 17 | 
 18 | 	io.WriterTo
 19 | 
 20 | 	// MaxMem returns the *maximum* memory required to decode the stream.
 21 | 	MaxMem() int
 22 | }
 23 | 
 24 | // IndexedReader gives access to internal information on
 25 | // block sizes available on indexed streams.
 26 | type IndexedReader interface {
 27 | 	Reader
 28 | 
 29 | 	// Blocksizes will return the sizes of each block.
 30 | 	// Will be available if an index was provided.
 31 | 	BlockSizes() []int
 32 | }
 33 | 
 34 | type reader struct {
 35 | 	streamReader
 36 | 	blocks []*rblock
 37 | }
 38 | 
 39 | type streamReader struct {
 40 | 	size         int
 41 | 	maxLength    uint64 // Maxmimum backreference count
 42 | 	curBlock     int
 43 | 	curData      []byte
 44 | 	ready        chan *rblock
 45 | 	closeReader  chan struct{}
 46 | 	readerClosed chan struct{}
 47 | }
 48 | 
 49 | // rblock contains read information about a single block
 50 | type rblock struct {
 51 | 	data     []byte
 52 | 	readData int
 53 | 	first    int   // Index of first occurrence
 54 | 	last     int   // Index of last occurrence
 55 | 	offset   int64 // Expected offset in data file (format 1)
 56 | 	err      error // Read error?
 57 | }
 58 | 
 59 | func (r *rblock) String() string {
 60 | 	if r == nil {
 61 | 		return "<nil>"
 62 | 	}
 63 | 	return fmt.Sprintf("{Read:%d; [%d:%d], offset:%d}", r.readData, r.first, r.last, r.offset)
 64 | }
 65 | 
 66 | var ErrUnknownFormat = errors.New("unknown index format")
 67 | 
 68 | // NewReader returns a reader that will decode the supplied index and data stream.
 69 | //
 70 | // This is compatible content from the NewWriter function.
 71 | // The function will decode the index before returning.
 72 | //
 73 | // When you are done with the Reader, use Close to release resources.
 74 | func NewReader(index io.Reader, blocks io.Reader) (IndexedReader, error) {
 75 | 	f := &reader{streamReader: streamReader{
 76 | 		ready:        make(chan *rblock, 8), // Read up to 8 blocks ahead
 77 | 		closeReader:  make(chan struct{}, 0),
 78 | 		readerClosed: make(chan struct{}, 0),
 79 | 		curBlock:     0,
 80 | 	}}
 81 | 	idx := bufio.NewReader(index)
 82 | 	format, err := binary.ReadUvarint(idx)
 83 | 	if err != nil {
 84 | 		return nil, err
 85 | 	}
 86 | 
 87 | 	switch format {
 88 | 	case 1:
 89 | 		err = f.readFormat1(idx)
 90 | 	default:
 91 | 		err = ErrUnknownFormat
 92 | 	}
 93 | 	go f.blockReader(blocks)
 94 | 
 95 | 	return f, err
 96 | }
 97 | 
 98 | // NewStreamReader returns a reader that will decode the supplied data stream.
 99 | //
100 | // This is compatible content from the NewStreamWriter function.
101 | //
102 | // When you are done with the Reader, use Close to release resources.
103 | func NewStreamReader(in io.Reader) (Reader, error) {
104 | 	f := &streamReader{
105 | 		ready:        make(chan *rblock, 8), // Read up to 8 blocks ahead
106 | 		closeReader:  make(chan struct{}, 0),
107 | 		readerClosed: make(chan struct{}, 0),
108 | 		curBlock:     0,
109 | 	}
110 | 	br := bufio.NewReader(in)
111 | 	format, err := binary.ReadUvarint(br)
112 | 	if err != nil {
113 | 		return nil, err
114 | 	}
115 | 
116 | 	switch format {
117 | 	case 2:
118 | 		err = f.readFormat2(br)
119 | 		if err != nil {
120 | 			return nil, err
121 | 		}
122 | 	default:
123 | 		return nil, ErrUnknownFormat
124 | 	}
125 | 
126 | 	go f.streamReader(br)
127 | 
128 | 	return f, nil
129 | }
130 | 
131 | // NewSeekRead returns a reader that will decode the supplied index and data stream.
132 | //
133 | // This is compatible content from the NewWriter function.
134 | //
135 | // No blocks will be kept in memory, but the block data input must be seekable.
136 | // The function will decode the index before returning.
137 | //
138 | // When you are done with the Reader, use Close to release resources.
139 | func NewSeekReader(index io.Reader, blocks io.ReadSeeker) (IndexedReader, error) {
140 | 	f := &reader{streamReader: streamReader{
141 | 		ready:        make(chan *rblock, 8), // Read up to 8 blocks ahead
142 | 		closeReader:  make(chan struct{}, 0),
143 | 		readerClosed: make(chan struct{}, 0),
144 | 		curBlock:     0,
145 | 		maxLength:    8, // We have 8 blocks readahead.
146 | 	}}
147 | 	idx := bufio.NewReader(index)
148 | 	format, err := binary.ReadUvarint(idx)
149 | 	if err != nil {
150 | 		return nil, err
151 | 	}
152 | 
153 | 	switch format {
154 | 	case 1:
155 | 		err = f.readFormat1(idx)
156 | 	default:
157 | 		err = ErrUnknownFormat
158 | 	}
159 | 
160 | 	go f.seekReader(blocks)
161 | 
162 | 	return f, err
163 | }
164 | 
165 | // readFormat1 will read the index of format 1
166 | // and prepare decoding
167 | func (f *reader) readFormat1(idx io.ByteReader) error {
168 | 	size, err := binary.ReadUvarint(idx)
169 | 	if err != nil {
170 | 		return err
171 | 	}
172 | 	f.size = int(size)
173 | 
174 | 	// Insert empty block 0
175 | 	f.blocks = append(f.blocks, nil)
176 | 	i := 0
177 | 	var foffset int64
178 | 	// Read blocks
179 | 	for {
180 | 		i++
181 | 		offset, err := binary.ReadUvarint(idx)
182 | 		if err != nil {
183 | 			return err
184 | 		}
185 | 		switch offset {
186 | 		// new block
187 | 		case 0:
188 | 			r, err := binary.ReadUvarint(idx)
189 | 			if err != nil {
190 | 				return err
191 | 			}
192 | 			if r > size {
193 | 				return fmt.Errorf("invalid size for block %d, %d > %d", i, r, size)
194 | 			}
195 | 			f.blocks = append(f.blocks, &rblock{first: i, last: i, readData: int(size - r), offset: foffset})
196 | 			foffset += int64(size - r)
197 | 		// Last block
198 | 		case math.MaxUint64:
199 | 			r, err := binary.ReadUvarint(idx)
200 | 			if err != nil {
201 | 				return err
202 | 			}
203 | 			if r > size {
204 | 				return fmt.Errorf("invalid size for block %d, %d > %d", i, r, size)
205 | 			}
206 | 			f.blocks = append(f.blocks, &rblock{readData: int(size - r), offset: foffset})
207 | 			foffset += int64(size - r)
208 | 			// Continuation should be 0
209 | 			r, err = binary.ReadUvarint(idx)
210 | 			if err != nil {
211 | 				return err
212 | 			}
213 | 			if r != 0 {
214 | 				return fmt.Errorf("invalid continuation, should be 0, was %d", r)
215 | 			}
216 | 			return nil
217 | 		// Deduplicated block
218 | 		default:
219 | 			pos := len(f.blocks) - int(offset)
220 | 			if pos <= 0 || pos >= len(f.blocks) {
221 | 				return fmt.Errorf("invalid offset encountered at block %d, offset was %d", len(f.blocks), offset)
222 | 			}
223 | 			// Update last position.
224 | 			org := f.blocks[pos]
225 | 			org.last = i
226 | 			f.blocks = append(f.blocks, org)
227 | 		}
228 | 	}
229 | }
230 | 
231 | // readFormat2 will read the header data of format 2
232 | // and stop at the first block.
233 | func (f *streamReader) readFormat2(rd io.ByteReader) error {
234 | 	size, err := binary.ReadUvarint(rd)
235 | 	if err != nil {
236 | 		return err
237 | 	}
238 | 	if size < MinBlockSize {
239 | 		return ErrSizeTooSmall
240 | 	}
241 | 	f.size = int(size)
242 | 
243 | 	maxLength, err := binary.ReadUvarint(rd)
244 | 	if err != nil {
245 | 		return err
246 | 	}
247 | 	if maxLength < 1 {
248 | 		return ErrMaxMemoryTooSmall
249 | 	}
250 | 	f.maxLength = maxLength
251 | 	return nil
252 | }
253 | 
254 | // Read will read from the input stream and return the
255 | // deduplicated data.
256 | func (f *streamReader) Read(b []byte) (int, error) {
257 | 	read := 0
258 | 	for len(b) > 0 {
259 | 		// Read next
260 | 		if len(f.curData) == 0 {
261 | 			f.curBlock++
262 | 			next, ok := <-f.ready
263 | 			if !ok {
264 | 				return read, io.EOF
265 | 			}
266 | 			if next.err != nil {
267 | 				return read, next.err
268 | 			}
269 | 			f.curData = next.data
270 | 			// We don't want to keep it, if this is the last block
271 | 			if f.curBlock == next.last {
272 | 				next.data = nil
273 | 			}
274 | 			if len(f.curData) == 0 {
275 | 				continue
276 | 			}
277 | 		}
278 | 		n := copy(b, f.curData)
279 | 		read += n
280 | 		b = b[n:]
281 | 		f.curData = f.curData[n:]
282 | 	}
283 | 	return read, nil
284 | }
285 | 
286 | // WriteTo writes data to w until there's no more data to write or when an error occurs.
287 | // The return value n is the number of bytes written.
288 | // Any error encountered during the write is also returned.
289 | func (f *streamReader) WriteTo(w io.Writer) (int64, error) {
290 | 	written := int64(0)
291 | 	for {
292 | 		next, ok := <-f.ready
293 | 		if !ok {
294 | 			return written, io.EOF
295 | 		}
296 | 		if next.err != nil {
297 | 			return written, next.err
298 | 		}
299 | 		f.curBlock++
300 | 		f.curData = next.data
301 | 
302 | 		// We don't want to keep it, if this is the last block
303 | 		if f.curBlock == next.last {
304 | 			next.data = nil
305 | 		}
306 | 		n, err := w.Write(f.curData)
307 | 		written += int64(n)
308 | 		if err != nil {
309 | 			return written, err
310 | 		}
311 | 	}
312 | }
313 | 
314 | // MaxMem returns the estimated maximum RAM usage needed to
315 | // unpack this content.
316 | func (f *streamReader) MaxMem() int {
317 | 	if f.maxLength > 0 {
318 | 		return int(f.maxLength) * f.size
319 | 	}
320 | 	return -1
321 | }
322 | 
323 | // MaxMem returns the estimated maximum RAM usage needed to
324 | // unpack this content.
325 | func (f *reader) MaxMem() int {
326 | 	i := 1 // Current block
327 | 	curUse := 0
328 | 	maxUse := 0
329 | 	for {
330 | 		b := f.blocks[i]
331 | 		if b.first == i {
332 | 			curUse += b.readData
333 | 		}
334 | 		if curUse > maxUse {
335 | 			maxUse = curUse
336 | 		}
337 | 
338 | 		if b.last == i {
339 | 			curUse -= b.readData
340 | 		}
341 | 
342 | 		i++
343 | 		// We read them all
344 | 		if i == len(f.blocks) {
345 | 			break
346 | 		}
347 | 	}
348 | 	return maxUse
349 | }
350 | 
351 | func (f *reader) BlockSizes() []int {
352 | 	if len(f.blocks) < 2 {
353 | 		return nil
354 | 	}
355 | 
356 | 	ret := make([]int, len(f.blocks)-1)
357 | 	for i, bl := range f.blocks[1:] {
358 | 		ret[i] = bl.readData
359 | 	}
360 | 	return ret
361 | }
362 | 
363 | // blockReader will read format 1 blocks and deliver them
364 | // to the ready channel.
365 | // The function will return if the stream is finished,
366 | // or an error occurs
367 | func (f *reader) blockReader(in io.Reader) {
368 | 	defer close(f.readerClosed)
369 | 	defer close(f.ready)
370 | 
371 | 	i := 1 // Current block
372 | 	totalRead := 0
373 | 	for {
374 | 		b := f.blocks[i]
375 | 		// Read it?
376 | 		if len(b.data) != b.readData {
377 | 			b.data = make([]byte, b.readData)
378 | 			n, err := io.ReadFull(in, b.data)
379 | 			if err != nil {
380 | 				b.err = err
381 | 			} else if n != b.readData {
382 | 				b.err = io.ErrUnexpectedEOF
383 | 			}
384 | 			totalRead += n
385 | 		}
386 | 		// Send or close
387 | 		select {
388 | 		case <-f.closeReader:
389 | 			return
390 | 		case f.ready <- b:
391 | 		}
392 | 		// Exit because of an error
393 | 		if b.err != nil {
394 | 			return
395 | 		}
396 | 		i++
397 | 		// We read them all
398 | 		if i == len(f.blocks) {
399 | 			return
400 | 		}
401 | 	}
402 | }
403 | 
404 | // streamReader will read blocks from a single stream
405 | // and deliver them to the "ready" channel.
406 | // The function will return if an error occurs or
407 | // the stream is finished.
408 | func (f *streamReader) streamReader(stream *bufio.Reader) {
409 | 	defer close(f.readerClosed)
410 | 	defer close(f.ready)
411 | 
412 | 	totalRead := 0
413 | 
414 | 	// Create backreference buffers
415 | 	blocks := make([][]byte, f.maxLength)
416 | 	for i := range blocks {
417 | 		blocks[i] = make([]byte, f.size)
418 | 	}
419 | 
420 | 	i := uint64(1) // Current block
421 | 	for {
422 | 		b := &rblock{}
423 | 		lastBlock := false
424 | 
425 | 		b.err = func() error {
426 | 			offset, err := binary.ReadUvarint(stream)
427 | 			if err != nil {
428 | 				return err
429 | 			}
430 | 			// Read it?
431 | 			if offset == 0 || offset == math.MaxUint64 {
432 | 				s, err := binary.ReadUvarint(stream)
433 | 				if err != nil {
434 | 					return err
435 | 				}
436 | 				size := f.size - int(s)
437 | 				if offset == math.MaxUint64 && size == 0 {
438 | 					lastBlock = true
439 | 					return nil
440 | 				}
441 | 				if size > f.size || size <= 0 {
442 | 					return fmt.Errorf("invalid size encountered at block %d, size was %d", i, size)
443 | 				}
444 | 				b.data = make([]byte, size)
445 | 				n, err := io.ReadFull(stream, b.data)
446 | 				if err != nil {
447 | 					return err
448 | 				} else if n != len(b.data) {
449 | 					return io.ErrUnexpectedEOF
450 | 				}
451 | 				totalRead += n
452 | 				if offset == math.MaxUint64 {
453 | 					lastBlock = true
454 | 				}
455 | 			} else {
456 | 				if offset > f.maxLength {
457 | 					return fmt.Errorf("invalid offset encountered at block %d, offset was %d", i, offset)
458 | 				}
459 | 				pos := i - offset
460 | 				if pos <= 0 {
461 | 					return fmt.Errorf("invalid offset encountered at block %d, offset was %d", i, offset)
462 | 				}
463 | 				src := blocks[pos%f.maxLength]
464 | 				b.data = src
465 | 			}
466 | 
467 | 			blocks[i%f.maxLength] = b.data
468 | 			return nil
469 | 		}()
470 | 		// Read continuation
471 | 		if lastBlock {
472 | 			r, err := binary.ReadUvarint(stream)
473 | 			if err != nil {
474 | 				b.err = err
475 | 			}
476 | 			if r != 0 {
477 | 				b.err = fmt.Errorf("invalid continuation, should be 0, was %d", r)
478 | 			}
479 | 		}
480 | 
481 | 		// Send or close
482 | 		select {
483 | 		case <-f.closeReader:
484 | 			return
485 | 		case f.ready <- b:
486 | 		}
487 | 		// Exit because of an error
488 | 		if b.err != nil || lastBlock {
489 | 			return
490 | 		}
491 | 		i++
492 | 	}
493 | }
494 | 
495 | // seekReader will read format 1 blocks and deliver them
496 | // to the ready channel.
497 | // The function will return if the stream is finished,
498 | // or an error occurs
499 | func (f *reader) seekReader(in io.ReadSeeker) {
500 | 	defer close(f.readerClosed)
501 | 	defer close(f.ready)
502 | 
503 | 	i := 1 // Current block
504 | 	var foffset int64
505 | 	for {
506 | 		// Copy b, we are modifying it.
507 | 		b := *f.blocks[i]
508 | 
509 | 		// Seek to offset if needed, and
510 | 		if b.offset != foffset {
511 | 			_, err := in.Seek(b.offset, 0)
512 | 			if err != nil {
513 | 				b.err = err
514 | 			}
515 | 		}
516 | 		if b.err == nil {
517 | 			b.data = make([]byte, b.readData)
518 | 			n, err := io.ReadFull(in, b.data)
519 | 			if err != nil {
520 | 				b.err = err
521 | 			} else if n != b.readData {
522 | 				b.err = io.ErrUnexpectedEOF
523 | 			}
524 | 			foffset = b.offset + int64(n)
525 | 		}
526 | 
527 | 		// Always release the memory of this block
528 | 		b.last = i
529 | 
530 | 		// Send or close
531 | 		select {
532 | 		case <-f.closeReader:
533 | 			return
534 | 		case f.ready <- &b:
535 | 		}
536 | 		// Exit because of an error
537 | 		if b.err != nil {
538 | 			return
539 | 		}
540 | 		i++
541 | 		// We read them all
542 | 		if i == len(f.blocks) {
543 | 			return
544 | 		}
545 | 	}
546 | }
547 | 
548 | // Close the reader and shut down the running goroutines.
549 | func (f *streamReader) Close() error {
550 | 	select {
551 | 	case <-f.readerClosed:
552 | 	case f.closeReader <- struct{}{}:
553 | 		<-f.readerClosed
554 | 	}
555 | 	return nil
556 | }
557 | 


--------------------------------------------------------------------------------
/reader_test.go:
--------------------------------------------------------------------------------
  1 | package dedup_test
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"io"
  6 | 	"testing"
  7 | 
  8 | 	"io/ioutil"
  9 | 
 10 | 	"fmt"
 11 | 
 12 | 	"github.com/klauspost/dedup"
 13 | )
 14 | 
 15 | func TestReader(t *testing.T) {
 16 | 	idx := bytes.Buffer{}
 17 | 	data := bytes.Buffer{}
 18 | 
 19 | 	const totalinput = 10<<20 + 65
 20 | 	input := getBufferSize(totalinput)
 21 | 
 22 | 	const size = 64 << 10
 23 | 	b := input.Bytes()
 24 | 	// Create some duplicates
 25 | 	for i := 0; i < 50; i++ {
 26 | 		// Read from 10 first blocks
 27 | 		src := b[(i%10)*size : (i%10)*size+size]
 28 | 		// Write into the following ones
 29 | 		dst := b[(10+i)*size : (i+10)*size+size]
 30 | 		copy(dst, src)
 31 | 	}
 32 | 	input = bytes.NewBuffer(b)
 33 | 	w, err := dedup.NewWriter(&idx, &data, dedup.ModeFixed, size, 0)
 34 | 	if err != nil {
 35 | 		t.Fatal(err)
 36 | 	}
 37 | 	io.Copy(w, input)
 38 | 	err = w.Close()
 39 | 	if err != nil {
 40 | 		t.Fatal(err)
 41 | 	}
 42 | 
 43 | 	t.Log("Fixed Index size:", idx.Len())
 44 | 	t.Log("Fixed Data size:", data.Len(), "-", data.Len()*100/totalinput, "%")
 45 | 
 46 | 	r, err := dedup.NewReader(&idx, &data)
 47 | 	if err != nil {
 48 | 		t.Fatal(err)
 49 | 	}
 50 | 
 51 | 	t.Log("Maximum estimated memory:", r.MaxMem(), "bytes")
 52 | 
 53 | 	out, err := ioutil.ReadAll(r)
 54 | 	if err != io.EOF && err != nil {
 55 | 		t.Fatal(err)
 56 | 	}
 57 | 	if len(b) != len(out) {
 58 | 		t.Fatalf("Expected len %d, got %d", len(b), len(out))
 59 | 	}
 60 | 	if bytes.Compare(b, out) != 0 {
 61 | 		t.Fatal("Output mismatch")
 62 | 	}
 63 | 	err = r.Close()
 64 | 	if err != nil {
 65 | 		t.Fatal(err)
 66 | 	}
 67 | 	blocks := r.BlockSizes()
 68 | 	for _, s := range blocks[:len(blocks)-1] {
 69 | 		if s != size {
 70 | 			t.Fatal("wrong size, expected", size, "got", s)
 71 | 		}
 72 | 	}
 73 | }
 74 | 
 75 | func TestReaderStream(t *testing.T) {
 76 | 	data := bytes.Buffer{}
 77 | 
 78 | 	const totalinput = 10<<20 + 65
 79 | 	input := getBufferSize(totalinput)
 80 | 
 81 | 	const size = 64 << 10
 82 | 	b := input.Bytes()
 83 | 	// Create some duplicates
 84 | 	for i := 0; i < 50; i++ {
 85 | 		// Read from 10 first blocks
 86 | 		src := b[(i%10)*size : (i%10)*size+size]
 87 | 		// Write into the following ones
 88 | 		dst := b[(10+i)*size : (i+10)*size+size]
 89 | 		copy(dst, src)
 90 | 	}
 91 | 	input = bytes.NewBuffer(b)
 92 | 	w, err := dedup.NewStreamWriter(&data, dedup.ModeFixed, size, 10*size)
 93 | 	if err != nil {
 94 | 		t.Fatal(err)
 95 | 	}
 96 | 	io.Copy(w, input)
 97 | 	err = w.Close()
 98 | 	if err != nil {
 99 | 		t.Fatal(err)
100 | 	}
101 | 
102 | 	t.Log("Fixed Data size:", data.Len(), "-", data.Len()*100/totalinput, "%")
103 | 
104 | 	r, err := dedup.NewStreamReader(&data)
105 | 	if err != nil {
106 | 		t.Fatal(err)
107 | 	}
108 | 
109 | 	t.Log("Maximum estimated memory:", r.MaxMem(), "bytes")
110 | 
111 | 	out, err := ioutil.ReadAll(r)
112 | 	if err != io.EOF && err != nil {
113 | 		t.Fatal(err)
114 | 	}
115 | 	if len(b) != len(out) {
116 | 		t.Fatalf("Expected len %d, got %d", len(b), len(out))
117 | 	}
118 | 	if bytes.Compare(b, out) != 0 {
119 | 		t.Fatal("Output mismatch")
120 | 	}
121 | 	err = r.Close()
122 | 	if err != nil {
123 | 		t.Fatal(err)
124 | 	}
125 | }
126 | 
127 | func TestSeekReader(t *testing.T) {
128 | 	idx := bytes.Buffer{}
129 | 	data := bytes.Buffer{}
130 | 
131 | 	const totalinput = 50<<20 + 65
132 | 	input := getBufferSize(totalinput)
133 | 
134 | 	const size = 64 << 10
135 | 	b := input.Bytes()
136 | 	// Create some duplicates
137 | 	for i := 0; i < 500; i++ {
138 | 		// Read from 10 first blocks
139 | 		src := b[(i%100)*size : (i%100)*size+size]
140 | 		// Write into the following ones
141 | 		dst := b[(100+i)*size : (i+100)*size+size]
142 | 		copy(dst, src)
143 | 	}
144 | 	input = bytes.NewBuffer(b)
145 | 	w, err := dedup.NewWriter(&idx, &data, dedup.ModeFixed, size, 0)
146 | 	if err != nil {
147 | 		t.Fatal(err)
148 | 	}
149 | 	io.Copy(w, input)
150 | 	err = w.Close()
151 | 	if err != nil {
152 | 		t.Fatal(err)
153 | 	}
154 | 
155 | 	t.Log("Fixed Index size:", idx.Len())
156 | 	t.Log("Fixed Data size:", data.Len(), "-", data.Len()*100/totalinput, "%")
157 | 
158 | 	r, err := dedup.NewSeekReader(&idx, bytes.NewReader(data.Bytes()))
159 | 	if err != nil {
160 | 		t.Fatal(err)
161 | 	}
162 | 
163 | 	t.Log("Maximum estimated memory:", r.MaxMem(), "bytes")
164 | 
165 | 	out, err := ioutil.ReadAll(r)
166 | 	if err != io.EOF && err != nil {
167 | 		t.Fatal(err)
168 | 	}
169 | 	if len(b) != len(out) {
170 | 		t.Fatalf("Expected len %d, got %d", len(b), len(out))
171 | 	}
172 | 	if bytes.Compare(b, out) != 0 {
173 | 		t.Fatal("Output mismatch")
174 | 	}
175 | 	err = r.Close()
176 | 	if err != nil {
177 | 		t.Fatal(err)
178 | 	}
179 | }
180 | 
181 | func TestDynamicRoundtrip(t *testing.T) {
182 | 	idx := bytes.Buffer{}
183 | 	data := bytes.Buffer{}
184 | 
185 | 	const totalinput = 10<<20 + 65
186 | 	input := getBufferSize(totalinput)
187 | 
188 | 	const size = 64 << 10
189 | 	b := input.Bytes()
190 | 	// Create some duplicates
191 | 	for i := 0; i < 50; i++ {
192 | 		// Read from 10 first blocks
193 | 		src := b[(i%10)*size : (i%10)*size+size]
194 | 		// Write into the following ones
195 | 		dst := b[(10+i)*size : (i+10)*size+size]
196 | 		copy(dst, src)
197 | 	}
198 | 	input = bytes.NewBuffer(b)
199 | 	w, err := dedup.NewWriter(&idx, &data, dedup.ModeDynamic, size, 0)
200 | 	if err != nil {
201 | 		t.Fatal(err)
202 | 	}
203 | 	io.Copy(w, input)
204 | 	err = w.Close()
205 | 	if err != nil {
206 | 		t.Fatal(err)
207 | 	}
208 | 
209 | 	t.Log("Dynamic Index size:", idx.Len())
210 | 	t.Log("Dynamic Data size:", data.Len())
211 | 
212 | 	r, err := dedup.NewReader(&idx, &data)
213 | 	if err != nil {
214 | 		t.Fatal(err)
215 | 	}
216 | 
217 | 	t.Log("Maximum estimated memory:", r.MaxMem(), "bytes")
218 | 	blocks := r.BlockSizes()
219 | 	avg := 0
220 | 	for _, v := range blocks {
221 | 		if v > size {
222 | 			t.Fatal("too big block returned, should not be >", size, "was", v)
223 | 		}
224 | 		avg += v
225 | 	}
226 | 	t.Log("Average block size:", avg/len(blocks), "bytes")
227 | 
228 | 	out, err := ioutil.ReadAll(r)
229 | 	if err != io.EOF && err != nil {
230 | 		t.Fatal(err)
231 | 	}
232 | 	if len(b) != len(out) {
233 | 		t.Fatalf("Expected len %d, got %d", len(b), len(out))
234 | 	}
235 | 	if bytes.Compare(b, out) != 0 {
236 | 		t.Fatal("Output mismatch")
237 | 	}
238 | 	err = r.Close()
239 | 	if err != nil {
240 | 		t.Fatal(err)
241 | 	}
242 | }
243 | 
244 | func TestReaderWriteTo(t *testing.T) {
245 | 	idx := bytes.Buffer{}
246 | 	data := bytes.Buffer{}
247 | 
248 | 	const totalinput = 10<<20 + 65
249 | 	input := getBufferSize(totalinput)
250 | 
251 | 	const size = 64 << 10
252 | 	b := input.Bytes()
253 | 	// Create some duplicates
254 | 	for i := 0; i < 50; i++ {
255 | 		// Read from 10 first blocks
256 | 		src := b[(i%10)*size : (i%10)*size+size]
257 | 		// Write into the following ones
258 | 		dst := b[(10+i)*size : (i+10)*size+size]
259 | 		copy(dst, src)
260 | 	}
261 | 	input = bytes.NewBuffer(b)
262 | 	w, err := dedup.NewWriter(&idx, &data, dedup.ModeFixed, size, 0)
263 | 	if err != nil {
264 | 		t.Fatal(err)
265 | 	}
266 | 	io.Copy(w, input)
267 | 	err = w.Close()
268 | 	if err != nil {
269 | 		t.Fatal(err)
270 | 	}
271 | 
272 | 	r, err := dedup.NewReader(&idx, &data)
273 | 	if err != nil {
274 | 		t.Fatal(err)
275 | 	}
276 | 
277 | 	dst := &bytes.Buffer{}
278 | 	n, err := r.WriteTo(dst)
279 | 	if err != io.EOF && err != nil {
280 | 		t.Fatal(err)
281 | 	}
282 | 	if len(b) != int(n) {
283 | 		t.Errorf("Write count, expected n %d, got %d", len(b), n)
284 | 	}
285 | 
286 | 	out := dst.Bytes()
287 | 	if len(b) != len(out) {
288 | 		t.Fatalf("Expected len %d, got %d", len(b), len(out))
289 | 	}
290 | 	if len(b) != len(out) {
291 | 		t.Fatalf("Expected len %d, got %d", len(b), len(out))
292 | 	}
293 | 	if bytes.Compare(b, out) != 0 {
294 | 		t.Fatal("Output mismatch")
295 | 	}
296 | 	err = r.Close()
297 | 	if err != nil {
298 | 		t.Fatal(err)
299 | 	}
300 | 	blocks := r.BlockSizes()
301 | 	for _, s := range blocks[:len(blocks)-1] {
302 | 		if s != size {
303 | 			t.Fatal("wrong size, expected", size, "got", s)
304 | 		}
305 | 	}
306 | }
307 | 
308 | // Indexed stream, 10MB input, 64K blocks
309 | func BenchmarkReader64K(t *testing.B) {
310 | 	idx := &bytes.Buffer{}
311 | 	data := &bytes.Buffer{}
312 | 
313 | 	const totalinput = 10 << 20
314 | 	input := getBufferSize(totalinput)
315 | 
316 | 	const size = 64 << 10
317 | 	b := input.Bytes()
318 | 	// Create some duplicates
319 | 	for i := 0; i < 50; i++ {
320 | 		// Read from 10 first blocks
321 | 		src := b[(i%10)*size : (i%10)*size+size]
322 | 		// Write into the following ones
323 | 		dst := b[(10+i)*size : (i+10)*size+size]
324 | 		copy(dst, src)
325 | 	}
326 | 	input = bytes.NewBuffer(b)
327 | 	w, err := dedup.NewWriter(idx, data, dedup.ModeFixed, size, 0)
328 | 	if err != nil {
329 | 		t.Fatal(err)
330 | 	}
331 | 	_, err = io.Copy(w, input)
332 | 	if err != nil {
333 | 		t.Fatal(err)
334 | 	}
335 | 	err = w.Close()
336 | 	if err != nil {
337 | 		t.Fatal(err)
338 | 	}
339 | 
340 | 	index := idx.Bytes()
341 | 	alldata := data.Bytes()
342 | 
343 | 	t.ResetTimer()
344 | 	t.SetBytes(totalinput)
345 | 	for i := 0; i < t.N; i++ {
346 | 		idx = bytes.NewBuffer(index)
347 | 		data = bytes.NewBuffer(alldata)
348 | 		r, err := dedup.NewReader(idx, data)
349 | 		if err != nil {
350 | 			t.Fatal(err)
351 | 		}
352 | 		n, err := io.Copy(ioutil.Discard, r)
353 | 		if err != nil && err != io.EOF {
354 | 			t.Fatal(err)
355 | 		}
356 | 		if n != int64(len(b)) {
357 | 			t.Fatal("read was short, expected", len(b), "was", n)
358 | 		}
359 | 		err = r.Close()
360 | 		if err != nil {
361 | 			t.Fatal(err)
362 | 		}
363 | 	}
364 | }
365 | 
366 | // Indexed stream, 10MB input, 4K blocks
367 | func BenchmarkReader4K(t *testing.B) {
368 | 	idx := &bytes.Buffer{}
369 | 	data := &bytes.Buffer{}
370 | 
371 | 	const totalinput = 10 << 20
372 | 	input := getBufferSize(totalinput)
373 | 
374 | 	const size = 4 << 10
375 | 	b := input.Bytes()
376 | 	// Create some duplicates
377 | 	for i := 0; i < 500; i++ {
378 | 		// Read from 10 first blocks
379 | 		src := b[(i%10)*size : (i%10)*size+size]
380 | 		// Write into the following ones
381 | 		dst := b[(10+i)*size : (i+10)*size+size]
382 | 		copy(dst, src)
383 | 	}
384 | 	input = bytes.NewBuffer(b)
385 | 	w, err := dedup.NewWriter(idx, data, dedup.ModeFixed, size, 0)
386 | 	if err != nil {
387 | 		t.Fatal(err)
388 | 	}
389 | 	_, err = io.Copy(w, input)
390 | 	if err != nil {
391 | 		t.Fatal(err)
392 | 	}
393 | 	err = w.Close()
394 | 	if err != nil {
395 | 		t.Fatal(err)
396 | 	}
397 | 
398 | 	index := idx.Bytes()
399 | 	alldata := data.Bytes()
400 | 
401 | 	t.ResetTimer()
402 | 	t.SetBytes(totalinput)
403 | 	for i := 0; i < t.N; i++ {
404 | 		idx := bytes.NewBuffer(index)
405 | 		data := bytes.NewBuffer(alldata)
406 | 		r, err := dedup.NewReader(idx, data)
407 | 		if err != nil {
408 | 			t.Fatal(err)
409 | 		}
410 | 		n, err := io.Copy(ioutil.Discard, r)
411 | 		if err != nil && err != io.EOF {
412 | 			t.Fatal(err)
413 | 		}
414 | 		if n != int64(len(b)) {
415 | 			t.Fatal("read was short, expected", len(b), "was", n)
416 | 		}
417 | 		err = r.Close()
418 | 		if err != nil {
419 | 			t.Fatal(err)
420 | 		}
421 | 	}
422 | }
423 | 
424 | // Indexed stream, 10MB input, 1K blocks
425 | func BenchmarkReader1K(t *testing.B) {
426 | 	idx := &bytes.Buffer{}
427 | 	data := &bytes.Buffer{}
428 | 
429 | 	const totalinput = 10 << 20
430 | 	input := getBufferSize(totalinput)
431 | 
432 | 	const size = 1 << 10
433 | 	b := input.Bytes()
434 | 	// Create some duplicates
435 | 	for i := 0; i < 500; i++ {
436 | 		// Read from 10 first blocks
437 | 		src := b[(i%10)*size : (i%10)*size+size]
438 | 		// Write into the following ones
439 | 		dst := b[(10+i)*size : (i+10)*size+size]
440 | 		copy(dst, src)
441 | 	}
442 | 	input = bytes.NewBuffer(b)
443 | 	w, err := dedup.NewWriter(idx, data, dedup.ModeFixed, size, 0)
444 | 	if err != nil {
445 | 		t.Fatal(err)
446 | 	}
447 | 	_, err = io.Copy(w, input)
448 | 	if err != nil {
449 | 		t.Fatal(err)
450 | 	}
451 | 	err = w.Close()
452 | 	if err != nil {
453 | 		t.Fatal(err)
454 | 	}
455 | 
456 | 	index := idx.Bytes()
457 | 	alldata := data.Bytes()
458 | 
459 | 	t.ResetTimer()
460 | 	t.SetBytes(totalinput)
461 | 	for i := 0; i < t.N; i++ {
462 | 		idx := bytes.NewBuffer(index)
463 | 		data := bytes.NewBuffer(alldata)
464 | 		r, err := dedup.NewReader(idx, data)
465 | 		if err != nil {
466 | 			t.Fatal(err)
467 | 		}
468 | 		n, err := io.Copy(ioutil.Discard, r)
469 | 		if err != nil && err != io.EOF {
470 | 			t.Fatal(err)
471 | 		}
472 | 		if n != int64(len(b)) {
473 | 			t.Fatal("read was short, expected", len(b), "was", n)
474 | 		}
475 | 		err = r.Close()
476 | 		if err != nil {
477 | 			t.Fatal(err)
478 | 		}
479 | 	}
480 | }
481 | 
482 | // Stream, 64K blocks on 10MB data.
483 | func BenchmarkReaderStream64K(t *testing.B) {
484 | 	data := &bytes.Buffer{}
485 | 
486 | 	const totalinput = 10 << 20
487 | 	input := getBufferSize(totalinput)
488 | 
489 | 	const size = 64 << 10
490 | 	b := input.Bytes()
491 | 	// Create some duplicates
492 | 	for i := 0; i < 50; i++ {
493 | 		// Read from 10 first blocks
494 | 		src := b[(i%10)*size : (i%10)*size+size]
495 | 		// Write into the following ones
496 | 		dst := b[(10+i)*size : (i+10)*size+size]
497 | 		copy(dst, src)
498 | 	}
499 | 	input = bytes.NewBuffer(b)
500 | 	w, err := dedup.NewStreamWriter(data, dedup.ModeFixed, size, 100*size)
501 | 	if err != nil {
502 | 		t.Fatal(err)
503 | 	}
504 | 	io.Copy(w, input)
505 | 	err = w.Close()
506 | 	if err != nil {
507 | 		t.Fatal(err)
508 | 	}
509 | 
510 | 	alldata := data.Bytes()
511 | 
512 | 	t.ResetTimer()
513 | 	t.SetBytes(totalinput)
514 | 	for i := 0; i < t.N; i++ {
515 | 		input := bytes.NewBuffer(alldata)
516 | 		r, err := dedup.NewStreamReader(input)
517 | 		if err != nil {
518 | 			t.Fatal(err)
519 | 		}
520 | 
521 | 		n, err := io.Copy(ioutil.Discard, r)
522 | 		if err != io.EOF && err != nil {
523 | 			t.Fatal(err)
524 | 		}
525 | 		if len(b) != int(n) {
526 | 			t.Fatalf("Expected len %d, got %d", len(b), n)
527 | 		}
528 | 		err = r.Close()
529 | 		if err != nil {
530 | 			t.Fatal(err)
531 | 		}
532 | 	}
533 | }
534 | 
535 | // Stream, 4K blocks on 10MB data.
536 | func BenchmarkReaderStream4K(t *testing.B) {
537 | 	data := &bytes.Buffer{}
538 | 
539 | 	const totalinput = 10 << 20
540 | 	input := getBufferSize(totalinput)
541 | 
542 | 	const size = 4 << 10
543 | 	b := input.Bytes()
544 | 	// Create some duplicates
545 | 	for i := 0; i < 100; i++ {
546 | 		// Read from 10 first blocks
547 | 		src := b[(i%10)*size : (i%10)*size+size]
548 | 		// Write into the following ones
549 | 		dst := b[(10+i)*size : (i+10)*size+size]
550 | 		copy(dst, src)
551 | 	}
552 | 	input = bytes.NewBuffer(b)
553 | 	w, err := dedup.NewStreamWriter(data, dedup.ModeFixed, size, 100*size)
554 | 	if err != nil {
555 | 		t.Fatal(err)
556 | 	}
557 | 	io.Copy(w, input)
558 | 	err = w.Close()
559 | 	if err != nil {
560 | 		t.Fatal(err)
561 | 	}
562 | 
563 | 	alldata := data.Bytes()
564 | 
565 | 	t.ResetTimer()
566 | 	t.SetBytes(totalinput)
567 | 	for i := 0; i < t.N; i++ {
568 | 		input := bytes.NewBuffer(alldata)
569 | 		r, err := dedup.NewStreamReader(input)
570 | 		if err != nil {
571 | 			t.Fatal(err)
572 | 		}
573 | 
574 | 		n, err := io.Copy(ioutil.Discard, r)
575 | 		if err != io.EOF && err != nil {
576 | 			t.Fatal(err)
577 | 		}
578 | 		if len(b) != int(n) {
579 | 			t.Fatalf("Expected len %d, got %d", len(b), n)
580 | 		}
581 | 		err = r.Close()
582 | 		if err != nil {
583 | 			t.Fatal(err)
584 | 		}
585 | 	}
586 | }
587 | 
588 | // Stream, 1K blocks on 10MB data.
589 | func BenchmarkReaderStream1K(t *testing.B) {
590 | 	data := &bytes.Buffer{}
591 | 
592 | 	const totalinput = 10 << 20
593 | 	input := getBufferSize(totalinput)
594 | 
595 | 	const size = 1 << 10
596 | 	b := input.Bytes()
597 | 	// Create some duplicates
598 | 	for i := 0; i < 500; i++ {
599 | 		// Read from 10 first blocks
600 | 		src := b[(i%10)*size : (i%10)*size+size]
601 | 		// Write into the following ones
602 | 		dst := b[(10+i)*size : (i+10)*size+size]
603 | 		copy(dst, src)
604 | 	}
605 | 	input = bytes.NewBuffer(b)
606 | 	w, err := dedup.NewStreamWriter(data, dedup.ModeFixed, size, 100*size)
607 | 	if err != nil {
608 | 		t.Fatal(err)
609 | 	}
610 | 	io.Copy(w, input)
611 | 	err = w.Close()
612 | 	if err != nil {
613 | 		t.Fatal(err)
614 | 	}
615 | 
616 | 	alldata := data.Bytes()
617 | 
618 | 	t.ResetTimer()
619 | 	t.SetBytes(totalinput)
620 | 	for i := 0; i < t.N; i++ {
621 | 		input := bytes.NewBuffer(alldata)
622 | 		r, err := dedup.NewStreamReader(input)
623 | 		if err != nil {
624 | 			t.Fatal(err)
625 | 		}
626 | 
627 | 		n, err := io.Copy(ioutil.Discard, r)
628 | 		if err != io.EOF && err != nil {
629 | 			t.Fatal(err)
630 | 		}
631 | 		if len(b) != int(n) {
632 | 			t.Fatalf("Expected len %d, got %d", len(b), n)
633 | 		}
634 | 		err = r.Close()
635 | 		if err != nil {
636 | 			t.Fatal(err)
637 | 		}
638 | 	}
639 | }
640 | 
641 | // This will deduplicate a buffer of zeros to an indexed stream
642 | func ExampleNewReader() {
643 | 	// Create data we can read.
644 | 	var idx, data bytes.Buffer
645 | 	input := bytes.NewBuffer(make([]byte, 50000))
646 | 	w, _ := dedup.NewWriter(&idx, &data, dedup.ModeFixed, 1000, 0)
647 | 	_, _ = io.Copy(w, input)
648 | 	_ = w.Close()
649 | 
650 | 	// Create a new reader.
651 | 	r, err := dedup.NewReader(&idx, &data)
652 | 	if err != nil {
653 | 		panic(err)
654 | 	}
655 | 
656 | 	// Inspect how much memory it will use.
657 | 	fmt.Println("Memory use:", r.MaxMem())
658 | 
659 | 	var dst bytes.Buffer
660 | 
661 | 	// Read everything
662 | 	_, err = io.Copy(&dst, r)
663 | 	if err != nil && err != io.EOF {
664 | 		panic(err)
665 | 	}
666 | 
667 | 	// Let us inspect what was written:
668 | 	fmt.Println("Returned data length:", dst.Len())
669 | 	fmt.Println("Everything zero:", 0 == bytes.Compare(dst.Bytes(), make([]byte, 50000)))
670 | 
671 | 	// OUTPUT: Memory use: 1000
672 | 	// Returned data length: 50000
673 | 	// Everything zero: true
674 | }
675 | 
676 | // This will deduplicate a buffer of zeros to an indexed stream
677 | func ExampleNewStreamReader() {
678 | 	// Create data we can read.
679 | 	var data bytes.Buffer
680 | 	input := bytes.NewBuffer(make([]byte, 50000))
681 | 	// Set the memory limit to 10000 bytes
682 | 	w, _ := dedup.NewStreamWriter(&data, dedup.ModeFixed, 1000, 10000)
683 | 	_, _ = io.Copy(w, input)
684 | 	_ = w.Close()
685 | 
686 | 	// Create a new stream reader:
687 | 	r, err := dedup.NewStreamReader(&data)
688 | 	if err != nil {
689 | 		panic(err)
690 | 	}
691 | 
692 | 	// Inspect how much memory it will use.
693 | 	// Since this is a stream, it will print the worst possible scenario
694 | 	fmt.Println("Memory use:", r.MaxMem())
695 | 
696 | 	var dst bytes.Buffer
697 | 
698 | 	// Read everything
699 | 	_, err = io.Copy(&dst, r)
700 | 	if err != nil && err != io.EOF {
701 | 		panic(err)
702 | 	}
703 | 
704 | 	// Let us inspect what was written:
705 | 	fmt.Println("Returned data length:", dst.Len())
706 | 	fmt.Println("Everything zero:", 0 == bytes.Compare(dst.Bytes(), make([]byte, 50000)))
707 | 
708 | 	// OUTPUT: Memory use: 10000
709 | 	// Returned data length: 50000
710 | 	// Everything zero: true
711 | }
712 | 


--------------------------------------------------------------------------------
/sort/hashsort.go:
--------------------------------------------------------------------------------
  1 | package sort
  2 | 
  3 | // Adapted from https://github.com/AlasdairF/Sort/tree/master/Int
  4 | // - no LICENSE, see https://github.com/AlasdairF/Sort/issues/1
  5 | // ================= COMMON =================
  6 | 
  7 | func min(a, b int) int {
  8 | 	if a < b {
  9 | 		return a
 10 | 	}
 11 | 	return b
 12 | }
 13 | 
 14 | // ------------- ASCENDING -------------
 15 | 
 16 | func heapSortAsc(data []int, a, b int) {
 17 | 	first := a
 18 | 	lo := 0
 19 | 	hi := b - a
 20 | 	for i := (hi - 1) / 2; i >= 0; i-- {
 21 | 		siftDownAsc(data, i, hi, first)
 22 | 	}
 23 | 	for i := hi - 1; i >= 0; i-- {
 24 | 		data[first], data[first+i] = data[first+i], data[first]
 25 | 		siftDownAsc(data, lo, i, first)
 26 | 	}
 27 | }
 28 | 
 29 | func insertionSortAsc(data []int, a, b int) {
 30 | 	var j int
 31 | 	for i := a + 1; i < b; i++ {
 32 | 		for j = i; j > a && data[j] < data[j-1]; j-- {
 33 | 			data[j], data[j-1] = data[j-1], data[j]
 34 | 		}
 35 | 	}
 36 | }
 37 | 
 38 | func siftDownAsc(data []int, lo, hi, first int) {
 39 | 	root := lo
 40 | 	for {
 41 | 		child := 2*root + 1
 42 | 		if child >= hi {
 43 | 			break
 44 | 		}
 45 | 		if child+1 < hi && data[first+child] < data[first+child+1] {
 46 | 			child++
 47 | 		}
 48 | 		if data[first+root] >= data[first+child] {
 49 | 			return
 50 | 		}
 51 | 		data[first+root], data[first+child] = data[first+child], data[first+root]
 52 | 		root = child
 53 | 	}
 54 | }
 55 | 
 56 | func medianOfThreeAsc(data []int, m1, m0, m2 int) {
 57 | 	// bubble sort on 3 elements
 58 | 	if data[m1] < data[m0] {
 59 | 		data[m1], data[m0] = data[m0], data[m1]
 60 | 	}
 61 | 	if data[m2] < data[m1] {
 62 | 		data[m2], data[m1] = data[m1], data[m2]
 63 | 	}
 64 | 	if data[m1] < data[m0] {
 65 | 		data[m1], data[m0] = data[m0], data[m1]
 66 | 	}
 67 | }
 68 | 
 69 | func swapRangeAsc(data []int, a, b, n int) {
 70 | 	for i := 0; i < n; i++ {
 71 | 		data[a], data[b] = data[b], data[a]
 72 | 		a++
 73 | 		b++
 74 | 	}
 75 | }
 76 | 
 77 | func doPivotAsc(data []int, lo, hi int) (midlo, midhi int) {
 78 | 	m := lo + (hi-lo)/2
 79 | 	if hi-lo > 40 {
 80 | 		s := (hi - lo) / 8
 81 | 		medianOfThreeAsc(data, lo, lo+s, lo+2*s)
 82 | 		medianOfThreeAsc(data, m, m-s, m+s)
 83 | 		medianOfThreeAsc(data, hi-1, hi-1-s, hi-1-2*s)
 84 | 	}
 85 | 	medianOfThreeAsc(data, lo, m, hi-1)
 86 | 
 87 | 	pivot := lo
 88 | 	a, b, c, d := lo+1, lo+1, hi, hi
 89 | 	for {
 90 | 		for b < c {
 91 | 			if data[b] < data[pivot] {
 92 | 				b++
 93 | 			} else if data[pivot] >= data[b] {
 94 | 				data[a], data[b] = data[b], data[a]
 95 | 				a++
 96 | 				b++
 97 | 			} else {
 98 | 				break
 99 | 			}
100 | 		}
101 | 		for b < c {
102 | 			if data[pivot] < data[c-1] {
103 | 				c--
104 | 			} else if data[c-1] >= data[pivot] {
105 | 				data[c-1], data[d-1] = data[d-1], data[c-1]
106 | 				c--
107 | 				d--
108 | 			} else {
109 | 				break
110 | 			}
111 | 		}
112 | 		if b >= c {
113 | 			break
114 | 		}
115 | 		data[b], data[c-1] = data[c-1], data[b]
116 | 		b++
117 | 		c--
118 | 	}
119 | 
120 | 	n := min(b-a, a-lo)
121 | 	swapRangeAsc(data, lo, b-n, n)
122 | 
123 | 	n = min(hi-d, d-c)
124 | 	swapRangeAsc(data, c, hi-n, n)
125 | 
126 | 	return lo + b - a, hi - (d - c)
127 | }
128 | 
129 | func quickSortAsc(data []int, a, b, maxDepth int) {
130 | 	var mlo, mhi int
131 | 	for b-a > 7 {
132 | 		if maxDepth == 0 {
133 | 			heapSortAsc(data, a, b)
134 | 			return
135 | 		}
136 | 		maxDepth--
137 | 		mlo, mhi = doPivotAsc(data, a, b)
138 | 		if mlo-a < b-mhi {
139 | 			quickSortAsc(data, a, mlo, maxDepth)
140 | 			a = mhi
141 | 		} else {
142 | 			quickSortAsc(data, mhi, b, maxDepth)
143 | 			b = mlo
144 | 		}
145 | 	}
146 | 	if b-a > 1 {
147 | 		insertionSortAsc(data, a, b)
148 | 	}
149 | }
150 | 
151 | func Asc(data []int) {
152 | 	maxDepth := 0
153 | 	for i := len(data); i > 0; i >>= 1 {
154 | 		maxDepth++
155 | 	}
156 | 	maxDepth *= 2
157 | 	quickSortAsc(data, 0, len(data), maxDepth)
158 | }
159 | 
160 | func IsSortedAsc(data []int) bool {
161 | 	for i := len(data) - 1; i > 0; i-- {
162 | 		if data[i] < data[i-1] {
163 | 			return false
164 | 		}
165 | 	}
166 | 	return true
167 | }
168 | 
169 | func StableAsc(data []int) {
170 | 	n := len(data)
171 | 	blockSize := 20
172 | 	a, b := 0, blockSize
173 | 	for b <= n {
174 | 		insertionSortAsc(data, a, b)
175 | 		a = b
176 | 		b += blockSize
177 | 	}
178 | 	insertionSortAsc(data, a, n)
179 | 
180 | 	for blockSize < n {
181 | 		a, b = 0, 2*blockSize
182 | 		for b <= n {
183 | 			symMergeAsc(data, a, a+blockSize, b)
184 | 			a = b
185 | 			b += 2 * blockSize
186 | 		}
187 | 		symMergeAsc(data, a, a+blockSize, n)
188 | 		blockSize *= 2
189 | 	}
190 | }
191 | 
192 | func symMergeAsc(data []int, a, m, b int) {
193 | 	if a >= m || m >= b {
194 | 		return
195 | 	}
196 | 	mid := a + (b-a)/2
197 | 	n := mid + m
198 | 	var start, c, r, p int
199 | 	if m > mid {
200 | 		start = n - b
201 | 		r, p = mid, n-1
202 | 		for start < r {
203 | 			c = start + (r-start)/2
204 | 			if data[p-c] >= data[c] {
205 | 				start = c + 1
206 | 			} else {
207 | 				r = c
208 | 			}
209 | 		}
210 | 	} else {
211 | 		start = a
212 | 		r, p = m, n-1
213 | 		for start < r {
214 | 			c = start + (r-start)/2
215 | 			if data[p-c] >= data[c] {
216 | 				start = c + 1
217 | 			} else {
218 | 				r = c
219 | 			}
220 | 		}
221 | 	}
222 | 	end := n - start
223 | 	rotateAsc(data, start, m, end)
224 | 	symMergeAsc(data, a, start, mid)
225 | 	symMergeAsc(data, mid, end, b)
226 | }
227 | 
228 | func rotateAsc(data []int, a, m, b int) {
229 | 	i := m - a
230 | 	if i == 0 {
231 | 		return
232 | 	}
233 | 	j := b - m
234 | 	if j == 0 {
235 | 		return
236 | 	}
237 | 	if i == j {
238 | 		swapRangeAsc(data, a, m, i)
239 | 		return
240 | 	}
241 | 	p := a + i
242 | 	for i != j {
243 | 		if i > j {
244 | 			swapRangeAsc(data, p-i, p, j)
245 | 			i -= j
246 | 		} else {
247 | 			swapRangeAsc(data, p-i, p+j-i, i)
248 | 			j -= i
249 | 		}
250 | 	}
251 | 	swapRangeAsc(data, p-i, p, i)
252 | }
253 | 
254 | // ------------- DESCENDING -------------
255 | 
256 | func heapSortDesc(data []int, a, b int) {
257 | 	first := a
258 | 	lo := 0
259 | 	hi := b - a
260 | 	for i := (hi - 1) / 2; i >= 0; i-- {
261 | 		siftDownDesc(data, i, hi, first)
262 | 	}
263 | 	for i := hi - 1; i >= 0; i-- {
264 | 		data[first], data[first+i] = data[first+i], data[first]
265 | 		siftDownDesc(data, lo, i, first)
266 | 	}
267 | }
268 | 
269 | func insertionSortDesc(data []int, a, b int) {
270 | 	var j int
271 | 	for i := a + 1; i < b; i++ {
272 | 		for j = i; j > a && data[j] > data[j-1]; j-- {
273 | 			data[j], data[j-1] = data[j-1], data[j]
274 | 		}
275 | 	}
276 | }
277 | 
278 | func siftDownDesc(data []int, lo, hi, first int) {
279 | 	root := lo
280 | 	for {
281 | 		child := 2*root + 1
282 | 		if child >= hi {
283 | 			break
284 | 		}
285 | 		if child+1 < hi && data[first+child] > data[first+child+1] {
286 | 			child++
287 | 		}
288 | 		if data[first+root] <= data[first+child] {
289 | 			return
290 | 		}
291 | 		data[first+root], data[first+child] = data[first+child], data[first+root]
292 | 		root = child
293 | 	}
294 | }
295 | 
296 | func medianOfThreeDesc(data []int, m1, m0, m2 int) {
297 | 	// bubble sort on 3 elements
298 | 	if data[m1] > data[m0] {
299 | 		data[m1], data[m0] = data[m0], data[m1]
300 | 	}
301 | 	if data[m2] > data[m1] {
302 | 		data[m2], data[m1] = data[m1], data[m2]
303 | 	}
304 | 	if data[m1] > data[m0] {
305 | 		data[m1], data[m0] = data[m0], data[m1]
306 | 	}
307 | }
308 | 
309 | func swapRangeDesc(data []int, a, b, n int) {
310 | 	for i := 0; i < n; i++ {
311 | 		data[a], data[b] = data[b], data[a]
312 | 		a++
313 | 		b++
314 | 	}
315 | }
316 | 
317 | func doPivotDesc(data []int, lo, hi int) (midlo, midhi int) {
318 | 	m := lo + (hi-lo)/2
319 | 	if hi-lo > 40 {
320 | 		s := (hi - lo) / 8
321 | 		medianOfThreeDesc(data, lo, lo+s, lo+2*s)
322 | 		medianOfThreeDesc(data, m, m-s, m+s)
323 | 		medianOfThreeDesc(data, hi-1, hi-1-s, hi-1-2*s)
324 | 	}
325 | 	medianOfThreeDesc(data, lo, m, hi-1)
326 | 
327 | 	pivot := lo
328 | 	a, b, c, d := lo+1, lo+1, hi, hi
329 | 	for {
330 | 		for b < c {
331 | 			if data[b] > data[pivot] {
332 | 				b++
333 | 			} else if data[pivot] <= data[b] {
334 | 				data[a], data[b] = data[b], data[a]
335 | 				a++
336 | 				b++
337 | 			} else {
338 | 				break
339 | 			}
340 | 		}
341 | 		for b < c {
342 | 			if data[pivot] > data[c-1] {
343 | 				c--
344 | 			} else if data[c-1] <= data[pivot] {
345 | 				data[c-1], data[d-1] = data[d-1], data[c-1]
346 | 				c--
347 | 				d--
348 | 			} else {
349 | 				break
350 | 			}
351 | 		}
352 | 		if b >= c {
353 | 			break
354 | 		}
355 | 		data[b], data[c-1] = data[c-1], data[b]
356 | 		b++
357 | 		c--
358 | 	}
359 | 
360 | 	n := min(b-a, a-lo)
361 | 	swapRangeDesc(data, lo, b-n, n)
362 | 
363 | 	n = min(hi-d, d-c)
364 | 	swapRangeDesc(data, c, hi-n, n)
365 | 
366 | 	return lo + b - a, hi - (d - c)
367 | }
368 | 
369 | func quickSortDesc(data []int, a, b, maxDepth int) {
370 | 	var mlo, mhi int
371 | 	for b-a > 7 {
372 | 		if maxDepth == 0 {
373 | 			heapSortDesc(data, a, b)
374 | 			return
375 | 		}
376 | 		maxDepth--
377 | 		mlo, mhi = doPivotDesc(data, a, b)
378 | 		if mlo-a < b-mhi {
379 | 			quickSortDesc(data, a, mlo, maxDepth)
380 | 			a = mhi
381 | 		} else {
382 | 			quickSortDesc(data, mhi, b, maxDepth)
383 | 			b = mlo
384 | 		}
385 | 	}
386 | 	if b-a > 1 {
387 | 		insertionSortDesc(data, a, b)
388 | 	}
389 | }
390 | 
391 | func Desc(data []int) {
392 | 	maxDepth := 0
393 | 	for i := len(data); i > 0; i >>= 1 {
394 | 		maxDepth++
395 | 	}
396 | 	maxDepth *= 2
397 | 	quickSortDesc(data, 0, len(data), maxDepth)
398 | }
399 | 
400 | func IsSortedDesc(data []int) bool {
401 | 	for i := len(data) - 1; i > 0; i-- {
402 | 		if data[i] > data[i-1] {
403 | 			return false
404 | 		}
405 | 	}
406 | 	return true
407 | }
408 | 
409 | func StableDesc(data []int) {
410 | 	n := len(data)
411 | 	blockSize := 20
412 | 	a, b := 0, blockSize
413 | 	for b <= n {
414 | 		insertionSortDesc(data, a, b)
415 | 		a = b
416 | 		b += blockSize
417 | 	}
418 | 	insertionSortDesc(data, a, n)
419 | 
420 | 	for blockSize < n {
421 | 		a, b = 0, 2*blockSize
422 | 		for b <= n {
423 | 			symMergeDesc(data, a, a+blockSize, b)
424 | 			a = b
425 | 			b += 2 * blockSize
426 | 		}
427 | 		symMergeDesc(data, a, a+blockSize, n)
428 | 		blockSize *= 2
429 | 	}
430 | }
431 | 
432 | func symMergeDesc(data []int, a, m, b int) {
433 | 	if a >= m || m >= b {
434 | 		return
435 | 	}
436 | 	mid := a + (b-a)/2
437 | 	n := mid + m
438 | 	var start, c, r, p int
439 | 	if m > mid {
440 | 		start = n - b
441 | 		r, p = mid, n-1
442 | 		for start < r {
443 | 			c = start + (r-start)/2
444 | 			if data[p-c] < data[c] {
445 | 				start = c + 1
446 | 			} else {
447 | 				r = c
448 | 			}
449 | 		}
450 | 	} else {
451 | 		start = a
452 | 		r, p = m, n-1
453 | 		for start < r {
454 | 			c = start + (r-start)/2
455 | 			if data[p-c] < data[c] {
456 | 				start = c + 1
457 | 			} else {
458 | 				r = c
459 | 			}
460 | 		}
461 | 	}
462 | 	end := n - start
463 | 	rotateDesc(data, start, m, end)
464 | 	symMergeDesc(data, a, start, mid)
465 | 	symMergeDesc(data, mid, end, b)
466 | }
467 | 
468 | func rotateDesc(data []int, a, m, b int) {
469 | 	i := m - a
470 | 	if i == 0 {
471 | 		return
472 | 	}
473 | 	j := b - m
474 | 	if j == 0 {
475 | 		return
476 | 	}
477 | 	if i == j {
478 | 		swapRangeDesc(data, a, m, i)
479 | 		return
480 | 	}
481 | 	p := a + i
482 | 	for i != j {
483 | 		if i > j {
484 | 			swapRangeDesc(data, p-i, p, j)
485 | 			i -= j
486 | 		} else {
487 | 			swapRangeDesc(data, p-i, p+j-i, i)
488 | 			j -= i
489 | 		}
490 | 	}
491 | 	swapRangeDesc(data, p-i, p, i)
492 | }
493 | 


--------------------------------------------------------------------------------
/testdata/sampledata.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klauspost/dedup/ebb83049c16285be6f30dd296301e32324b4eeab/testdata/sampledata.zip


--------------------------------------------------------------------------------
/writer.go:
--------------------------------------------------------------------------------
  1 | package dedup
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	hasher "crypto/sha1"
  6 | 	"encoding/binary"
  7 | 	"errors"
  8 | 	"fmt"
  9 | 	"io"
 10 | 	"math"
 11 | 	"math/big"
 12 | 	"runtime"
 13 | 	"sync"
 14 | 
 15 | 	"github.com/klauspost/dedup/sort"
 16 | )
 17 | 
 18 | type Writer interface {
 19 | 	io.WriteCloser
 20 | 
 21 | 	// Split content, so a new block begins with next write.
 22 | 	Split()
 23 | 
 24 | 	// MemUse returns an approximate maximum memory use in bytes for
 25 | 	// encoder (Writer) and decoder (Reader) for the given number of bytes.
 26 | 	MemUse(bytes int) (encoder, decoder int64)
 27 | 
 28 | 	// Returns the current number of blocks.
 29 | 	// Blocks may still be processing.
 30 | 	Blocks() int
 31 | }
 32 | 
 33 | // Size of the underlying hash in bytes for those interested.
 34 | const HashSize = hasher.Size
 35 | 
 36 | // The smallest "maximum" block size allowed.
 37 | const MinBlockSize = 512
 38 | 
 39 | // ErrMaxMemoryTooSmall is returned if the encoder isn't allowed to store
 40 | // even 1 block.
 41 | var ErrMaxMemoryTooSmall = errors.New("there must be at be space for 1 block")
 42 | 
 43 | // Deduplication mode used to determine how input is split.
 44 | type Mode int
 45 | 
 46 | const (
 47 | 	// Fixed block size
 48 | 	//
 49 | 	// This is by far the fastest mode, and checks for duplicates
 50 | 	// In fixed block sizes.
 51 | 	// It can be helpful to use the "Split" function to reset offset, which
 52 | 	// will reset duplication search at the position you are at.
 53 | 	ModeFixed Mode = 0
 54 | 
 55 | 	// Dynamic block size.
 56 | 	//
 57 | 	// This mode will create a deduplicator that will split the contents written
 58 | 	// to it into dynamically sized blocks.
 59 | 	// The size given indicates the maximum block size. Average size is usually maxSize/4.
 60 | 	// Minimum block size is maxSize/64.
 61 | 	ModeDynamic = 1
 62 | 
 63 | 	// Dynamic block size.
 64 | 	//
 65 | 	// This mode will create a deduplicator that will split the contents written
 66 | 	// to it into dynamically sized blocks.
 67 | 	// The size given indicates the maximum block size. Average size is usually maxSize/4.
 68 | 	// Minimum block size is maxSize/64.
 69 | 	ModeDynamicEntropy = 2
 70 | )
 71 | 
 72 | // Fragment is a file fragment.
 73 | // It is the data returned by the NewSplitter.
 74 | type Fragment struct {
 75 | 	Hash    [HashSize]byte // Hash of the fragment
 76 | 	Payload []byte         // Data of the fragment.
 77 | 	New     bool           // Will be true, if the data hasn't been encountered before.
 78 | 	N       uint           // Sequencially incrementing number for each segment.
 79 | }
 80 | 
 81 | type writer struct {
 82 | 	blks      io.Writer                          // Block data writer
 83 | 	idx       io.Writer                          // Index writer
 84 | 	frags     chan<- Fragment                    // Fragment output
 85 | 	maxSize   int                                // Maximum Block size
 86 | 	maxBlocks int                                // Maximum backreference distance
 87 | 	index     map[[hasher.Size]byte]int          // Known hashes and their index
 88 | 	input     chan *block                        // Channel containing blocks to be hashed
 89 | 	write     chan *block                        // Channel containing (ordered) blocks to be written
 90 | 	exited    chan struct{}                      // Closed when the writer exits.
 91 | 	cur       []byte                             // Current block being written
 92 | 	off       int                                // Write offset in current block
 93 | 	buffers   chan *block                        // Buffers ready for re-use.
 94 | 	vari64    []byte                             // Temporary buffer for writing varints
 95 | 	err       error                              // Error state
 96 | 	mu        sync.Mutex                         // Mutex for error state
 97 | 	nblocks   int                                // Current block number. First block is 1.
 98 | 	writer    func(*writer, []byte) (int, error) // Writes are forwarded here.
 99 | 	flush     func(*writer) error                // Called from Close *before* the writer is closed.
100 | 	close     func(*writer) error                // Called from Close *after* the writer is closed.
101 | 	split     func(*writer)                      // Called when Split is called.
102 | }
103 | 
104 | // block contains information about a single block
105 | type block struct {
106 | 	data     []byte
107 | 	sha1Hash [hasher.Size]byte
108 | 	hashDone chan error
109 | 	N        int
110 | }
111 | 
112 | // ErrSizeTooSmall is returned if the requested block size is smaller than
113 | // hash size.
114 | var ErrSizeTooSmall = errors.New("maximum block size too small. must be at least 512 bytes")
115 | 
116 | // NewWriter will create a deduplicator that will split the contents written
117 | // to it into blocks and de-duplicate these.
118 | //
119 | // The output is delivered as two streams, an index stream and a block stream.
120 | //
121 | // The index stream will contain information about which blocks are deduplicated
122 | // and the block stream will contain uncompressed data blocks.
123 | //
124 | // You can set the maximum memory for the decoder to use.
125 | // This limits the length a match can be made.
126 | // This is very conservative, so you can set this at the absolute limit of memory available.
127 | // If you use dynamic blocks, also note that the average size is 1/4th of the maximum block size.
128 | // Set maxMemory to 0 to disable decoder memory limit.
129 | //
130 | // This function returns data that is compatible with the NewReader function.
131 | // The returned writer must be closed to flush the remaining data.
132 | func NewWriter(index io.Writer, blocks io.Writer, mode Mode, maxSize, maxMemory uint) (Writer, error) {
133 | 	ncpu := runtime.GOMAXPROCS(0)
134 | 	// For small block sizes we need to keep a pretty big buffer to keep input fed.
135 | 	// Constant below appears to be sweet spot measured with 4K blocks.
136 | 	var bufmul = 256 << 10 / int(maxSize)
137 | 	if bufmul < 2 {
138 | 		bufmul = 2
139 | 	}
140 | 
141 | 	w := &writer{
142 | 		blks:      blocks,
143 | 		idx:       index,
144 | 		maxSize:   int(maxSize),
145 | 		index:     make(map[[hasher.Size]byte]int),
146 | 		input:     make(chan *block, ncpu*bufmul),
147 | 		write:     make(chan *block, ncpu*bufmul),
148 | 		exited:    make(chan struct{}, 0),
149 | 		cur:       make([]byte, maxSize),
150 | 		vari64:    make([]byte, binary.MaxVarintLen64),
151 | 		buffers:   make(chan *block, ncpu*bufmul),
152 | 		nblocks:   1,
153 | 		maxBlocks: int(maxMemory / maxSize),
154 | 	}
155 | 
156 | 	switch mode {
157 | 	case ModeFixed:
158 | 		fw := &fixedWriter{}
159 | 		w.writer = fw.write
160 | 		w.split = fw.split
161 | 	case ModeDynamic:
162 | 		zw := newZpaqWriter(maxSize)
163 | 		w.writer = zw.write
164 | 		w.split = zw.split
165 | 	case ModeDynamicEntropy:
166 | 		zw := newEntropyWriter(maxSize)
167 | 		w.writer = zw.write
168 | 		w.split = zw.split
169 | 	default:
170 | 		return nil, fmt.Errorf("dedup: unknown mode")
171 | 	}
172 | 
173 | 	if w.maxSize < MinBlockSize {
174 | 		return nil, ErrSizeTooSmall
175 | 	}
176 | 
177 | 	w.close = idxClose
178 | 	w.putUint64(1)               // Format
179 | 	w.putUint64(uint64(maxSize)) // Maximum block size
180 | 
181 | 	// Start one goroutine per core
182 | 	for i := 0; i < ncpu; i++ {
183 | 		go w.hasher()
184 | 	}
185 | 	// Insert the buffers we will use
186 | 	for i := 0; i < ncpu*bufmul; i++ {
187 | 		w.buffers <- &block{data: make([]byte, maxSize), hashDone: make(chan error, 1)}
188 | 	}
189 | 	go w.blockWriter()
190 | 	return w, nil
191 | }
192 | 
193 | // NewStreamWriter will create a deduplicator that will split the contents written
194 | // to it into blocks and de-duplicate these.
195 | //
196 | // The output is delivered as a single stream, and memory use will remain stable for
197 | // both writing and reading the stream.
198 | //
199 | // This function returns data that is compatible with the NewStreamReader function.
200 | //
201 | // You can must set the maximum memory for the decoder to use.
202 | // This limits the length a match can be made.
203 | // If you use dynamic blocks, also note that the average size is 1/4th of the maximum block size.
204 | //
205 | // The returned writer must be closed to flush the remaining data.
206 | func NewStreamWriter(out io.Writer, mode Mode, maxSize, maxMemory uint) (Writer, error) {
207 | 	ncpu := runtime.GOMAXPROCS(0)
208 | 	// For small block sizes we need to keep a pretty big buffer to keep input fed.
209 | 	// Constant below appears to be sweet spot measured with 4K blocks.
210 | 	var bufmul = 256 << 10 / int(maxSize)
211 | 	if bufmul < 2 {
212 | 		bufmul = 2
213 | 	}
214 | 	if maxMemory < maxSize {
215 | 		return nil, ErrMaxMemoryTooSmall
216 | 	}
217 | 	w := &writer{
218 | 		idx:       out,
219 | 		maxSize:   int(maxSize),
220 | 		index:     make(map[[hasher.Size]byte]int),
221 | 		input:     make(chan *block, ncpu*bufmul),
222 | 		write:     make(chan *block, ncpu*bufmul),
223 | 		exited:    make(chan struct{}, 0),
224 | 		cur:       make([]byte, maxSize),
225 | 		vari64:    make([]byte, binary.MaxVarintLen64),
226 | 		buffers:   make(chan *block, ncpu*bufmul),
227 | 		nblocks:   1,
228 | 		maxBlocks: int(maxMemory / maxSize),
229 | 	}
230 | 
231 | 	switch mode {
232 | 	case ModeFixed:
233 | 		fw := &fixedWriter{}
234 | 		w.writer = fw.write
235 | 	case ModeDynamic:
236 | 		zw := newZpaqWriter(maxSize)
237 | 		w.writer = zw.write
238 | 	case ModeDynamicEntropy:
239 | 		zw := newEntropyWriter(maxSize)
240 | 		w.writer = zw.write
241 | 		/*	case ModeDynamicSignatures:
242 | 				zw := newZpaqWriter(maxSize)
243 | 				w.writer = zw.writeFile
244 | 			case ModeSignaturesOnly:
245 | 				w.writer = fileSplitOnly
246 | 		*/
247 | 	default:
248 | 		return nil, fmt.Errorf("dedup: unknown mode")
249 | 	}
250 | 
251 | 	if w.maxSize < MinBlockSize {
252 | 		return nil, ErrSizeTooSmall
253 | 	}
254 | 
255 | 	w.close = streamClose
256 | 	w.putUint64(2)                   // Format
257 | 	w.putUint64(uint64(maxSize))     // Maximum block size
258 | 	w.putUint64(uint64(w.maxBlocks)) // Maximum backreference length
259 | 
260 | 	// Start one goroutine per core
261 | 	for i := 0; i < ncpu; i++ {
262 | 		go w.hasher()
263 | 	}
264 | 	// Insert the buffers we will use
265 | 	for i := 0; i < ncpu*bufmul; i++ {
266 | 		w.buffers <- &block{data: make([]byte, maxSize), hashDone: make(chan error, 1)}
267 | 	}
268 | 	go w.blockStreamWriter()
269 | 	return w, nil
270 | }
271 | 
272 | // NewSplitter will return a writer you can write data to,
273 | // and the file will be split into separate fragments.
274 | //
275 | // You must supply a fragment channel, that will output fragments for
276 | // the data you have written. The channel must accept data while you
277 | // write to the spliter.
278 | //
279 | // For each fragment the SHA-1 hash of the data section is returned,
280 | // along with the raw data of this segment.
281 | //
282 | // When you call Close on the returned Writer, the final fragments
283 | // will be sent and the channel will be closed.
284 | func NewSplitter(fragments chan<- Fragment, mode Mode, maxSize uint) (Writer, error) {
285 | 	ncpu := runtime.GOMAXPROCS(0)
286 | 	// For small block sizes we need to keep a pretty big buffer to keep input fed.
287 | 	// Constant below appears to be sweet spot measured with 4K blocks.
288 | 	var bufmul = 256 << 10 / int(maxSize)
289 | 	if bufmul < 2 {
290 | 		bufmul = 2
291 | 	}
292 | 
293 | 	w := &writer{
294 | 		frags:   fragments,
295 | 		maxSize: int(maxSize),
296 | 		index:   make(map[[hasher.Size]byte]int),
297 | 		input:   make(chan *block, ncpu*bufmul),
298 | 		write:   make(chan *block, ncpu*bufmul),
299 | 		exited:  make(chan struct{}, 0),
300 | 		cur:     make([]byte, maxSize),
301 | 		vari64:  make([]byte, binary.MaxVarintLen64),
302 | 		buffers: make(chan *block, ncpu*bufmul),
303 | 		nblocks: 1,
304 | 	}
305 | 
306 | 	switch mode {
307 | 	case ModeFixed:
308 | 		fw := &fixedWriter{}
309 | 		w.writer = fw.write
310 | 		w.split = fw.split
311 | 	case ModeDynamic:
312 | 		zw := newZpaqWriter(maxSize)
313 | 		w.writer = zw.write
314 | 		w.split = zw.split
315 | 	case ModeDynamicEntropy:
316 | 		zw := newEntropyWriter(maxSize)
317 | 		w.writer = zw.write
318 | 		w.split = zw.split
319 | 	default:
320 | 		return nil, fmt.Errorf("dedup: unknown mode")
321 | 	}
322 | 
323 | 	w.flush = func(w *writer) error {
324 | 		w.split(w)
325 | 		return w.err
326 | 	}
327 | 
328 | 	if w.maxSize < MinBlockSize {
329 | 		return nil, ErrSizeTooSmall
330 | 	}
331 | 
332 | 	// Start one goroutine per core
333 | 	for i := 0; i < ncpu; i++ {
334 | 		go w.hasher()
335 | 	}
336 | 	// Insert the buffers we will use
337 | 	for i := 0; i < ncpu*bufmul; i++ {
338 | 		w.buffers <- &block{data: make([]byte, maxSize), hashDone: make(chan error, 1)}
339 | 	}
340 | 	go w.fragmentWriter()
341 | 	return w, nil
342 | }
343 | 
344 | // putUint64 will Write a uint64 value to index stream.
345 | func (w *writer) putUint64(v uint64) error {
346 | 	n := binary.PutUvarint(w.vari64, v)
347 | 	n2, err := w.idx.Write(w.vari64[:n])
348 | 	if err != nil {
349 | 		return err
350 | 	}
351 | 	if n2 != n {
352 | 		return io.ErrShortWrite
353 | 	}
354 | 	return nil
355 | }
356 | 
357 | // Split content, so a new block begins with next write
358 | func (w *writer) Split() {
359 | 	w.split(w)
360 | }
361 | 
362 | func (w *writer) Blocks() int {
363 | 	w.mu.Lock()
364 | 	b := w.nblocks - 1
365 | 	w.mu.Unlock()
366 | 	return b
367 | }
368 | 
369 | // Write contents to the deduplicator.
370 | func (w *writer) Write(b []byte) (n int, err error) {
371 | 	w.mu.Lock()
372 | 	err = w.err
373 | 	w.mu.Unlock()
374 | 	if err != nil {
375 | 		return 0, err
376 | 	}
377 | 	return w.writer(w, b)
378 | }
379 | 
380 | // setErr will set the error state of the writer.
381 | func (w *writer) setErr(err error) {
382 | 	if err == nil {
383 | 		return
384 | 	}
385 | 	w.mu.Lock()
386 | 	w.err = err
387 | 	w.mu.Unlock()
388 | }
389 | 
390 | // idxClose will flush the remainder of an index based stream
391 | func idxClose(w *writer) (err error) {
392 | 	// Insert length of remaining data into index
393 | 	w.putUint64(uint64(math.MaxUint64))
394 | 	w.putUint64(uint64(w.maxSize - w.off))
395 | 	w.putUint64(0) // Stream continuation possibility, should be 0.
396 | 
397 | 	buf := bytes.NewBuffer(w.cur[0:w.off])
398 | 	n, err := io.Copy(w.blks, buf)
399 | 	if err != nil {
400 | 		return err
401 | 	}
402 | 	if int(n) != w.off {
403 | 		return errors.New("idxClose: r.cur short write")
404 | 	}
405 | 	return nil
406 | }
407 | 
408 | // streamClose will flush the remainder of an single stream
409 | func streamClose(w *writer) (err error) {
410 | 	// Insert length of remaining data into index
411 | 	w.putUint64(uint64(math.MaxUint64))
412 | 	w.putUint64(uint64(w.maxSize - w.off))
413 | 
414 | 	buf := bytes.NewBuffer(w.cur[0:w.off])
415 | 	n, err := io.Copy(w.idx, buf)
416 | 	if err != nil {
417 | 		return err
418 | 	}
419 | 	if int(n) != w.off {
420 | 		return errors.New("streamClose: r.cur short write")
421 | 	}
422 | 	w.putUint64(0) // Stream continuation possibility, should be 0.
423 | 	return nil
424 | }
425 | 
426 | // Close and flush the remaining data to output.
427 | func (w *writer) Close() (err error) {
428 | 	select {
429 | 	case <-w.exited:
430 | 		return w.err
431 | 	default:
432 | 	}
433 | 	if w.flush != nil {
434 | 		err := w.flush(w)
435 | 		if err != nil {
436 | 			return err
437 | 		}
438 | 	}
439 | 	close(w.input)
440 | 	close(w.write)
441 | 	<-w.exited
442 | 
443 | 	if w.close != nil {
444 | 		err := w.close(w)
445 | 		if err != nil {
446 | 			return err
447 | 		}
448 | 	}
449 | 	return w.err
450 | }
451 | 
452 | // hasher will hash incoming blocks
453 | // and signal the writer when done.
454 | func (w *writer) hasher() {
455 | 	h := hasher.New()
456 | 	for b := range w.input {
457 | 		buf := bytes.NewBuffer(b.data)
458 | 		h.Reset()
459 | 		n, err := io.Copy(h, buf)
460 | 		if err != nil {
461 | 			w.setErr(err)
462 | 			return
463 | 		}
464 | 		if int(n) != len(b.data) {
465 | 			w.setErr(errors.New("short copy in hasher"))
466 | 			return
467 | 		}
468 | 		_ = h.Sum(b.sha1Hash[:0])
469 | 		b.hashDone <- nil
470 | 	}
471 | }
472 | 
473 | // blockWriter will write hashed blocks to the output
474 | // and recycle the buffers.
475 | func (w *writer) blockWriter() {
476 | 	defer close(w.exited)
477 | 
478 | 	sortA := make([]int, w.maxBlocks+1)
479 | 
480 | 	for b := range w.write {
481 | 		_ = <-b.hashDone
482 | 		match, ok := w.index[b.sha1Hash]
483 | 		if !ok {
484 | 			buf := bytes.NewBuffer(b.data)
485 | 			n, err := io.Copy(w.blks, buf)
486 | 			if err != nil {
487 | 				w.setErr(err)
488 | 				return
489 | 			}
490 | 			if int(n) != len(b.data) {
491 | 				// This should not be possible with io.copy without an error,
492 | 				// but we test anyway.
493 | 				w.setErr(errors.New("error: short write on copy"))
494 | 				return
495 | 			}
496 | 			w.putUint64(0)
497 | 			w.putUint64(uint64(w.maxSize) - uint64(n))
498 | 		} else {
499 | 			offset := b.N - match
500 | 			if offset <= 0 {
501 | 				// should be impossible, indicated an internal error
502 | 				w.setErr(errors.New("internal error: negative offset"))
503 | 				return
504 | 			}
505 | 			w.putUint64(uint64(offset))
506 | 		}
507 | 		// Update hash to latest match
508 | 		w.index[b.sha1Hash] = b.N
509 | 
510 | 		// Purge the entries with the oldest matches
511 | 		if w.maxBlocks > 0 && len(w.index) > w.maxBlocks {
512 | 			ar := sortA[0:len(w.index)]
513 | 			i := 0
514 | 			for _, v := range w.index {
515 | 				ar[i] = v
516 | 				i++
517 | 			}
518 | 			sort.Asc(ar)
519 | 			// Cut the oldest quarter blocks
520 | 			// since this isn't free
521 | 			cutoff := ar[w.maxBlocks/4]
522 | 			for k, v := range w.index {
523 | 				if v < cutoff {
524 | 					delete(w.index, k)
525 | 				}
526 | 			}
527 | 		}
528 | 
529 | 		// Done, reinsert buffer
530 | 		w.buffers <- b
531 | 	}
532 | }
533 | 
534 | // blockStreamWriter will write blocks and indexes to the output stream
535 | // and recycle the buffers.
536 | func (w *writer) blockStreamWriter() {
537 | 	defer close(w.exited)
538 | 	for b := range w.write {
539 | 		_ = <-b.hashDone
540 | 		match, ok := w.index[b.sha1Hash]
541 | 		if w.maxBlocks > 0 && (b.N-match) > w.maxBlocks {
542 | 			ok = false
543 | 		}
544 | 		if !ok {
545 | 			w.putUint64(0)
546 | 			w.putUint64(uint64(w.maxSize) - uint64(len(b.data)))
547 | 			buf := bytes.NewBuffer(b.data)
548 | 			n, err := io.Copy(w.idx, buf)
549 | 			if err != nil {
550 | 				w.setErr(err)
551 | 				return
552 | 			}
553 | 			if int(n) != len(b.data) {
554 | 				// This should not be possible with io.Copy without an error,
555 | 				// but we test anyway.
556 | 				w.setErr(errors.New("error: short write on copy"))
557 | 				return
558 | 			}
559 | 		} else {
560 | 			offset := b.N - match
561 | 			if offset <= 0 {
562 | 				// should be impossible, indicated an internal error
563 | 				w.setErr(errors.New("internal error: negative offset"))
564 | 				return
565 | 			}
566 | 			w.putUint64(uint64(offset))
567 | 		}
568 | 		// Update hash to latest match
569 | 		w.index[b.sha1Hash] = b.N
570 | 
571 | 		// Purge old entries once in a while
572 | 		if w.maxBlocks > 0 && b.N&65535 == 65535 {
573 | 			for k, v := range w.index {
574 | 				if (b.N - v) > w.maxBlocks {
575 | 					delete(w.index, k)
576 | 				}
577 | 			}
578 | 		}
579 | 		// Done, reinsert buffer
580 | 		w.buffers <- b
581 | 	}
582 | }
583 | 
584 | // fragmentWriter will write hashed blocks to the output channel
585 | // and recycle the buffers.
586 | func (w *writer) fragmentWriter() {
587 | 	defer close(w.exited)
588 | 	defer close(w.frags)
589 | 	n := uint(0)
590 | 	for b := range w.write {
591 | 		_ = <-b.hashDone
592 | 		var f Fragment
593 | 		f.N = n
594 | 		copy(f.Hash[:], b.sha1Hash[:])
595 | 		_, ok := w.index[b.sha1Hash]
596 | 		f.Payload = make([]byte, len(b.data))
597 | 		copy(f.Payload, b.data)
598 | 		if !ok {
599 | 			w.index[b.sha1Hash] = 0
600 | 			f.New = !ok
601 | 		}
602 | 		w.frags <- f
603 | 		// Done, reinsert buffer
604 | 		w.buffers <- b
605 | 		n++
606 | 	}
607 | }
608 | 
609 | type fixedWriter struct{}
610 | 
611 | // Write blocks of similar size.
612 | func (f *fixedWriter) write(w *writer, b []byte) (n int, err error) {
613 | 	written := 0
614 | 	for len(b) > 0 {
615 | 		n := copy(w.cur[w.off:], b)
616 | 		b = b[n:]
617 | 		w.off += n
618 | 		written += n
619 | 		// Filled the buffer? Send it off!
620 | 		if w.off == w.maxSize {
621 | 			b := <-w.buffers
622 | 			// Swap block with current
623 | 			w.cur, b.data = b.data, w.cur
624 | 			w.mu.Lock()
625 | 			b.N = w.nblocks
626 | 			w.nblocks++
627 | 			w.mu.Unlock()
628 | 
629 | 			w.input <- b
630 | 			w.write <- b
631 | 			w.off = 0
632 | 		}
633 | 	}
634 | 	return written, nil
635 | }
636 | 
637 | // Split content, so a new block begins with next write
638 | func (f *fixedWriter) split(w *writer) {
639 | 	if w.off == 0 {
640 | 		return
641 | 	}
642 | 	b := <-w.buffers
643 | 	// Swap block with current
644 | 	w.cur, b.data = b.data[:w.maxSize], w.cur[:w.off]
645 | 	w.mu.Lock()
646 | 	b.N = w.nblocks
647 | 	w.nblocks++
648 | 	w.mu.Unlock()
649 | 
650 | 	w.input <- b
651 | 	w.write <- b
652 | 	w.off = 0
653 | }
654 | 
655 | // MemUse returns an approximate maximum memory use in bytes for
656 | // encoder (Writer) and decoder (Reader) for the given number of bytes.
657 | func (w *writer) MemUse(bytes int) (encoder, decoder int64) {
658 | 	blocks := (bytes + w.maxSize - 1) / w.maxSize
659 | 	if w.maxBlocks > 0 {
660 | 		if w.maxBlocks < blocks {
661 | 			blocks = w.maxBlocks
662 | 		}
663 | 	}
664 | 	// Data length
665 | 	data := big.NewInt(int64(blocks))
666 | 	data = data.Mul(data, big.NewInt(int64(w.maxSize)))
667 | 	d := data.Int64()
668 | 	if data.BitLen() > 63 {
669 | 		d = math.MaxInt64
670 | 	}
671 | 	// Index length
672 | 	bl := big.NewInt(int64(blocks))
673 | 	perBlock := big.NewInt(int64(HashSize + 8 /*int64*/ + 24 /* map entry*/))
674 | 	total := bl.Mul(bl, perBlock)
675 | 	if total.BitLen() > 63 {
676 | 		return math.MaxInt64, d
677 | 	}
678 | 	return total.Int64(), d
679 | }
680 | 
681 | // Split blocks like ZPAQ: (public domain)
682 | type zpaqWriter struct {
683 | 	h           uint32 // rolling hash for finding fragment boundaries
684 | 	c1          byte   // last byte
685 | 	maxFragment int
686 | 	minFragment int
687 | 	maxHash     uint32
688 | 	o1          [256]byte // order 1 context -> predicted byte
689 | }
690 | 
691 | // Split blocks. Typically block size will be maxSize / 4
692 | // Minimum block size is maxSize/64.
693 | //
694 | // The break point is content dependent.
695 | // Any insertions, deletions, or edits that occur before the start of the 32+ byte dependency window
696 | // don't affect the break point.
697 | // This makes it likely for two files to still have identical fragments far away from any edits.
698 | func newZpaqWriter(maxSize uint) *zpaqWriter {
699 | 	fragment := math.Log2(float64(maxSize) / (64 * 64))
700 | 	mh := math.Exp2(22 - fragment)
701 | 	return &zpaqWriter{
702 | 		maxFragment: int(maxSize),
703 | 		minFragment: int(maxSize / 64),
704 | 		maxHash:     uint32(mh),
705 | 	}
706 | }
707 | 
708 | // h is a 32 bit hash that depends on the last 32 bytes that were mispredicted by the order 1 model o1[].
709 | // h < maxhash therefore occurs with probability 2^-16, giving an average fragment size of 64K.
710 | // The variable size dependency window works because one constant is odd (correct prediction, no shift),
711 | // and the other is even but not a multiple of 4 (missed prediction, 1 bit shift left).
712 | // This is different from a normal Rabin filter, which uses a large fixed-sized dependency window
713 | // and two multiply operations, one at the window entry and the inverse at the window exit.
714 | func (z *zpaqWriter) write(w *writer, b []byte) (int, error) {
715 | 	// Transfer to local variables ~30% faster.
716 | 	c1 := z.c1
717 | 	h := z.h
718 | 	off := w.off
719 | 	for _, c := range b {
720 | 		if c == z.o1[c1] {
721 | 			h = (h + uint32(c) + 1) * 314159265
722 | 		} else {
723 | 			h = (h + uint32(c) + 1) * 271828182
724 | 		}
725 | 		z.o1[c1] = c
726 | 		c1 = c
727 | 		w.cur[off] = c
728 | 		off++
729 | 
730 | 		// At a break point? Send it off!
731 | 		if (off >= z.minFragment && h < z.maxHash) || off >= z.maxFragment {
732 | 			b := <-w.buffers
733 | 			// Swap block with current
734 | 			w.cur, b.data = b.data[:w.maxSize], w.cur[:off]
735 | 			b.N = w.nblocks
736 | 
737 | 			w.input <- b
738 | 			w.write <- b
739 | 			w.nblocks++
740 | 			off = 0
741 | 			h = 0
742 | 			c1 = 0
743 | 		}
744 | 	}
745 | 	w.off = off
746 | 	z.h = h
747 | 	z.c1 = c1
748 | 	return len(b), nil
749 | }
750 | 
751 | // Split content, so a new block begins with next write
752 | func (z *zpaqWriter) split(w *writer) {
753 | 	if w.off == 0 {
754 | 		return
755 | 	}
756 | 	b := <-w.buffers
757 | 	// Swap block with current
758 | 	w.cur, b.data = b.data[:w.maxSize], w.cur[:w.off]
759 | 	w.mu.Lock()
760 | 	b.N = w.nblocks
761 | 	w.nblocks++
762 | 	w.mu.Unlock()
763 | 
764 | 	w.input <- b
765 | 	w.write <- b
766 | 	w.off = 0
767 | 	z.h = 0
768 | 	z.c1 = 0
769 | }
770 | 
771 | // Split blocks based on entropy distribution.
772 | type entWriter struct {
773 | 	h           uint32 // rolling hash for finding fragment boundaries
774 | 	maxFragment int
775 | 	minFragment int
776 | 	maxHash     uint32
777 | 	hist        [256]uint16 // histogram of current accumulated
778 | 	histLen     int
779 | 	avgHist     uint16
780 | }
781 | 
782 | // Split blocks. Typically block size will be maxSize / 4
783 | // Minimum block size is maxSize/32.
784 | //
785 | // The break point is content dependent.
786 | // Any insertions, deletions, or edits that occur before the start of the 32+ byte dependency window
787 | // don't affect the break point.
788 | // This makes it likely for two files to still have identical fragments far away from any edits.
789 | func newEntropyWriter(maxSize uint) *entWriter {
790 | 	fragment := math.Log2(float64(maxSize) / (64 * 64))
791 | 	mh := math.Exp2(22 - fragment)
792 | 	e := &entWriter{
793 | 		maxFragment: int(maxSize),
794 | 		minFragment: int(maxSize / 32),
795 | 		maxHash:     uint32(mh),
796 | 	}
797 | 	if e.minFragment > 65535 {
798 | 		e.minFragment = 65535
799 | 	}
800 | 	if e.minFragment < 512 {
801 | 		e.minFragment = 512
802 | 	}
803 | 	e.avgHist = uint16(e.minFragment / 255)
804 | 	return e
805 | }
806 | 
807 | // h is a 32 bit hash that depends on the last 32 bytes that were mispredicted by the order 1 model o1[].
808 | // h < maxhash therefore occurs with probability 2^-16, giving an average fragment size of 64K.
809 | // The variable size dependency window works because one constant is odd (correct prediction, no shift),
810 | // and the other is even but not a multiple of 4 (missed prediction, 1 bit shift left).
811 | // This is different from a normal Rabin filter, which uses a large fixed-sized dependency window
812 | // and two multiply operations, one at the window entry and the inverse at the window exit.
813 | func (e *entWriter) write(w *writer, b []byte) (int, error) {
814 | 	inLen := len(b)
815 | 	if e.histLen < e.minFragment {
816 | 		b2 := b
817 | 		if len(b2)+e.histLen > e.minFragment {
818 | 			b2 = b2[:e.minFragment-e.histLen]
819 | 		}
820 | 		off := w.off
821 | 		for i := range b2 {
822 | 			v := b2[i]
823 | 			e.hist[v]++
824 | 			w.cur[off+i] = v
825 | 		}
826 | 		e.histLen += len(b2)
827 | 		w.off += len(b2)
828 | 		b = b[len(b2):]
829 | 	}
830 | 	if len(b) == 0 {
831 | 		return inLen, nil
832 | 	}
833 | 
834 | 	// Transfer to local variables ~30% faster.
835 | 	h := e.h
836 | 	off := w.off
837 | 	for _, c := range b {
838 | 		if e.hist[c] >= e.avgHist {
839 | 			h = (h + uint32(c) + 1) * 314159265
840 | 		} else {
841 | 			h = (h + uint32(c) + 1) * 271828182
842 | 		}
843 | 		w.cur[off] = c
844 | 		off++
845 | 
846 | 		// At a break point? Send it off!
847 | 		if (off >= e.minFragment && h < e.maxHash) || off >= e.maxFragment {
848 | 			b := <-w.buffers
849 | 			// Swap block with current
850 | 			w.cur, b.data = b.data[:w.maxSize], w.cur[:off]
851 | 			b.N = w.nblocks
852 | 
853 | 			w.input <- b
854 | 			w.write <- b
855 | 			e.histLen = 0
856 | 			for i := range e.hist {
857 | 				e.hist[i] = 0
858 | 			}
859 | 			w.nblocks++
860 | 			off = 0
861 | 			h = 0
862 | 		}
863 | 	}
864 | 	w.off = off
865 | 	e.h = h
866 | 	return inLen, nil
867 | }
868 | 
869 | // Split content, so a new block begins with next write
870 | func (e *entWriter) split(w *writer) {
871 | 	if w.off == 0 {
872 | 		return
873 | 	}
874 | 	b := <-w.buffers
875 | 	// Swap block with current
876 | 	w.cur, b.data = b.data[:w.maxSize], w.cur[:w.off]
877 | 	w.mu.Lock()
878 | 	b.N = w.nblocks
879 | 	w.nblocks++
880 | 	w.mu.Unlock()
881 | 
882 | 	w.input <- b
883 | 	w.write <- b
884 | 	w.off = 0
885 | 	e.h = 0
886 | 	e.histLen = 0
887 | 	for i := range e.hist {
888 | 		e.hist[i] = 0
889 | 	}
890 | }
891 | 


--------------------------------------------------------------------------------
/writer_test.go:
--------------------------------------------------------------------------------
   1 | package dedup_test
   2 | 
   3 | import (
   4 | 	"bytes"
   5 | 	"encoding/hex"
   6 | 	"fmt"
   7 | 	"io"
   8 | 	"io/ioutil"
   9 | 	"math/rand"
  10 | 	"os"
  11 | 	"sync"
  12 | 	"testing"
  13 | 
  14 | 	"github.com/klauspost/dedup"
  15 | )
  16 | 
  17 | // Returns a deterministic buffer of size n
  18 | func getBufferSize(n int) *bytes.Buffer {
  19 | 	rand.Seed(0)
  20 | 	b := make([]byte, n)
  21 | 	for i := range b {
  22 | 		b[i] = byte(rand.Intn(255))
  23 | 	}
  24 | 	return bytes.NewBuffer(b)
  25 | }
  26 | 
  27 | func TestFixedWriter(t *testing.T) {
  28 | 	idx := bytes.Buffer{}
  29 | 	data := bytes.Buffer{}
  30 | 
  31 | 	const totalinput = 10 << 20
  32 | 	input := getBufferSize(totalinput)
  33 | 
  34 | 	const size = 64 << 10
  35 | 	b := input.Bytes()
  36 | 	// Create some duplicates
  37 | 	for i := 0; i < 50; i++ {
  38 | 		// Read from 10 first blocks
  39 | 		src := b[(i%10)*size : (i%10)*size+size]
  40 | 		// Write into the following ones
  41 | 		dst := b[(10+i)*size : (i+10)*size+size]
  42 | 		copy(dst, src)
  43 | 	}
  44 | 	input = bytes.NewBuffer(b)
  45 | 	w, err := dedup.NewWriter(&idx, &data, dedup.ModeFixed, size, size*10)
  46 | 	if err != nil {
  47 | 		t.Fatal(err)
  48 | 	}
  49 | 	io.Copy(w, input)
  50 | 	err = w.Close()
  51 | 	if err != nil {
  52 | 		t.Fatal(err)
  53 | 	}
  54 | 	removed := ((totalinput) - data.Len()) / size
  55 | 
  56 | 	t.Log(dedup.BirthdayProblem(totalinput / size))
  57 | 	t.Log("Index size:", idx.Len())
  58 | 	t.Log("Data size:", data.Len())
  59 | 	t.Log("Removed", removed, "blocks")
  60 | 	// We should get at least 50 blocks
  61 | 	if removed < 50 {
  62 | 		t.Fatal("didn't remove at least 50 blocks")
  63 | 	}
  64 | 	if removed > 60 {
  65 | 		t.Fatal("removed unreasonable high amount of blocks")
  66 | 	}
  67 | }
  68 | 
  69 | func TestFixedWriterLimit(t *testing.T) {
  70 | 	idx := bytes.Buffer{}
  71 | 	data := bytes.Buffer{}
  72 | 
  73 | 	const totalinput = 10 << 20
  74 | 	const limit = 9
  75 | 	input := getBufferSize(totalinput)
  76 | 
  77 | 	const size = 64 << 10
  78 | 	b := input.Bytes()
  79 | 	// Create some duplicates
  80 | 	for i := 0; i < 50; i++ {
  81 | 		// Read from 10 first blocks
  82 | 		src := b[(i%10)*size : (i%10)*size+size]
  83 | 		// Write into the following ones
  84 | 		dst := b[(10+50-i)*size : (10+50-i)*size+size]
  85 | 		copy(dst, src)
  86 | 	}
  87 | 	input = bytes.NewBuffer(b)
  88 | 	w, err := dedup.NewWriter(&idx, &data, dedup.ModeFixed, size, limit*size)
  89 | 	if err != nil {
  90 | 		t.Fatal(err)
  91 | 	}
  92 | 	io.Copy(w, input)
  93 | 	err = w.Close()
  94 | 	if err != nil {
  95 | 		t.Fatal(err)
  96 | 	}
  97 | 	removed := ((totalinput) - data.Len()) / size
  98 | 
  99 | 	t.Log("Index size:", idx.Len())
 100 | 	t.Log("Data size:", data.Len())
 101 | 	t.Log("Removed", removed, "blocks")
 102 | 	// We should get at least 50 blocks
 103 | 	if removed > 10 {
 104 | 		t.Fatal("it did not appear to respect the limit")
 105 | 	}
 106 | 	if removed < 8 {
 107 | 		t.Fatal("removed too many blocks")
 108 | 	}
 109 | 	r, err := dedup.NewReader(&idx, &data)
 110 | 	if err != nil {
 111 | 		t.Fatal(err)
 112 | 	}
 113 | 
 114 | 	useBlocks := r.MaxMem() / size
 115 | 	if useBlocks > 9 {
 116 | 		t.Fatal("Uses too much memory, expected", limit, "got", useBlocks)
 117 | 	}
 118 | 	t.Log("Maximum estimated use:", r.MaxMem(), "bytes,", useBlocks, "blocks")
 119 | 	r.Close()
 120 | }
 121 | 
 122 | func TestFixedFragmentSplitter(t *testing.T) {
 123 | 	const totalinput = 10<<20 + 500
 124 | 	input := getBufferSize(totalinput)
 125 | 
 126 | 	const size = 64 << 10
 127 | 	b := input.Bytes()
 128 | 	// Create some duplicates
 129 | 	for i := 0; i < 50; i++ {
 130 | 		// Read from 10 first blocks
 131 | 		src := b[(i%10)*size : (i%10)*size+size]
 132 | 		// Write into the following ones
 133 | 		dst := b[(10+i)*size : (i+10)*size+size]
 134 | 		copy(dst, src)
 135 | 	}
 136 | 	out := make(chan dedup.Fragment, 10)
 137 | 	count := make(chan int, 0)
 138 | 	go func() {
 139 | 		n := 0
 140 | 		off := 0
 141 | 		for f := range out {
 142 | 			if !bytes.Equal(b[off:off+len(f.Payload)], f.Payload) {
 143 | 				panic(fmt.Sprintf("output mismatch at offset %d", n))
 144 | 			}
 145 | 			off += len(f.Payload)
 146 | 			if f.New {
 147 | 				n += len(f.Payload)
 148 | 			}
 149 | 		}
 150 | 		count <- n
 151 | 		count <- off
 152 | 	}()
 153 | 	input = bytes.NewBuffer(b)
 154 | 	w, err := dedup.NewSplitter(out, dedup.ModeFixed, size)
 155 | 	if err != nil {
 156 | 		t.Fatal(err)
 157 | 	}
 158 | 	io.Copy(w, input)
 159 | 	err = w.Close()
 160 | 	if err != nil {
 161 | 		t.Fatal(err)
 162 | 	}
 163 | 	datalen := <-count
 164 | 	gotLen := <-count
 165 | 	removed := ((totalinput) - datalen) / size
 166 | 
 167 | 	if gotLen != totalinput {
 168 | 		t.Fatalf("did not get all data, want %d, got %d", totalinput, gotLen)
 169 | 	}
 170 | 	t.Log("Data size:", datalen)
 171 | 	t.Log("Removed", removed, "blocks")
 172 | 	// We should get at least 50 blocks
 173 | 	if removed < 50 {
 174 | 		t.Fatal("didn't remove at least 50 blocks")
 175 | 	}
 176 | 	if removed > 60 {
 177 | 		t.Fatal("removed unreasonable high amount of blocks")
 178 | 	}
 179 | }
 180 | 
 181 | func TestDynamicFragmentSplitter(t *testing.T) {
 182 | 	const totalinput = 10 << 20
 183 | 	input := getBufferSize(totalinput)
 184 | 
 185 | 	const size = 64 << 10
 186 | 	b := input.Bytes()
 187 | 	// Create some duplicates
 188 | 	for i := 0; i < 50; i++ {
 189 | 		// Read from 10 first blocks
 190 | 		src := b[(i%10)*size : (i%10)*size+size]
 191 | 		// Write into the following ones
 192 | 		dst := b[(10+i)*size : (i+10)*size+size]
 193 | 		copy(dst, src)
 194 | 	}
 195 | 	out := make(chan dedup.Fragment, 10)
 196 | 	count := make(chan int, 0)
 197 | 	go func() {
 198 | 		n := 0
 199 | 		off := 0
 200 | 		for f := range out {
 201 | 			if !bytes.Equal(b[off:off+len(f.Payload)], f.Payload) {
 202 | 				panic(fmt.Sprintf("output mismatch at offset %d", n))
 203 | 			}
 204 | 			off += len(f.Payload)
 205 | 			if f.New {
 206 | 				n += len(f.Payload)
 207 | 			}
 208 | 		}
 209 | 		count <- n
 210 | 		count <- off
 211 | 	}()
 212 | 	input = bytes.NewBuffer(b)
 213 | 	w, err := dedup.NewSplitter(out, dedup.ModeDynamic, size)
 214 | 	if err != nil {
 215 | 		t.Fatal(err)
 216 | 	}
 217 | 	io.Copy(w, input)
 218 | 	err = w.Close()
 219 | 	if err != nil {
 220 | 		t.Fatal(err)
 221 | 	}
 222 | 	datalen := <-count
 223 | 	gotLen := <-count
 224 | 	removed := ((totalinput) - datalen) / size
 225 | 
 226 | 	if gotLen != totalinput {
 227 | 		t.Fatalf("did not get all data, want %d, got %d", totalinput, gotLen)
 228 | 	}
 229 | 	t.Log("Data size:", datalen)
 230 | 	t.Log("Removed", removed, "blocks")
 231 | 	// We should get at least 50 blocks
 232 | 	if removed < 45 {
 233 | 		t.Fatal("didn't remove at least 45 blocks")
 234 | 	}
 235 | 	if removed > 60 {
 236 | 		t.Fatal("removed unreasonable high amount of blocks")
 237 | 	}
 238 | }
 239 | 
 240 | func TestDynamicEntropySplitter(t *testing.T) {
 241 | 	const totalinput = 10 << 20
 242 | 	input := getBufferSize(totalinput)
 243 | 
 244 | 	const size = 64 << 10
 245 | 	b := input.Bytes()
 246 | 	// Create some duplicates
 247 | 	for i := 0; i < 50; i++ {
 248 | 		// Read from 10 first blocks
 249 | 		src := b[(i%10)*size : (i%10)*size+size]
 250 | 		// Write into the following ones
 251 | 		dst := b[(10+i)*size : (i+10)*size+size]
 252 | 		copy(dst, src)
 253 | 	}
 254 | 	out := make(chan dedup.Fragment, 10)
 255 | 	count := make(chan int, 0)
 256 | 	go func() {
 257 | 		n := 0
 258 | 		off := 0
 259 | 		for f := range out {
 260 | 			if !bytes.Equal(b[off:off+len(f.Payload)], f.Payload) {
 261 | 				panic(fmt.Sprintf("output mismatch at offset %d", n))
 262 | 			}
 263 | 			off += len(f.Payload)
 264 | 			if f.New {
 265 | 				n += len(f.Payload)
 266 | 			}
 267 | 		}
 268 | 		count <- n
 269 | 		count <- off
 270 | 	}()
 271 | 	input = bytes.NewBuffer(b)
 272 | 	w, err := dedup.NewSplitter(out, dedup.ModeDynamic, size)
 273 | 	if err != nil {
 274 | 		t.Fatal(err)
 275 | 	}
 276 | 	io.Copy(w, input)
 277 | 	err = w.Close()
 278 | 	if err != nil {
 279 | 		t.Fatal(err)
 280 | 	}
 281 | 	datalen := <-count
 282 | 	gotLen := <-count
 283 | 	removed := ((totalinput) - datalen) / size
 284 | 
 285 | 	if gotLen != totalinput {
 286 | 		t.Fatalf("did not get all data, want %d, got %d", totalinput, gotLen)
 287 | 	}
 288 | 	t.Log("Data size:", datalen)
 289 | 	t.Log("Removed", removed, "blocks")
 290 | 	// We should get at least 45 blocks
 291 | 	if removed < 45 {
 292 | 		t.Fatal("didn't remove at least 50 blocks")
 293 | 	}
 294 | 	if removed > 60 {
 295 | 		t.Fatal("removed unreasonable high amount of blocks")
 296 | 	}
 297 | }
 298 | 
 299 | func TestDynamicWriter(t *testing.T) {
 300 | 	idx := bytes.Buffer{}
 301 | 	data := bytes.Buffer{}
 302 | 
 303 | 	const totalinput = 10 << 20
 304 | 	input := getBufferSize(totalinput)
 305 | 
 306 | 	const size = 64 << 10
 307 | 	b := input.Bytes()
 308 | 	// Create some duplicates
 309 | 	for i := 0; i < 50; i++ {
 310 | 		// Read from 10 first blocks
 311 | 		src := b[(i%10)*size : (i%10)*size+size]
 312 | 		// Write into the following ones
 313 | 		dst := b[(10+i)*size : (i+10)*size+size]
 314 | 		copy(dst, src)
 315 | 	}
 316 | 	input = bytes.NewBuffer(b)
 317 | 	w, err := dedup.NewWriter(&idx, &data, dedup.ModeDynamic, size, 10*8*size)
 318 | 	if err != nil {
 319 | 		t.Fatal(err)
 320 | 	}
 321 | 	io.Copy(w, input)
 322 | 	err = w.Close()
 323 | 	if err != nil {
 324 | 		t.Fatal(err)
 325 | 	}
 326 | 	removed := ((totalinput) - data.Len()) / size
 327 | 
 328 | 	t.Log("Dynamic Index size:", idx.Len())
 329 | 	t.Log("Dynamic Data size:", data.Len())
 330 | 	t.Log("Removed", removed, "blocks")
 331 | 	// We don't know how many, but it should remove some blocks
 332 | 	if removed < 40 {
 333 | 		t.Fatal("didn't remove at least 40 blocks")
 334 | 	}
 335 | }
 336 | 
 337 | func TestDynamicEntropyWriter(t *testing.T) {
 338 | 	idx := bytes.Buffer{}
 339 | 	data := bytes.Buffer{}
 340 | 
 341 | 	const totalinput = 10 << 20
 342 | 	input := getBufferSize(totalinput)
 343 | 
 344 | 	const size = 64 << 10
 345 | 	b := input.Bytes()
 346 | 	// Create some duplicates
 347 | 	for i := 0; i < 50; i++ {
 348 | 		// Read from 10 first blocks
 349 | 		src := b[(i%10)*size : (i%10)*size+size]
 350 | 		// Write into the following ones
 351 | 		dst := b[(10+i)*size : (i+10)*size+size]
 352 | 		copy(dst, src)
 353 | 	}
 354 | 	input = bytes.NewBuffer(b)
 355 | 	w, err := dedup.NewWriter(&idx, &data, dedup.ModeDynamicEntropy, size, 10*8*size)
 356 | 	if err != nil {
 357 | 		t.Fatal(err)
 358 | 	}
 359 | 	io.Copy(w, input)
 360 | 	err = w.Close()
 361 | 	if err != nil {
 362 | 		t.Fatal(err)
 363 | 	}
 364 | 	removed := ((totalinput) - data.Len()) / size
 365 | 
 366 | 	t.Log("Dynamic Index size:", idx.Len())
 367 | 	t.Log("Dynamic Data size:", data.Len())
 368 | 	t.Log("Removed", removed, "blocks")
 369 | 	// We don't know how many, but it should remove some blocks
 370 | 	if removed < 40 {
 371 | 		t.Fatal("didn't remove at least 40 blocks")
 372 | 	}
 373 | }
 374 | 
 375 | func TestFixedStreamWriter(t *testing.T) {
 376 | 	data := bytes.Buffer{}
 377 | 
 378 | 	const totalinput = 10 << 20
 379 | 	input := getBufferSize(totalinput)
 380 | 
 381 | 	const size = 64 << 10
 382 | 	b := input.Bytes()
 383 | 	// Create some duplicates
 384 | 	for i := 0; i < 50; i++ {
 385 | 		// Read from 10 first blocks
 386 | 		src := b[(i%10)*size : (i%10)*size+size]
 387 | 		// Write into the following ones
 388 | 		dst := b[(10+i)*size : (i+10)*size+size]
 389 | 		copy(dst, src)
 390 | 	}
 391 | 	input = bytes.NewBuffer(b)
 392 | 	w, err := dedup.NewStreamWriter(&data, dedup.ModeFixed, size, 10*size)
 393 | 	if err != nil {
 394 | 		t.Fatal(err)
 395 | 	}
 396 | 	io.Copy(w, input)
 397 | 	err = w.Close()
 398 | 	if err != nil {
 399 | 		t.Fatal(err)
 400 | 	}
 401 | 	removed := ((totalinput) - data.Len()) / size
 402 | 
 403 | 	t.Log("Data size:", data.Len())
 404 | 	t.Log("Removed", removed, "blocks")
 405 | 	// We should get at least 50 blocks, but there is a little overhead
 406 | 	if removed < 49 {
 407 | 		t.Fatal("didn't remove at least 49 blocks")
 408 | 	}
 409 | 	if removed > 60 {
 410 | 		t.Fatal("removed unreasonable high amount of blocks")
 411 | 	}
 412 | }
 413 | 
 414 | func TestDynamicStreamWriter(t *testing.T) {
 415 | 	data := bytes.Buffer{}
 416 | 
 417 | 	const totalinput = 10 << 20
 418 | 	input := getBufferSize(totalinput)
 419 | 
 420 | 	const size = 64 << 10
 421 | 	b := input.Bytes()
 422 | 	// Create some duplicates
 423 | 	for i := 0; i < 50; i++ {
 424 | 		// Read from 10 first blocks
 425 | 		src := b[(i%10)*size : (i%10)*size+size]
 426 | 		// Write into the following ones
 427 | 		dst := b[(10+i)*size : (i+10)*size+size]
 428 | 		copy(dst, src)
 429 | 	}
 430 | 	input = bytes.NewBuffer(b)
 431 | 	w, err := dedup.NewStreamWriter(&data, dedup.ModeDynamic, size, 10*8*size)
 432 | 	if err != nil {
 433 | 		t.Fatal(err)
 434 | 	}
 435 | 	io.Copy(w, input)
 436 | 	err = w.Close()
 437 | 	if err != nil {
 438 | 		t.Fatal(err)
 439 | 	}
 440 | 	removed := ((totalinput) - data.Len()) / size
 441 | 
 442 | 	t.Log("Dynamic Data size:", data.Len())
 443 | 	t.Log("Removed", removed, "blocks")
 444 | 	// We don't know how many, but it should remove some blocks
 445 | 	if removed < 40 {
 446 | 		t.Fatal("didn't remove at least 40 blocks")
 447 | 	}
 448 | }
 449 | 
 450 | func BenchmarkFixedWriter64K(t *testing.B) {
 451 | 	const totalinput = 10 << 20
 452 | 	input := getBufferSize(totalinput)
 453 | 
 454 | 	const size = 64 << 10
 455 | 	b := input.Bytes()
 456 | 	// Create some duplicates
 457 | 	for i := 0; i < 50; i++ {
 458 | 		// Read from 10 first blocks
 459 | 		src := b[(i%10)*size : (i%10)*size+size]
 460 | 		// Write into the following ones
 461 | 		dst := b[(10+i)*size : (i+10)*size+size]
 462 | 		copy(dst, src)
 463 | 	}
 464 | 	t.ResetTimer()
 465 | 	t.SetBytes(totalinput)
 466 | 	for i := 0; i < t.N; i++ {
 467 | 		input = bytes.NewBuffer(b)
 468 | 		w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeFixed, size, 0)
 469 | 		io.Copy(w, input)
 470 | 		err := w.Close()
 471 | 		if err != nil {
 472 | 			t.Fatal(err)
 473 | 		}
 474 | 	}
 475 | }
 476 | 
 477 | func BenchmarkFixedWriter4K(t *testing.B) {
 478 | 	const totalinput = 10 << 20
 479 | 	input := getBufferSize(totalinput)
 480 | 
 481 | 	const size = 4 << 10
 482 | 	b := input.Bytes()
 483 | 	// Create some duplicates
 484 | 	for i := 0; i < 500; i++ {
 485 | 		// Read from 10 first blocks
 486 | 		src := b[(i%10)*size : (i%10)*size+size]
 487 | 		// Write into the following ones
 488 | 		dst := b[(10+i)*size : (i+10)*size+size]
 489 | 		copy(dst, src)
 490 | 	}
 491 | 	t.ResetTimer()
 492 | 	t.SetBytes(totalinput)
 493 | 	for i := 0; i < t.N; i++ {
 494 | 		input = bytes.NewBuffer(b)
 495 | 		w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeFixed, size, 0)
 496 | 		io.Copy(w, input)
 497 | 		err := w.Close()
 498 | 		if err != nil {
 499 | 			t.Fatal(err)
 500 | 		}
 501 | 	}
 502 | }
 503 | 
 504 | func BenchmarkFixedWriter1K(t *testing.B) {
 505 | 	const totalinput = 10 << 20
 506 | 	input := getBufferSize(totalinput)
 507 | 
 508 | 	const size = 1 << 10
 509 | 	b := input.Bytes()
 510 | 	// Create some duplicates
 511 | 	for i := 0; i < 500; i++ {
 512 | 		// Read from 10 first blocks
 513 | 		src := b[(i%10)*size : (i%10)*size+size]
 514 | 		// Write into the following ones
 515 | 		dst := b[(10+i)*size : (i+10)*size+size]
 516 | 		copy(dst, src)
 517 | 	}
 518 | 	t.ResetTimer()
 519 | 	t.SetBytes(totalinput)
 520 | 	for i := 0; i < t.N; i++ {
 521 | 		input = bytes.NewBuffer(b)
 522 | 		w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeFixed, size, 0)
 523 | 		io.Copy(w, input)
 524 | 		err := w.Close()
 525 | 		if err != nil {
 526 | 			t.Fatal(err)
 527 | 		}
 528 | 	}
 529 | }
 530 | 
 531 | // Maximum block size:64k
 532 | func BenchmarkDynamicWriter64K(t *testing.B) {
 533 | 	const totalinput = 10 << 20
 534 | 	input := getBufferSize(totalinput)
 535 | 
 536 | 	const size = 64 << 10
 537 | 	b := input.Bytes()
 538 | 	// Create some duplicates
 539 | 	for i := 0; i < 50; i++ {
 540 | 		// Read from 10 first blocks
 541 | 		src := b[(i%10)*size : (i%10)*size+size]
 542 | 		// Write into the following ones
 543 | 		dst := b[(10+i)*size : (i+10)*size+size]
 544 | 		copy(dst, src)
 545 | 	}
 546 | 	t.ResetTimer()
 547 | 	t.SetBytes(totalinput)
 548 | 	for i := 0; i < t.N; i++ {
 549 | 		input = bytes.NewBuffer(b)
 550 | 		w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeDynamic, size, 0)
 551 | 		io.Copy(w, input)
 552 | 		err := w.Close()
 553 | 		if err != nil {
 554 | 			t.Fatal(err)
 555 | 		}
 556 | 	}
 557 | }
 558 | 
 559 | // Maximum block size:64k
 560 | func BenchmarkDynamicFragments64K(t *testing.B) {
 561 | 	const totalinput = 10 << 20
 562 | 	input := getBufferSize(totalinput)
 563 | 
 564 | 	const size = 64 << 10
 565 | 	b := input.Bytes()
 566 | 	// Create some duplicates
 567 | 	for i := 0; i < 50; i++ {
 568 | 		// Read from 10 first blocks
 569 | 		src := b[(i%10)*size : (i%10)*size+size]
 570 | 		// Write into the following ones
 571 | 		dst := b[(10+i)*size : (i+10)*size+size]
 572 | 		copy(dst, src)
 573 | 	}
 574 | 	t.ResetTimer()
 575 | 	t.SetBytes(totalinput)
 576 | 	for i := 0; i < t.N; i++ {
 577 | 		out := make(chan dedup.Fragment, 10)
 578 | 		go func() {
 579 | 			for _ = range out {
 580 | 			}
 581 | 		}()
 582 | 		input = bytes.NewBuffer(b)
 583 | 		w, _ := dedup.NewSplitter(out, dedup.ModeDynamic, size)
 584 | 		io.Copy(w, input)
 585 | 		err := w.Close()
 586 | 		if err != nil {
 587 | 			t.Fatal(err)
 588 | 		}
 589 | 	}
 590 | }
 591 | 
 592 | // Maximum block size:64k
 593 | func BenchmarkDynamicEntropyFragments64K(t *testing.B) {
 594 | 	const totalinput = 10 << 20
 595 | 	input := getBufferSize(totalinput)
 596 | 
 597 | 	const size = 64 << 10
 598 | 	b := input.Bytes()
 599 | 	// Create some duplicates
 600 | 	for i := 0; i < 50; i++ {
 601 | 		// Read from 10 first blocks
 602 | 		src := b[(i%10)*size : (i%10)*size+size]
 603 | 		// Write into the following ones
 604 | 		dst := b[(10+i)*size : (i+10)*size+size]
 605 | 		copy(dst, src)
 606 | 	}
 607 | 	t.ResetTimer()
 608 | 	t.SetBytes(totalinput)
 609 | 	for i := 0; i < t.N; i++ {
 610 | 		out := make(chan dedup.Fragment, 10)
 611 | 		go func() {
 612 | 			for _ = range out {
 613 | 			}
 614 | 		}()
 615 | 		input = bytes.NewBuffer(b)
 616 | 		w, _ := dedup.NewSplitter(out, dedup.ModeDynamicEntropy, size)
 617 | 		io.Copy(w, input)
 618 | 		err := w.Close()
 619 | 		if err != nil {
 620 | 			t.Fatal(err)
 621 | 		}
 622 | 	}
 623 | }
 624 | 
 625 | // Maximum block size:4k
 626 | func BenchmarkDynamicEntropyFragments4K(t *testing.B) {
 627 | 	const totalinput = 10 << 20
 628 | 	input := getBufferSize(totalinput)
 629 | 
 630 | 	const size = 4 << 10
 631 | 	b := input.Bytes()
 632 | 	// Create some duplicates
 633 | 	for i := 0; i < 50; i++ {
 634 | 		// Read from 10 first blocks
 635 | 		src := b[(i%10)*size : (i%10)*size+size]
 636 | 		// Write into the following ones
 637 | 		dst := b[(10+i)*size : (i+10)*size+size]
 638 | 		copy(dst, src)
 639 | 	}
 640 | 	t.ResetTimer()
 641 | 	t.SetBytes(totalinput)
 642 | 	for i := 0; i < t.N; i++ {
 643 | 		out := make(chan dedup.Fragment, 10)
 644 | 		go func() {
 645 | 			for _ = range out {
 646 | 			}
 647 | 		}()
 648 | 		input = bytes.NewBuffer(b)
 649 | 		w, _ := dedup.NewSplitter(out, dedup.ModeDynamicEntropy, size)
 650 | 		io.Copy(w, input)
 651 | 		err := w.Close()
 652 | 		if err != nil {
 653 | 			t.Fatal(err)
 654 | 		}
 655 | 	}
 656 | }
 657 | 
 658 | // Maximum block size:4k
 659 | func BenchmarkDynamicWriter4K(t *testing.B) {
 660 | 	const totalinput = 10 << 20
 661 | 	input := getBufferSize(totalinput)
 662 | 
 663 | 	const size = 4 << 10
 664 | 	b := input.Bytes()
 665 | 	// Create some duplicates
 666 | 	for i := 0; i < 50; i++ {
 667 | 		// Read from 10 first blocks
 668 | 		src := b[(i%10)*size : (i%10)*size+size]
 669 | 		// Write into the following ones
 670 | 		dst := b[(10+i)*size : (i+10)*size+size]
 671 | 		copy(dst, src)
 672 | 	}
 673 | 	t.ResetTimer()
 674 | 	t.SetBytes(totalinput)
 675 | 	for i := 0; i < t.N; i++ {
 676 | 		input = bytes.NewBuffer(b)
 677 | 		w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeDynamic, size, 0)
 678 | 		io.Copy(w, input)
 679 | 		err := w.Close()
 680 | 		if err != nil {
 681 | 			t.Fatal(err)
 682 | 		}
 683 | 	}
 684 | }
 685 | 
 686 | func BenchmarkFixedStreamWriter4K(t *testing.B) {
 687 | 	const totalinput = 10 << 20
 688 | 	input := getBufferSize(totalinput)
 689 | 
 690 | 	const size = 4 << 10
 691 | 	b := input.Bytes()
 692 | 	// Create some duplicates
 693 | 	for i := 0; i < 500; i++ {
 694 | 		// Read from 10 first blocks
 695 | 		src := b[(i%10)*size : (i%10)*size+size]
 696 | 		// Write into the following ones
 697 | 		dst := b[(10+i)*size : (i+10)*size+size]
 698 | 		copy(dst, src)
 699 | 	}
 700 | 	t.ResetTimer()
 701 | 	t.SetBytes(totalinput)
 702 | 	for i := 0; i < t.N; i++ {
 703 | 		input = bytes.NewBuffer(b)
 704 | 		w, _ := dedup.NewStreamWriter(ioutil.Discard, dedup.ModeFixed, size, 10*size)
 705 | 		io.Copy(w, input)
 706 | 		err := w.Close()
 707 | 		if err != nil {
 708 | 			t.Fatal(err)
 709 | 		}
 710 | 	}
 711 | }
 712 | 
 713 | // This doesn't actually test anything, but prints probabilities to log
 714 | func TestBirthdayProblem(t *testing.T) {
 715 | 	t.Log("Hash size is", dedup.HashSize*8, "bits")
 716 | 	t.Log("1GiB, 1KiB blocks:")
 717 | 	t.Log(dedup.BirthdayProblem((1 << 30) / (1 << 10)))
 718 | 	w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeFixed, 1<<10, 0)
 719 | 	e, _ := w.MemUse(1 << 30)
 720 | 	t.Logf("It will use %d MiB for encoder.", e>>20)
 721 | 
 722 | 	t.Log("1TiB, 4KiB blocks:")
 723 | 	t.Log(dedup.BirthdayProblem((1 << 40) / (4 << 10)))
 724 | 	w, _ = dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeFixed, 4<<10, 0)
 725 | 	e, _ = w.MemUse(1 << 40)
 726 | 	t.Logf("It will use %d MiB for encoder.", e>>20)
 727 | 
 728 | 	t.Log("1PiB, 4KiB blocks:")
 729 | 	t.Log(dedup.BirthdayProblem((1 << 50) / (4 << 10)))
 730 | 	e, _ = w.MemUse(1 << 50)
 731 | 	t.Logf("It will use %d MiB for encoder.", e>>20)
 732 | 
 733 | 	t.Log("1EiB, 64KiB blocks:")
 734 | 	t.Log(dedup.BirthdayProblem((1 << 60) / (64 << 10)))
 735 | 	w, _ = dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeFixed, 64<<10, 0)
 736 | 	e, _ = w.MemUse(1 << 60)
 737 | 	t.Logf("It will use %d MiB for encoder.", e>>20)
 738 | 
 739 | 	t.Log("1EiB, 1KiB blocks:")
 740 | 	t.Log(dedup.BirthdayProblem((1 << 60) / (1 << 10)))
 741 | 	w, _ = dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeFixed, 1<<10, 0)
 742 | 	e, _ = w.MemUse(1 << 60)
 743 | 	t.Logf("It will use %d MiB for encoder.", e>>20)
 744 | }
 745 | 
 746 | // This will deduplicate a buffer of zeros to an indexed stream
 747 | func ExampleNewWriter() {
 748 | 	// We will write to these
 749 | 	idx := bytes.Buffer{}
 750 | 	data := bytes.Buffer{}
 751 | 
 752 | 	// This is our input:
 753 | 	input := bytes.NewBuffer(make([]byte, 50000))
 754 | 
 755 | 	// Create a new writer, with each block being 1000 bytes
 756 | 	w, err := dedup.NewWriter(&idx, &data, dedup.ModeFixed, 1000, 0)
 757 | 	if err != nil {
 758 | 		panic(err)
 759 | 	}
 760 | 
 761 | 	// Copy our input to the writer.
 762 | 	io.Copy(w, input)
 763 | 
 764 | 	// Close the writer
 765 | 	err = w.Close()
 766 | 	if err != nil {
 767 | 		panic(err)
 768 | 	}
 769 | 
 770 | 	// Let us inspect what was written:
 771 | 	fmt.Println("Blocks:", w.Blocks())
 772 | 	fmt.Println("Index size:", idx.Len())
 773 | 	fmt.Println("Data size:", data.Len())
 774 | 
 775 | 	// OUTPUT: Blocks: 50
 776 | 	// Index size: 67
 777 | 	// Data size: 1000
 778 | }
 779 | 
 780 | // This will deduplicate a buffer of zeros to an non-indexed stream
 781 | func ExampleNewStreamWriter() {
 782 | 	// We will write to this
 783 | 	data := bytes.Buffer{}
 784 | 
 785 | 	// This is our input:
 786 | 	input := bytes.NewBuffer(make([]byte, 50000))
 787 | 
 788 | 	// Create a new writer, with each block being 1000 bytes,
 789 | 	// And allow it to use 10000 bytes of memory
 790 | 	w, err := dedup.NewStreamWriter(&data, dedup.ModeFixed, 1000, 10000)
 791 | 	if err != nil {
 792 | 		panic(err)
 793 | 	}
 794 | 	// Copy our input to the writer.
 795 | 	io.Copy(w, input)
 796 | 
 797 | 	// Close the writer
 798 | 	err = w.Close()
 799 | 	if err != nil {
 800 | 		panic(err)
 801 | 	}
 802 | 
 803 | 	// Let us inspect what was written:
 804 | 	fmt.Println("Blocks:", w.Blocks())
 805 | 	fmt.Println("Data size:", data.Len())
 806 | 
 807 | 	// OUTPUT: Blocks: 50
 808 | 	// Data size: 1068
 809 | }
 810 | 
 811 | // This will deduplicate a buffer of zeros,
 812 | // and return each block on a channel in order.
 813 | func ExampleNewSplitter() {
 814 | 	// We will write to this
 815 | 	// We set a small buffer
 816 | 	out := make(chan dedup.Fragment, 10)
 817 | 
 818 | 	// This will consume our blocks as they are returned
 819 | 	// and send information about what was received.
 820 | 	info := make(chan int, 0)
 821 | 	go func() {
 822 | 		n := 0
 823 | 		size := 0
 824 | 		for f := range out {
 825 | 			n++
 826 | 			if f.New {
 827 | 				size += len(f.Payload)
 828 | 			}
 829 | 		}
 830 | 		info <- n
 831 | 		info <- size
 832 | 	}()
 833 | 
 834 | 	// This is our input:
 835 | 	input := bytes.NewBuffer(make([]byte, 50050))
 836 | 
 837 | 	// Create a new writer, with each block being 1000 bytes,
 838 | 	w, err := dedup.NewSplitter(out, dedup.ModeFixed, 1000)
 839 | 	if err != nil {
 840 | 		panic(err)
 841 | 	}
 842 | 	// Copy our input to the writer.
 843 | 	io.Copy(w, input)
 844 | 
 845 | 	// Close the writer
 846 | 	err = w.Close()
 847 | 	if err != nil {
 848 | 		panic(err)
 849 | 	}
 850 | 
 851 | 	// Let us inspect what was written:
 852 | 	fmt.Println("Blocks:", <-info)
 853 | 	// Size of one (repeated) block + 50 bytes for last.
 854 | 	fmt.Println("Data size:", <-info)
 855 | 
 856 | 	// OUTPUT: Blocks: 51
 857 | 	// Data size: 1050
 858 | }
 859 | 
 860 | // This will deduplicate a file
 861 | // and return each block on a channel in order.
 862 | func ExampleNewSplitter_file() {
 863 | 	// Our input
 864 | 	f, _ := os.Open("testdata/sampledata.zip")
 865 | 	defer f.Close()
 866 | 
 867 | 	// We will receive fragments on this channel
 868 | 	ch := make(chan dedup.Fragment, 10)
 869 | 
 870 | 	var wg sync.WaitGroup
 871 | 	wg.Add(1)
 872 | 
 873 | 	// Start a goroutine that will consume the fragments
 874 | 	go func() {
 875 | 		defer wg.Done()
 876 | 		for {
 877 | 			select {
 878 | 			case f, ok := <-ch:
 879 | 				if !ok {
 880 | 					return
 881 | 				}
 882 | 				if f.New {
 883 | 					fmt.Printf("Got NEW fragment #%d, size %d, hash:%s\n", f.N, len(f.Payload), hex.EncodeToString(f.Hash[:]))
 884 | 					// Insert payload into data store
 885 | 				} else {
 886 | 					fmt.Printf("Got OLD fragment #%d, size %d, hash:%s\n", f.N, len(f.Payload), hex.EncodeToString(f.Hash[:]))
 887 | 				}
 888 | 				// Add hash to list of hashes required to reconstruct the file.
 889 | 			}
 890 | 		}
 891 | 	}()
 892 | 
 893 | 	// Create a dynamic splitter with average size of 1024 bytes.
 894 | 	w, _ := dedup.NewSplitter(ch, dedup.ModeDynamic, 4*1024)
 895 | 
 896 | 	// Copy data to the splitter
 897 | 	_, _ = io.Copy(w, f)
 898 | 
 899 | 	// Flush the remaining fragments
 900 | 	_ = w.Close()
 901 | 
 902 | 	// Wait for input to be received.
 903 | 	wg.Wait()
 904 | 
 905 | 	// OUTPUT:
 906 | 	// Got NEW fragment #0, size 893, hash:7f8455127e82f90ea7e97716ccaefa9317279b4b
 907 | 	// Got NEW fragment #1, size 559, hash:b554708bbfda24f1eb8fcd75a155d23bd36939d3
 908 | 	// Got NEW fragment #2, size 3482, hash:59bca870477e14e97ae8650e74ef52abcb6340e8
 909 | 	// Got NEW fragment #3, size 165, hash:6fb05a63e28a1bb2e880e051940f517115e7b16c
 910 | 	// Got NEW fragment #4, size 852, hash:6671826ffff6edd32951a0e774efccb5101ba629
 911 | 	// Got NEW fragment #5, size 3759, hash:0fae545a20195720d8e9bb9540069418d7db0873
 912 | 	// Got OLD fragment #6, size 3482, hash:59bca870477e14e97ae8650e74ef52abcb6340e8
 913 | 	// Got OLD fragment #7, size 165, hash:6fb05a63e28a1bb2e880e051940f517115e7b16c
 914 | 	// Got OLD fragment #8, size 852, hash:6671826ffff6edd32951a0e774efccb5101ba629
 915 | 	// Got NEW fragment #9, size 2380, hash:1507aa13e215517ce982b9235a0221018128ed4e
 916 | 	// Got NEW fragment #10, size 71, hash:f262fcf4af26ee75ff3045db2af21f2acca235cd
 917 | }
 918 | 
 919 | // This will deduplicate a file
 920 | // and return each block on a channel in order.
 921 | func ExampleNewSplitter_entropy() {
 922 | 	// Our input
 923 | 	f, _ := os.Open("testdata/sampledata.zip")
 924 | 	defer f.Close()
 925 | 
 926 | 	// We will receive fragments on this channel
 927 | 	ch := make(chan dedup.Fragment, 10)
 928 | 
 929 | 	var wg sync.WaitGroup
 930 | 	wg.Add(1)
 931 | 
 932 | 	// Start a goroutine that will consume the fragments
 933 | 	go func() {
 934 | 		defer wg.Done()
 935 | 		for {
 936 | 			select {
 937 | 			case f, ok := <-ch:
 938 | 				if !ok {
 939 | 					return
 940 | 				}
 941 | 				if f.New {
 942 | 					fmt.Printf("Got NEW fragment #%d, size %d, hash:%s\n", f.N, len(f.Payload), hex.EncodeToString(f.Hash[:]))
 943 | 					// Insert payload into data store
 944 | 				} else {
 945 | 					fmt.Printf("Got OLD fragment #%d, size %d, hash:%s\n", f.N, len(f.Payload), hex.EncodeToString(f.Hash[:]))
 946 | 				}
 947 | 				// Add hash to list of hashes required to reconstruct the file.
 948 | 			}
 949 | 		}
 950 | 	}()
 951 | 
 952 | 	// Create a dynamic splitter with average size of 1024 bytes.
 953 | 	w, _ := dedup.NewSplitter(ch, dedup.ModeDynamicEntropy, 4*1024)
 954 | 
 955 | 	// Copy data to the splitter
 956 | 	_, _ = io.Copy(w, f)
 957 | 
 958 | 	// Flush the remaining fragments
 959 | 	_ = w.Close()
 960 | 
 961 | 	// Wait for input to be received.
 962 | 	wg.Wait()
 963 | 
 964 | 	// OUTPUT:
 965 | 	//Got NEW fragment #0, size 521, hash:0c5989843e85f31aed26f249bd203240dd72f77a
 966 | 	//Got NEW fragment #1, size 1563, hash:308ff2e0b4776c2a08fe549422c7ebfbf646bb22
 967 | 	//Got NEW fragment #2, size 919, hash:9d68759ef33ae919b656faf52bb1177e803f810b
 968 | 	//Got NEW fragment #3, size 1326, hash:c272c26dff010417ca2120a8e82addfdadb4efeb
 969 | 	//Got NEW fragment #4, size 1284, hash:9bbe891ccb1b141e0e122110e730e8df9743331e
 970 | 	//Got NEW fragment #5, size 1220, hash:5019f56fa9395060fbe2e957ad518a35cd667f9b
 971 | 	//Got NEW fragment #6, size 3509, hash:e0d7c8acfdd5b399a92b5e495a0794ffa842ee73
 972 | 	//Got OLD fragment #7, size 919, hash:9d68759ef33ae919b656faf52bb1177e803f810b
 973 | 	//Got OLD fragment #8, size 1326, hash:c272c26dff010417ca2120a8e82addfdadb4efeb
 974 | 	//Got OLD fragment #9, size 1284, hash:9bbe891ccb1b141e0e122110e730e8df9743331e
 975 | 	//Got OLD fragment #10, size 1220, hash:5019f56fa9395060fbe2e957ad518a35cd667f9b
 976 | 	//Got NEW fragment #11, size 1569, hash:5ae2760535662c13b336d1ae4a0a7fdcba789d83
 977 | }
 978 | 
 979 | // This example will show how to write data to two files.
 980 | // Running this example will deduplicate an empty byte slice
 981 | // of 500000 bytes into an 'output.data' and 'output.idx' file.
 982 | //
 983 | // In the real world, you would likely want to add a bufio.NewWriter
 984 | // to the output, but to keep it simple, we don't do that here.
 985 | func ExampleNewWriter_file() {
 986 | 	data, err := os.Create("output.data")
 987 | 	if err != nil {
 988 | 		panic(err)
 989 | 	}
 990 | 	// Close, print stats and remove it
 991 | 	defer func() {
 992 | 		data.Close()
 993 | 		stat, _ := os.Stat("output.data")
 994 | 		fmt.Println("Data size:", stat.Size())
 995 | 		os.Remove("output.data")
 996 | 	}()
 997 | 
 998 | 	idx, err := os.Create("output.idx")
 999 | 	if err != nil {
1000 | 		panic(err)
1001 | 	}
1002 | 	// Close, print stats and remove it
1003 | 	defer func() {
1004 | 		idx.Close()
1005 | 		stat, _ := os.Stat("output.idx")
1006 | 		fmt.Println("Index size:", stat.Size())
1007 | 		os.Remove("output.idx")
1008 | 	}()
1009 | 
1010 | 	// This is our input:
1011 | 	input := bytes.NewBuffer(make([]byte, 500000))
1012 | 
1013 | 	// Create a new writer, with each block being 1000 bytes fixed size.
1014 | 	w, err := dedup.NewWriter(idx, data, dedup.ModeFixed, 1000, 0)
1015 | 	if err != nil {
1016 | 		panic(err)
1017 | 	}
1018 | 	defer w.Close()
1019 | 
1020 | 	// Copy our input to the writer.
1021 | 	io.Copy(w, input)
1022 | 
1023 | 	// Print the number of blocks written
1024 | 	fmt.Println("Blocks:", w.Blocks())
1025 | 
1026 | 	// OUTPUT: Blocks: 500
1027 | 	// Index size: 517
1028 | 	// Data size: 1000
1029 | }
1030 | 
1031 | // This will deduplicate a buffer of zeros to an non-indexed stream
1032 | // written to a file.
1033 | // It is not recommended to use a single stream when you are writing to
1034 | // a stream.
1035 | func ExampleNewStreamWriter_file() {
1036 | 	// We will write to this
1037 | 	data, err := os.Create("outputstream.data")
1038 | 	if err != nil {
1039 | 		panic(err)
1040 | 	}
1041 | 	// Close, print stats and remove it
1042 | 	defer func() {
1043 | 		data.Close()
1044 | 		stat, _ := os.Stat("outputstream.data")
1045 | 		fmt.Println("Stream size:", stat.Size())
1046 | 		os.Remove("outputstream.data")
1047 | 	}()
1048 | 
1049 | 	// This is our input:
1050 | 	input := bytes.NewBuffer(make([]byte, 500000))
1051 | 
1052 | 	// Create a new writer, with each block being 1000 bytes,
1053 | 	// And allow it to use 10000 bytes of memory
1054 | 	w, err := dedup.NewStreamWriter(data, dedup.ModeFixed, 1000, 10000)
1055 | 	if err != nil {
1056 | 		panic(err)
1057 | 	}
1058 | 	defer w.Close()
1059 | 
1060 | 	// Copy our input to the writer.
1061 | 	io.Copy(w, input)
1062 | 
1063 | 	// Print the number of blocks written
1064 | 	fmt.Println("Blocks:", w.Blocks())
1065 | 
1066 | 	// OUTPUT: Blocks: 500
1067 | 	// Stream size: 1518
1068 | }
1069 | 
1070 | // This shows an example of a birthday problem calculation.
1071 | // We calculate the probability of a collision of SHA1 hashes
1072 | // on 1 Terabyte data, using 1 Kilobyte blocks.
1073 | // With SHA-1, that gives a 1 in 2535301202817642046627252275200 chance
1074 | // of a collision occurring.
1075 | func ExampleBirthdayProblem() {
1076 | 	fmt.Println("Hash size is", dedup.HashSize*8, "bits")
1077 | 	fmt.Println("1TiB, 1KiB blocks:")
1078 | 	fmt.Println(dedup.BirthdayProblem((1 << 40) / (1 << 10)))
1079 | 	// Output: Hash size is 160 bits
1080 | 	// 1TiB, 1KiB blocks:
1081 | 	// Collision probability is ~ 1/2535301202817642046627252275200 ~ 3.944304522431639e-31
1082 | }
1083 | 


--------------------------------------------------------------------------------