├── .gitignore ├── .travis.yml ├── FORMAT.md ├── LICENSE ├── README.md ├── dedup.go ├── filesplitter.go ├── reader.go ├── reader_test.go ├── sort └── hashsort.go ├── testdata └── sampledata.zip ├── writer.go └── writer_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | /.idea 26 | /dedup.iml 27 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | sudo: false 4 | 5 | os: 6 | - linux 7 | - osx 8 | 9 | go: 10 | - 1.7.x 11 | - 1.8.x 12 | - 1.9.x 13 | - master 14 | 15 | script: 16 | - go test -v -cpu=1,2,4 . 17 | - go test -v -cpu=2 -race -short . 18 | 19 | matrix: 20 | allow_failures: 21 | - go: 'master' 22 | fast_finish: true 23 | -------------------------------------------------------------------------------- /FORMAT.md: -------------------------------------------------------------------------------- 1 | # Stream formats 2 | 3 | The streaming format is designed to be easy and fast to write or parse. 4 | 5 | It only uses 64 bit unsigned variable sized ints to store the information. 6 | 7 | The encoding of Uvarint is: 8 | 9 | * unsigned integers are serialized 7 bits at a time, starting with the least significant bits 10 | * the most significant bit (msb) in each output byte indicates if there is a continuation byte (msb = 1) 11 | 12 | All values in this format can fit in a 64 bit unsigned value. 13 | See [encoding/binary.ReadUvarint](https://golang.org/pkg/encoding/binary/#ReadUvarint) for a reference implementation. 14 | 15 | # Format 1 16 | This format has data and index split in two files, so the index can be quickly read before any decoding starts. 17 | 18 | This index allows to keep track of the last occurence of a block, so it can be deallocated immediately afterwards. 19 | 20 | ## Header 21 | 22 | | Content | Type | Values | 23 | |----------------|---------|--------------| 24 | | Format ID | UvarInt | 0x1 (always) | 25 | | MaxBlockSize | UvarInt | >= 512 | 26 | 27 | ## Repeat Blocks 28 | 29 | This is the index parsing algorithm in overall terms. 30 | 31 | ```Go 32 | for { 33 | offset = ReadVarUint() 34 | 35 | switch offset { 36 | 37 | // NEW BLOCK 38 | case 0: 39 | x = ReadVarUint() 40 | if x > MaxBlockSize { ERROR } 41 | blockSize = MaxBlockSize - x 42 | block = ReadBytesFromDataStream(blockSize) 43 | 44 | // END OF STREAM 45 | case 1<<64 - 1: 46 | x = ReadVarUint() 47 | if x > MaxBlockSize { ERROR } 48 | blockSize = MaxBlockSize - x 49 | block = ReadBytesFromDataStream(blockSize) 50 | 51 | // Stream terminator 52 | x := ReadVarUint() 53 | if x != 0 { ERROR } 54 | break 55 | 56 | // DEDUPLICATED BLOCK 57 | default: 58 | SourceBlockNum = CurrentBlock - offset 59 | if SourceBlockNum < 0 { ERROR } 60 | } 61 | } 62 | ``` 63 | 64 | ### Block sizes 65 | Block sizes are stored as `MaxSize - Size`, so fixed block sizes are all stored as size '0'. 66 | 67 | ### Block Offset 68 | The deduplicated offset is backwards from the the current block, so if the current block is the same 69 | as the previous, it will be encoded as '1'. If it is two blocks back, 2, etc. 70 | 71 | # Format 2 72 | 73 | Format 2 has block definitions and data interleaved. It only has a minor difference to Format 1, since it includes a 74 | Maximum backreference Length, helping the decoder to deallocate blocks. 75 | 76 | 77 | ## Header 78 | 79 | | Content | Type | Values | 80 | |----------------|---------|--------------| 81 | | Format ID | UvarInt | 0x2 (always) | 82 | | MaxBlockSize | UvarInt | >= 512 | 83 | | MaxLength | UvarInt | >= 1 | 84 | 85 | In addition to Maximum Block Size, a `MaxLength` is also added, which indicates the maximum backreference distance 86 | of this stream. This means that any offsets will be less or equal to MaxLength. 87 | 88 | ## Repeat Blocks 89 | 90 | This is the decoding loop. `MaxLength` blocks should be kept in memory while the decoding is taking place. 91 | Data is read from in between block definitions, if offset is 0. 92 | 93 | ```Go 94 | for { 95 | offset = ReadVarUint() 96 | 97 | switch offset { 98 | 99 | // NEW BLOCK 100 | case 0: 101 | x = ReadVarUint() 102 | if x > MaxBlockSize { ERROR } 103 | blockSize = MaxBlockSize - x 104 | block = ReadBytes(blockSize) 105 | 106 | // END OF STREAM 107 | case 1<<64 - 1: 108 | x = ReadVarUint() 109 | if x > MaxBlockSize { ERROR } 110 | blockSize = MaxBlockSize - x 111 | block = ReadBytes(blockSize) 112 | 113 | // Stream terminator 114 | x := ReadVarUint() 115 | if x != 0 { ERROR } 116 | break 117 | 118 | // DEDUPLICATED BLOCK 119 | default: 120 | if offset > MaxLength { ERROR } 121 | SourceBlockNum = CurrentBlock - offset 122 | if SourceBlockNum < 0 { ERROR } 123 | } 124 | } 125 | 126 | ``` 127 | 128 | 129 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Klaus Post 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dedup 2 | A Streaming Deduplication package for Go 3 | 4 | This package implements streaming deduplication, allowing you to remove duplicated data in streams. It implements variable block sizes and automatic content block adaptation. It has a fully streaming mode and an indexed mode, that has significantly reduced memory requirements. 5 | 6 | For an introduction to deduplication read this blog post [Fast Stream Deduplication in Go](https://blog.klauspost.com/fast-stream-deduplication-in-go/). 7 | 8 | Package home: https://github.com/klauspost/dedup 9 | 10 | Godoc: https://godoc.org/github.com/klauspost/dedup 11 | 12 | [![Build Status](https://travis-ci.org/klauspost/dedup.svg?branch=master)](https://travis-ci.org/klauspost/dedup) 13 | [![GoDoc][1]][2] 14 | 15 | [1]: https://godoc.org/github.com/klauspost/dedup?status.svg 16 | [2]: https://godoc.org/github.com/klauspost/dedup 17 | 18 | # Installation 19 | To get the package use the standard: 20 | ```bash 21 | go get -u github.com/klauspost/dedup 22 | ``` 23 | 24 | # Usage 25 | 26 | If you haven't already, you should read the [Fast Stream Deduplication in Go](https://blog.klauspost.com/fast-stream-deduplication-in-go/) blog post, since it will introduce different aspects and help you make choices for your setup. 27 | 28 | There are two symmetric functions [`NewWriter`](https://godoc.org/github.com/klauspost/dedup#NewWriter)/[`NewReader`](https://godoc.org/github.com/klauspost/dedup#NewReader) and [`NewStreamWriter`](https://godoc.org/github.com/klauspost/dedup#NewStreamWriter)/[`NewStreamReader`](https://godoc.org/github.com/klauspost/dedup#NewStreamReader)`. The first pair creates an *indexed* stream, which will write the index and data to two separate streams. This allows to decode the deduplicated stream with much less memory. The second pair will write all data to a *single stream*. This allows for on-the-fly transfers, but will require more memory in the receiving end. 29 | 30 | When you create a deduplicating stream, you can specify between *fixed* or *dynamic* block sizes. The dynamic blocks adapt block splits to the incoming content, but is slower than fixed size, and has to use more conservative memory estimations. 31 | 32 | Here is an example of a full roundtrip with indexed streams. For more examples see the [godoc examples](https://godoc.org/github.com/klauspost/dedup#pkg-examples). 33 | 34 | ```Go 35 | package main 36 | 37 | import ( 38 | "bytes" 39 | "fmt" 40 | "io" 41 | 42 | "github.com/klauspost/dedup" 43 | ) 44 | 45 | // This will deduplicate a buffer of zeros to an indexed stream 46 | func main() { 47 | // We will write out deduplicated data to these 48 | idx := bytes.Buffer{} 49 | data := bytes.Buffer{} 50 | 51 | // This is our input: 52 | input := bytes.NewBuffer(make([]byte, 50000)) 53 | 54 | // Create a new writer, with each block being 1000 bytes fixed size. 55 | w, err := dedup.NewWriter(&idx, &data, dedup.ModeFixed, 1000, 0) 56 | if err != nil { 57 | panic(err) 58 | } 59 | // Copy our input to the writer. 60 | io.Copy(w, input) 61 | 62 | // Close to flush the remaining buffers 63 | err = w.Close() 64 | if err != nil { 65 | panic(err) 66 | } 67 | 68 | // Create a new indexed stream reader: 69 | r, err := dedup.NewReader(&idx, &data) 70 | if err != nil { 71 | panic(err) 72 | } 73 | 74 | // Inspect how much memory it will use. 75 | fmt.Println("Memory use:", r.MaxMem()) 76 | 77 | var dst bytes.Buffer 78 | 79 | // Read everything 80 | _, err = io.Copy(&dst, r) 81 | if err != nil && err != io.EOF { 82 | panic(err) 83 | } 84 | 85 | // Let us inspect what was written: 86 | fmt.Println("Returned data length:", dst.Len()) 87 | fmt.Println("Everything zero:", 0 == bytes.Compare(dst.Bytes(), make([]byte, 50000))) 88 | } 89 | ``` 90 | 91 | Note that there is no error resilience built in. If any data is corrupted in any way, it will probably not be detected, and there is no way to recover corrupted data. So if you are in an environment where that could occur, you should add additional checks to ensure that data is recoverable. 92 | 93 | ## Input Splitting 94 | 95 | If you want to simply split the input, that functionality is also exposed. 96 | 97 | This can be useful in the case that you want to deduplicate to a your own key-value store. 98 | In this case, you simply feed the input to a *NewSplitter*. 99 | This will return the individual fragments along with a hash. 100 | This will allow you to store your files as a stream of hashes, and separate the data into your data store. 101 | 102 | See the examples attached to the [NewSplitter](https://godoc.org/github.com/klauspost/dedup#example-NewSplitter) function on how to use this. 103 | 104 | ## Hash collisions 105 | 106 | The encoder uses SHA-1 to identify and "remember" unique blocks. No hash is secure from collisions, but SHA-1 offers 160 bits of entropy. 107 | 108 | For example, the chance of a random hash collision to occur when encoding 1 TB data in 1KB blocks is 3.94×10^-31 : 1, or one in "2.5 thousand billion billion billion". This of course assumes a uniform hash distribution and no deliberate hash collision attacks. 109 | 110 | If SHA-1 doesn't provide sufficient security, it has been made very easy for you to create a stronger version. It is possible for you to create a stronger version by simply changing the import: 111 | 112 | ```Go 113 | import hasher "crypto/sha1" 114 | ``` 115 | You can use [sha256](https://golang.org/pkg/crypto/sha256/), [sha512](https://golang.org/pkg/crypto/sha512/) for stronger hashes, or [md5](https://golang.org/pkg/crypto/md5/) for a faster hash. 116 | 117 | To help you calculate the birthday problem likelyhood with a given number of blocks, I have provided the [BirthdayProblem function](https://godoc.org/github.com/klauspost/dedup#BirthdayProblem). 118 | 119 | ## Why is this not compression? 120 | 121 | Deduplication does the same as compression but on a higher level. Instead of looking for small matches, it attempts to find the "bigger" matches. It will attempt to match and eliminate blocks where all content matches. 122 | 123 | This can be useful when backing up disk images or other content where you have duplicated files, etc. 124 | 125 | Deduplication is a good step *before* compression. You will still be able to compress your data, since unique blocks are passed through as-is, in order and without any modification. 126 | 127 | # License 128 | 129 | This code is published under an MIT license. See LICENSE file for more information. 130 | -------------------------------------------------------------------------------- /dedup.go: -------------------------------------------------------------------------------- 1 | // depdup: A Streaming Deduplication package 2 | // 3 | // This package implements streaming deduplication, allowing you to remove duplicated data in streams. 4 | // It implements variable block sizes and automatic content block adaptation. 5 | // It has a fully streaming mode and an indexed mode, that has significantly reduced memory requirements. 6 | // 7 | // Read for an introduction to deduplication: https://blog.klauspost.com/fast-stream-deduplication-in-go 8 | // 9 | // Package home: https://github.com/klauspost/dedup 10 | // 11 | // Godoc: https://godoc.org/github.com/klauspost/dedup 12 | // 13 | package dedup 14 | 15 | import ( 16 | "fmt" 17 | "math/big" 18 | ) 19 | 20 | // Returns an approximate Birthday probability calculation 21 | // based on the number of blocks given and the hash size. 22 | // 23 | // It uses the simplified calculation: p = k(k-1) / (2N) 24 | // 25 | // From http://preshing.com/20110504/hash-collision-probabilities/ 26 | func BirthdayProblem(blocks int) string { 27 | k := big.NewInt(int64(blocks)) 28 | km1 := big.NewInt(int64(blocks - 1)) 29 | ksq := k.Mul(k, km1) 30 | n := big.NewInt(0) 31 | n = n.Exp(big.NewInt(2), big.NewInt(int64(HashSize)*8), nil) 32 | twoN := n.Add(n, n) 33 | var t, t2 big.Rat 34 | var res *big.Rat 35 | // 36 | res = t.SetFrac(ksq, twoN) 37 | f64, _ := res.Float64() 38 | inv := t2.Inv(res).FloatString(0) 39 | invs := fmt.Sprintf(" ~ 1/%s ~ %v", inv, f64) 40 | 41 | return "Collision probability is" + invs 42 | } 43 | -------------------------------------------------------------------------------- /filesplitter.go: -------------------------------------------------------------------------------- 1 | //+build ignore 2 | 3 | // DISABLED, since I have found no scenarios where it provides improvement 4 | 5 | package dedup 6 | 7 | import ( 8 | "bytes" 9 | "errors" 10 | "io" 11 | "io/ioutil" 12 | "testing" 13 | ) 14 | 15 | const ( 16 | // Dynamic block size, including split on file signatures. 17 | // There are a number of typical file signartures builtin, 18 | // or you can use AddSignature to add your own. 19 | ModeDynamicSignatures = 2 20 | 21 | // Dynamic block size only split on file signatures 22 | ModeSignaturesOnly = 3 23 | ) 24 | 25 | // Split on zpaq hash, file signatures and maximum block size. 26 | func (z *zpaqWriter) writeFile(w *writer, b []byte) (int, error) { 27 | c1 := z.c1 28 | 29 | for i, c := range b { 30 | split := false 31 | v := sigmap[c] 32 | if len(v) > 0 && i < len(b)-6 { 33 | for _, s := range v { 34 | split = true 35 | for j, expect := range s { 36 | if b[j+1] != expect { 37 | split = false 38 | break 39 | } 40 | } 41 | } 42 | } 43 | if c == z.o1[c1] { 44 | z.h = (z.h + uint32(c) + 1) * 314159265 45 | } else { 46 | z.h = (z.h + uint32(c) + 1) * 271828182 47 | } 48 | z.o1[c1] = c 49 | c1 = c 50 | w.cur[w.off] = c 51 | w.off++ 52 | 53 | // Filled the buffer? Send it off! 54 | if w.off >= z.minFragment && (z.h < z.maxHash || split || w.off >= z.maxFragment) { 55 | b := <-w.buffers 56 | // Swap block with current 57 | w.cur, b.data = b.data[:w.maxSize], w.cur[:w.off] 58 | b.N = w.nblocks 59 | 60 | w.input <- b 61 | w.write <- b 62 | w.nblocks++ 63 | w.off = 0 64 | z.h = 0 65 | c1 = 0 66 | } 67 | } 68 | z.c1 = c1 69 | return len(b), nil 70 | } 71 | 72 | // Split on maximum size and file signatures only. 73 | func fileSplitOnly(w *writer, b []byte) (int, error) { 74 | for i, c := range b { 75 | split := false 76 | v := sigmap[c] 77 | if len(v) > 0 && i < len(b)-6 { 78 | for _, s := range v { 79 | split = true 80 | for j, expect := range s { 81 | if b[j+1] != expect { 82 | split = false 83 | break 84 | } 85 | } 86 | } 87 | } 88 | w.cur[w.off] = c 89 | w.off++ 90 | 91 | // Filled the buffer? Send it off! 92 | if split || w.off >= w.maxSize { 93 | b := <-w.buffers 94 | // Swap block with current 95 | w.cur, b.data = b.data[:w.maxSize], w.cur[:w.off] 96 | b.N = w.nblocks 97 | 98 | w.input <- b 99 | w.write <- b 100 | w.nblocks++ 101 | w.off = 0 102 | } 103 | } 104 | return len(b), nil 105 | } 106 | 107 | // 4 times faster than map[byte][][]byte 108 | // 2 times faster than generated code (switch byte 0, if) 109 | var sigmap [256][][]byte 110 | 111 | func init() { 112 | for _, sig := range signatures { 113 | l := sig[0] 114 | err := AddSignature(sig[1 : 1+l]) 115 | if err != nil { 116 | panic(err) 117 | } 118 | } 119 | } 120 | 121 | // ErrSignatureTooShort is returned if AddSignature is called 122 | // with a signature shorter than 3 bytes 123 | var ErrSignatureTooShort = errors.New("signature should be at least 2 bytes") 124 | 125 | // AddSignature will add a signature that will cause a block 126 | // split. The signature must be more than 1 byte (at least 3 is recommended), 127 | // and only up to 7 bytes are compared. 128 | func AddSignature(b []byte) error { 129 | if len(b) <= 1 { 130 | return ErrSignatureTooShort 131 | } 132 | if len(b) > 7 { 133 | b = b[:7] 134 | } 135 | x := sigmap[b[0]] 136 | dst := make([]byte, len(b)-1) 137 | copy(dst, b[1:]) 138 | x = append(x, dst) 139 | sigmap[b[0]] = x 140 | return nil 141 | } 142 | 143 | // File start signatures 144 | // 8 bytes, 1 byte length (1 to 7), 1-7 bytes identifier literals, 7-length padding. 145 | var signatures = [][8]byte{ 146 | [8]byte{3, 0x42, 0x5A, 0x68, 0, 0, 0, 0}, //bzip 2 147 | [8]byte{3, 0x1f, 0x8b, 0x00, 0, 0, 0, 0}, //gzip (store) 148 | [8]byte{3, 0x1f, 0x8b, 0x08, 0, 0, 0, 0}, //gzip (deflate) 149 | [8]byte{6, 0x47, 0x49, 0x46, 0x38, 0x37, 0x61, 0}, //GIF87a 150 | [8]byte{6, 0x47, 0x49, 0x46, 0x38, 0x39, 0x61, 0}, //GIF89a 151 | [8]byte{4, 0x49, 0x49, 0x2A, 0x0, 0, 0, 0}, //TIFF 152 | [8]byte{4, 0x4D, 0x4D, 0x00, 0x2A, 0, 0, 0}, //TIFF 153 | [8]byte{3, 0xFF, 0xD8, 0xFF, 0, 0, 0, 0}, //JPEG 154 | [8]byte{4, 0x46, 0x4F, 0x52, 0x4D, 0, 0, 0}, //IFF (FORM) 155 | [8]byte{4, 0x50, 0x4B, 0x03, 0x04, 0, 0, 0}, //ZIP 156 | [8]byte{4, 0x50, 0x4B, 0x07, 0x08, 0, 0, 0}, //ZIP 157 | [8]byte{7, 0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x00}, //RAR 158 | [8]byte{4, 0x7F, 0x45, 0x4C, 0x46, 0, 0, 0}, //ELF 159 | [8]byte{7, 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A}, //PNG 160 | [8]byte{4, 0xCA, 0xFE, 0xBA, 0xBE, 0, 0, 0}, //Java Class 161 | [8]byte{3, 0xEF, 0xBB, 0xBF, 0, 0, 0, 0}, //Unicode byte order mark 162 | [8]byte{4, 0xFE, 0xED, 0xFA, 0xCE, 0, 0, 0}, //Mach-O binary (32-bit) 163 | [8]byte{4, 0xFE, 0xED, 0xFA, 0xCF, 0, 0, 0}, //Mach-O binary (64-bit) 164 | [8]byte{4, 0xCE, 0xFA, 0xED, 0xFE, 0, 0, 0}, //Mach-O binary (32-bit) 165 | [8]byte{4, 0xCF, 0xFA, 0xED, 0xFE, 0, 0, 0}, //Mach-O binary (64-bit) 166 | [8]byte{4, 0xFF, 0xFE, 0x00, 0x00, 0, 0, 0}, //BOM 32-bit Unicode Transfer Format 167 | [8]byte{4, 0x50, 0x45, 0x00, 0x00, 0, 0, 0}, //PE (PE Header) 168 | [8]byte{4, 0x25, 0x21, 0x50, 0x53, 0, 0, 0}, //PS 169 | [8]byte{4, 0x25, 0x50, 0x44, 0x46, 0, 0, 0}, //PDF 170 | [8]byte{7, 0x30, 0x26, 0xB2, 0x75, 0x8E, 0x66, 0xCF}, //ASF 171 | [8]byte{7, 0xA6, 0xD9, 0x00, 0xAA, 0x00, 0x62, 0xCE}, //WMV 172 | [8]byte{7, 0x24, 0x53, 0x44, 0x49, 0x30, 0x30, 0x30}, //SDI 173 | [8]byte{4, 0x4F, 0x67, 0x67, 0x53, 0, 0, 0}, //OGG 174 | [8]byte{4, 0x38, 0x42, 0x50, 0x53, 0, 0, 0}, //PSD 175 | [8]byte{4, 0x52, 0x49, 0x46, 0x46, 0, 0, 0}, //WAV/AVI 176 | [8]byte{3, 0x49, 0x44, 0x33, 0, 0, 0, 0}, //MP3 (ID3 v2, all versions) 177 | [8]byte{5, 0x43, 0x44, 0x30, 0x30, 0x31, 0, 0}, //ISO 178 | [8]byte{3, 0x4B, 0x44, 0x4D, 0, 0, 0, 0}, //VMDK 179 | [8]byte{4, 0x66, 0x4C, 0x61, 0x43, 0, 0, 0}, //FLAC 180 | [8]byte{4, 0x4D, 0x54, 0x68, 0x64, 0, 0, 0}, //MIDI 181 | [8]byte{5, 0x1A, 0x45, 0xDF, 0xA3, 0, 0}, //MKV 182 | [8]byte{5, 0x1F, 0x43, 0xB6, 0x75, 0, 0}, //MKV Cluster 183 | [8]byte{4, 0x46, 0x4c, 0x56, 0x01, 0, 0, 0}, //FLV (old format) 184 | [8]byte{7, 0x66, 0x74, 0x79, 0x70, 0x33, 0x67, 0x70}, //3GG/MP4 185 | [8]byte{6, 0x37, 0x7a, 0xbc, 0xaf, 0x27, 0x1c, 0}, //7zip 186 | [8]byte{6, 0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00, 0}, //XZ format 187 | [8]byte{7, 0x42, 0x4f, 0x4f, 0x4b, 0x4d, 0x4f, 0x42}, //MOBI book format 188 | [8]byte{7, 0x53, 0x51, 0x4c, 0x69, 0x74, 0x65, 0x20}, //SQLite DB 189 | [8]byte{6, 0x7b, 0x5c, 0x72, 0x74, 0x66, 0x31, 0}, //RTF '{\rtf1\' 190 | [8]byte{7, '<', '!', 'D', 'O', 'C', 'T', 'Y'}, //HTML Doctype 191 | [8]byte{4, 0x49, 0x54, 0x53, 0x46, 0, 0, 0}, //CHM Fomrat 192 | [8]byte{6, '<', '?', 'x', 'm', 'l', ' ', 0}, //XML Doctype 193 | [8]byte{5, 0x2e, 0x70, 0x6e, 0x20, 0x30, 0, 0}, //troff page #0 194 | [8]byte{4, 0xfe, 0x62, 0x69, 0x6e, 0, 0, 0}, //MySQL binlog 195 | [8]byte{5, 'K', 'D', 'M', 'V', 0x01, 0, 0}, //Virtual machine disk image 196 | [8]byte{5, 'M', 'R', 'V', 'N', 0x01, 0, 0}, //VMware nvram image 197 | 198 | // Exotics: 199 | //[8]byte{7, 0x46, 0x55, 0x4a, 0x49, 0x46, 0x49, 0x4c}, //FUJI Raw format 200 | //[8]byte{7, 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a}, //MSI format 201 | //[8]byte{5, 0x46, 0x4f, 0x56, 0x62, 0x00, 0, 0}, //X3F format 202 | 203 | //[8]byte{4, 0x50, 0x4B, 0x05, 0x06, 0, 0, 0}, //ZIP empty archive 204 | } 205 | 206 | // Tests: 207 | 208 | // Maximum block size: 64k 209 | func BenchmarkDynamicSigsWriter64K(t *testing.B) { 210 | const totalinput = 10 << 20 211 | input := getBufferSize(totalinput) 212 | 213 | const size = 64 << 10 214 | b := input.Bytes() 215 | // Create some duplicates 216 | for i := 0; i < 50; i++ { 217 | // Read from 10 first blocks 218 | src := b[(i%10)*size : (i%10)*size+size] 219 | // Write into the following ones 220 | dst := b[(10+i)*size : (i+10)*size+size] 221 | copy(dst, src) 222 | } 223 | t.ResetTimer() 224 | t.SetBytes(totalinput) 225 | for i := 0; i < t.N; i++ { 226 | input = bytes.NewBuffer(b) 227 | w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeDynamicSignatures, size, 0) 228 | io.Copy(w, input) 229 | err := w.Close() 230 | if err != nil { 231 | t.Fatal(err) 232 | } 233 | } 234 | } 235 | 236 | // Maximum block size: 64k 237 | func BenchmarkSigsOnlyWriter64K(t *testing.B) { 238 | const totalinput = 10 << 20 239 | input := getBufferSize(totalinput) 240 | 241 | const size = 64 << 10 242 | b := input.Bytes() 243 | // Create some duplicates 244 | for i := 0; i < 50; i++ { 245 | // Read from 10 first blocks 246 | src := b[(i%10)*size : (i%10)*size+size] 247 | // Write into the following ones 248 | dst := b[(10+i)*size : (i+10)*size+size] 249 | copy(dst, src) 250 | } 251 | t.ResetTimer() 252 | t.SetBytes(totalinput) 253 | for i := 0; i < t.N; i++ { 254 | input = bytes.NewBuffer(b) 255 | w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeSignaturesOnly, size, 0) 256 | io.Copy(w, input) 257 | err := w.Close() 258 | if err != nil { 259 | t.Fatal(err) 260 | } 261 | } 262 | } 263 | -------------------------------------------------------------------------------- /reader.go: -------------------------------------------------------------------------------- 1 | package dedup 2 | 3 | import ( 4 | "bufio" 5 | "encoding/binary" 6 | "errors" 7 | "fmt" 8 | "io" 9 | "math" 10 | ) 11 | 12 | // A Reader will decode a deduplicated stream and 13 | // return the data as it was encoded. 14 | // Use Close when done to release resources. 15 | type Reader interface { 16 | io.ReadCloser 17 | 18 | io.WriterTo 19 | 20 | // MaxMem returns the *maximum* memory required to decode the stream. 21 | MaxMem() int 22 | } 23 | 24 | // IndexedReader gives access to internal information on 25 | // block sizes available on indexed streams. 26 | type IndexedReader interface { 27 | Reader 28 | 29 | // Blocksizes will return the sizes of each block. 30 | // Will be available if an index was provided. 31 | BlockSizes() []int 32 | } 33 | 34 | type reader struct { 35 | streamReader 36 | blocks []*rblock 37 | } 38 | 39 | type streamReader struct { 40 | size int 41 | maxLength uint64 // Maxmimum backreference count 42 | curBlock int 43 | curData []byte 44 | ready chan *rblock 45 | closeReader chan struct{} 46 | readerClosed chan struct{} 47 | } 48 | 49 | // rblock contains read information about a single block 50 | type rblock struct { 51 | data []byte 52 | readData int 53 | first int // Index of first occurrence 54 | last int // Index of last occurrence 55 | offset int64 // Expected offset in data file (format 1) 56 | err error // Read error? 57 | } 58 | 59 | func (r *rblock) String() string { 60 | if r == nil { 61 | return "" 62 | } 63 | return fmt.Sprintf("{Read:%d; [%d:%d], offset:%d}", r.readData, r.first, r.last, r.offset) 64 | } 65 | 66 | var ErrUnknownFormat = errors.New("unknown index format") 67 | 68 | // NewReader returns a reader that will decode the supplied index and data stream. 69 | // 70 | // This is compatible content from the NewWriter function. 71 | // The function will decode the index before returning. 72 | // 73 | // When you are done with the Reader, use Close to release resources. 74 | func NewReader(index io.Reader, blocks io.Reader) (IndexedReader, error) { 75 | f := &reader{streamReader: streamReader{ 76 | ready: make(chan *rblock, 8), // Read up to 8 blocks ahead 77 | closeReader: make(chan struct{}, 0), 78 | readerClosed: make(chan struct{}, 0), 79 | curBlock: 0, 80 | }} 81 | idx := bufio.NewReader(index) 82 | format, err := binary.ReadUvarint(idx) 83 | if err != nil { 84 | return nil, err 85 | } 86 | 87 | switch format { 88 | case 1: 89 | err = f.readFormat1(idx) 90 | default: 91 | err = ErrUnknownFormat 92 | } 93 | go f.blockReader(blocks) 94 | 95 | return f, err 96 | } 97 | 98 | // NewStreamReader returns a reader that will decode the supplied data stream. 99 | // 100 | // This is compatible content from the NewStreamWriter function. 101 | // 102 | // When you are done with the Reader, use Close to release resources. 103 | func NewStreamReader(in io.Reader) (Reader, error) { 104 | f := &streamReader{ 105 | ready: make(chan *rblock, 8), // Read up to 8 blocks ahead 106 | closeReader: make(chan struct{}, 0), 107 | readerClosed: make(chan struct{}, 0), 108 | curBlock: 0, 109 | } 110 | br := bufio.NewReader(in) 111 | format, err := binary.ReadUvarint(br) 112 | if err != nil { 113 | return nil, err 114 | } 115 | 116 | switch format { 117 | case 2: 118 | err = f.readFormat2(br) 119 | if err != nil { 120 | return nil, err 121 | } 122 | default: 123 | return nil, ErrUnknownFormat 124 | } 125 | 126 | go f.streamReader(br) 127 | 128 | return f, nil 129 | } 130 | 131 | // NewSeekRead returns a reader that will decode the supplied index and data stream. 132 | // 133 | // This is compatible content from the NewWriter function. 134 | // 135 | // No blocks will be kept in memory, but the block data input must be seekable. 136 | // The function will decode the index before returning. 137 | // 138 | // When you are done with the Reader, use Close to release resources. 139 | func NewSeekReader(index io.Reader, blocks io.ReadSeeker) (IndexedReader, error) { 140 | f := &reader{streamReader: streamReader{ 141 | ready: make(chan *rblock, 8), // Read up to 8 blocks ahead 142 | closeReader: make(chan struct{}, 0), 143 | readerClosed: make(chan struct{}, 0), 144 | curBlock: 0, 145 | maxLength: 8, // We have 8 blocks readahead. 146 | }} 147 | idx := bufio.NewReader(index) 148 | format, err := binary.ReadUvarint(idx) 149 | if err != nil { 150 | return nil, err 151 | } 152 | 153 | switch format { 154 | case 1: 155 | err = f.readFormat1(idx) 156 | default: 157 | err = ErrUnknownFormat 158 | } 159 | 160 | go f.seekReader(blocks) 161 | 162 | return f, err 163 | } 164 | 165 | // readFormat1 will read the index of format 1 166 | // and prepare decoding 167 | func (f *reader) readFormat1(idx io.ByteReader) error { 168 | size, err := binary.ReadUvarint(idx) 169 | if err != nil { 170 | return err 171 | } 172 | f.size = int(size) 173 | 174 | // Insert empty block 0 175 | f.blocks = append(f.blocks, nil) 176 | i := 0 177 | var foffset int64 178 | // Read blocks 179 | for { 180 | i++ 181 | offset, err := binary.ReadUvarint(idx) 182 | if err != nil { 183 | return err 184 | } 185 | switch offset { 186 | // new block 187 | case 0: 188 | r, err := binary.ReadUvarint(idx) 189 | if err != nil { 190 | return err 191 | } 192 | if r > size { 193 | return fmt.Errorf("invalid size for block %d, %d > %d", i, r, size) 194 | } 195 | f.blocks = append(f.blocks, &rblock{first: i, last: i, readData: int(size - r), offset: foffset}) 196 | foffset += int64(size - r) 197 | // Last block 198 | case math.MaxUint64: 199 | r, err := binary.ReadUvarint(idx) 200 | if err != nil { 201 | return err 202 | } 203 | if r > size { 204 | return fmt.Errorf("invalid size for block %d, %d > %d", i, r, size) 205 | } 206 | f.blocks = append(f.blocks, &rblock{readData: int(size - r), offset: foffset}) 207 | foffset += int64(size - r) 208 | // Continuation should be 0 209 | r, err = binary.ReadUvarint(idx) 210 | if err != nil { 211 | return err 212 | } 213 | if r != 0 { 214 | return fmt.Errorf("invalid continuation, should be 0, was %d", r) 215 | } 216 | return nil 217 | // Deduplicated block 218 | default: 219 | pos := len(f.blocks) - int(offset) 220 | if pos <= 0 || pos >= len(f.blocks) { 221 | return fmt.Errorf("invalid offset encountered at block %d, offset was %d", len(f.blocks), offset) 222 | } 223 | // Update last position. 224 | org := f.blocks[pos] 225 | org.last = i 226 | f.blocks = append(f.blocks, org) 227 | } 228 | } 229 | } 230 | 231 | // readFormat2 will read the header data of format 2 232 | // and stop at the first block. 233 | func (f *streamReader) readFormat2(rd io.ByteReader) error { 234 | size, err := binary.ReadUvarint(rd) 235 | if err != nil { 236 | return err 237 | } 238 | if size < MinBlockSize { 239 | return ErrSizeTooSmall 240 | } 241 | f.size = int(size) 242 | 243 | maxLength, err := binary.ReadUvarint(rd) 244 | if err != nil { 245 | return err 246 | } 247 | if maxLength < 1 { 248 | return ErrMaxMemoryTooSmall 249 | } 250 | f.maxLength = maxLength 251 | return nil 252 | } 253 | 254 | // Read will read from the input stream and return the 255 | // deduplicated data. 256 | func (f *streamReader) Read(b []byte) (int, error) { 257 | read := 0 258 | for len(b) > 0 { 259 | // Read next 260 | if len(f.curData) == 0 { 261 | f.curBlock++ 262 | next, ok := <-f.ready 263 | if !ok { 264 | return read, io.EOF 265 | } 266 | if next.err != nil { 267 | return read, next.err 268 | } 269 | f.curData = next.data 270 | // We don't want to keep it, if this is the last block 271 | if f.curBlock == next.last { 272 | next.data = nil 273 | } 274 | if len(f.curData) == 0 { 275 | continue 276 | } 277 | } 278 | n := copy(b, f.curData) 279 | read += n 280 | b = b[n:] 281 | f.curData = f.curData[n:] 282 | } 283 | return read, nil 284 | } 285 | 286 | // WriteTo writes data to w until there's no more data to write or when an error occurs. 287 | // The return value n is the number of bytes written. 288 | // Any error encountered during the write is also returned. 289 | func (f *streamReader) WriteTo(w io.Writer) (int64, error) { 290 | written := int64(0) 291 | for { 292 | next, ok := <-f.ready 293 | if !ok { 294 | return written, io.EOF 295 | } 296 | if next.err != nil { 297 | return written, next.err 298 | } 299 | f.curBlock++ 300 | f.curData = next.data 301 | 302 | // We don't want to keep it, if this is the last block 303 | if f.curBlock == next.last { 304 | next.data = nil 305 | } 306 | n, err := w.Write(f.curData) 307 | written += int64(n) 308 | if err != nil { 309 | return written, err 310 | } 311 | } 312 | } 313 | 314 | // MaxMem returns the estimated maximum RAM usage needed to 315 | // unpack this content. 316 | func (f *streamReader) MaxMem() int { 317 | if f.maxLength > 0 { 318 | return int(f.maxLength) * f.size 319 | } 320 | return -1 321 | } 322 | 323 | // MaxMem returns the estimated maximum RAM usage needed to 324 | // unpack this content. 325 | func (f *reader) MaxMem() int { 326 | i := 1 // Current block 327 | curUse := 0 328 | maxUse := 0 329 | for { 330 | b := f.blocks[i] 331 | if b.first == i { 332 | curUse += b.readData 333 | } 334 | if curUse > maxUse { 335 | maxUse = curUse 336 | } 337 | 338 | if b.last == i { 339 | curUse -= b.readData 340 | } 341 | 342 | i++ 343 | // We read them all 344 | if i == len(f.blocks) { 345 | break 346 | } 347 | } 348 | return maxUse 349 | } 350 | 351 | func (f *reader) BlockSizes() []int { 352 | if len(f.blocks) < 2 { 353 | return nil 354 | } 355 | 356 | ret := make([]int, len(f.blocks)-1) 357 | for i, bl := range f.blocks[1:] { 358 | ret[i] = bl.readData 359 | } 360 | return ret 361 | } 362 | 363 | // blockReader will read format 1 blocks and deliver them 364 | // to the ready channel. 365 | // The function will return if the stream is finished, 366 | // or an error occurs 367 | func (f *reader) blockReader(in io.Reader) { 368 | defer close(f.readerClosed) 369 | defer close(f.ready) 370 | 371 | i := 1 // Current block 372 | totalRead := 0 373 | for { 374 | b := f.blocks[i] 375 | // Read it? 376 | if len(b.data) != b.readData { 377 | b.data = make([]byte, b.readData) 378 | n, err := io.ReadFull(in, b.data) 379 | if err != nil { 380 | b.err = err 381 | } else if n != b.readData { 382 | b.err = io.ErrUnexpectedEOF 383 | } 384 | totalRead += n 385 | } 386 | // Send or close 387 | select { 388 | case <-f.closeReader: 389 | return 390 | case f.ready <- b: 391 | } 392 | // Exit because of an error 393 | if b.err != nil { 394 | return 395 | } 396 | i++ 397 | // We read them all 398 | if i == len(f.blocks) { 399 | return 400 | } 401 | } 402 | } 403 | 404 | // streamReader will read blocks from a single stream 405 | // and deliver them to the "ready" channel. 406 | // The function will return if an error occurs or 407 | // the stream is finished. 408 | func (f *streamReader) streamReader(stream *bufio.Reader) { 409 | defer close(f.readerClosed) 410 | defer close(f.ready) 411 | 412 | totalRead := 0 413 | 414 | // Create backreference buffers 415 | blocks := make([][]byte, f.maxLength) 416 | for i := range blocks { 417 | blocks[i] = make([]byte, f.size) 418 | } 419 | 420 | i := uint64(1) // Current block 421 | for { 422 | b := &rblock{} 423 | lastBlock := false 424 | 425 | b.err = func() error { 426 | offset, err := binary.ReadUvarint(stream) 427 | if err != nil { 428 | return err 429 | } 430 | // Read it? 431 | if offset == 0 || offset == math.MaxUint64 { 432 | s, err := binary.ReadUvarint(stream) 433 | if err != nil { 434 | return err 435 | } 436 | size := f.size - int(s) 437 | if offset == math.MaxUint64 && size == 0 { 438 | lastBlock = true 439 | return nil 440 | } 441 | if size > f.size || size <= 0 { 442 | return fmt.Errorf("invalid size encountered at block %d, size was %d", i, size) 443 | } 444 | b.data = make([]byte, size) 445 | n, err := io.ReadFull(stream, b.data) 446 | if err != nil { 447 | return err 448 | } else if n != len(b.data) { 449 | return io.ErrUnexpectedEOF 450 | } 451 | totalRead += n 452 | if offset == math.MaxUint64 { 453 | lastBlock = true 454 | } 455 | } else { 456 | if offset > f.maxLength { 457 | return fmt.Errorf("invalid offset encountered at block %d, offset was %d", i, offset) 458 | } 459 | pos := i - offset 460 | if pos <= 0 { 461 | return fmt.Errorf("invalid offset encountered at block %d, offset was %d", i, offset) 462 | } 463 | src := blocks[pos%f.maxLength] 464 | b.data = src 465 | } 466 | 467 | blocks[i%f.maxLength] = b.data 468 | return nil 469 | }() 470 | // Read continuation 471 | if lastBlock { 472 | r, err := binary.ReadUvarint(stream) 473 | if err != nil { 474 | b.err = err 475 | } 476 | if r != 0 { 477 | b.err = fmt.Errorf("invalid continuation, should be 0, was %d", r) 478 | } 479 | } 480 | 481 | // Send or close 482 | select { 483 | case <-f.closeReader: 484 | return 485 | case f.ready <- b: 486 | } 487 | // Exit because of an error 488 | if b.err != nil || lastBlock { 489 | return 490 | } 491 | i++ 492 | } 493 | } 494 | 495 | // seekReader will read format 1 blocks and deliver them 496 | // to the ready channel. 497 | // The function will return if the stream is finished, 498 | // or an error occurs 499 | func (f *reader) seekReader(in io.ReadSeeker) { 500 | defer close(f.readerClosed) 501 | defer close(f.ready) 502 | 503 | i := 1 // Current block 504 | var foffset int64 505 | for { 506 | // Copy b, we are modifying it. 507 | b := *f.blocks[i] 508 | 509 | // Seek to offset if needed, and 510 | if b.offset != foffset { 511 | _, err := in.Seek(b.offset, 0) 512 | if err != nil { 513 | b.err = err 514 | } 515 | } 516 | if b.err == nil { 517 | b.data = make([]byte, b.readData) 518 | n, err := io.ReadFull(in, b.data) 519 | if err != nil { 520 | b.err = err 521 | } else if n != b.readData { 522 | b.err = io.ErrUnexpectedEOF 523 | } 524 | foffset = b.offset + int64(n) 525 | } 526 | 527 | // Always release the memory of this block 528 | b.last = i 529 | 530 | // Send or close 531 | select { 532 | case <-f.closeReader: 533 | return 534 | case f.ready <- &b: 535 | } 536 | // Exit because of an error 537 | if b.err != nil { 538 | return 539 | } 540 | i++ 541 | // We read them all 542 | if i == len(f.blocks) { 543 | return 544 | } 545 | } 546 | } 547 | 548 | // Close the reader and shut down the running goroutines. 549 | func (f *streamReader) Close() error { 550 | select { 551 | case <-f.readerClosed: 552 | case f.closeReader <- struct{}{}: 553 | <-f.readerClosed 554 | } 555 | return nil 556 | } 557 | -------------------------------------------------------------------------------- /reader_test.go: -------------------------------------------------------------------------------- 1 | package dedup_test 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "testing" 7 | 8 | "io/ioutil" 9 | 10 | "fmt" 11 | 12 | "github.com/klauspost/dedup" 13 | ) 14 | 15 | func TestReader(t *testing.T) { 16 | idx := bytes.Buffer{} 17 | data := bytes.Buffer{} 18 | 19 | const totalinput = 10<<20 + 65 20 | input := getBufferSize(totalinput) 21 | 22 | const size = 64 << 10 23 | b := input.Bytes() 24 | // Create some duplicates 25 | for i := 0; i < 50; i++ { 26 | // Read from 10 first blocks 27 | src := b[(i%10)*size : (i%10)*size+size] 28 | // Write into the following ones 29 | dst := b[(10+i)*size : (i+10)*size+size] 30 | copy(dst, src) 31 | } 32 | input = bytes.NewBuffer(b) 33 | w, err := dedup.NewWriter(&idx, &data, dedup.ModeFixed, size, 0) 34 | if err != nil { 35 | t.Fatal(err) 36 | } 37 | io.Copy(w, input) 38 | err = w.Close() 39 | if err != nil { 40 | t.Fatal(err) 41 | } 42 | 43 | t.Log("Fixed Index size:", idx.Len()) 44 | t.Log("Fixed Data size:", data.Len(), "-", data.Len()*100/totalinput, "%") 45 | 46 | r, err := dedup.NewReader(&idx, &data) 47 | if err != nil { 48 | t.Fatal(err) 49 | } 50 | 51 | t.Log("Maximum estimated memory:", r.MaxMem(), "bytes") 52 | 53 | out, err := ioutil.ReadAll(r) 54 | if err != io.EOF && err != nil { 55 | t.Fatal(err) 56 | } 57 | if len(b) != len(out) { 58 | t.Fatalf("Expected len %d, got %d", len(b), len(out)) 59 | } 60 | if bytes.Compare(b, out) != 0 { 61 | t.Fatal("Output mismatch") 62 | } 63 | err = r.Close() 64 | if err != nil { 65 | t.Fatal(err) 66 | } 67 | blocks := r.BlockSizes() 68 | for _, s := range blocks[:len(blocks)-1] { 69 | if s != size { 70 | t.Fatal("wrong size, expected", size, "got", s) 71 | } 72 | } 73 | } 74 | 75 | func TestReaderStream(t *testing.T) { 76 | data := bytes.Buffer{} 77 | 78 | const totalinput = 10<<20 + 65 79 | input := getBufferSize(totalinput) 80 | 81 | const size = 64 << 10 82 | b := input.Bytes() 83 | // Create some duplicates 84 | for i := 0; i < 50; i++ { 85 | // Read from 10 first blocks 86 | src := b[(i%10)*size : (i%10)*size+size] 87 | // Write into the following ones 88 | dst := b[(10+i)*size : (i+10)*size+size] 89 | copy(dst, src) 90 | } 91 | input = bytes.NewBuffer(b) 92 | w, err := dedup.NewStreamWriter(&data, dedup.ModeFixed, size, 10*size) 93 | if err != nil { 94 | t.Fatal(err) 95 | } 96 | io.Copy(w, input) 97 | err = w.Close() 98 | if err != nil { 99 | t.Fatal(err) 100 | } 101 | 102 | t.Log("Fixed Data size:", data.Len(), "-", data.Len()*100/totalinput, "%") 103 | 104 | r, err := dedup.NewStreamReader(&data) 105 | if err != nil { 106 | t.Fatal(err) 107 | } 108 | 109 | t.Log("Maximum estimated memory:", r.MaxMem(), "bytes") 110 | 111 | out, err := ioutil.ReadAll(r) 112 | if err != io.EOF && err != nil { 113 | t.Fatal(err) 114 | } 115 | if len(b) != len(out) { 116 | t.Fatalf("Expected len %d, got %d", len(b), len(out)) 117 | } 118 | if bytes.Compare(b, out) != 0 { 119 | t.Fatal("Output mismatch") 120 | } 121 | err = r.Close() 122 | if err != nil { 123 | t.Fatal(err) 124 | } 125 | } 126 | 127 | func TestSeekReader(t *testing.T) { 128 | idx := bytes.Buffer{} 129 | data := bytes.Buffer{} 130 | 131 | const totalinput = 50<<20 + 65 132 | input := getBufferSize(totalinput) 133 | 134 | const size = 64 << 10 135 | b := input.Bytes() 136 | // Create some duplicates 137 | for i := 0; i < 500; i++ { 138 | // Read from 10 first blocks 139 | src := b[(i%100)*size : (i%100)*size+size] 140 | // Write into the following ones 141 | dst := b[(100+i)*size : (i+100)*size+size] 142 | copy(dst, src) 143 | } 144 | input = bytes.NewBuffer(b) 145 | w, err := dedup.NewWriter(&idx, &data, dedup.ModeFixed, size, 0) 146 | if err != nil { 147 | t.Fatal(err) 148 | } 149 | io.Copy(w, input) 150 | err = w.Close() 151 | if err != nil { 152 | t.Fatal(err) 153 | } 154 | 155 | t.Log("Fixed Index size:", idx.Len()) 156 | t.Log("Fixed Data size:", data.Len(), "-", data.Len()*100/totalinput, "%") 157 | 158 | r, err := dedup.NewSeekReader(&idx, bytes.NewReader(data.Bytes())) 159 | if err != nil { 160 | t.Fatal(err) 161 | } 162 | 163 | t.Log("Maximum estimated memory:", r.MaxMem(), "bytes") 164 | 165 | out, err := ioutil.ReadAll(r) 166 | if err != io.EOF && err != nil { 167 | t.Fatal(err) 168 | } 169 | if len(b) != len(out) { 170 | t.Fatalf("Expected len %d, got %d", len(b), len(out)) 171 | } 172 | if bytes.Compare(b, out) != 0 { 173 | t.Fatal("Output mismatch") 174 | } 175 | err = r.Close() 176 | if err != nil { 177 | t.Fatal(err) 178 | } 179 | } 180 | 181 | func TestDynamicRoundtrip(t *testing.T) { 182 | idx := bytes.Buffer{} 183 | data := bytes.Buffer{} 184 | 185 | const totalinput = 10<<20 + 65 186 | input := getBufferSize(totalinput) 187 | 188 | const size = 64 << 10 189 | b := input.Bytes() 190 | // Create some duplicates 191 | for i := 0; i < 50; i++ { 192 | // Read from 10 first blocks 193 | src := b[(i%10)*size : (i%10)*size+size] 194 | // Write into the following ones 195 | dst := b[(10+i)*size : (i+10)*size+size] 196 | copy(dst, src) 197 | } 198 | input = bytes.NewBuffer(b) 199 | w, err := dedup.NewWriter(&idx, &data, dedup.ModeDynamic, size, 0) 200 | if err != nil { 201 | t.Fatal(err) 202 | } 203 | io.Copy(w, input) 204 | err = w.Close() 205 | if err != nil { 206 | t.Fatal(err) 207 | } 208 | 209 | t.Log("Dynamic Index size:", idx.Len()) 210 | t.Log("Dynamic Data size:", data.Len()) 211 | 212 | r, err := dedup.NewReader(&idx, &data) 213 | if err != nil { 214 | t.Fatal(err) 215 | } 216 | 217 | t.Log("Maximum estimated memory:", r.MaxMem(), "bytes") 218 | blocks := r.BlockSizes() 219 | avg := 0 220 | for _, v := range blocks { 221 | if v > size { 222 | t.Fatal("too big block returned, should not be >", size, "was", v) 223 | } 224 | avg += v 225 | } 226 | t.Log("Average block size:", avg/len(blocks), "bytes") 227 | 228 | out, err := ioutil.ReadAll(r) 229 | if err != io.EOF && err != nil { 230 | t.Fatal(err) 231 | } 232 | if len(b) != len(out) { 233 | t.Fatalf("Expected len %d, got %d", len(b), len(out)) 234 | } 235 | if bytes.Compare(b, out) != 0 { 236 | t.Fatal("Output mismatch") 237 | } 238 | err = r.Close() 239 | if err != nil { 240 | t.Fatal(err) 241 | } 242 | } 243 | 244 | func TestReaderWriteTo(t *testing.T) { 245 | idx := bytes.Buffer{} 246 | data := bytes.Buffer{} 247 | 248 | const totalinput = 10<<20 + 65 249 | input := getBufferSize(totalinput) 250 | 251 | const size = 64 << 10 252 | b := input.Bytes() 253 | // Create some duplicates 254 | for i := 0; i < 50; i++ { 255 | // Read from 10 first blocks 256 | src := b[(i%10)*size : (i%10)*size+size] 257 | // Write into the following ones 258 | dst := b[(10+i)*size : (i+10)*size+size] 259 | copy(dst, src) 260 | } 261 | input = bytes.NewBuffer(b) 262 | w, err := dedup.NewWriter(&idx, &data, dedup.ModeFixed, size, 0) 263 | if err != nil { 264 | t.Fatal(err) 265 | } 266 | io.Copy(w, input) 267 | err = w.Close() 268 | if err != nil { 269 | t.Fatal(err) 270 | } 271 | 272 | r, err := dedup.NewReader(&idx, &data) 273 | if err != nil { 274 | t.Fatal(err) 275 | } 276 | 277 | dst := &bytes.Buffer{} 278 | n, err := r.WriteTo(dst) 279 | if err != io.EOF && err != nil { 280 | t.Fatal(err) 281 | } 282 | if len(b) != int(n) { 283 | t.Errorf("Write count, expected n %d, got %d", len(b), n) 284 | } 285 | 286 | out := dst.Bytes() 287 | if len(b) != len(out) { 288 | t.Fatalf("Expected len %d, got %d", len(b), len(out)) 289 | } 290 | if len(b) != len(out) { 291 | t.Fatalf("Expected len %d, got %d", len(b), len(out)) 292 | } 293 | if bytes.Compare(b, out) != 0 { 294 | t.Fatal("Output mismatch") 295 | } 296 | err = r.Close() 297 | if err != nil { 298 | t.Fatal(err) 299 | } 300 | blocks := r.BlockSizes() 301 | for _, s := range blocks[:len(blocks)-1] { 302 | if s != size { 303 | t.Fatal("wrong size, expected", size, "got", s) 304 | } 305 | } 306 | } 307 | 308 | // Indexed stream, 10MB input, 64K blocks 309 | func BenchmarkReader64K(t *testing.B) { 310 | idx := &bytes.Buffer{} 311 | data := &bytes.Buffer{} 312 | 313 | const totalinput = 10 << 20 314 | input := getBufferSize(totalinput) 315 | 316 | const size = 64 << 10 317 | b := input.Bytes() 318 | // Create some duplicates 319 | for i := 0; i < 50; i++ { 320 | // Read from 10 first blocks 321 | src := b[(i%10)*size : (i%10)*size+size] 322 | // Write into the following ones 323 | dst := b[(10+i)*size : (i+10)*size+size] 324 | copy(dst, src) 325 | } 326 | input = bytes.NewBuffer(b) 327 | w, err := dedup.NewWriter(idx, data, dedup.ModeFixed, size, 0) 328 | if err != nil { 329 | t.Fatal(err) 330 | } 331 | _, err = io.Copy(w, input) 332 | if err != nil { 333 | t.Fatal(err) 334 | } 335 | err = w.Close() 336 | if err != nil { 337 | t.Fatal(err) 338 | } 339 | 340 | index := idx.Bytes() 341 | alldata := data.Bytes() 342 | 343 | t.ResetTimer() 344 | t.SetBytes(totalinput) 345 | for i := 0; i < t.N; i++ { 346 | idx = bytes.NewBuffer(index) 347 | data = bytes.NewBuffer(alldata) 348 | r, err := dedup.NewReader(idx, data) 349 | if err != nil { 350 | t.Fatal(err) 351 | } 352 | n, err := io.Copy(ioutil.Discard, r) 353 | if err != nil && err != io.EOF { 354 | t.Fatal(err) 355 | } 356 | if n != int64(len(b)) { 357 | t.Fatal("read was short, expected", len(b), "was", n) 358 | } 359 | err = r.Close() 360 | if err != nil { 361 | t.Fatal(err) 362 | } 363 | } 364 | } 365 | 366 | // Indexed stream, 10MB input, 4K blocks 367 | func BenchmarkReader4K(t *testing.B) { 368 | idx := &bytes.Buffer{} 369 | data := &bytes.Buffer{} 370 | 371 | const totalinput = 10 << 20 372 | input := getBufferSize(totalinput) 373 | 374 | const size = 4 << 10 375 | b := input.Bytes() 376 | // Create some duplicates 377 | for i := 0; i < 500; i++ { 378 | // Read from 10 first blocks 379 | src := b[(i%10)*size : (i%10)*size+size] 380 | // Write into the following ones 381 | dst := b[(10+i)*size : (i+10)*size+size] 382 | copy(dst, src) 383 | } 384 | input = bytes.NewBuffer(b) 385 | w, err := dedup.NewWriter(idx, data, dedup.ModeFixed, size, 0) 386 | if err != nil { 387 | t.Fatal(err) 388 | } 389 | _, err = io.Copy(w, input) 390 | if err != nil { 391 | t.Fatal(err) 392 | } 393 | err = w.Close() 394 | if err != nil { 395 | t.Fatal(err) 396 | } 397 | 398 | index := idx.Bytes() 399 | alldata := data.Bytes() 400 | 401 | t.ResetTimer() 402 | t.SetBytes(totalinput) 403 | for i := 0; i < t.N; i++ { 404 | idx := bytes.NewBuffer(index) 405 | data := bytes.NewBuffer(alldata) 406 | r, err := dedup.NewReader(idx, data) 407 | if err != nil { 408 | t.Fatal(err) 409 | } 410 | n, err := io.Copy(ioutil.Discard, r) 411 | if err != nil && err != io.EOF { 412 | t.Fatal(err) 413 | } 414 | if n != int64(len(b)) { 415 | t.Fatal("read was short, expected", len(b), "was", n) 416 | } 417 | err = r.Close() 418 | if err != nil { 419 | t.Fatal(err) 420 | } 421 | } 422 | } 423 | 424 | // Indexed stream, 10MB input, 1K blocks 425 | func BenchmarkReader1K(t *testing.B) { 426 | idx := &bytes.Buffer{} 427 | data := &bytes.Buffer{} 428 | 429 | const totalinput = 10 << 20 430 | input := getBufferSize(totalinput) 431 | 432 | const size = 1 << 10 433 | b := input.Bytes() 434 | // Create some duplicates 435 | for i := 0; i < 500; i++ { 436 | // Read from 10 first blocks 437 | src := b[(i%10)*size : (i%10)*size+size] 438 | // Write into the following ones 439 | dst := b[(10+i)*size : (i+10)*size+size] 440 | copy(dst, src) 441 | } 442 | input = bytes.NewBuffer(b) 443 | w, err := dedup.NewWriter(idx, data, dedup.ModeFixed, size, 0) 444 | if err != nil { 445 | t.Fatal(err) 446 | } 447 | _, err = io.Copy(w, input) 448 | if err != nil { 449 | t.Fatal(err) 450 | } 451 | err = w.Close() 452 | if err != nil { 453 | t.Fatal(err) 454 | } 455 | 456 | index := idx.Bytes() 457 | alldata := data.Bytes() 458 | 459 | t.ResetTimer() 460 | t.SetBytes(totalinput) 461 | for i := 0; i < t.N; i++ { 462 | idx := bytes.NewBuffer(index) 463 | data := bytes.NewBuffer(alldata) 464 | r, err := dedup.NewReader(idx, data) 465 | if err != nil { 466 | t.Fatal(err) 467 | } 468 | n, err := io.Copy(ioutil.Discard, r) 469 | if err != nil && err != io.EOF { 470 | t.Fatal(err) 471 | } 472 | if n != int64(len(b)) { 473 | t.Fatal("read was short, expected", len(b), "was", n) 474 | } 475 | err = r.Close() 476 | if err != nil { 477 | t.Fatal(err) 478 | } 479 | } 480 | } 481 | 482 | // Stream, 64K blocks on 10MB data. 483 | func BenchmarkReaderStream64K(t *testing.B) { 484 | data := &bytes.Buffer{} 485 | 486 | const totalinput = 10 << 20 487 | input := getBufferSize(totalinput) 488 | 489 | const size = 64 << 10 490 | b := input.Bytes() 491 | // Create some duplicates 492 | for i := 0; i < 50; i++ { 493 | // Read from 10 first blocks 494 | src := b[(i%10)*size : (i%10)*size+size] 495 | // Write into the following ones 496 | dst := b[(10+i)*size : (i+10)*size+size] 497 | copy(dst, src) 498 | } 499 | input = bytes.NewBuffer(b) 500 | w, err := dedup.NewStreamWriter(data, dedup.ModeFixed, size, 100*size) 501 | if err != nil { 502 | t.Fatal(err) 503 | } 504 | io.Copy(w, input) 505 | err = w.Close() 506 | if err != nil { 507 | t.Fatal(err) 508 | } 509 | 510 | alldata := data.Bytes() 511 | 512 | t.ResetTimer() 513 | t.SetBytes(totalinput) 514 | for i := 0; i < t.N; i++ { 515 | input := bytes.NewBuffer(alldata) 516 | r, err := dedup.NewStreamReader(input) 517 | if err != nil { 518 | t.Fatal(err) 519 | } 520 | 521 | n, err := io.Copy(ioutil.Discard, r) 522 | if err != io.EOF && err != nil { 523 | t.Fatal(err) 524 | } 525 | if len(b) != int(n) { 526 | t.Fatalf("Expected len %d, got %d", len(b), n) 527 | } 528 | err = r.Close() 529 | if err != nil { 530 | t.Fatal(err) 531 | } 532 | } 533 | } 534 | 535 | // Stream, 4K blocks on 10MB data. 536 | func BenchmarkReaderStream4K(t *testing.B) { 537 | data := &bytes.Buffer{} 538 | 539 | const totalinput = 10 << 20 540 | input := getBufferSize(totalinput) 541 | 542 | const size = 4 << 10 543 | b := input.Bytes() 544 | // Create some duplicates 545 | for i := 0; i < 100; i++ { 546 | // Read from 10 first blocks 547 | src := b[(i%10)*size : (i%10)*size+size] 548 | // Write into the following ones 549 | dst := b[(10+i)*size : (i+10)*size+size] 550 | copy(dst, src) 551 | } 552 | input = bytes.NewBuffer(b) 553 | w, err := dedup.NewStreamWriter(data, dedup.ModeFixed, size, 100*size) 554 | if err != nil { 555 | t.Fatal(err) 556 | } 557 | io.Copy(w, input) 558 | err = w.Close() 559 | if err != nil { 560 | t.Fatal(err) 561 | } 562 | 563 | alldata := data.Bytes() 564 | 565 | t.ResetTimer() 566 | t.SetBytes(totalinput) 567 | for i := 0; i < t.N; i++ { 568 | input := bytes.NewBuffer(alldata) 569 | r, err := dedup.NewStreamReader(input) 570 | if err != nil { 571 | t.Fatal(err) 572 | } 573 | 574 | n, err := io.Copy(ioutil.Discard, r) 575 | if err != io.EOF && err != nil { 576 | t.Fatal(err) 577 | } 578 | if len(b) != int(n) { 579 | t.Fatalf("Expected len %d, got %d", len(b), n) 580 | } 581 | err = r.Close() 582 | if err != nil { 583 | t.Fatal(err) 584 | } 585 | } 586 | } 587 | 588 | // Stream, 1K blocks on 10MB data. 589 | func BenchmarkReaderStream1K(t *testing.B) { 590 | data := &bytes.Buffer{} 591 | 592 | const totalinput = 10 << 20 593 | input := getBufferSize(totalinput) 594 | 595 | const size = 1 << 10 596 | b := input.Bytes() 597 | // Create some duplicates 598 | for i := 0; i < 500; i++ { 599 | // Read from 10 first blocks 600 | src := b[(i%10)*size : (i%10)*size+size] 601 | // Write into the following ones 602 | dst := b[(10+i)*size : (i+10)*size+size] 603 | copy(dst, src) 604 | } 605 | input = bytes.NewBuffer(b) 606 | w, err := dedup.NewStreamWriter(data, dedup.ModeFixed, size, 100*size) 607 | if err != nil { 608 | t.Fatal(err) 609 | } 610 | io.Copy(w, input) 611 | err = w.Close() 612 | if err != nil { 613 | t.Fatal(err) 614 | } 615 | 616 | alldata := data.Bytes() 617 | 618 | t.ResetTimer() 619 | t.SetBytes(totalinput) 620 | for i := 0; i < t.N; i++ { 621 | input := bytes.NewBuffer(alldata) 622 | r, err := dedup.NewStreamReader(input) 623 | if err != nil { 624 | t.Fatal(err) 625 | } 626 | 627 | n, err := io.Copy(ioutil.Discard, r) 628 | if err != io.EOF && err != nil { 629 | t.Fatal(err) 630 | } 631 | if len(b) != int(n) { 632 | t.Fatalf("Expected len %d, got %d", len(b), n) 633 | } 634 | err = r.Close() 635 | if err != nil { 636 | t.Fatal(err) 637 | } 638 | } 639 | } 640 | 641 | // This will deduplicate a buffer of zeros to an indexed stream 642 | func ExampleNewReader() { 643 | // Create data we can read. 644 | var idx, data bytes.Buffer 645 | input := bytes.NewBuffer(make([]byte, 50000)) 646 | w, _ := dedup.NewWriter(&idx, &data, dedup.ModeFixed, 1000, 0) 647 | _, _ = io.Copy(w, input) 648 | _ = w.Close() 649 | 650 | // Create a new reader. 651 | r, err := dedup.NewReader(&idx, &data) 652 | if err != nil { 653 | panic(err) 654 | } 655 | 656 | // Inspect how much memory it will use. 657 | fmt.Println("Memory use:", r.MaxMem()) 658 | 659 | var dst bytes.Buffer 660 | 661 | // Read everything 662 | _, err = io.Copy(&dst, r) 663 | if err != nil && err != io.EOF { 664 | panic(err) 665 | } 666 | 667 | // Let us inspect what was written: 668 | fmt.Println("Returned data length:", dst.Len()) 669 | fmt.Println("Everything zero:", 0 == bytes.Compare(dst.Bytes(), make([]byte, 50000))) 670 | 671 | // OUTPUT: Memory use: 1000 672 | // Returned data length: 50000 673 | // Everything zero: true 674 | } 675 | 676 | // This will deduplicate a buffer of zeros to an indexed stream 677 | func ExampleNewStreamReader() { 678 | // Create data we can read. 679 | var data bytes.Buffer 680 | input := bytes.NewBuffer(make([]byte, 50000)) 681 | // Set the memory limit to 10000 bytes 682 | w, _ := dedup.NewStreamWriter(&data, dedup.ModeFixed, 1000, 10000) 683 | _, _ = io.Copy(w, input) 684 | _ = w.Close() 685 | 686 | // Create a new stream reader: 687 | r, err := dedup.NewStreamReader(&data) 688 | if err != nil { 689 | panic(err) 690 | } 691 | 692 | // Inspect how much memory it will use. 693 | // Since this is a stream, it will print the worst possible scenario 694 | fmt.Println("Memory use:", r.MaxMem()) 695 | 696 | var dst bytes.Buffer 697 | 698 | // Read everything 699 | _, err = io.Copy(&dst, r) 700 | if err != nil && err != io.EOF { 701 | panic(err) 702 | } 703 | 704 | // Let us inspect what was written: 705 | fmt.Println("Returned data length:", dst.Len()) 706 | fmt.Println("Everything zero:", 0 == bytes.Compare(dst.Bytes(), make([]byte, 50000))) 707 | 708 | // OUTPUT: Memory use: 10000 709 | // Returned data length: 50000 710 | // Everything zero: true 711 | } 712 | -------------------------------------------------------------------------------- /sort/hashsort.go: -------------------------------------------------------------------------------- 1 | package sort 2 | 3 | // Adapted from https://github.com/AlasdairF/Sort/tree/master/Int 4 | // - no LICENSE, see https://github.com/AlasdairF/Sort/issues/1 5 | // ================= COMMON ================= 6 | 7 | func min(a, b int) int { 8 | if a < b { 9 | return a 10 | } 11 | return b 12 | } 13 | 14 | // ------------- ASCENDING ------------- 15 | 16 | func heapSortAsc(data []int, a, b int) { 17 | first := a 18 | lo := 0 19 | hi := b - a 20 | for i := (hi - 1) / 2; i >= 0; i-- { 21 | siftDownAsc(data, i, hi, first) 22 | } 23 | for i := hi - 1; i >= 0; i-- { 24 | data[first], data[first+i] = data[first+i], data[first] 25 | siftDownAsc(data, lo, i, first) 26 | } 27 | } 28 | 29 | func insertionSortAsc(data []int, a, b int) { 30 | var j int 31 | for i := a + 1; i < b; i++ { 32 | for j = i; j > a && data[j] < data[j-1]; j-- { 33 | data[j], data[j-1] = data[j-1], data[j] 34 | } 35 | } 36 | } 37 | 38 | func siftDownAsc(data []int, lo, hi, first int) { 39 | root := lo 40 | for { 41 | child := 2*root + 1 42 | if child >= hi { 43 | break 44 | } 45 | if child+1 < hi && data[first+child] < data[first+child+1] { 46 | child++ 47 | } 48 | if data[first+root] >= data[first+child] { 49 | return 50 | } 51 | data[first+root], data[first+child] = data[first+child], data[first+root] 52 | root = child 53 | } 54 | } 55 | 56 | func medianOfThreeAsc(data []int, m1, m0, m2 int) { 57 | // bubble sort on 3 elements 58 | if data[m1] < data[m0] { 59 | data[m1], data[m0] = data[m0], data[m1] 60 | } 61 | if data[m2] < data[m1] { 62 | data[m2], data[m1] = data[m1], data[m2] 63 | } 64 | if data[m1] < data[m0] { 65 | data[m1], data[m0] = data[m0], data[m1] 66 | } 67 | } 68 | 69 | func swapRangeAsc(data []int, a, b, n int) { 70 | for i := 0; i < n; i++ { 71 | data[a], data[b] = data[b], data[a] 72 | a++ 73 | b++ 74 | } 75 | } 76 | 77 | func doPivotAsc(data []int, lo, hi int) (midlo, midhi int) { 78 | m := lo + (hi-lo)/2 79 | if hi-lo > 40 { 80 | s := (hi - lo) / 8 81 | medianOfThreeAsc(data, lo, lo+s, lo+2*s) 82 | medianOfThreeAsc(data, m, m-s, m+s) 83 | medianOfThreeAsc(data, hi-1, hi-1-s, hi-1-2*s) 84 | } 85 | medianOfThreeAsc(data, lo, m, hi-1) 86 | 87 | pivot := lo 88 | a, b, c, d := lo+1, lo+1, hi, hi 89 | for { 90 | for b < c { 91 | if data[b] < data[pivot] { 92 | b++ 93 | } else if data[pivot] >= data[b] { 94 | data[a], data[b] = data[b], data[a] 95 | a++ 96 | b++ 97 | } else { 98 | break 99 | } 100 | } 101 | for b < c { 102 | if data[pivot] < data[c-1] { 103 | c-- 104 | } else if data[c-1] >= data[pivot] { 105 | data[c-1], data[d-1] = data[d-1], data[c-1] 106 | c-- 107 | d-- 108 | } else { 109 | break 110 | } 111 | } 112 | if b >= c { 113 | break 114 | } 115 | data[b], data[c-1] = data[c-1], data[b] 116 | b++ 117 | c-- 118 | } 119 | 120 | n := min(b-a, a-lo) 121 | swapRangeAsc(data, lo, b-n, n) 122 | 123 | n = min(hi-d, d-c) 124 | swapRangeAsc(data, c, hi-n, n) 125 | 126 | return lo + b - a, hi - (d - c) 127 | } 128 | 129 | func quickSortAsc(data []int, a, b, maxDepth int) { 130 | var mlo, mhi int 131 | for b-a > 7 { 132 | if maxDepth == 0 { 133 | heapSortAsc(data, a, b) 134 | return 135 | } 136 | maxDepth-- 137 | mlo, mhi = doPivotAsc(data, a, b) 138 | if mlo-a < b-mhi { 139 | quickSortAsc(data, a, mlo, maxDepth) 140 | a = mhi 141 | } else { 142 | quickSortAsc(data, mhi, b, maxDepth) 143 | b = mlo 144 | } 145 | } 146 | if b-a > 1 { 147 | insertionSortAsc(data, a, b) 148 | } 149 | } 150 | 151 | func Asc(data []int) { 152 | maxDepth := 0 153 | for i := len(data); i > 0; i >>= 1 { 154 | maxDepth++ 155 | } 156 | maxDepth *= 2 157 | quickSortAsc(data, 0, len(data), maxDepth) 158 | } 159 | 160 | func IsSortedAsc(data []int) bool { 161 | for i := len(data) - 1; i > 0; i-- { 162 | if data[i] < data[i-1] { 163 | return false 164 | } 165 | } 166 | return true 167 | } 168 | 169 | func StableAsc(data []int) { 170 | n := len(data) 171 | blockSize := 20 172 | a, b := 0, blockSize 173 | for b <= n { 174 | insertionSortAsc(data, a, b) 175 | a = b 176 | b += blockSize 177 | } 178 | insertionSortAsc(data, a, n) 179 | 180 | for blockSize < n { 181 | a, b = 0, 2*blockSize 182 | for b <= n { 183 | symMergeAsc(data, a, a+blockSize, b) 184 | a = b 185 | b += 2 * blockSize 186 | } 187 | symMergeAsc(data, a, a+blockSize, n) 188 | blockSize *= 2 189 | } 190 | } 191 | 192 | func symMergeAsc(data []int, a, m, b int) { 193 | if a >= m || m >= b { 194 | return 195 | } 196 | mid := a + (b-a)/2 197 | n := mid + m 198 | var start, c, r, p int 199 | if m > mid { 200 | start = n - b 201 | r, p = mid, n-1 202 | for start < r { 203 | c = start + (r-start)/2 204 | if data[p-c] >= data[c] { 205 | start = c + 1 206 | } else { 207 | r = c 208 | } 209 | } 210 | } else { 211 | start = a 212 | r, p = m, n-1 213 | for start < r { 214 | c = start + (r-start)/2 215 | if data[p-c] >= data[c] { 216 | start = c + 1 217 | } else { 218 | r = c 219 | } 220 | } 221 | } 222 | end := n - start 223 | rotateAsc(data, start, m, end) 224 | symMergeAsc(data, a, start, mid) 225 | symMergeAsc(data, mid, end, b) 226 | } 227 | 228 | func rotateAsc(data []int, a, m, b int) { 229 | i := m - a 230 | if i == 0 { 231 | return 232 | } 233 | j := b - m 234 | if j == 0 { 235 | return 236 | } 237 | if i == j { 238 | swapRangeAsc(data, a, m, i) 239 | return 240 | } 241 | p := a + i 242 | for i != j { 243 | if i > j { 244 | swapRangeAsc(data, p-i, p, j) 245 | i -= j 246 | } else { 247 | swapRangeAsc(data, p-i, p+j-i, i) 248 | j -= i 249 | } 250 | } 251 | swapRangeAsc(data, p-i, p, i) 252 | } 253 | 254 | // ------------- DESCENDING ------------- 255 | 256 | func heapSortDesc(data []int, a, b int) { 257 | first := a 258 | lo := 0 259 | hi := b - a 260 | for i := (hi - 1) / 2; i >= 0; i-- { 261 | siftDownDesc(data, i, hi, first) 262 | } 263 | for i := hi - 1; i >= 0; i-- { 264 | data[first], data[first+i] = data[first+i], data[first] 265 | siftDownDesc(data, lo, i, first) 266 | } 267 | } 268 | 269 | func insertionSortDesc(data []int, a, b int) { 270 | var j int 271 | for i := a + 1; i < b; i++ { 272 | for j = i; j > a && data[j] > data[j-1]; j-- { 273 | data[j], data[j-1] = data[j-1], data[j] 274 | } 275 | } 276 | } 277 | 278 | func siftDownDesc(data []int, lo, hi, first int) { 279 | root := lo 280 | for { 281 | child := 2*root + 1 282 | if child >= hi { 283 | break 284 | } 285 | if child+1 < hi && data[first+child] > data[first+child+1] { 286 | child++ 287 | } 288 | if data[first+root] <= data[first+child] { 289 | return 290 | } 291 | data[first+root], data[first+child] = data[first+child], data[first+root] 292 | root = child 293 | } 294 | } 295 | 296 | func medianOfThreeDesc(data []int, m1, m0, m2 int) { 297 | // bubble sort on 3 elements 298 | if data[m1] > data[m0] { 299 | data[m1], data[m0] = data[m0], data[m1] 300 | } 301 | if data[m2] > data[m1] { 302 | data[m2], data[m1] = data[m1], data[m2] 303 | } 304 | if data[m1] > data[m0] { 305 | data[m1], data[m0] = data[m0], data[m1] 306 | } 307 | } 308 | 309 | func swapRangeDesc(data []int, a, b, n int) { 310 | for i := 0; i < n; i++ { 311 | data[a], data[b] = data[b], data[a] 312 | a++ 313 | b++ 314 | } 315 | } 316 | 317 | func doPivotDesc(data []int, lo, hi int) (midlo, midhi int) { 318 | m := lo + (hi-lo)/2 319 | if hi-lo > 40 { 320 | s := (hi - lo) / 8 321 | medianOfThreeDesc(data, lo, lo+s, lo+2*s) 322 | medianOfThreeDesc(data, m, m-s, m+s) 323 | medianOfThreeDesc(data, hi-1, hi-1-s, hi-1-2*s) 324 | } 325 | medianOfThreeDesc(data, lo, m, hi-1) 326 | 327 | pivot := lo 328 | a, b, c, d := lo+1, lo+1, hi, hi 329 | for { 330 | for b < c { 331 | if data[b] > data[pivot] { 332 | b++ 333 | } else if data[pivot] <= data[b] { 334 | data[a], data[b] = data[b], data[a] 335 | a++ 336 | b++ 337 | } else { 338 | break 339 | } 340 | } 341 | for b < c { 342 | if data[pivot] > data[c-1] { 343 | c-- 344 | } else if data[c-1] <= data[pivot] { 345 | data[c-1], data[d-1] = data[d-1], data[c-1] 346 | c-- 347 | d-- 348 | } else { 349 | break 350 | } 351 | } 352 | if b >= c { 353 | break 354 | } 355 | data[b], data[c-1] = data[c-1], data[b] 356 | b++ 357 | c-- 358 | } 359 | 360 | n := min(b-a, a-lo) 361 | swapRangeDesc(data, lo, b-n, n) 362 | 363 | n = min(hi-d, d-c) 364 | swapRangeDesc(data, c, hi-n, n) 365 | 366 | return lo + b - a, hi - (d - c) 367 | } 368 | 369 | func quickSortDesc(data []int, a, b, maxDepth int) { 370 | var mlo, mhi int 371 | for b-a > 7 { 372 | if maxDepth == 0 { 373 | heapSortDesc(data, a, b) 374 | return 375 | } 376 | maxDepth-- 377 | mlo, mhi = doPivotDesc(data, a, b) 378 | if mlo-a < b-mhi { 379 | quickSortDesc(data, a, mlo, maxDepth) 380 | a = mhi 381 | } else { 382 | quickSortDesc(data, mhi, b, maxDepth) 383 | b = mlo 384 | } 385 | } 386 | if b-a > 1 { 387 | insertionSortDesc(data, a, b) 388 | } 389 | } 390 | 391 | func Desc(data []int) { 392 | maxDepth := 0 393 | for i := len(data); i > 0; i >>= 1 { 394 | maxDepth++ 395 | } 396 | maxDepth *= 2 397 | quickSortDesc(data, 0, len(data), maxDepth) 398 | } 399 | 400 | func IsSortedDesc(data []int) bool { 401 | for i := len(data) - 1; i > 0; i-- { 402 | if data[i] > data[i-1] { 403 | return false 404 | } 405 | } 406 | return true 407 | } 408 | 409 | func StableDesc(data []int) { 410 | n := len(data) 411 | blockSize := 20 412 | a, b := 0, blockSize 413 | for b <= n { 414 | insertionSortDesc(data, a, b) 415 | a = b 416 | b += blockSize 417 | } 418 | insertionSortDesc(data, a, n) 419 | 420 | for blockSize < n { 421 | a, b = 0, 2*blockSize 422 | for b <= n { 423 | symMergeDesc(data, a, a+blockSize, b) 424 | a = b 425 | b += 2 * blockSize 426 | } 427 | symMergeDesc(data, a, a+blockSize, n) 428 | blockSize *= 2 429 | } 430 | } 431 | 432 | func symMergeDesc(data []int, a, m, b int) { 433 | if a >= m || m >= b { 434 | return 435 | } 436 | mid := a + (b-a)/2 437 | n := mid + m 438 | var start, c, r, p int 439 | if m > mid { 440 | start = n - b 441 | r, p = mid, n-1 442 | for start < r { 443 | c = start + (r-start)/2 444 | if data[p-c] < data[c] { 445 | start = c + 1 446 | } else { 447 | r = c 448 | } 449 | } 450 | } else { 451 | start = a 452 | r, p = m, n-1 453 | for start < r { 454 | c = start + (r-start)/2 455 | if data[p-c] < data[c] { 456 | start = c + 1 457 | } else { 458 | r = c 459 | } 460 | } 461 | } 462 | end := n - start 463 | rotateDesc(data, start, m, end) 464 | symMergeDesc(data, a, start, mid) 465 | symMergeDesc(data, mid, end, b) 466 | } 467 | 468 | func rotateDesc(data []int, a, m, b int) { 469 | i := m - a 470 | if i == 0 { 471 | return 472 | } 473 | j := b - m 474 | if j == 0 { 475 | return 476 | } 477 | if i == j { 478 | swapRangeDesc(data, a, m, i) 479 | return 480 | } 481 | p := a + i 482 | for i != j { 483 | if i > j { 484 | swapRangeDesc(data, p-i, p, j) 485 | i -= j 486 | } else { 487 | swapRangeDesc(data, p-i, p+j-i, i) 488 | j -= i 489 | } 490 | } 491 | swapRangeDesc(data, p-i, p, i) 492 | } 493 | -------------------------------------------------------------------------------- /testdata/sampledata.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klauspost/dedup/ebb83049c16285be6f30dd296301e32324b4eeab/testdata/sampledata.zip -------------------------------------------------------------------------------- /writer.go: -------------------------------------------------------------------------------- 1 | package dedup 2 | 3 | import ( 4 | "bytes" 5 | hasher "crypto/sha1" 6 | "encoding/binary" 7 | "errors" 8 | "fmt" 9 | "io" 10 | "math" 11 | "math/big" 12 | "runtime" 13 | "sync" 14 | 15 | "github.com/klauspost/dedup/sort" 16 | ) 17 | 18 | type Writer interface { 19 | io.WriteCloser 20 | 21 | // Split content, so a new block begins with next write. 22 | Split() 23 | 24 | // MemUse returns an approximate maximum memory use in bytes for 25 | // encoder (Writer) and decoder (Reader) for the given number of bytes. 26 | MemUse(bytes int) (encoder, decoder int64) 27 | 28 | // Returns the current number of blocks. 29 | // Blocks may still be processing. 30 | Blocks() int 31 | } 32 | 33 | // Size of the underlying hash in bytes for those interested. 34 | const HashSize = hasher.Size 35 | 36 | // The smallest "maximum" block size allowed. 37 | const MinBlockSize = 512 38 | 39 | // ErrMaxMemoryTooSmall is returned if the encoder isn't allowed to store 40 | // even 1 block. 41 | var ErrMaxMemoryTooSmall = errors.New("there must be at be space for 1 block") 42 | 43 | // Deduplication mode used to determine how input is split. 44 | type Mode int 45 | 46 | const ( 47 | // Fixed block size 48 | // 49 | // This is by far the fastest mode, and checks for duplicates 50 | // In fixed block sizes. 51 | // It can be helpful to use the "Split" function to reset offset, which 52 | // will reset duplication search at the position you are at. 53 | ModeFixed Mode = 0 54 | 55 | // Dynamic block size. 56 | // 57 | // This mode will create a deduplicator that will split the contents written 58 | // to it into dynamically sized blocks. 59 | // The size given indicates the maximum block size. Average size is usually maxSize/4. 60 | // Minimum block size is maxSize/64. 61 | ModeDynamic = 1 62 | 63 | // Dynamic block size. 64 | // 65 | // This mode will create a deduplicator that will split the contents written 66 | // to it into dynamically sized blocks. 67 | // The size given indicates the maximum block size. Average size is usually maxSize/4. 68 | // Minimum block size is maxSize/64. 69 | ModeDynamicEntropy = 2 70 | ) 71 | 72 | // Fragment is a file fragment. 73 | // It is the data returned by the NewSplitter. 74 | type Fragment struct { 75 | Hash [HashSize]byte // Hash of the fragment 76 | Payload []byte // Data of the fragment. 77 | New bool // Will be true, if the data hasn't been encountered before. 78 | N uint // Sequencially incrementing number for each segment. 79 | } 80 | 81 | type writer struct { 82 | blks io.Writer // Block data writer 83 | idx io.Writer // Index writer 84 | frags chan<- Fragment // Fragment output 85 | maxSize int // Maximum Block size 86 | maxBlocks int // Maximum backreference distance 87 | index map[[hasher.Size]byte]int // Known hashes and their index 88 | input chan *block // Channel containing blocks to be hashed 89 | write chan *block // Channel containing (ordered) blocks to be written 90 | exited chan struct{} // Closed when the writer exits. 91 | cur []byte // Current block being written 92 | off int // Write offset in current block 93 | buffers chan *block // Buffers ready for re-use. 94 | vari64 []byte // Temporary buffer for writing varints 95 | err error // Error state 96 | mu sync.Mutex // Mutex for error state 97 | nblocks int // Current block number. First block is 1. 98 | writer func(*writer, []byte) (int, error) // Writes are forwarded here. 99 | flush func(*writer) error // Called from Close *before* the writer is closed. 100 | close func(*writer) error // Called from Close *after* the writer is closed. 101 | split func(*writer) // Called when Split is called. 102 | } 103 | 104 | // block contains information about a single block 105 | type block struct { 106 | data []byte 107 | sha1Hash [hasher.Size]byte 108 | hashDone chan error 109 | N int 110 | } 111 | 112 | // ErrSizeTooSmall is returned if the requested block size is smaller than 113 | // hash size. 114 | var ErrSizeTooSmall = errors.New("maximum block size too small. must be at least 512 bytes") 115 | 116 | // NewWriter will create a deduplicator that will split the contents written 117 | // to it into blocks and de-duplicate these. 118 | // 119 | // The output is delivered as two streams, an index stream and a block stream. 120 | // 121 | // The index stream will contain information about which blocks are deduplicated 122 | // and the block stream will contain uncompressed data blocks. 123 | // 124 | // You can set the maximum memory for the decoder to use. 125 | // This limits the length a match can be made. 126 | // This is very conservative, so you can set this at the absolute limit of memory available. 127 | // If you use dynamic blocks, also note that the average size is 1/4th of the maximum block size. 128 | // Set maxMemory to 0 to disable decoder memory limit. 129 | // 130 | // This function returns data that is compatible with the NewReader function. 131 | // The returned writer must be closed to flush the remaining data. 132 | func NewWriter(index io.Writer, blocks io.Writer, mode Mode, maxSize, maxMemory uint) (Writer, error) { 133 | ncpu := runtime.GOMAXPROCS(0) 134 | // For small block sizes we need to keep a pretty big buffer to keep input fed. 135 | // Constant below appears to be sweet spot measured with 4K blocks. 136 | var bufmul = 256 << 10 / int(maxSize) 137 | if bufmul < 2 { 138 | bufmul = 2 139 | } 140 | 141 | w := &writer{ 142 | blks: blocks, 143 | idx: index, 144 | maxSize: int(maxSize), 145 | index: make(map[[hasher.Size]byte]int), 146 | input: make(chan *block, ncpu*bufmul), 147 | write: make(chan *block, ncpu*bufmul), 148 | exited: make(chan struct{}, 0), 149 | cur: make([]byte, maxSize), 150 | vari64: make([]byte, binary.MaxVarintLen64), 151 | buffers: make(chan *block, ncpu*bufmul), 152 | nblocks: 1, 153 | maxBlocks: int(maxMemory / maxSize), 154 | } 155 | 156 | switch mode { 157 | case ModeFixed: 158 | fw := &fixedWriter{} 159 | w.writer = fw.write 160 | w.split = fw.split 161 | case ModeDynamic: 162 | zw := newZpaqWriter(maxSize) 163 | w.writer = zw.write 164 | w.split = zw.split 165 | case ModeDynamicEntropy: 166 | zw := newEntropyWriter(maxSize) 167 | w.writer = zw.write 168 | w.split = zw.split 169 | default: 170 | return nil, fmt.Errorf("dedup: unknown mode") 171 | } 172 | 173 | if w.maxSize < MinBlockSize { 174 | return nil, ErrSizeTooSmall 175 | } 176 | 177 | w.close = idxClose 178 | w.putUint64(1) // Format 179 | w.putUint64(uint64(maxSize)) // Maximum block size 180 | 181 | // Start one goroutine per core 182 | for i := 0; i < ncpu; i++ { 183 | go w.hasher() 184 | } 185 | // Insert the buffers we will use 186 | for i := 0; i < ncpu*bufmul; i++ { 187 | w.buffers <- &block{data: make([]byte, maxSize), hashDone: make(chan error, 1)} 188 | } 189 | go w.blockWriter() 190 | return w, nil 191 | } 192 | 193 | // NewStreamWriter will create a deduplicator that will split the contents written 194 | // to it into blocks and de-duplicate these. 195 | // 196 | // The output is delivered as a single stream, and memory use will remain stable for 197 | // both writing and reading the stream. 198 | // 199 | // This function returns data that is compatible with the NewStreamReader function. 200 | // 201 | // You can must set the maximum memory for the decoder to use. 202 | // This limits the length a match can be made. 203 | // If you use dynamic blocks, also note that the average size is 1/4th of the maximum block size. 204 | // 205 | // The returned writer must be closed to flush the remaining data. 206 | func NewStreamWriter(out io.Writer, mode Mode, maxSize, maxMemory uint) (Writer, error) { 207 | ncpu := runtime.GOMAXPROCS(0) 208 | // For small block sizes we need to keep a pretty big buffer to keep input fed. 209 | // Constant below appears to be sweet spot measured with 4K blocks. 210 | var bufmul = 256 << 10 / int(maxSize) 211 | if bufmul < 2 { 212 | bufmul = 2 213 | } 214 | if maxMemory < maxSize { 215 | return nil, ErrMaxMemoryTooSmall 216 | } 217 | w := &writer{ 218 | idx: out, 219 | maxSize: int(maxSize), 220 | index: make(map[[hasher.Size]byte]int), 221 | input: make(chan *block, ncpu*bufmul), 222 | write: make(chan *block, ncpu*bufmul), 223 | exited: make(chan struct{}, 0), 224 | cur: make([]byte, maxSize), 225 | vari64: make([]byte, binary.MaxVarintLen64), 226 | buffers: make(chan *block, ncpu*bufmul), 227 | nblocks: 1, 228 | maxBlocks: int(maxMemory / maxSize), 229 | } 230 | 231 | switch mode { 232 | case ModeFixed: 233 | fw := &fixedWriter{} 234 | w.writer = fw.write 235 | case ModeDynamic: 236 | zw := newZpaqWriter(maxSize) 237 | w.writer = zw.write 238 | case ModeDynamicEntropy: 239 | zw := newEntropyWriter(maxSize) 240 | w.writer = zw.write 241 | /* case ModeDynamicSignatures: 242 | zw := newZpaqWriter(maxSize) 243 | w.writer = zw.writeFile 244 | case ModeSignaturesOnly: 245 | w.writer = fileSplitOnly 246 | */ 247 | default: 248 | return nil, fmt.Errorf("dedup: unknown mode") 249 | } 250 | 251 | if w.maxSize < MinBlockSize { 252 | return nil, ErrSizeTooSmall 253 | } 254 | 255 | w.close = streamClose 256 | w.putUint64(2) // Format 257 | w.putUint64(uint64(maxSize)) // Maximum block size 258 | w.putUint64(uint64(w.maxBlocks)) // Maximum backreference length 259 | 260 | // Start one goroutine per core 261 | for i := 0; i < ncpu; i++ { 262 | go w.hasher() 263 | } 264 | // Insert the buffers we will use 265 | for i := 0; i < ncpu*bufmul; i++ { 266 | w.buffers <- &block{data: make([]byte, maxSize), hashDone: make(chan error, 1)} 267 | } 268 | go w.blockStreamWriter() 269 | return w, nil 270 | } 271 | 272 | // NewSplitter will return a writer you can write data to, 273 | // and the file will be split into separate fragments. 274 | // 275 | // You must supply a fragment channel, that will output fragments for 276 | // the data you have written. The channel must accept data while you 277 | // write to the spliter. 278 | // 279 | // For each fragment the SHA-1 hash of the data section is returned, 280 | // along with the raw data of this segment. 281 | // 282 | // When you call Close on the returned Writer, the final fragments 283 | // will be sent and the channel will be closed. 284 | func NewSplitter(fragments chan<- Fragment, mode Mode, maxSize uint) (Writer, error) { 285 | ncpu := runtime.GOMAXPROCS(0) 286 | // For small block sizes we need to keep a pretty big buffer to keep input fed. 287 | // Constant below appears to be sweet spot measured with 4K blocks. 288 | var bufmul = 256 << 10 / int(maxSize) 289 | if bufmul < 2 { 290 | bufmul = 2 291 | } 292 | 293 | w := &writer{ 294 | frags: fragments, 295 | maxSize: int(maxSize), 296 | index: make(map[[hasher.Size]byte]int), 297 | input: make(chan *block, ncpu*bufmul), 298 | write: make(chan *block, ncpu*bufmul), 299 | exited: make(chan struct{}, 0), 300 | cur: make([]byte, maxSize), 301 | vari64: make([]byte, binary.MaxVarintLen64), 302 | buffers: make(chan *block, ncpu*bufmul), 303 | nblocks: 1, 304 | } 305 | 306 | switch mode { 307 | case ModeFixed: 308 | fw := &fixedWriter{} 309 | w.writer = fw.write 310 | w.split = fw.split 311 | case ModeDynamic: 312 | zw := newZpaqWriter(maxSize) 313 | w.writer = zw.write 314 | w.split = zw.split 315 | case ModeDynamicEntropy: 316 | zw := newEntropyWriter(maxSize) 317 | w.writer = zw.write 318 | w.split = zw.split 319 | default: 320 | return nil, fmt.Errorf("dedup: unknown mode") 321 | } 322 | 323 | w.flush = func(w *writer) error { 324 | w.split(w) 325 | return w.err 326 | } 327 | 328 | if w.maxSize < MinBlockSize { 329 | return nil, ErrSizeTooSmall 330 | } 331 | 332 | // Start one goroutine per core 333 | for i := 0; i < ncpu; i++ { 334 | go w.hasher() 335 | } 336 | // Insert the buffers we will use 337 | for i := 0; i < ncpu*bufmul; i++ { 338 | w.buffers <- &block{data: make([]byte, maxSize), hashDone: make(chan error, 1)} 339 | } 340 | go w.fragmentWriter() 341 | return w, nil 342 | } 343 | 344 | // putUint64 will Write a uint64 value to index stream. 345 | func (w *writer) putUint64(v uint64) error { 346 | n := binary.PutUvarint(w.vari64, v) 347 | n2, err := w.idx.Write(w.vari64[:n]) 348 | if err != nil { 349 | return err 350 | } 351 | if n2 != n { 352 | return io.ErrShortWrite 353 | } 354 | return nil 355 | } 356 | 357 | // Split content, so a new block begins with next write 358 | func (w *writer) Split() { 359 | w.split(w) 360 | } 361 | 362 | func (w *writer) Blocks() int { 363 | w.mu.Lock() 364 | b := w.nblocks - 1 365 | w.mu.Unlock() 366 | return b 367 | } 368 | 369 | // Write contents to the deduplicator. 370 | func (w *writer) Write(b []byte) (n int, err error) { 371 | w.mu.Lock() 372 | err = w.err 373 | w.mu.Unlock() 374 | if err != nil { 375 | return 0, err 376 | } 377 | return w.writer(w, b) 378 | } 379 | 380 | // setErr will set the error state of the writer. 381 | func (w *writer) setErr(err error) { 382 | if err == nil { 383 | return 384 | } 385 | w.mu.Lock() 386 | w.err = err 387 | w.mu.Unlock() 388 | } 389 | 390 | // idxClose will flush the remainder of an index based stream 391 | func idxClose(w *writer) (err error) { 392 | // Insert length of remaining data into index 393 | w.putUint64(uint64(math.MaxUint64)) 394 | w.putUint64(uint64(w.maxSize - w.off)) 395 | w.putUint64(0) // Stream continuation possibility, should be 0. 396 | 397 | buf := bytes.NewBuffer(w.cur[0:w.off]) 398 | n, err := io.Copy(w.blks, buf) 399 | if err != nil { 400 | return err 401 | } 402 | if int(n) != w.off { 403 | return errors.New("idxClose: r.cur short write") 404 | } 405 | return nil 406 | } 407 | 408 | // streamClose will flush the remainder of an single stream 409 | func streamClose(w *writer) (err error) { 410 | // Insert length of remaining data into index 411 | w.putUint64(uint64(math.MaxUint64)) 412 | w.putUint64(uint64(w.maxSize - w.off)) 413 | 414 | buf := bytes.NewBuffer(w.cur[0:w.off]) 415 | n, err := io.Copy(w.idx, buf) 416 | if err != nil { 417 | return err 418 | } 419 | if int(n) != w.off { 420 | return errors.New("streamClose: r.cur short write") 421 | } 422 | w.putUint64(0) // Stream continuation possibility, should be 0. 423 | return nil 424 | } 425 | 426 | // Close and flush the remaining data to output. 427 | func (w *writer) Close() (err error) { 428 | select { 429 | case <-w.exited: 430 | return w.err 431 | default: 432 | } 433 | if w.flush != nil { 434 | err := w.flush(w) 435 | if err != nil { 436 | return err 437 | } 438 | } 439 | close(w.input) 440 | close(w.write) 441 | <-w.exited 442 | 443 | if w.close != nil { 444 | err := w.close(w) 445 | if err != nil { 446 | return err 447 | } 448 | } 449 | return w.err 450 | } 451 | 452 | // hasher will hash incoming blocks 453 | // and signal the writer when done. 454 | func (w *writer) hasher() { 455 | h := hasher.New() 456 | for b := range w.input { 457 | buf := bytes.NewBuffer(b.data) 458 | h.Reset() 459 | n, err := io.Copy(h, buf) 460 | if err != nil { 461 | w.setErr(err) 462 | return 463 | } 464 | if int(n) != len(b.data) { 465 | w.setErr(errors.New("short copy in hasher")) 466 | return 467 | } 468 | _ = h.Sum(b.sha1Hash[:0]) 469 | b.hashDone <- nil 470 | } 471 | } 472 | 473 | // blockWriter will write hashed blocks to the output 474 | // and recycle the buffers. 475 | func (w *writer) blockWriter() { 476 | defer close(w.exited) 477 | 478 | sortA := make([]int, w.maxBlocks+1) 479 | 480 | for b := range w.write { 481 | _ = <-b.hashDone 482 | match, ok := w.index[b.sha1Hash] 483 | if !ok { 484 | buf := bytes.NewBuffer(b.data) 485 | n, err := io.Copy(w.blks, buf) 486 | if err != nil { 487 | w.setErr(err) 488 | return 489 | } 490 | if int(n) != len(b.data) { 491 | // This should not be possible with io.copy without an error, 492 | // but we test anyway. 493 | w.setErr(errors.New("error: short write on copy")) 494 | return 495 | } 496 | w.putUint64(0) 497 | w.putUint64(uint64(w.maxSize) - uint64(n)) 498 | } else { 499 | offset := b.N - match 500 | if offset <= 0 { 501 | // should be impossible, indicated an internal error 502 | w.setErr(errors.New("internal error: negative offset")) 503 | return 504 | } 505 | w.putUint64(uint64(offset)) 506 | } 507 | // Update hash to latest match 508 | w.index[b.sha1Hash] = b.N 509 | 510 | // Purge the entries with the oldest matches 511 | if w.maxBlocks > 0 && len(w.index) > w.maxBlocks { 512 | ar := sortA[0:len(w.index)] 513 | i := 0 514 | for _, v := range w.index { 515 | ar[i] = v 516 | i++ 517 | } 518 | sort.Asc(ar) 519 | // Cut the oldest quarter blocks 520 | // since this isn't free 521 | cutoff := ar[w.maxBlocks/4] 522 | for k, v := range w.index { 523 | if v < cutoff { 524 | delete(w.index, k) 525 | } 526 | } 527 | } 528 | 529 | // Done, reinsert buffer 530 | w.buffers <- b 531 | } 532 | } 533 | 534 | // blockStreamWriter will write blocks and indexes to the output stream 535 | // and recycle the buffers. 536 | func (w *writer) blockStreamWriter() { 537 | defer close(w.exited) 538 | for b := range w.write { 539 | _ = <-b.hashDone 540 | match, ok := w.index[b.sha1Hash] 541 | if w.maxBlocks > 0 && (b.N-match) > w.maxBlocks { 542 | ok = false 543 | } 544 | if !ok { 545 | w.putUint64(0) 546 | w.putUint64(uint64(w.maxSize) - uint64(len(b.data))) 547 | buf := bytes.NewBuffer(b.data) 548 | n, err := io.Copy(w.idx, buf) 549 | if err != nil { 550 | w.setErr(err) 551 | return 552 | } 553 | if int(n) != len(b.data) { 554 | // This should not be possible with io.Copy without an error, 555 | // but we test anyway. 556 | w.setErr(errors.New("error: short write on copy")) 557 | return 558 | } 559 | } else { 560 | offset := b.N - match 561 | if offset <= 0 { 562 | // should be impossible, indicated an internal error 563 | w.setErr(errors.New("internal error: negative offset")) 564 | return 565 | } 566 | w.putUint64(uint64(offset)) 567 | } 568 | // Update hash to latest match 569 | w.index[b.sha1Hash] = b.N 570 | 571 | // Purge old entries once in a while 572 | if w.maxBlocks > 0 && b.N&65535 == 65535 { 573 | for k, v := range w.index { 574 | if (b.N - v) > w.maxBlocks { 575 | delete(w.index, k) 576 | } 577 | } 578 | } 579 | // Done, reinsert buffer 580 | w.buffers <- b 581 | } 582 | } 583 | 584 | // fragmentWriter will write hashed blocks to the output channel 585 | // and recycle the buffers. 586 | func (w *writer) fragmentWriter() { 587 | defer close(w.exited) 588 | defer close(w.frags) 589 | n := uint(0) 590 | for b := range w.write { 591 | _ = <-b.hashDone 592 | var f Fragment 593 | f.N = n 594 | copy(f.Hash[:], b.sha1Hash[:]) 595 | _, ok := w.index[b.sha1Hash] 596 | f.Payload = make([]byte, len(b.data)) 597 | copy(f.Payload, b.data) 598 | if !ok { 599 | w.index[b.sha1Hash] = 0 600 | f.New = !ok 601 | } 602 | w.frags <- f 603 | // Done, reinsert buffer 604 | w.buffers <- b 605 | n++ 606 | } 607 | } 608 | 609 | type fixedWriter struct{} 610 | 611 | // Write blocks of similar size. 612 | func (f *fixedWriter) write(w *writer, b []byte) (n int, err error) { 613 | written := 0 614 | for len(b) > 0 { 615 | n := copy(w.cur[w.off:], b) 616 | b = b[n:] 617 | w.off += n 618 | written += n 619 | // Filled the buffer? Send it off! 620 | if w.off == w.maxSize { 621 | b := <-w.buffers 622 | // Swap block with current 623 | w.cur, b.data = b.data, w.cur 624 | w.mu.Lock() 625 | b.N = w.nblocks 626 | w.nblocks++ 627 | w.mu.Unlock() 628 | 629 | w.input <- b 630 | w.write <- b 631 | w.off = 0 632 | } 633 | } 634 | return written, nil 635 | } 636 | 637 | // Split content, so a new block begins with next write 638 | func (f *fixedWriter) split(w *writer) { 639 | if w.off == 0 { 640 | return 641 | } 642 | b := <-w.buffers 643 | // Swap block with current 644 | w.cur, b.data = b.data[:w.maxSize], w.cur[:w.off] 645 | w.mu.Lock() 646 | b.N = w.nblocks 647 | w.nblocks++ 648 | w.mu.Unlock() 649 | 650 | w.input <- b 651 | w.write <- b 652 | w.off = 0 653 | } 654 | 655 | // MemUse returns an approximate maximum memory use in bytes for 656 | // encoder (Writer) and decoder (Reader) for the given number of bytes. 657 | func (w *writer) MemUse(bytes int) (encoder, decoder int64) { 658 | blocks := (bytes + w.maxSize - 1) / w.maxSize 659 | if w.maxBlocks > 0 { 660 | if w.maxBlocks < blocks { 661 | blocks = w.maxBlocks 662 | } 663 | } 664 | // Data length 665 | data := big.NewInt(int64(blocks)) 666 | data = data.Mul(data, big.NewInt(int64(w.maxSize))) 667 | d := data.Int64() 668 | if data.BitLen() > 63 { 669 | d = math.MaxInt64 670 | } 671 | // Index length 672 | bl := big.NewInt(int64(blocks)) 673 | perBlock := big.NewInt(int64(HashSize + 8 /*int64*/ + 24 /* map entry*/)) 674 | total := bl.Mul(bl, perBlock) 675 | if total.BitLen() > 63 { 676 | return math.MaxInt64, d 677 | } 678 | return total.Int64(), d 679 | } 680 | 681 | // Split blocks like ZPAQ: (public domain) 682 | type zpaqWriter struct { 683 | h uint32 // rolling hash for finding fragment boundaries 684 | c1 byte // last byte 685 | maxFragment int 686 | minFragment int 687 | maxHash uint32 688 | o1 [256]byte // order 1 context -> predicted byte 689 | } 690 | 691 | // Split blocks. Typically block size will be maxSize / 4 692 | // Minimum block size is maxSize/64. 693 | // 694 | // The break point is content dependent. 695 | // Any insertions, deletions, or edits that occur before the start of the 32+ byte dependency window 696 | // don't affect the break point. 697 | // This makes it likely for two files to still have identical fragments far away from any edits. 698 | func newZpaqWriter(maxSize uint) *zpaqWriter { 699 | fragment := math.Log2(float64(maxSize) / (64 * 64)) 700 | mh := math.Exp2(22 - fragment) 701 | return &zpaqWriter{ 702 | maxFragment: int(maxSize), 703 | minFragment: int(maxSize / 64), 704 | maxHash: uint32(mh), 705 | } 706 | } 707 | 708 | // h is a 32 bit hash that depends on the last 32 bytes that were mispredicted by the order 1 model o1[]. 709 | // h < maxhash therefore occurs with probability 2^-16, giving an average fragment size of 64K. 710 | // The variable size dependency window works because one constant is odd (correct prediction, no shift), 711 | // and the other is even but not a multiple of 4 (missed prediction, 1 bit shift left). 712 | // This is different from a normal Rabin filter, which uses a large fixed-sized dependency window 713 | // and two multiply operations, one at the window entry and the inverse at the window exit. 714 | func (z *zpaqWriter) write(w *writer, b []byte) (int, error) { 715 | // Transfer to local variables ~30% faster. 716 | c1 := z.c1 717 | h := z.h 718 | off := w.off 719 | for _, c := range b { 720 | if c == z.o1[c1] { 721 | h = (h + uint32(c) + 1) * 314159265 722 | } else { 723 | h = (h + uint32(c) + 1) * 271828182 724 | } 725 | z.o1[c1] = c 726 | c1 = c 727 | w.cur[off] = c 728 | off++ 729 | 730 | // At a break point? Send it off! 731 | if (off >= z.minFragment && h < z.maxHash) || off >= z.maxFragment { 732 | b := <-w.buffers 733 | // Swap block with current 734 | w.cur, b.data = b.data[:w.maxSize], w.cur[:off] 735 | b.N = w.nblocks 736 | 737 | w.input <- b 738 | w.write <- b 739 | w.nblocks++ 740 | off = 0 741 | h = 0 742 | c1 = 0 743 | } 744 | } 745 | w.off = off 746 | z.h = h 747 | z.c1 = c1 748 | return len(b), nil 749 | } 750 | 751 | // Split content, so a new block begins with next write 752 | func (z *zpaqWriter) split(w *writer) { 753 | if w.off == 0 { 754 | return 755 | } 756 | b := <-w.buffers 757 | // Swap block with current 758 | w.cur, b.data = b.data[:w.maxSize], w.cur[:w.off] 759 | w.mu.Lock() 760 | b.N = w.nblocks 761 | w.nblocks++ 762 | w.mu.Unlock() 763 | 764 | w.input <- b 765 | w.write <- b 766 | w.off = 0 767 | z.h = 0 768 | z.c1 = 0 769 | } 770 | 771 | // Split blocks based on entropy distribution. 772 | type entWriter struct { 773 | h uint32 // rolling hash for finding fragment boundaries 774 | maxFragment int 775 | minFragment int 776 | maxHash uint32 777 | hist [256]uint16 // histogram of current accumulated 778 | histLen int 779 | avgHist uint16 780 | } 781 | 782 | // Split blocks. Typically block size will be maxSize / 4 783 | // Minimum block size is maxSize/32. 784 | // 785 | // The break point is content dependent. 786 | // Any insertions, deletions, or edits that occur before the start of the 32+ byte dependency window 787 | // don't affect the break point. 788 | // This makes it likely for two files to still have identical fragments far away from any edits. 789 | func newEntropyWriter(maxSize uint) *entWriter { 790 | fragment := math.Log2(float64(maxSize) / (64 * 64)) 791 | mh := math.Exp2(22 - fragment) 792 | e := &entWriter{ 793 | maxFragment: int(maxSize), 794 | minFragment: int(maxSize / 32), 795 | maxHash: uint32(mh), 796 | } 797 | if e.minFragment > 65535 { 798 | e.minFragment = 65535 799 | } 800 | if e.minFragment < 512 { 801 | e.minFragment = 512 802 | } 803 | e.avgHist = uint16(e.minFragment / 255) 804 | return e 805 | } 806 | 807 | // h is a 32 bit hash that depends on the last 32 bytes that were mispredicted by the order 1 model o1[]. 808 | // h < maxhash therefore occurs with probability 2^-16, giving an average fragment size of 64K. 809 | // The variable size dependency window works because one constant is odd (correct prediction, no shift), 810 | // and the other is even but not a multiple of 4 (missed prediction, 1 bit shift left). 811 | // This is different from a normal Rabin filter, which uses a large fixed-sized dependency window 812 | // and two multiply operations, one at the window entry and the inverse at the window exit. 813 | func (e *entWriter) write(w *writer, b []byte) (int, error) { 814 | inLen := len(b) 815 | if e.histLen < e.minFragment { 816 | b2 := b 817 | if len(b2)+e.histLen > e.minFragment { 818 | b2 = b2[:e.minFragment-e.histLen] 819 | } 820 | off := w.off 821 | for i := range b2 { 822 | v := b2[i] 823 | e.hist[v]++ 824 | w.cur[off+i] = v 825 | } 826 | e.histLen += len(b2) 827 | w.off += len(b2) 828 | b = b[len(b2):] 829 | } 830 | if len(b) == 0 { 831 | return inLen, nil 832 | } 833 | 834 | // Transfer to local variables ~30% faster. 835 | h := e.h 836 | off := w.off 837 | for _, c := range b { 838 | if e.hist[c] >= e.avgHist { 839 | h = (h + uint32(c) + 1) * 314159265 840 | } else { 841 | h = (h + uint32(c) + 1) * 271828182 842 | } 843 | w.cur[off] = c 844 | off++ 845 | 846 | // At a break point? Send it off! 847 | if (off >= e.minFragment && h < e.maxHash) || off >= e.maxFragment { 848 | b := <-w.buffers 849 | // Swap block with current 850 | w.cur, b.data = b.data[:w.maxSize], w.cur[:off] 851 | b.N = w.nblocks 852 | 853 | w.input <- b 854 | w.write <- b 855 | e.histLen = 0 856 | for i := range e.hist { 857 | e.hist[i] = 0 858 | } 859 | w.nblocks++ 860 | off = 0 861 | h = 0 862 | } 863 | } 864 | w.off = off 865 | e.h = h 866 | return inLen, nil 867 | } 868 | 869 | // Split content, so a new block begins with next write 870 | func (e *entWriter) split(w *writer) { 871 | if w.off == 0 { 872 | return 873 | } 874 | b := <-w.buffers 875 | // Swap block with current 876 | w.cur, b.data = b.data[:w.maxSize], w.cur[:w.off] 877 | w.mu.Lock() 878 | b.N = w.nblocks 879 | w.nblocks++ 880 | w.mu.Unlock() 881 | 882 | w.input <- b 883 | w.write <- b 884 | w.off = 0 885 | e.h = 0 886 | e.histLen = 0 887 | for i := range e.hist { 888 | e.hist[i] = 0 889 | } 890 | } 891 | -------------------------------------------------------------------------------- /writer_test.go: -------------------------------------------------------------------------------- 1 | package dedup_test 2 | 3 | import ( 4 | "bytes" 5 | "encoding/hex" 6 | "fmt" 7 | "io" 8 | "io/ioutil" 9 | "math/rand" 10 | "os" 11 | "sync" 12 | "testing" 13 | 14 | "github.com/klauspost/dedup" 15 | ) 16 | 17 | // Returns a deterministic buffer of size n 18 | func getBufferSize(n int) *bytes.Buffer { 19 | rand.Seed(0) 20 | b := make([]byte, n) 21 | for i := range b { 22 | b[i] = byte(rand.Intn(255)) 23 | } 24 | return bytes.NewBuffer(b) 25 | } 26 | 27 | func TestFixedWriter(t *testing.T) { 28 | idx := bytes.Buffer{} 29 | data := bytes.Buffer{} 30 | 31 | const totalinput = 10 << 20 32 | input := getBufferSize(totalinput) 33 | 34 | const size = 64 << 10 35 | b := input.Bytes() 36 | // Create some duplicates 37 | for i := 0; i < 50; i++ { 38 | // Read from 10 first blocks 39 | src := b[(i%10)*size : (i%10)*size+size] 40 | // Write into the following ones 41 | dst := b[(10+i)*size : (i+10)*size+size] 42 | copy(dst, src) 43 | } 44 | input = bytes.NewBuffer(b) 45 | w, err := dedup.NewWriter(&idx, &data, dedup.ModeFixed, size, size*10) 46 | if err != nil { 47 | t.Fatal(err) 48 | } 49 | io.Copy(w, input) 50 | err = w.Close() 51 | if err != nil { 52 | t.Fatal(err) 53 | } 54 | removed := ((totalinput) - data.Len()) / size 55 | 56 | t.Log(dedup.BirthdayProblem(totalinput / size)) 57 | t.Log("Index size:", idx.Len()) 58 | t.Log("Data size:", data.Len()) 59 | t.Log("Removed", removed, "blocks") 60 | // We should get at least 50 blocks 61 | if removed < 50 { 62 | t.Fatal("didn't remove at least 50 blocks") 63 | } 64 | if removed > 60 { 65 | t.Fatal("removed unreasonable high amount of blocks") 66 | } 67 | } 68 | 69 | func TestFixedWriterLimit(t *testing.T) { 70 | idx := bytes.Buffer{} 71 | data := bytes.Buffer{} 72 | 73 | const totalinput = 10 << 20 74 | const limit = 9 75 | input := getBufferSize(totalinput) 76 | 77 | const size = 64 << 10 78 | b := input.Bytes() 79 | // Create some duplicates 80 | for i := 0; i < 50; i++ { 81 | // Read from 10 first blocks 82 | src := b[(i%10)*size : (i%10)*size+size] 83 | // Write into the following ones 84 | dst := b[(10+50-i)*size : (10+50-i)*size+size] 85 | copy(dst, src) 86 | } 87 | input = bytes.NewBuffer(b) 88 | w, err := dedup.NewWriter(&idx, &data, dedup.ModeFixed, size, limit*size) 89 | if err != nil { 90 | t.Fatal(err) 91 | } 92 | io.Copy(w, input) 93 | err = w.Close() 94 | if err != nil { 95 | t.Fatal(err) 96 | } 97 | removed := ((totalinput) - data.Len()) / size 98 | 99 | t.Log("Index size:", idx.Len()) 100 | t.Log("Data size:", data.Len()) 101 | t.Log("Removed", removed, "blocks") 102 | // We should get at least 50 blocks 103 | if removed > 10 { 104 | t.Fatal("it did not appear to respect the limit") 105 | } 106 | if removed < 8 { 107 | t.Fatal("removed too many blocks") 108 | } 109 | r, err := dedup.NewReader(&idx, &data) 110 | if err != nil { 111 | t.Fatal(err) 112 | } 113 | 114 | useBlocks := r.MaxMem() / size 115 | if useBlocks > 9 { 116 | t.Fatal("Uses too much memory, expected", limit, "got", useBlocks) 117 | } 118 | t.Log("Maximum estimated use:", r.MaxMem(), "bytes,", useBlocks, "blocks") 119 | r.Close() 120 | } 121 | 122 | func TestFixedFragmentSplitter(t *testing.T) { 123 | const totalinput = 10<<20 + 500 124 | input := getBufferSize(totalinput) 125 | 126 | const size = 64 << 10 127 | b := input.Bytes() 128 | // Create some duplicates 129 | for i := 0; i < 50; i++ { 130 | // Read from 10 first blocks 131 | src := b[(i%10)*size : (i%10)*size+size] 132 | // Write into the following ones 133 | dst := b[(10+i)*size : (i+10)*size+size] 134 | copy(dst, src) 135 | } 136 | out := make(chan dedup.Fragment, 10) 137 | count := make(chan int, 0) 138 | go func() { 139 | n := 0 140 | off := 0 141 | for f := range out { 142 | if !bytes.Equal(b[off:off+len(f.Payload)], f.Payload) { 143 | panic(fmt.Sprintf("output mismatch at offset %d", n)) 144 | } 145 | off += len(f.Payload) 146 | if f.New { 147 | n += len(f.Payload) 148 | } 149 | } 150 | count <- n 151 | count <- off 152 | }() 153 | input = bytes.NewBuffer(b) 154 | w, err := dedup.NewSplitter(out, dedup.ModeFixed, size) 155 | if err != nil { 156 | t.Fatal(err) 157 | } 158 | io.Copy(w, input) 159 | err = w.Close() 160 | if err != nil { 161 | t.Fatal(err) 162 | } 163 | datalen := <-count 164 | gotLen := <-count 165 | removed := ((totalinput) - datalen) / size 166 | 167 | if gotLen != totalinput { 168 | t.Fatalf("did not get all data, want %d, got %d", totalinput, gotLen) 169 | } 170 | t.Log("Data size:", datalen) 171 | t.Log("Removed", removed, "blocks") 172 | // We should get at least 50 blocks 173 | if removed < 50 { 174 | t.Fatal("didn't remove at least 50 blocks") 175 | } 176 | if removed > 60 { 177 | t.Fatal("removed unreasonable high amount of blocks") 178 | } 179 | } 180 | 181 | func TestDynamicFragmentSplitter(t *testing.T) { 182 | const totalinput = 10 << 20 183 | input := getBufferSize(totalinput) 184 | 185 | const size = 64 << 10 186 | b := input.Bytes() 187 | // Create some duplicates 188 | for i := 0; i < 50; i++ { 189 | // Read from 10 first blocks 190 | src := b[(i%10)*size : (i%10)*size+size] 191 | // Write into the following ones 192 | dst := b[(10+i)*size : (i+10)*size+size] 193 | copy(dst, src) 194 | } 195 | out := make(chan dedup.Fragment, 10) 196 | count := make(chan int, 0) 197 | go func() { 198 | n := 0 199 | off := 0 200 | for f := range out { 201 | if !bytes.Equal(b[off:off+len(f.Payload)], f.Payload) { 202 | panic(fmt.Sprintf("output mismatch at offset %d", n)) 203 | } 204 | off += len(f.Payload) 205 | if f.New { 206 | n += len(f.Payload) 207 | } 208 | } 209 | count <- n 210 | count <- off 211 | }() 212 | input = bytes.NewBuffer(b) 213 | w, err := dedup.NewSplitter(out, dedup.ModeDynamic, size) 214 | if err != nil { 215 | t.Fatal(err) 216 | } 217 | io.Copy(w, input) 218 | err = w.Close() 219 | if err != nil { 220 | t.Fatal(err) 221 | } 222 | datalen := <-count 223 | gotLen := <-count 224 | removed := ((totalinput) - datalen) / size 225 | 226 | if gotLen != totalinput { 227 | t.Fatalf("did not get all data, want %d, got %d", totalinput, gotLen) 228 | } 229 | t.Log("Data size:", datalen) 230 | t.Log("Removed", removed, "blocks") 231 | // We should get at least 50 blocks 232 | if removed < 45 { 233 | t.Fatal("didn't remove at least 45 blocks") 234 | } 235 | if removed > 60 { 236 | t.Fatal("removed unreasonable high amount of blocks") 237 | } 238 | } 239 | 240 | func TestDynamicEntropySplitter(t *testing.T) { 241 | const totalinput = 10 << 20 242 | input := getBufferSize(totalinput) 243 | 244 | const size = 64 << 10 245 | b := input.Bytes() 246 | // Create some duplicates 247 | for i := 0; i < 50; i++ { 248 | // Read from 10 first blocks 249 | src := b[(i%10)*size : (i%10)*size+size] 250 | // Write into the following ones 251 | dst := b[(10+i)*size : (i+10)*size+size] 252 | copy(dst, src) 253 | } 254 | out := make(chan dedup.Fragment, 10) 255 | count := make(chan int, 0) 256 | go func() { 257 | n := 0 258 | off := 0 259 | for f := range out { 260 | if !bytes.Equal(b[off:off+len(f.Payload)], f.Payload) { 261 | panic(fmt.Sprintf("output mismatch at offset %d", n)) 262 | } 263 | off += len(f.Payload) 264 | if f.New { 265 | n += len(f.Payload) 266 | } 267 | } 268 | count <- n 269 | count <- off 270 | }() 271 | input = bytes.NewBuffer(b) 272 | w, err := dedup.NewSplitter(out, dedup.ModeDynamic, size) 273 | if err != nil { 274 | t.Fatal(err) 275 | } 276 | io.Copy(w, input) 277 | err = w.Close() 278 | if err != nil { 279 | t.Fatal(err) 280 | } 281 | datalen := <-count 282 | gotLen := <-count 283 | removed := ((totalinput) - datalen) / size 284 | 285 | if gotLen != totalinput { 286 | t.Fatalf("did not get all data, want %d, got %d", totalinput, gotLen) 287 | } 288 | t.Log("Data size:", datalen) 289 | t.Log("Removed", removed, "blocks") 290 | // We should get at least 45 blocks 291 | if removed < 45 { 292 | t.Fatal("didn't remove at least 50 blocks") 293 | } 294 | if removed > 60 { 295 | t.Fatal("removed unreasonable high amount of blocks") 296 | } 297 | } 298 | 299 | func TestDynamicWriter(t *testing.T) { 300 | idx := bytes.Buffer{} 301 | data := bytes.Buffer{} 302 | 303 | const totalinput = 10 << 20 304 | input := getBufferSize(totalinput) 305 | 306 | const size = 64 << 10 307 | b := input.Bytes() 308 | // Create some duplicates 309 | for i := 0; i < 50; i++ { 310 | // Read from 10 first blocks 311 | src := b[(i%10)*size : (i%10)*size+size] 312 | // Write into the following ones 313 | dst := b[(10+i)*size : (i+10)*size+size] 314 | copy(dst, src) 315 | } 316 | input = bytes.NewBuffer(b) 317 | w, err := dedup.NewWriter(&idx, &data, dedup.ModeDynamic, size, 10*8*size) 318 | if err != nil { 319 | t.Fatal(err) 320 | } 321 | io.Copy(w, input) 322 | err = w.Close() 323 | if err != nil { 324 | t.Fatal(err) 325 | } 326 | removed := ((totalinput) - data.Len()) / size 327 | 328 | t.Log("Dynamic Index size:", idx.Len()) 329 | t.Log("Dynamic Data size:", data.Len()) 330 | t.Log("Removed", removed, "blocks") 331 | // We don't know how many, but it should remove some blocks 332 | if removed < 40 { 333 | t.Fatal("didn't remove at least 40 blocks") 334 | } 335 | } 336 | 337 | func TestDynamicEntropyWriter(t *testing.T) { 338 | idx := bytes.Buffer{} 339 | data := bytes.Buffer{} 340 | 341 | const totalinput = 10 << 20 342 | input := getBufferSize(totalinput) 343 | 344 | const size = 64 << 10 345 | b := input.Bytes() 346 | // Create some duplicates 347 | for i := 0; i < 50; i++ { 348 | // Read from 10 first blocks 349 | src := b[(i%10)*size : (i%10)*size+size] 350 | // Write into the following ones 351 | dst := b[(10+i)*size : (i+10)*size+size] 352 | copy(dst, src) 353 | } 354 | input = bytes.NewBuffer(b) 355 | w, err := dedup.NewWriter(&idx, &data, dedup.ModeDynamicEntropy, size, 10*8*size) 356 | if err != nil { 357 | t.Fatal(err) 358 | } 359 | io.Copy(w, input) 360 | err = w.Close() 361 | if err != nil { 362 | t.Fatal(err) 363 | } 364 | removed := ((totalinput) - data.Len()) / size 365 | 366 | t.Log("Dynamic Index size:", idx.Len()) 367 | t.Log("Dynamic Data size:", data.Len()) 368 | t.Log("Removed", removed, "blocks") 369 | // We don't know how many, but it should remove some blocks 370 | if removed < 40 { 371 | t.Fatal("didn't remove at least 40 blocks") 372 | } 373 | } 374 | 375 | func TestFixedStreamWriter(t *testing.T) { 376 | data := bytes.Buffer{} 377 | 378 | const totalinput = 10 << 20 379 | input := getBufferSize(totalinput) 380 | 381 | const size = 64 << 10 382 | b := input.Bytes() 383 | // Create some duplicates 384 | for i := 0; i < 50; i++ { 385 | // Read from 10 first blocks 386 | src := b[(i%10)*size : (i%10)*size+size] 387 | // Write into the following ones 388 | dst := b[(10+i)*size : (i+10)*size+size] 389 | copy(dst, src) 390 | } 391 | input = bytes.NewBuffer(b) 392 | w, err := dedup.NewStreamWriter(&data, dedup.ModeFixed, size, 10*size) 393 | if err != nil { 394 | t.Fatal(err) 395 | } 396 | io.Copy(w, input) 397 | err = w.Close() 398 | if err != nil { 399 | t.Fatal(err) 400 | } 401 | removed := ((totalinput) - data.Len()) / size 402 | 403 | t.Log("Data size:", data.Len()) 404 | t.Log("Removed", removed, "blocks") 405 | // We should get at least 50 blocks, but there is a little overhead 406 | if removed < 49 { 407 | t.Fatal("didn't remove at least 49 blocks") 408 | } 409 | if removed > 60 { 410 | t.Fatal("removed unreasonable high amount of blocks") 411 | } 412 | } 413 | 414 | func TestDynamicStreamWriter(t *testing.T) { 415 | data := bytes.Buffer{} 416 | 417 | const totalinput = 10 << 20 418 | input := getBufferSize(totalinput) 419 | 420 | const size = 64 << 10 421 | b := input.Bytes() 422 | // Create some duplicates 423 | for i := 0; i < 50; i++ { 424 | // Read from 10 first blocks 425 | src := b[(i%10)*size : (i%10)*size+size] 426 | // Write into the following ones 427 | dst := b[(10+i)*size : (i+10)*size+size] 428 | copy(dst, src) 429 | } 430 | input = bytes.NewBuffer(b) 431 | w, err := dedup.NewStreamWriter(&data, dedup.ModeDynamic, size, 10*8*size) 432 | if err != nil { 433 | t.Fatal(err) 434 | } 435 | io.Copy(w, input) 436 | err = w.Close() 437 | if err != nil { 438 | t.Fatal(err) 439 | } 440 | removed := ((totalinput) - data.Len()) / size 441 | 442 | t.Log("Dynamic Data size:", data.Len()) 443 | t.Log("Removed", removed, "blocks") 444 | // We don't know how many, but it should remove some blocks 445 | if removed < 40 { 446 | t.Fatal("didn't remove at least 40 blocks") 447 | } 448 | } 449 | 450 | func BenchmarkFixedWriter64K(t *testing.B) { 451 | const totalinput = 10 << 20 452 | input := getBufferSize(totalinput) 453 | 454 | const size = 64 << 10 455 | b := input.Bytes() 456 | // Create some duplicates 457 | for i := 0; i < 50; i++ { 458 | // Read from 10 first blocks 459 | src := b[(i%10)*size : (i%10)*size+size] 460 | // Write into the following ones 461 | dst := b[(10+i)*size : (i+10)*size+size] 462 | copy(dst, src) 463 | } 464 | t.ResetTimer() 465 | t.SetBytes(totalinput) 466 | for i := 0; i < t.N; i++ { 467 | input = bytes.NewBuffer(b) 468 | w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeFixed, size, 0) 469 | io.Copy(w, input) 470 | err := w.Close() 471 | if err != nil { 472 | t.Fatal(err) 473 | } 474 | } 475 | } 476 | 477 | func BenchmarkFixedWriter4K(t *testing.B) { 478 | const totalinput = 10 << 20 479 | input := getBufferSize(totalinput) 480 | 481 | const size = 4 << 10 482 | b := input.Bytes() 483 | // Create some duplicates 484 | for i := 0; i < 500; i++ { 485 | // Read from 10 first blocks 486 | src := b[(i%10)*size : (i%10)*size+size] 487 | // Write into the following ones 488 | dst := b[(10+i)*size : (i+10)*size+size] 489 | copy(dst, src) 490 | } 491 | t.ResetTimer() 492 | t.SetBytes(totalinput) 493 | for i := 0; i < t.N; i++ { 494 | input = bytes.NewBuffer(b) 495 | w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeFixed, size, 0) 496 | io.Copy(w, input) 497 | err := w.Close() 498 | if err != nil { 499 | t.Fatal(err) 500 | } 501 | } 502 | } 503 | 504 | func BenchmarkFixedWriter1K(t *testing.B) { 505 | const totalinput = 10 << 20 506 | input := getBufferSize(totalinput) 507 | 508 | const size = 1 << 10 509 | b := input.Bytes() 510 | // Create some duplicates 511 | for i := 0; i < 500; i++ { 512 | // Read from 10 first blocks 513 | src := b[(i%10)*size : (i%10)*size+size] 514 | // Write into the following ones 515 | dst := b[(10+i)*size : (i+10)*size+size] 516 | copy(dst, src) 517 | } 518 | t.ResetTimer() 519 | t.SetBytes(totalinput) 520 | for i := 0; i < t.N; i++ { 521 | input = bytes.NewBuffer(b) 522 | w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeFixed, size, 0) 523 | io.Copy(w, input) 524 | err := w.Close() 525 | if err != nil { 526 | t.Fatal(err) 527 | } 528 | } 529 | } 530 | 531 | // Maximum block size:64k 532 | func BenchmarkDynamicWriter64K(t *testing.B) { 533 | const totalinput = 10 << 20 534 | input := getBufferSize(totalinput) 535 | 536 | const size = 64 << 10 537 | b := input.Bytes() 538 | // Create some duplicates 539 | for i := 0; i < 50; i++ { 540 | // Read from 10 first blocks 541 | src := b[(i%10)*size : (i%10)*size+size] 542 | // Write into the following ones 543 | dst := b[(10+i)*size : (i+10)*size+size] 544 | copy(dst, src) 545 | } 546 | t.ResetTimer() 547 | t.SetBytes(totalinput) 548 | for i := 0; i < t.N; i++ { 549 | input = bytes.NewBuffer(b) 550 | w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeDynamic, size, 0) 551 | io.Copy(w, input) 552 | err := w.Close() 553 | if err != nil { 554 | t.Fatal(err) 555 | } 556 | } 557 | } 558 | 559 | // Maximum block size:64k 560 | func BenchmarkDynamicFragments64K(t *testing.B) { 561 | const totalinput = 10 << 20 562 | input := getBufferSize(totalinput) 563 | 564 | const size = 64 << 10 565 | b := input.Bytes() 566 | // Create some duplicates 567 | for i := 0; i < 50; i++ { 568 | // Read from 10 first blocks 569 | src := b[(i%10)*size : (i%10)*size+size] 570 | // Write into the following ones 571 | dst := b[(10+i)*size : (i+10)*size+size] 572 | copy(dst, src) 573 | } 574 | t.ResetTimer() 575 | t.SetBytes(totalinput) 576 | for i := 0; i < t.N; i++ { 577 | out := make(chan dedup.Fragment, 10) 578 | go func() { 579 | for _ = range out { 580 | } 581 | }() 582 | input = bytes.NewBuffer(b) 583 | w, _ := dedup.NewSplitter(out, dedup.ModeDynamic, size) 584 | io.Copy(w, input) 585 | err := w.Close() 586 | if err != nil { 587 | t.Fatal(err) 588 | } 589 | } 590 | } 591 | 592 | // Maximum block size:64k 593 | func BenchmarkDynamicEntropyFragments64K(t *testing.B) { 594 | const totalinput = 10 << 20 595 | input := getBufferSize(totalinput) 596 | 597 | const size = 64 << 10 598 | b := input.Bytes() 599 | // Create some duplicates 600 | for i := 0; i < 50; i++ { 601 | // Read from 10 first blocks 602 | src := b[(i%10)*size : (i%10)*size+size] 603 | // Write into the following ones 604 | dst := b[(10+i)*size : (i+10)*size+size] 605 | copy(dst, src) 606 | } 607 | t.ResetTimer() 608 | t.SetBytes(totalinput) 609 | for i := 0; i < t.N; i++ { 610 | out := make(chan dedup.Fragment, 10) 611 | go func() { 612 | for _ = range out { 613 | } 614 | }() 615 | input = bytes.NewBuffer(b) 616 | w, _ := dedup.NewSplitter(out, dedup.ModeDynamicEntropy, size) 617 | io.Copy(w, input) 618 | err := w.Close() 619 | if err != nil { 620 | t.Fatal(err) 621 | } 622 | } 623 | } 624 | 625 | // Maximum block size:4k 626 | func BenchmarkDynamicEntropyFragments4K(t *testing.B) { 627 | const totalinput = 10 << 20 628 | input := getBufferSize(totalinput) 629 | 630 | const size = 4 << 10 631 | b := input.Bytes() 632 | // Create some duplicates 633 | for i := 0; i < 50; i++ { 634 | // Read from 10 first blocks 635 | src := b[(i%10)*size : (i%10)*size+size] 636 | // Write into the following ones 637 | dst := b[(10+i)*size : (i+10)*size+size] 638 | copy(dst, src) 639 | } 640 | t.ResetTimer() 641 | t.SetBytes(totalinput) 642 | for i := 0; i < t.N; i++ { 643 | out := make(chan dedup.Fragment, 10) 644 | go func() { 645 | for _ = range out { 646 | } 647 | }() 648 | input = bytes.NewBuffer(b) 649 | w, _ := dedup.NewSplitter(out, dedup.ModeDynamicEntropy, size) 650 | io.Copy(w, input) 651 | err := w.Close() 652 | if err != nil { 653 | t.Fatal(err) 654 | } 655 | } 656 | } 657 | 658 | // Maximum block size:4k 659 | func BenchmarkDynamicWriter4K(t *testing.B) { 660 | const totalinput = 10 << 20 661 | input := getBufferSize(totalinput) 662 | 663 | const size = 4 << 10 664 | b := input.Bytes() 665 | // Create some duplicates 666 | for i := 0; i < 50; i++ { 667 | // Read from 10 first blocks 668 | src := b[(i%10)*size : (i%10)*size+size] 669 | // Write into the following ones 670 | dst := b[(10+i)*size : (i+10)*size+size] 671 | copy(dst, src) 672 | } 673 | t.ResetTimer() 674 | t.SetBytes(totalinput) 675 | for i := 0; i < t.N; i++ { 676 | input = bytes.NewBuffer(b) 677 | w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeDynamic, size, 0) 678 | io.Copy(w, input) 679 | err := w.Close() 680 | if err != nil { 681 | t.Fatal(err) 682 | } 683 | } 684 | } 685 | 686 | func BenchmarkFixedStreamWriter4K(t *testing.B) { 687 | const totalinput = 10 << 20 688 | input := getBufferSize(totalinput) 689 | 690 | const size = 4 << 10 691 | b := input.Bytes() 692 | // Create some duplicates 693 | for i := 0; i < 500; i++ { 694 | // Read from 10 first blocks 695 | src := b[(i%10)*size : (i%10)*size+size] 696 | // Write into the following ones 697 | dst := b[(10+i)*size : (i+10)*size+size] 698 | copy(dst, src) 699 | } 700 | t.ResetTimer() 701 | t.SetBytes(totalinput) 702 | for i := 0; i < t.N; i++ { 703 | input = bytes.NewBuffer(b) 704 | w, _ := dedup.NewStreamWriter(ioutil.Discard, dedup.ModeFixed, size, 10*size) 705 | io.Copy(w, input) 706 | err := w.Close() 707 | if err != nil { 708 | t.Fatal(err) 709 | } 710 | } 711 | } 712 | 713 | // This doesn't actually test anything, but prints probabilities to log 714 | func TestBirthdayProblem(t *testing.T) { 715 | t.Log("Hash size is", dedup.HashSize*8, "bits") 716 | t.Log("1GiB, 1KiB blocks:") 717 | t.Log(dedup.BirthdayProblem((1 << 30) / (1 << 10))) 718 | w, _ := dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeFixed, 1<<10, 0) 719 | e, _ := w.MemUse(1 << 30) 720 | t.Logf("It will use %d MiB for encoder.", e>>20) 721 | 722 | t.Log("1TiB, 4KiB blocks:") 723 | t.Log(dedup.BirthdayProblem((1 << 40) / (4 << 10))) 724 | w, _ = dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeFixed, 4<<10, 0) 725 | e, _ = w.MemUse(1 << 40) 726 | t.Logf("It will use %d MiB for encoder.", e>>20) 727 | 728 | t.Log("1PiB, 4KiB blocks:") 729 | t.Log(dedup.BirthdayProblem((1 << 50) / (4 << 10))) 730 | e, _ = w.MemUse(1 << 50) 731 | t.Logf("It will use %d MiB for encoder.", e>>20) 732 | 733 | t.Log("1EiB, 64KiB blocks:") 734 | t.Log(dedup.BirthdayProblem((1 << 60) / (64 << 10))) 735 | w, _ = dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeFixed, 64<<10, 0) 736 | e, _ = w.MemUse(1 << 60) 737 | t.Logf("It will use %d MiB for encoder.", e>>20) 738 | 739 | t.Log("1EiB, 1KiB blocks:") 740 | t.Log(dedup.BirthdayProblem((1 << 60) / (1 << 10))) 741 | w, _ = dedup.NewWriter(ioutil.Discard, ioutil.Discard, dedup.ModeFixed, 1<<10, 0) 742 | e, _ = w.MemUse(1 << 60) 743 | t.Logf("It will use %d MiB for encoder.", e>>20) 744 | } 745 | 746 | // This will deduplicate a buffer of zeros to an indexed stream 747 | func ExampleNewWriter() { 748 | // We will write to these 749 | idx := bytes.Buffer{} 750 | data := bytes.Buffer{} 751 | 752 | // This is our input: 753 | input := bytes.NewBuffer(make([]byte, 50000)) 754 | 755 | // Create a new writer, with each block being 1000 bytes 756 | w, err := dedup.NewWriter(&idx, &data, dedup.ModeFixed, 1000, 0) 757 | if err != nil { 758 | panic(err) 759 | } 760 | 761 | // Copy our input to the writer. 762 | io.Copy(w, input) 763 | 764 | // Close the writer 765 | err = w.Close() 766 | if err != nil { 767 | panic(err) 768 | } 769 | 770 | // Let us inspect what was written: 771 | fmt.Println("Blocks:", w.Blocks()) 772 | fmt.Println("Index size:", idx.Len()) 773 | fmt.Println("Data size:", data.Len()) 774 | 775 | // OUTPUT: Blocks: 50 776 | // Index size: 67 777 | // Data size: 1000 778 | } 779 | 780 | // This will deduplicate a buffer of zeros to an non-indexed stream 781 | func ExampleNewStreamWriter() { 782 | // We will write to this 783 | data := bytes.Buffer{} 784 | 785 | // This is our input: 786 | input := bytes.NewBuffer(make([]byte, 50000)) 787 | 788 | // Create a new writer, with each block being 1000 bytes, 789 | // And allow it to use 10000 bytes of memory 790 | w, err := dedup.NewStreamWriter(&data, dedup.ModeFixed, 1000, 10000) 791 | if err != nil { 792 | panic(err) 793 | } 794 | // Copy our input to the writer. 795 | io.Copy(w, input) 796 | 797 | // Close the writer 798 | err = w.Close() 799 | if err != nil { 800 | panic(err) 801 | } 802 | 803 | // Let us inspect what was written: 804 | fmt.Println("Blocks:", w.Blocks()) 805 | fmt.Println("Data size:", data.Len()) 806 | 807 | // OUTPUT: Blocks: 50 808 | // Data size: 1068 809 | } 810 | 811 | // This will deduplicate a buffer of zeros, 812 | // and return each block on a channel in order. 813 | func ExampleNewSplitter() { 814 | // We will write to this 815 | // We set a small buffer 816 | out := make(chan dedup.Fragment, 10) 817 | 818 | // This will consume our blocks as they are returned 819 | // and send information about what was received. 820 | info := make(chan int, 0) 821 | go func() { 822 | n := 0 823 | size := 0 824 | for f := range out { 825 | n++ 826 | if f.New { 827 | size += len(f.Payload) 828 | } 829 | } 830 | info <- n 831 | info <- size 832 | }() 833 | 834 | // This is our input: 835 | input := bytes.NewBuffer(make([]byte, 50050)) 836 | 837 | // Create a new writer, with each block being 1000 bytes, 838 | w, err := dedup.NewSplitter(out, dedup.ModeFixed, 1000) 839 | if err != nil { 840 | panic(err) 841 | } 842 | // Copy our input to the writer. 843 | io.Copy(w, input) 844 | 845 | // Close the writer 846 | err = w.Close() 847 | if err != nil { 848 | panic(err) 849 | } 850 | 851 | // Let us inspect what was written: 852 | fmt.Println("Blocks:", <-info) 853 | // Size of one (repeated) block + 50 bytes for last. 854 | fmt.Println("Data size:", <-info) 855 | 856 | // OUTPUT: Blocks: 51 857 | // Data size: 1050 858 | } 859 | 860 | // This will deduplicate a file 861 | // and return each block on a channel in order. 862 | func ExampleNewSplitter_file() { 863 | // Our input 864 | f, _ := os.Open("testdata/sampledata.zip") 865 | defer f.Close() 866 | 867 | // We will receive fragments on this channel 868 | ch := make(chan dedup.Fragment, 10) 869 | 870 | var wg sync.WaitGroup 871 | wg.Add(1) 872 | 873 | // Start a goroutine that will consume the fragments 874 | go func() { 875 | defer wg.Done() 876 | for { 877 | select { 878 | case f, ok := <-ch: 879 | if !ok { 880 | return 881 | } 882 | if f.New { 883 | fmt.Printf("Got NEW fragment #%d, size %d, hash:%s\n", f.N, len(f.Payload), hex.EncodeToString(f.Hash[:])) 884 | // Insert payload into data store 885 | } else { 886 | fmt.Printf("Got OLD fragment #%d, size %d, hash:%s\n", f.N, len(f.Payload), hex.EncodeToString(f.Hash[:])) 887 | } 888 | // Add hash to list of hashes required to reconstruct the file. 889 | } 890 | } 891 | }() 892 | 893 | // Create a dynamic splitter with average size of 1024 bytes. 894 | w, _ := dedup.NewSplitter(ch, dedup.ModeDynamic, 4*1024) 895 | 896 | // Copy data to the splitter 897 | _, _ = io.Copy(w, f) 898 | 899 | // Flush the remaining fragments 900 | _ = w.Close() 901 | 902 | // Wait for input to be received. 903 | wg.Wait() 904 | 905 | // OUTPUT: 906 | // Got NEW fragment #0, size 893, hash:7f8455127e82f90ea7e97716ccaefa9317279b4b 907 | // Got NEW fragment #1, size 559, hash:b554708bbfda24f1eb8fcd75a155d23bd36939d3 908 | // Got NEW fragment #2, size 3482, hash:59bca870477e14e97ae8650e74ef52abcb6340e8 909 | // Got NEW fragment #3, size 165, hash:6fb05a63e28a1bb2e880e051940f517115e7b16c 910 | // Got NEW fragment #4, size 852, hash:6671826ffff6edd32951a0e774efccb5101ba629 911 | // Got NEW fragment #5, size 3759, hash:0fae545a20195720d8e9bb9540069418d7db0873 912 | // Got OLD fragment #6, size 3482, hash:59bca870477e14e97ae8650e74ef52abcb6340e8 913 | // Got OLD fragment #7, size 165, hash:6fb05a63e28a1bb2e880e051940f517115e7b16c 914 | // Got OLD fragment #8, size 852, hash:6671826ffff6edd32951a0e774efccb5101ba629 915 | // Got NEW fragment #9, size 2380, hash:1507aa13e215517ce982b9235a0221018128ed4e 916 | // Got NEW fragment #10, size 71, hash:f262fcf4af26ee75ff3045db2af21f2acca235cd 917 | } 918 | 919 | // This will deduplicate a file 920 | // and return each block on a channel in order. 921 | func ExampleNewSplitter_entropy() { 922 | // Our input 923 | f, _ := os.Open("testdata/sampledata.zip") 924 | defer f.Close() 925 | 926 | // We will receive fragments on this channel 927 | ch := make(chan dedup.Fragment, 10) 928 | 929 | var wg sync.WaitGroup 930 | wg.Add(1) 931 | 932 | // Start a goroutine that will consume the fragments 933 | go func() { 934 | defer wg.Done() 935 | for { 936 | select { 937 | case f, ok := <-ch: 938 | if !ok { 939 | return 940 | } 941 | if f.New { 942 | fmt.Printf("Got NEW fragment #%d, size %d, hash:%s\n", f.N, len(f.Payload), hex.EncodeToString(f.Hash[:])) 943 | // Insert payload into data store 944 | } else { 945 | fmt.Printf("Got OLD fragment #%d, size %d, hash:%s\n", f.N, len(f.Payload), hex.EncodeToString(f.Hash[:])) 946 | } 947 | // Add hash to list of hashes required to reconstruct the file. 948 | } 949 | } 950 | }() 951 | 952 | // Create a dynamic splitter with average size of 1024 bytes. 953 | w, _ := dedup.NewSplitter(ch, dedup.ModeDynamicEntropy, 4*1024) 954 | 955 | // Copy data to the splitter 956 | _, _ = io.Copy(w, f) 957 | 958 | // Flush the remaining fragments 959 | _ = w.Close() 960 | 961 | // Wait for input to be received. 962 | wg.Wait() 963 | 964 | // OUTPUT: 965 | //Got NEW fragment #0, size 521, hash:0c5989843e85f31aed26f249bd203240dd72f77a 966 | //Got NEW fragment #1, size 1563, hash:308ff2e0b4776c2a08fe549422c7ebfbf646bb22 967 | //Got NEW fragment #2, size 919, hash:9d68759ef33ae919b656faf52bb1177e803f810b 968 | //Got NEW fragment #3, size 1326, hash:c272c26dff010417ca2120a8e82addfdadb4efeb 969 | //Got NEW fragment #4, size 1284, hash:9bbe891ccb1b141e0e122110e730e8df9743331e 970 | //Got NEW fragment #5, size 1220, hash:5019f56fa9395060fbe2e957ad518a35cd667f9b 971 | //Got NEW fragment #6, size 3509, hash:e0d7c8acfdd5b399a92b5e495a0794ffa842ee73 972 | //Got OLD fragment #7, size 919, hash:9d68759ef33ae919b656faf52bb1177e803f810b 973 | //Got OLD fragment #8, size 1326, hash:c272c26dff010417ca2120a8e82addfdadb4efeb 974 | //Got OLD fragment #9, size 1284, hash:9bbe891ccb1b141e0e122110e730e8df9743331e 975 | //Got OLD fragment #10, size 1220, hash:5019f56fa9395060fbe2e957ad518a35cd667f9b 976 | //Got NEW fragment #11, size 1569, hash:5ae2760535662c13b336d1ae4a0a7fdcba789d83 977 | } 978 | 979 | // This example will show how to write data to two files. 980 | // Running this example will deduplicate an empty byte slice 981 | // of 500000 bytes into an 'output.data' and 'output.idx' file. 982 | // 983 | // In the real world, you would likely want to add a bufio.NewWriter 984 | // to the output, but to keep it simple, we don't do that here. 985 | func ExampleNewWriter_file() { 986 | data, err := os.Create("output.data") 987 | if err != nil { 988 | panic(err) 989 | } 990 | // Close, print stats and remove it 991 | defer func() { 992 | data.Close() 993 | stat, _ := os.Stat("output.data") 994 | fmt.Println("Data size:", stat.Size()) 995 | os.Remove("output.data") 996 | }() 997 | 998 | idx, err := os.Create("output.idx") 999 | if err != nil { 1000 | panic(err) 1001 | } 1002 | // Close, print stats and remove it 1003 | defer func() { 1004 | idx.Close() 1005 | stat, _ := os.Stat("output.idx") 1006 | fmt.Println("Index size:", stat.Size()) 1007 | os.Remove("output.idx") 1008 | }() 1009 | 1010 | // This is our input: 1011 | input := bytes.NewBuffer(make([]byte, 500000)) 1012 | 1013 | // Create a new writer, with each block being 1000 bytes fixed size. 1014 | w, err := dedup.NewWriter(idx, data, dedup.ModeFixed, 1000, 0) 1015 | if err != nil { 1016 | panic(err) 1017 | } 1018 | defer w.Close() 1019 | 1020 | // Copy our input to the writer. 1021 | io.Copy(w, input) 1022 | 1023 | // Print the number of blocks written 1024 | fmt.Println("Blocks:", w.Blocks()) 1025 | 1026 | // OUTPUT: Blocks: 500 1027 | // Index size: 517 1028 | // Data size: 1000 1029 | } 1030 | 1031 | // This will deduplicate a buffer of zeros to an non-indexed stream 1032 | // written to a file. 1033 | // It is not recommended to use a single stream when you are writing to 1034 | // a stream. 1035 | func ExampleNewStreamWriter_file() { 1036 | // We will write to this 1037 | data, err := os.Create("outputstream.data") 1038 | if err != nil { 1039 | panic(err) 1040 | } 1041 | // Close, print stats and remove it 1042 | defer func() { 1043 | data.Close() 1044 | stat, _ := os.Stat("outputstream.data") 1045 | fmt.Println("Stream size:", stat.Size()) 1046 | os.Remove("outputstream.data") 1047 | }() 1048 | 1049 | // This is our input: 1050 | input := bytes.NewBuffer(make([]byte, 500000)) 1051 | 1052 | // Create a new writer, with each block being 1000 bytes, 1053 | // And allow it to use 10000 bytes of memory 1054 | w, err := dedup.NewStreamWriter(data, dedup.ModeFixed, 1000, 10000) 1055 | if err != nil { 1056 | panic(err) 1057 | } 1058 | defer w.Close() 1059 | 1060 | // Copy our input to the writer. 1061 | io.Copy(w, input) 1062 | 1063 | // Print the number of blocks written 1064 | fmt.Println("Blocks:", w.Blocks()) 1065 | 1066 | // OUTPUT: Blocks: 500 1067 | // Stream size: 1518 1068 | } 1069 | 1070 | // This shows an example of a birthday problem calculation. 1071 | // We calculate the probability of a collision of SHA1 hashes 1072 | // on 1 Terabyte data, using 1 Kilobyte blocks. 1073 | // With SHA-1, that gives a 1 in 2535301202817642046627252275200 chance 1074 | // of a collision occurring. 1075 | func ExampleBirthdayProblem() { 1076 | fmt.Println("Hash size is", dedup.HashSize*8, "bits") 1077 | fmt.Println("1TiB, 1KiB blocks:") 1078 | fmt.Println(dedup.BirthdayProblem((1 << 40) / (1 << 10))) 1079 | // Output: Hash size is 160 bits 1080 | // 1TiB, 1KiB blocks: 1081 | // Collision probability is ~ 1/2535301202817642046627252275200 ~ 3.944304522431639e-31 1082 | } 1083 | --------------------------------------------------------------------------------