├── .travis.yml
├── LICENSE
├── README.md
├── gen_test.go
└── rsync.go
/.travis.yml:
--------------------------------------------------------------------------------
1 |
2 | language: go
3 | go: 1.5.1
4 |
5 | os:
6 | - linux
7 | - osx
8 |
9 | install:
10 | - go get -d -v ./...
11 |
12 | script:
13 | - go test -v
14 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2012 Daniel Theophanes
2 | Copyright (c) 2015 Amos Wenger
3 |
4 | This software is provided 'as-is', without any express or implied
5 | warranty. In no event will the authors be held liable for any damages
6 | arising from the use of this software.
7 |
8 | Permission is granted to anyone to use this software for any purpose,
9 | including commercial applications, and to alter it and redistribute it
10 | freely, subject to the following restrictions:
11 |
12 | 1. The origin of this software must not be misrepresented; you must not
13 | claim that you wrote the original software. If you use this software
14 | in a product, an acknowledgment in the product documentation would be
15 | appreciated but is not required.
16 |
17 | 2. Altered source versions must be plainly marked as such, and must not be
18 | misrepresented as being the original software.
19 |
20 | 3. This notice may not be removed or altered from any source
21 | distribution.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # rsync-go
2 |
3 | 
4 | [](https://travis-ci.org/itchio/rsync-go)
5 | [](https://godoc.org/github.com/itchio/rsync-go)
6 |
7 | This is a pure go implementation of the rsync algorithm.
8 |
9 | This repo in particular is a fork of Daniel Teophanes's implementation
10 | (see the [](#Links) section for details)
11 |
12 | ### Usage
13 |
14 | Here's a simple example (without error checking):
15 |
16 | ```go
17 | import (
18 | "os"
19 | "bytes"
20 |
21 | "gopkg.in/itchio/go-rsync.v0"
22 | )
23 |
24 | func main() {
25 | srcReader, _ := os.Open("content-v2.bin")
26 | defer srcReader.Close()
27 |
28 | rs := &rsync.RSync{}
29 |
30 | // here we store the whole signature in a byte slice,
31 | // but it could just as well be sent over a network connection for example
32 | sig := make([]rsync.BlockHash, 0, 10)
33 | writeSignature := func(bl rsync.BlockHash) error {
34 | sig = append(sig, bl)
35 | return nil
36 | }
37 |
38 | rs.CreateSignature(srcReader, writeSignature)
39 |
40 | targetReader, _ := os.Open("content-v1.bin")
41 |
42 | opsOut := make(chan rsync.Operation)
43 | writeOperation := func(op rsync.Operation) error {
44 | opsOut <- op
45 | return nil
46 | }
47 |
48 | go func() {
49 | defer close(opsOut)
50 | rs.CreateDelta(targetReader, writeOperation)
51 | }()
52 |
53 | srcWriter, _ := os.OpenFile("content-v2-reconstructed.bin")
54 | srcReader.Seek(0, os.SEEK_SET)
55 |
56 | rs.ApplyDelta(srcWriter, srcReader, opsOut)
57 | }
58 | ```
59 |
60 | Import
61 |
62 | Creating a signature:
63 |
64 | ### Links
65 |
66 | * original repo:
67 | * paper behind the rsync algorithm:
68 |
--------------------------------------------------------------------------------
/gen_test.go:
--------------------------------------------------------------------------------
1 | package rsync
2 |
3 | import (
4 | "bytes"
5 | "math/rand"
6 | "testing"
7 | )
8 |
9 | type RandReader struct {
10 | rand.Source
11 | }
12 |
13 | func (rr RandReader) Read(sink []byte) (int, error) {
14 | var tail, head int
15 | buf := make([]byte, 8)
16 | var r uint64
17 | for {
18 | head = min(tail+8, len(sink))
19 | if tail == head {
20 | return head, nil
21 | }
22 |
23 | r = (uint64)(rr.Int63())
24 | buf[0] = (byte)(r)
25 | buf[1] = (byte)(r >> 8)
26 | buf[2] = (byte)(r >> 16)
27 | buf[3] = (byte)(r >> 24)
28 | buf[4] = (byte)(r >> 32)
29 | buf[5] = (byte)(r >> 40)
30 | buf[6] = (byte)(r >> 48)
31 | buf[7] = (byte)(r >> 56)
32 |
33 | tail += copy(sink[tail:head], buf)
34 | }
35 | }
36 |
37 | type pair struct {
38 | Source, Target content
39 | Description string
40 | }
41 | type content struct {
42 | Len int
43 | Seed int64
44 | Alter int
45 | Data []byte
46 | }
47 |
48 | func (c *content) Fill() {
49 | c.Data = make([]byte, c.Len)
50 | src := rand.NewSource(c.Seed)
51 | RandReader{src}.Read(c.Data)
52 |
53 | if c.Alter > 0 {
54 | r := rand.New(src)
55 | for i := 0; i < c.Alter; i++ {
56 | at := r.Intn(len(c.Data))
57 | c.Data[at] += byte(r.Int())
58 | }
59 | }
60 | }
61 |
62 | func Test_GenData(t *testing.T) {
63 | // Use a seeded generator to get consistent results.
64 | // This allows testing the package without bundling many test files.
65 |
66 | var pairs = []pair{
67 | pair{
68 | Source: content{Len: 512*1024 + 89, Seed: 42, Alter: 0},
69 | Target: content{Len: 512*1024 + 89, Seed: 42, Alter: 5},
70 | Description: "Same length, slightly different content.",
71 | },
72 | pair{
73 | Source: content{Len: 512*1024 + 89, Seed: 9824, Alter: 0},
74 | Target: content{Len: 512*1024 + 89, Seed: 2345, Alter: 0},
75 | Description: "Same length, very different content.",
76 | },
77 | pair{
78 | Source: content{Len: 512*1024 + 89, Seed: 42, Alter: 0},
79 | Target: content{Len: 256*1024 + 19, Seed: 42, Alter: 0},
80 | Description: "Target shorter then source, same content.",
81 | },
82 | pair{
83 | Source: content{Len: 512*1024 + 89, Seed: 42, Alter: 0},
84 | Target: content{Len: 256*1024 + 19, Seed: 42, Alter: 5},
85 | Description: "Target shorter then source, slightly different content.",
86 | },
87 | pair{
88 | Source: content{Len: 256*1024 + 19, Seed: 42, Alter: 0},
89 | Target: content{Len: 512*1024 + 89, Seed: 42, Alter: 0},
90 | Description: "Source shorter then target, same content.",
91 | },
92 | pair{
93 | Source: content{Len: 512*1024 + 89, Seed: 42, Alter: 5},
94 | Target: content{Len: 256*1024 + 19, Seed: 42, Alter: 0},
95 | Description: "Source shorter then target, slightly different content.",
96 | },
97 | pair{
98 | Source: content{Len: 512*1024 + 89, Seed: 42, Alter: 0},
99 | Target: content{Len: 0, Seed: 42, Alter: 0},
100 | Description: "Target empty and source has content.",
101 | },
102 | pair{
103 | Source: content{Len: 0, Seed: 42, Alter: 0},
104 | Target: content{Len: 512*1024 + 89, Seed: 42, Alter: 0},
105 | Description: "Source empty and target has content.",
106 | },
107 | pair{
108 | Source: content{Len: 872, Seed: 9824, Alter: 0},
109 | Target: content{Len: 235, Seed: 2345, Alter: 0},
110 | Description: "Source and target both smaller then a block size.",
111 | },
112 | }
113 | rs := &RSync{}
114 | rsDelta := &RSync{}
115 | for _, p := range pairs {
116 | (&p.Source).Fill()
117 | (&p.Target).Fill()
118 |
119 | sourceBuffer := bytes.NewReader(p.Source.Data)
120 | targetBuffer := bytes.NewReader(p.Target.Data)
121 |
122 | sig := make([]BlockHash, 0, 10)
123 | err := rs.CreateSignature(targetBuffer, func(bl BlockHash) error {
124 | sig = append(sig, bl)
125 | return nil
126 | })
127 | if err != nil {
128 | t.Errorf("Failed to create signature: %s", err)
129 | }
130 | opsOut := make(chan Operation)
131 | go func() {
132 | var blockCt, blockRangeCt, dataCt, bytes int
133 | defer close(opsOut)
134 | err := rsDelta.CreateDelta(sourceBuffer, sig, func(op Operation) error {
135 | switch op.Type {
136 | case OpBlockRange:
137 | blockRangeCt++
138 | case OpBlock:
139 | blockCt++
140 | case OpData:
141 | // Copy data buffer so it may be reused in internal buffer.
142 | b := make([]byte, len(op.Data))
143 | copy(b, op.Data)
144 | op.Data = b
145 | dataCt++
146 | bytes += len(op.Data)
147 | }
148 | opsOut <- op
149 | return nil
150 | })
151 | t.Logf("Range Ops:%5d, Block Ops:%5d, Data Ops: %5d, Data Len: %5dKiB, For %s.", blockRangeCt, blockCt, dataCt, bytes/1024, p.Description)
152 | if err != nil {
153 | t.Errorf("Failed to create delta: %s", err)
154 | }
155 | }()
156 |
157 | result := new(bytes.Buffer)
158 |
159 | targetBuffer.Seek(0, 0)
160 | err = rs.ApplyDelta(result, targetBuffer, opsOut)
161 | if err != nil {
162 | t.Errorf("Failed to apply delta: %s", err)
163 | }
164 |
165 | if result.Len() != len(p.Source.Data) {
166 | t.Errorf("Result not same size as source: %s", p.Description)
167 | } else if bytes.Equal(result.Bytes(), p.Source.Data) == false {
168 | t.Errorf("Result is different from the source: %s", p.Description)
169 | }
170 |
171 | p.Source.Data = nil
172 | p.Target.Data = nil
173 | }
174 | }
175 |
--------------------------------------------------------------------------------
/rsync.go:
--------------------------------------------------------------------------------
1 | // RSync/RDiff implementation.
2 | //
3 | // Algorithm found at: http://www.samba.org/~tridge/phd_thesis.pdf
4 | //
5 | // Definitions
6 | // Source: The final content.
7 | // Target: The content to be made into final content.
8 | // Signature: The sequence of hashes used to identify the content.
9 | package rsync
10 |
11 | import (
12 | "bytes"
13 | "crypto/md5"
14 | "hash"
15 | "io"
16 | )
17 |
18 | // If no BlockSize is specified in the RSync instance, this value is used.
19 | const DefaultBlockSize = 1024 * 6
20 | const DefaultMaxDataOp = DefaultBlockSize * 10
21 |
22 | // Internal constant used in rolling checksum.
23 | const _M = 1 << 16
24 |
25 | // Operation Types.
26 | type OpType byte
27 |
28 | const (
29 | OpBlock OpType = iota
30 | OpData
31 | OpHash
32 | OpBlockRange
33 | )
34 |
35 | // Instruction to mutate target to align to source.
36 | type Operation struct {
37 | Type OpType
38 | BlockIndex uint64
39 | BlockIndexEnd uint64
40 | Data []byte
41 | }
42 |
43 | // Signature hash item generated from target.
44 | type BlockHash struct {
45 | Index uint64
46 | StrongHash []byte
47 | WeakHash uint32
48 | }
49 |
50 | // Write signatures as they are generated.
51 | type SignatureWriter func(bl BlockHash) error
52 | type OperationWriter func(op Operation) error
53 |
54 | // Properties to use while working with the rsync algorithm.
55 | // A single RSync should not be used concurrently as it may contain
56 | // internal buffers and hash sums.
57 | type RSync struct {
58 | BlockSize int
59 | MaxDataOp int
60 |
61 | // If this is nil an MD5 hash is used.
62 | UniqueHasher hash.Hash
63 |
64 | buffer []byte
65 | }
66 |
67 | // If the target length is known the number of hashes in the
68 | // signature can be determined.
69 | func (r *RSync) BlockHashCount(targetLength int) (count int) {
70 | if r.BlockSize <= 0 {
71 | r.BlockSize = DefaultBlockSize
72 | }
73 | count = (targetLength / r.BlockSize)
74 | if targetLength%r.BlockSize != 0 {
75 | count++
76 | }
77 | return
78 | }
79 |
80 | // Calculate the signature of target.
81 | func (r *RSync) CreateSignature(target io.Reader, sw SignatureWriter) error {
82 | if r.BlockSize <= 0 {
83 | r.BlockSize = DefaultBlockSize
84 | }
85 | if r.UniqueHasher == nil {
86 | r.UniqueHasher = md5.New()
87 | }
88 | var err error
89 | var n int
90 |
91 | minBufferSize := r.BlockSize
92 | if len(r.buffer) < minBufferSize {
93 | r.buffer = make([]byte, minBufferSize)
94 | }
95 | buffer := r.buffer
96 |
97 | var block []byte
98 | loop := true
99 | var index uint64
100 | for loop {
101 | n, err = io.ReadAtLeast(target, buffer, r.BlockSize)
102 | if err != nil {
103 | // n == 0.
104 | if err == io.EOF {
105 | return nil
106 | }
107 | if err != io.ErrUnexpectedEOF {
108 | return err
109 | }
110 | // n > 0.
111 | loop = false
112 | }
113 | block = buffer[:n]
114 | weak, _, _ := βhash(block)
115 | err = sw(BlockHash{StrongHash: r.uniqueHash(block), WeakHash: weak, Index: index})
116 | if err != nil {
117 | return err
118 | }
119 | index++
120 | }
121 | return nil
122 | }
123 |
124 | // Apply the difference to the target.
125 | func (r *RSync) ApplyDelta(alignedTarget io.Writer, target io.ReadSeeker, ops chan Operation) error {
126 | if r.BlockSize <= 0 {
127 | r.BlockSize = DefaultBlockSize
128 | }
129 | var err error
130 | var n int
131 | var block []byte
132 |
133 | minBufferSize := r.BlockSize
134 | if len(r.buffer) < minBufferSize {
135 | r.buffer = make([]byte, minBufferSize)
136 | }
137 | buffer := r.buffer
138 |
139 | writeBlock := func(op Operation) error {
140 | target.Seek(int64(r.BlockSize*int(op.BlockIndex)), 0)
141 | n, err = io.ReadAtLeast(target, buffer, r.BlockSize)
142 | if err != nil {
143 | if err != io.ErrUnexpectedEOF {
144 | return err
145 | }
146 | }
147 | block = buffer[:n]
148 | _, err = alignedTarget.Write(block)
149 | if err != nil {
150 | return err
151 | }
152 | return nil
153 | }
154 |
155 | for op := range ops {
156 | switch op.Type {
157 | case OpBlockRange:
158 | for i := op.BlockIndex; i <= op.BlockIndexEnd; i++ {
159 | err = writeBlock(Operation{
160 | Type: OpBlock,
161 | BlockIndex: i,
162 | })
163 | if err != nil {
164 | if err == io.EOF {
165 | break
166 | }
167 | return err
168 | }
169 | }
170 | case OpBlock:
171 | err = writeBlock(op)
172 | if err != nil {
173 | if err == io.EOF {
174 | break
175 | }
176 | return err
177 | }
178 | case OpData:
179 | _, err = alignedTarget.Write(op.Data)
180 | if err != nil {
181 | return err
182 | }
183 | }
184 | }
185 | return nil
186 | }
187 |
188 | // Create the operation list to mutate the target signature into the source.
189 | // Any data operation from the OperationWriter must have the data copied out
190 | // within the span of the function; the data buffer underlying the operation
191 | // data is reused. The sourceSum create a complete hash sum of the source if
192 | // present.
193 | func (r *RSync) CreateDelta(source io.Reader, signature []BlockHash, ops OperationWriter) (err error) {
194 | if r.BlockSize <= 0 {
195 | r.BlockSize = DefaultBlockSize
196 | }
197 | if r.MaxDataOp <= 0 {
198 | r.MaxDataOp = DefaultMaxDataOp
199 | }
200 | if r.UniqueHasher == nil {
201 | r.UniqueHasher = md5.New()
202 | }
203 | minBufferSize := (r.BlockSize * 2) + (r.MaxDataOp)
204 | if len(r.buffer) < minBufferSize {
205 | r.buffer = make([]byte, minBufferSize)
206 | }
207 | buffer := r.buffer
208 |
209 | // A single β hashes may correlate with a many unique hashes.
210 | hashLookup := make(map[uint32][]BlockHash, len(signature))
211 | for _, h := range signature {
212 | key := h.WeakHash
213 | hashLookup[key] = append(hashLookup[key], h)
214 | }
215 |
216 | type section struct {
217 | tail int
218 | head int
219 | }
220 |
221 | var data, sum section
222 | var n, validTo int
223 | var αPop, αPush, β, β1, β2 uint32
224 | var blockIndex uint64
225 | var rolling, lastRun, foundHash bool
226 |
227 | // Store the previous non-data operation for combining.
228 | var prevOp *Operation
229 |
230 | // Send the last operation if there is one waiting.
231 | defer func() {
232 | if prevOp == nil {
233 | return
234 | }
235 | err = ops(*prevOp)
236 | prevOp = nil
237 | }()
238 |
239 | // Combine OpBlock into OpBlockRange. To do this store the previous
240 | // non-data operation and determine if it can be extended.
241 | enqueue := func(op Operation) (err error) {
242 | switch op.Type {
243 | case OpBlock:
244 | if prevOp != nil {
245 | switch prevOp.Type {
246 | case OpBlock:
247 | if prevOp.BlockIndex+1 == op.BlockIndex {
248 | prevOp = &Operation{
249 | Type: OpBlockRange,
250 | BlockIndex: prevOp.BlockIndex,
251 | BlockIndexEnd: op.BlockIndex,
252 | }
253 | return
254 | }
255 | case OpBlockRange:
256 | if prevOp.BlockIndexEnd+1 == op.BlockIndex {
257 | prevOp.BlockIndexEnd = op.BlockIndex
258 | return
259 | }
260 | }
261 | err = ops(*prevOp)
262 | if err != nil {
263 | return
264 | }
265 | prevOp = nil
266 | }
267 | prevOp = &op
268 | case OpData:
269 | // Never save a data operation, as it would corrupt the buffer.
270 | if prevOp != nil {
271 | err = ops(*prevOp)
272 | if err != nil {
273 | return
274 | }
275 | }
276 | err = ops(op)
277 | if err != nil {
278 | return
279 | }
280 | prevOp = nil
281 | }
282 | return
283 | }
284 |
285 | for !lastRun {
286 | // Determine if the buffer should be extended.
287 | if sum.tail+r.BlockSize > validTo {
288 | // Determine if the buffer should be wrapped.
289 | if validTo+r.BlockSize > len(buffer) {
290 | // Before wrapping the buffer, send any trailing data off.
291 | if data.tail < data.head {
292 | err = enqueue(Operation{Type: OpData, Data: buffer[data.tail:data.head]})
293 | if err != nil {
294 | return err
295 | }
296 | }
297 | // Wrap the buffer.
298 | l := validTo - sum.tail
299 | copy(buffer[:l], buffer[sum.tail:validTo])
300 |
301 | // Reset indexes.
302 | validTo = l
303 | sum.tail = 0
304 | data.head = 0
305 | data.tail = 0
306 | }
307 |
308 | n, err = io.ReadAtLeast(source, buffer[validTo:validTo+r.BlockSize], r.BlockSize)
309 | validTo += n
310 | if err != nil {
311 | if err != io.EOF && err != io.ErrUnexpectedEOF {
312 | return err
313 | }
314 | lastRun = true
315 |
316 | data.head = validTo
317 | }
318 | if n == 0 {
319 | break
320 | }
321 | }
322 |
323 | // Set the hash sum window head. Must either be a block size
324 | // or be at the end of the buffer.
325 | sum.head = min(sum.tail+r.BlockSize, validTo)
326 |
327 | // Compute the rolling hash.
328 | if !rolling {
329 | β, β1, β2 = βhash(buffer[sum.tail:sum.head])
330 | rolling = true
331 | } else {
332 | αPush = uint32(buffer[sum.head-1])
333 | β1 = (β1 - αPop + αPush) % _M
334 | β2 = (β2 - uint32(sum.head-sum.tail)*αPop + β1) % _M
335 | β = β1 + _M*β2
336 | }
337 |
338 | // Determine if there is a hash match.
339 | foundHash = false
340 | if hh, ok := hashLookup[β]; ok && !lastRun {
341 | blockIndex, foundHash = findUniqueHash(hh, r.uniqueHash(buffer[sum.tail:sum.head]))
342 | }
343 | // Send data off if there is data available and a hash is found (so the buffer before it
344 | // must be flushed first), or the data chunk size has reached it's maximum size (for buffer
345 | // allocation purposes) or to flush the end of the data.
346 | if data.tail < data.head && (foundHash || data.head-data.tail >= r.MaxDataOp || lastRun) {
347 | err = enqueue(Operation{Type: OpData, Data: buffer[data.tail:data.head]})
348 | if err != nil {
349 | return err
350 | }
351 | data.tail = data.head
352 | }
353 |
354 | if foundHash {
355 | err = enqueue(Operation{Type: OpBlock, BlockIndex: blockIndex})
356 | if err != nil {
357 | return err
358 | }
359 | rolling = false
360 | sum.tail += r.BlockSize
361 |
362 | // There is prior knowledge that any available data
363 | // buffered will have already been sent. Thus we can
364 | // assume data.head and data.tail are the same.
365 | // May trigger "data wrap".
366 | data.head = sum.tail
367 | data.tail = sum.tail
368 | } else {
369 | // The following is for the next loop iteration, so don't try to calculate if last.
370 | if !lastRun && rolling {
371 | αPop = uint32(buffer[sum.tail])
372 | }
373 | sum.tail += 1
374 |
375 | // May trigger "data wrap".
376 | data.head = sum.tail
377 | }
378 | }
379 | return nil
380 | }
381 |
382 | // Use a more unique way to identify a set of bytes.
383 | func (r *RSync) uniqueHash(v []byte) []byte {
384 | r.UniqueHasher.Reset()
385 | r.UniqueHasher.Write(v)
386 | return r.UniqueHasher.Sum(nil)
387 | }
388 |
389 | // Searches for a given strong hash among all strong hashes in this bucket.
390 | func findUniqueHash(hh []BlockHash, hashValue []byte) (uint64, bool) {
391 | if len(hashValue) == 0 {
392 | return 0, false
393 | }
394 | for _, block := range hh {
395 | if bytes.Equal(block.StrongHash, hashValue) {
396 | return block.Index, true
397 | }
398 | }
399 | return 0, false
400 | }
401 |
402 | // Use a faster way to identify a set of bytes.
403 | func βhash(block []byte) (β uint32, β1 uint32, β2 uint32) {
404 | var a, b uint32
405 | for i, val := range block {
406 | a += uint32(val)
407 | b += (uint32(len(block)-1) - uint32(i) + 1) * uint32(val)
408 | }
409 | β = (a % _M) + (_M * (b % _M))
410 | β1 = a % _M
411 | β2 = b % _M
412 | return
413 | }
414 |
415 | func min(a, b int) int {
416 | if a < b {
417 | return a
418 | }
419 | return b
420 | }
421 |
--------------------------------------------------------------------------------