├── .travis.yml ├── LICENSE ├── README.md ├── gen_test.go └── rsync.go /.travis.yml: -------------------------------------------------------------------------------- 1 | 2 | language: go 3 | go: 1.5.1 4 | 5 | os: 6 | - linux 7 | - osx 8 | 9 | install: 10 | - go get -d -v ./... 11 | 12 | script: 13 | - go test -v 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 Daniel Theophanes 2 | Copyright (c) 2015 Amos Wenger 3 | 4 | This software is provided 'as-is', without any express or implied 5 | warranty. In no event will the authors be held liable for any damages 6 | arising from the use of this software. 7 | 8 | Permission is granted to anyone to use this software for any purpose, 9 | including commercial applications, and to alter it and redistribute it 10 | freely, subject to the following restrictions: 11 | 12 | 1. The origin of this software must not be misrepresented; you must not 13 | claim that you wrote the original software. If you use this software 14 | in a product, an acknowledgment in the product documentation would be 15 | appreciated but is not required. 16 | 17 | 2. Altered source versions must be plainly marked as such, and must not be 18 | misrepresented as being the original software. 19 | 20 | 3. This notice may not be removed or altered from any source 21 | distribution. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rsync-go 2 | 3 | ![MIT licensed](https://img.shields.io/badge/license-MIT-blue.svg) 4 | [![Build Status](https://travis-ci.org/itchio/rsync-go.svg?branch=master)](https://travis-ci.org/itchio/rsync-go) 5 | [![GoDoc](https://godoc.org/github.com/itchio/rsync-go?status.svg)](https://godoc.org/github.com/itchio/rsync-go) 6 | 7 | This is a pure go implementation of the rsync algorithm. 8 | 9 | This repo in particular is a fork of Daniel Teophanes's implementation 10 | (see the [](#Links) section for details) 11 | 12 | ### Usage 13 | 14 | Here's a simple example (without error checking): 15 | 16 | ```go 17 | import ( 18 | "os" 19 | "bytes" 20 | 21 | "gopkg.in/itchio/go-rsync.v0" 22 | ) 23 | 24 | func main() { 25 | srcReader, _ := os.Open("content-v2.bin") 26 | defer srcReader.Close() 27 | 28 | rs := &rsync.RSync{} 29 | 30 | // here we store the whole signature in a byte slice, 31 | // but it could just as well be sent over a network connection for example 32 | sig := make([]rsync.BlockHash, 0, 10) 33 | writeSignature := func(bl rsync.BlockHash) error { 34 | sig = append(sig, bl) 35 | return nil 36 | } 37 | 38 | rs.CreateSignature(srcReader, writeSignature) 39 | 40 | targetReader, _ := os.Open("content-v1.bin") 41 | 42 | opsOut := make(chan rsync.Operation) 43 | writeOperation := func(op rsync.Operation) error { 44 | opsOut <- op 45 | return nil 46 | } 47 | 48 | go func() { 49 | defer close(opsOut) 50 | rs.CreateDelta(targetReader, writeOperation) 51 | }() 52 | 53 | srcWriter, _ := os.OpenFile("content-v2-reconstructed.bin") 54 | srcReader.Seek(0, os.SEEK_SET) 55 | 56 | rs.ApplyDelta(srcWriter, srcReader, opsOut) 57 | } 58 | ``` 59 | 60 | Import 61 | 62 | Creating a signature: 63 | 64 | ### Links 65 | 66 | * original repo: 67 | * paper behind the rsync algorithm: 68 | -------------------------------------------------------------------------------- /gen_test.go: -------------------------------------------------------------------------------- 1 | package rsync 2 | 3 | import ( 4 | "bytes" 5 | "math/rand" 6 | "testing" 7 | ) 8 | 9 | type RandReader struct { 10 | rand.Source 11 | } 12 | 13 | func (rr RandReader) Read(sink []byte) (int, error) { 14 | var tail, head int 15 | buf := make([]byte, 8) 16 | var r uint64 17 | for { 18 | head = min(tail+8, len(sink)) 19 | if tail == head { 20 | return head, nil 21 | } 22 | 23 | r = (uint64)(rr.Int63()) 24 | buf[0] = (byte)(r) 25 | buf[1] = (byte)(r >> 8) 26 | buf[2] = (byte)(r >> 16) 27 | buf[3] = (byte)(r >> 24) 28 | buf[4] = (byte)(r >> 32) 29 | buf[5] = (byte)(r >> 40) 30 | buf[6] = (byte)(r >> 48) 31 | buf[7] = (byte)(r >> 56) 32 | 33 | tail += copy(sink[tail:head], buf) 34 | } 35 | } 36 | 37 | type pair struct { 38 | Source, Target content 39 | Description string 40 | } 41 | type content struct { 42 | Len int 43 | Seed int64 44 | Alter int 45 | Data []byte 46 | } 47 | 48 | func (c *content) Fill() { 49 | c.Data = make([]byte, c.Len) 50 | src := rand.NewSource(c.Seed) 51 | RandReader{src}.Read(c.Data) 52 | 53 | if c.Alter > 0 { 54 | r := rand.New(src) 55 | for i := 0; i < c.Alter; i++ { 56 | at := r.Intn(len(c.Data)) 57 | c.Data[at] += byte(r.Int()) 58 | } 59 | } 60 | } 61 | 62 | func Test_GenData(t *testing.T) { 63 | // Use a seeded generator to get consistent results. 64 | // This allows testing the package without bundling many test files. 65 | 66 | var pairs = []pair{ 67 | pair{ 68 | Source: content{Len: 512*1024 + 89, Seed: 42, Alter: 0}, 69 | Target: content{Len: 512*1024 + 89, Seed: 42, Alter: 5}, 70 | Description: "Same length, slightly different content.", 71 | }, 72 | pair{ 73 | Source: content{Len: 512*1024 + 89, Seed: 9824, Alter: 0}, 74 | Target: content{Len: 512*1024 + 89, Seed: 2345, Alter: 0}, 75 | Description: "Same length, very different content.", 76 | }, 77 | pair{ 78 | Source: content{Len: 512*1024 + 89, Seed: 42, Alter: 0}, 79 | Target: content{Len: 256*1024 + 19, Seed: 42, Alter: 0}, 80 | Description: "Target shorter then source, same content.", 81 | }, 82 | pair{ 83 | Source: content{Len: 512*1024 + 89, Seed: 42, Alter: 0}, 84 | Target: content{Len: 256*1024 + 19, Seed: 42, Alter: 5}, 85 | Description: "Target shorter then source, slightly different content.", 86 | }, 87 | pair{ 88 | Source: content{Len: 256*1024 + 19, Seed: 42, Alter: 0}, 89 | Target: content{Len: 512*1024 + 89, Seed: 42, Alter: 0}, 90 | Description: "Source shorter then target, same content.", 91 | }, 92 | pair{ 93 | Source: content{Len: 512*1024 + 89, Seed: 42, Alter: 5}, 94 | Target: content{Len: 256*1024 + 19, Seed: 42, Alter: 0}, 95 | Description: "Source shorter then target, slightly different content.", 96 | }, 97 | pair{ 98 | Source: content{Len: 512*1024 + 89, Seed: 42, Alter: 0}, 99 | Target: content{Len: 0, Seed: 42, Alter: 0}, 100 | Description: "Target empty and source has content.", 101 | }, 102 | pair{ 103 | Source: content{Len: 0, Seed: 42, Alter: 0}, 104 | Target: content{Len: 512*1024 + 89, Seed: 42, Alter: 0}, 105 | Description: "Source empty and target has content.", 106 | }, 107 | pair{ 108 | Source: content{Len: 872, Seed: 9824, Alter: 0}, 109 | Target: content{Len: 235, Seed: 2345, Alter: 0}, 110 | Description: "Source and target both smaller then a block size.", 111 | }, 112 | } 113 | rs := &RSync{} 114 | rsDelta := &RSync{} 115 | for _, p := range pairs { 116 | (&p.Source).Fill() 117 | (&p.Target).Fill() 118 | 119 | sourceBuffer := bytes.NewReader(p.Source.Data) 120 | targetBuffer := bytes.NewReader(p.Target.Data) 121 | 122 | sig := make([]BlockHash, 0, 10) 123 | err := rs.CreateSignature(targetBuffer, func(bl BlockHash) error { 124 | sig = append(sig, bl) 125 | return nil 126 | }) 127 | if err != nil { 128 | t.Errorf("Failed to create signature: %s", err) 129 | } 130 | opsOut := make(chan Operation) 131 | go func() { 132 | var blockCt, blockRangeCt, dataCt, bytes int 133 | defer close(opsOut) 134 | err := rsDelta.CreateDelta(sourceBuffer, sig, func(op Operation) error { 135 | switch op.Type { 136 | case OpBlockRange: 137 | blockRangeCt++ 138 | case OpBlock: 139 | blockCt++ 140 | case OpData: 141 | // Copy data buffer so it may be reused in internal buffer. 142 | b := make([]byte, len(op.Data)) 143 | copy(b, op.Data) 144 | op.Data = b 145 | dataCt++ 146 | bytes += len(op.Data) 147 | } 148 | opsOut <- op 149 | return nil 150 | }) 151 | t.Logf("Range Ops:%5d, Block Ops:%5d, Data Ops: %5d, Data Len: %5dKiB, For %s.", blockRangeCt, blockCt, dataCt, bytes/1024, p.Description) 152 | if err != nil { 153 | t.Errorf("Failed to create delta: %s", err) 154 | } 155 | }() 156 | 157 | result := new(bytes.Buffer) 158 | 159 | targetBuffer.Seek(0, 0) 160 | err = rs.ApplyDelta(result, targetBuffer, opsOut) 161 | if err != nil { 162 | t.Errorf("Failed to apply delta: %s", err) 163 | } 164 | 165 | if result.Len() != len(p.Source.Data) { 166 | t.Errorf("Result not same size as source: %s", p.Description) 167 | } else if bytes.Equal(result.Bytes(), p.Source.Data) == false { 168 | t.Errorf("Result is different from the source: %s", p.Description) 169 | } 170 | 171 | p.Source.Data = nil 172 | p.Target.Data = nil 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /rsync.go: -------------------------------------------------------------------------------- 1 | // RSync/RDiff implementation. 2 | // 3 | // Algorithm found at: http://www.samba.org/~tridge/phd_thesis.pdf 4 | // 5 | // Definitions 6 | // Source: The final content. 7 | // Target: The content to be made into final content. 8 | // Signature: The sequence of hashes used to identify the content. 9 | package rsync 10 | 11 | import ( 12 | "bytes" 13 | "crypto/md5" 14 | "hash" 15 | "io" 16 | ) 17 | 18 | // If no BlockSize is specified in the RSync instance, this value is used. 19 | const DefaultBlockSize = 1024 * 6 20 | const DefaultMaxDataOp = DefaultBlockSize * 10 21 | 22 | // Internal constant used in rolling checksum. 23 | const _M = 1 << 16 24 | 25 | // Operation Types. 26 | type OpType byte 27 | 28 | const ( 29 | OpBlock OpType = iota 30 | OpData 31 | OpHash 32 | OpBlockRange 33 | ) 34 | 35 | // Instruction to mutate target to align to source. 36 | type Operation struct { 37 | Type OpType 38 | BlockIndex uint64 39 | BlockIndexEnd uint64 40 | Data []byte 41 | } 42 | 43 | // Signature hash item generated from target. 44 | type BlockHash struct { 45 | Index uint64 46 | StrongHash []byte 47 | WeakHash uint32 48 | } 49 | 50 | // Write signatures as they are generated. 51 | type SignatureWriter func(bl BlockHash) error 52 | type OperationWriter func(op Operation) error 53 | 54 | // Properties to use while working with the rsync algorithm. 55 | // A single RSync should not be used concurrently as it may contain 56 | // internal buffers and hash sums. 57 | type RSync struct { 58 | BlockSize int 59 | MaxDataOp int 60 | 61 | // If this is nil an MD5 hash is used. 62 | UniqueHasher hash.Hash 63 | 64 | buffer []byte 65 | } 66 | 67 | // If the target length is known the number of hashes in the 68 | // signature can be determined. 69 | func (r *RSync) BlockHashCount(targetLength int) (count int) { 70 | if r.BlockSize <= 0 { 71 | r.BlockSize = DefaultBlockSize 72 | } 73 | count = (targetLength / r.BlockSize) 74 | if targetLength%r.BlockSize != 0 { 75 | count++ 76 | } 77 | return 78 | } 79 | 80 | // Calculate the signature of target. 81 | func (r *RSync) CreateSignature(target io.Reader, sw SignatureWriter) error { 82 | if r.BlockSize <= 0 { 83 | r.BlockSize = DefaultBlockSize 84 | } 85 | if r.UniqueHasher == nil { 86 | r.UniqueHasher = md5.New() 87 | } 88 | var err error 89 | var n int 90 | 91 | minBufferSize := r.BlockSize 92 | if len(r.buffer) < minBufferSize { 93 | r.buffer = make([]byte, minBufferSize) 94 | } 95 | buffer := r.buffer 96 | 97 | var block []byte 98 | loop := true 99 | var index uint64 100 | for loop { 101 | n, err = io.ReadAtLeast(target, buffer, r.BlockSize) 102 | if err != nil { 103 | // n == 0. 104 | if err == io.EOF { 105 | return nil 106 | } 107 | if err != io.ErrUnexpectedEOF { 108 | return err 109 | } 110 | // n > 0. 111 | loop = false 112 | } 113 | block = buffer[:n] 114 | weak, _, _ := βhash(block) 115 | err = sw(BlockHash{StrongHash: r.uniqueHash(block), WeakHash: weak, Index: index}) 116 | if err != nil { 117 | return err 118 | } 119 | index++ 120 | } 121 | return nil 122 | } 123 | 124 | // Apply the difference to the target. 125 | func (r *RSync) ApplyDelta(alignedTarget io.Writer, target io.ReadSeeker, ops chan Operation) error { 126 | if r.BlockSize <= 0 { 127 | r.BlockSize = DefaultBlockSize 128 | } 129 | var err error 130 | var n int 131 | var block []byte 132 | 133 | minBufferSize := r.BlockSize 134 | if len(r.buffer) < minBufferSize { 135 | r.buffer = make([]byte, minBufferSize) 136 | } 137 | buffer := r.buffer 138 | 139 | writeBlock := func(op Operation) error { 140 | target.Seek(int64(r.BlockSize*int(op.BlockIndex)), 0) 141 | n, err = io.ReadAtLeast(target, buffer, r.BlockSize) 142 | if err != nil { 143 | if err != io.ErrUnexpectedEOF { 144 | return err 145 | } 146 | } 147 | block = buffer[:n] 148 | _, err = alignedTarget.Write(block) 149 | if err != nil { 150 | return err 151 | } 152 | return nil 153 | } 154 | 155 | for op := range ops { 156 | switch op.Type { 157 | case OpBlockRange: 158 | for i := op.BlockIndex; i <= op.BlockIndexEnd; i++ { 159 | err = writeBlock(Operation{ 160 | Type: OpBlock, 161 | BlockIndex: i, 162 | }) 163 | if err != nil { 164 | if err == io.EOF { 165 | break 166 | } 167 | return err 168 | } 169 | } 170 | case OpBlock: 171 | err = writeBlock(op) 172 | if err != nil { 173 | if err == io.EOF { 174 | break 175 | } 176 | return err 177 | } 178 | case OpData: 179 | _, err = alignedTarget.Write(op.Data) 180 | if err != nil { 181 | return err 182 | } 183 | } 184 | } 185 | return nil 186 | } 187 | 188 | // Create the operation list to mutate the target signature into the source. 189 | // Any data operation from the OperationWriter must have the data copied out 190 | // within the span of the function; the data buffer underlying the operation 191 | // data is reused. The sourceSum create a complete hash sum of the source if 192 | // present. 193 | func (r *RSync) CreateDelta(source io.Reader, signature []BlockHash, ops OperationWriter) (err error) { 194 | if r.BlockSize <= 0 { 195 | r.BlockSize = DefaultBlockSize 196 | } 197 | if r.MaxDataOp <= 0 { 198 | r.MaxDataOp = DefaultMaxDataOp 199 | } 200 | if r.UniqueHasher == nil { 201 | r.UniqueHasher = md5.New() 202 | } 203 | minBufferSize := (r.BlockSize * 2) + (r.MaxDataOp) 204 | if len(r.buffer) < minBufferSize { 205 | r.buffer = make([]byte, minBufferSize) 206 | } 207 | buffer := r.buffer 208 | 209 | // A single β hashes may correlate with a many unique hashes. 210 | hashLookup := make(map[uint32][]BlockHash, len(signature)) 211 | for _, h := range signature { 212 | key := h.WeakHash 213 | hashLookup[key] = append(hashLookup[key], h) 214 | } 215 | 216 | type section struct { 217 | tail int 218 | head int 219 | } 220 | 221 | var data, sum section 222 | var n, validTo int 223 | var αPop, αPush, β, β1, β2 uint32 224 | var blockIndex uint64 225 | var rolling, lastRun, foundHash bool 226 | 227 | // Store the previous non-data operation for combining. 228 | var prevOp *Operation 229 | 230 | // Send the last operation if there is one waiting. 231 | defer func() { 232 | if prevOp == nil { 233 | return 234 | } 235 | err = ops(*prevOp) 236 | prevOp = nil 237 | }() 238 | 239 | // Combine OpBlock into OpBlockRange. To do this store the previous 240 | // non-data operation and determine if it can be extended. 241 | enqueue := func(op Operation) (err error) { 242 | switch op.Type { 243 | case OpBlock: 244 | if prevOp != nil { 245 | switch prevOp.Type { 246 | case OpBlock: 247 | if prevOp.BlockIndex+1 == op.BlockIndex { 248 | prevOp = &Operation{ 249 | Type: OpBlockRange, 250 | BlockIndex: prevOp.BlockIndex, 251 | BlockIndexEnd: op.BlockIndex, 252 | } 253 | return 254 | } 255 | case OpBlockRange: 256 | if prevOp.BlockIndexEnd+1 == op.BlockIndex { 257 | prevOp.BlockIndexEnd = op.BlockIndex 258 | return 259 | } 260 | } 261 | err = ops(*prevOp) 262 | if err != nil { 263 | return 264 | } 265 | prevOp = nil 266 | } 267 | prevOp = &op 268 | case OpData: 269 | // Never save a data operation, as it would corrupt the buffer. 270 | if prevOp != nil { 271 | err = ops(*prevOp) 272 | if err != nil { 273 | return 274 | } 275 | } 276 | err = ops(op) 277 | if err != nil { 278 | return 279 | } 280 | prevOp = nil 281 | } 282 | return 283 | } 284 | 285 | for !lastRun { 286 | // Determine if the buffer should be extended. 287 | if sum.tail+r.BlockSize > validTo { 288 | // Determine if the buffer should be wrapped. 289 | if validTo+r.BlockSize > len(buffer) { 290 | // Before wrapping the buffer, send any trailing data off. 291 | if data.tail < data.head { 292 | err = enqueue(Operation{Type: OpData, Data: buffer[data.tail:data.head]}) 293 | if err != nil { 294 | return err 295 | } 296 | } 297 | // Wrap the buffer. 298 | l := validTo - sum.tail 299 | copy(buffer[:l], buffer[sum.tail:validTo]) 300 | 301 | // Reset indexes. 302 | validTo = l 303 | sum.tail = 0 304 | data.head = 0 305 | data.tail = 0 306 | } 307 | 308 | n, err = io.ReadAtLeast(source, buffer[validTo:validTo+r.BlockSize], r.BlockSize) 309 | validTo += n 310 | if err != nil { 311 | if err != io.EOF && err != io.ErrUnexpectedEOF { 312 | return err 313 | } 314 | lastRun = true 315 | 316 | data.head = validTo 317 | } 318 | if n == 0 { 319 | break 320 | } 321 | } 322 | 323 | // Set the hash sum window head. Must either be a block size 324 | // or be at the end of the buffer. 325 | sum.head = min(sum.tail+r.BlockSize, validTo) 326 | 327 | // Compute the rolling hash. 328 | if !rolling { 329 | β, β1, β2 = βhash(buffer[sum.tail:sum.head]) 330 | rolling = true 331 | } else { 332 | αPush = uint32(buffer[sum.head-1]) 333 | β1 = (β1 - αPop + αPush) % _M 334 | β2 = (β2 - uint32(sum.head-sum.tail)*αPop + β1) % _M 335 | β = β1 + _M*β2 336 | } 337 | 338 | // Determine if there is a hash match. 339 | foundHash = false 340 | if hh, ok := hashLookup[β]; ok && !lastRun { 341 | blockIndex, foundHash = findUniqueHash(hh, r.uniqueHash(buffer[sum.tail:sum.head])) 342 | } 343 | // Send data off if there is data available and a hash is found (so the buffer before it 344 | // must be flushed first), or the data chunk size has reached it's maximum size (for buffer 345 | // allocation purposes) or to flush the end of the data. 346 | if data.tail < data.head && (foundHash || data.head-data.tail >= r.MaxDataOp || lastRun) { 347 | err = enqueue(Operation{Type: OpData, Data: buffer[data.tail:data.head]}) 348 | if err != nil { 349 | return err 350 | } 351 | data.tail = data.head 352 | } 353 | 354 | if foundHash { 355 | err = enqueue(Operation{Type: OpBlock, BlockIndex: blockIndex}) 356 | if err != nil { 357 | return err 358 | } 359 | rolling = false 360 | sum.tail += r.BlockSize 361 | 362 | // There is prior knowledge that any available data 363 | // buffered will have already been sent. Thus we can 364 | // assume data.head and data.tail are the same. 365 | // May trigger "data wrap". 366 | data.head = sum.tail 367 | data.tail = sum.tail 368 | } else { 369 | // The following is for the next loop iteration, so don't try to calculate if last. 370 | if !lastRun && rolling { 371 | αPop = uint32(buffer[sum.tail]) 372 | } 373 | sum.tail += 1 374 | 375 | // May trigger "data wrap". 376 | data.head = sum.tail 377 | } 378 | } 379 | return nil 380 | } 381 | 382 | // Use a more unique way to identify a set of bytes. 383 | func (r *RSync) uniqueHash(v []byte) []byte { 384 | r.UniqueHasher.Reset() 385 | r.UniqueHasher.Write(v) 386 | return r.UniqueHasher.Sum(nil) 387 | } 388 | 389 | // Searches for a given strong hash among all strong hashes in this bucket. 390 | func findUniqueHash(hh []BlockHash, hashValue []byte) (uint64, bool) { 391 | if len(hashValue) == 0 { 392 | return 0, false 393 | } 394 | for _, block := range hh { 395 | if bytes.Equal(block.StrongHash, hashValue) { 396 | return block.Index, true 397 | } 398 | } 399 | return 0, false 400 | } 401 | 402 | // Use a faster way to identify a set of bytes. 403 | func βhash(block []byte) (β uint32, β1 uint32, β2 uint32) { 404 | var a, b uint32 405 | for i, val := range block { 406 | a += uint32(val) 407 | b += (uint32(len(block)-1) - uint32(i) + 1) * uint32(val) 408 | } 409 | β = (a % _M) + (_M * (b % _M)) 410 | β1 = a % _M 411 | β2 = b % _M 412 | return 413 | } 414 | 415 | func min(a, b int) int { 416 | if a < b { 417 | return a 418 | } 419 | return b 420 | } 421 | --------------------------------------------------------------------------------