├── .gitignore
├── go.mod
├── go.sum
├── blockdevice.go
├── blockdevice_linux.go
├── example
    ├── ceph-import-diff
    │   ├── README.md
    │   └── main.go
    └── webserver
    │   ├── client.go
    │   └── server.go
├── sparse_fallback.go
├── LICENSE.txt
├── sparse.go
├── format
    ├── format.go
    └── rbd.go
├── sparse_unix.go
├── README.md
├── sparse_windows.go
├── cmd
    └── sparsecat
    │   └── main.go
└── copy.go


/.gitignore:
--------------------------------------------------------------------------------
1 | ./sparsecat
2 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/svenwiltink/sparsecat
2 | 
3 | go 1.17
4 | 
5 | require golang.org/x/sys v0.0.0-20210601080250-7ecdf8ef093b
6 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | golang.org/x/sys v0.0.0-20210601080250-7ecdf8ef093b h1:qh4f65QIVFjq9eBURLEYWqaEXmOyqdUyiBSgaXWccWk=
2 | golang.org/x/sys v0.0.0-20210601080250-7ecdf8ef093b/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
3 | 


--------------------------------------------------------------------------------
/blockdevice.go:
--------------------------------------------------------------------------------
 1 | //go:build !linux
 2 | // +build !linux
 3 | 
 4 | package sparsecat
 5 | 
 6 | import (
 7 | 	"errors"
 8 | 	"os"
 9 | )
10 | 
11 | func getBlockDeviceSize(f *os.File) (int64, error) {
12 | 	return 0, errors.New("operation not supported")
13 | }
14 | 


--------------------------------------------------------------------------------
/blockdevice_linux.go:
--------------------------------------------------------------------------------
 1 | package sparsecat
 2 | 
 3 | import (
 4 | 	"os"
 5 | 
 6 | 	"golang.org/x/sys/unix"
 7 | )
 8 | 
 9 | func getBlockDeviceSize(file *os.File) (size int, err error) {
10 | 	conn, err := file.SyscallConn()
11 | 	if err != nil {
12 | 		return 0, err
13 | 	}
14 | 
15 | 	connerr := conn.Control(func(fd uintptr) {
16 | 		size, err = unix.IoctlGetInt(int(fd), unix.BLKGETSIZE64)
17 | 	})
18 | 
19 | 	if connerr != nil {
20 | 		return 0, connerr
21 | 	}
22 | 
23 | 	return size, err
24 | }
25 | 


--------------------------------------------------------------------------------
/example/ceph-import-diff/README.md:
--------------------------------------------------------------------------------
 1 | ### ceph-import-diff
 2 | 
 3 | Proof of concept of sending sparse files to a ceph cluster. It creates a valid `export-format 2` stream that can be
 4 | piped to `rbd import`.
 5 | 
 6 | Example command: 
 7 | ```
 8 | ./ceph-import-diff vps.raw | pv | rbd import --export-format 2 - libvirt-pool/banaan
 9 | 7,07GiB 0:00:53 [ 134MiB/s]
10 | ```
11 | 
12 | Compared to the normal import of the same image:
13 | ```
14 | pv vps.raw | rbd import - libvirt-pool/banaan
15 | 200GiB 0:04:46 [ 714MiB/s]
16 | ```
17 | 
18 | A savings of 193GiB and nearly 4 minutes


--------------------------------------------------------------------------------
/example/webserver/client.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"github.com/svenwiltink/sparsecat"
 5 | 	"net/http"
 6 | 	"net/http/httputil"
 7 | 	"os"
 8 | )
 9 | 
10 | func main() {
11 | 	source, err := os.Open("image.raw")
12 | 	if err != nil {
13 | 		panic(err)
14 | 	}
15 | 	defer source.Close()
16 | 
17 | 	sparseEncoder := sparsecat.NewEncoder(source)
18 | 
19 | 	resp, err := http.Post("http://localhost:6969/store", "application/octet-stream", sparseEncoder)
20 | 	if err != nil {
21 | 		panic(err)
22 | 	}
23 | 
24 | 	httputil.DumpResponse(resp, false)
25 | 
26 | 	defer resp.Body.Close()
27 | }
28 | 


--------------------------------------------------------------------------------
/sparse_fallback.go:
--------------------------------------------------------------------------------
 1 | //go:build !darwin && !dragonfly && !freebsd && !linux && !netbsd && !openbsd && !solaris && !windows
 2 | // +build !darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris,!windows
 3 | 
 4 | package sparsecat
 5 | 
 6 | import "os"
 7 | 
 8 | // fallback implementations for operating systems that don't support SEEK_HOLE and SEEK_DATA. It returns
 9 | // the offset and the end of the file.
10 | func detectDataSection(file *os.File, offset int64) (start int64, end int64, err error) {
11 | 	fi, err := file.Stat()
12 | 	if err != nil {
13 | 		return 0, 0, err
14 | 	}
15 | 
16 | 	return offset, fi.Size(), nil
17 | }
18 | 
19 | func supportsSeekHole(f *os.File) bool {
20 | 	return true
21 | }
22 | 
23 | func getBlockDeviceSize() (int64, error) {
24 | 	return 0, errors.New("operation not supported")
25 | }
26 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Sven Wiltink
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/sparse.go:
--------------------------------------------------------------------------------
 1 | package sparsecat
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"errors"
 6 | 	"io"
 7 | 	"os"
 8 | )
 9 | 
10 | const BLK_READ_BUFFER = 4_000_000 // 4MB
11 | 
12 | func isBlockDevice(fi os.FileInfo) bool {
13 | 	return fi.Mode()&os.ModeDevice == os.ModeDevice
14 | }
15 | 
16 | // slowDetectDataSection detects data sections by reading a buffer at the time, discarding any that don't contain
17 | // data. Only returns EOF when there is no data to be copied anymore
18 | func slowDetectDataSection(file io.Reader, currentOffset int64) (start int64, end int64, reader io.Reader, err error) {
19 | 	var buf [BLK_READ_BUFFER]byte
20 | 
21 | 	for {
22 | 		read, err := file.Read(buf[:])
23 | 		if err != nil && !errors.Is(err, io.EOF) {
24 | 			return 0, 0, nil, err
25 | 		}
26 | 
27 | 		if read == 0 && errors.Is(err, io.EOF) {
28 | 			return 0, 0, nil, err
29 | 		}
30 | 
31 | 		// buffer is empty, discard data but advance offset unless EOF
32 | 		if isBufferEmpty(buf[:read]) {
33 | 			currentOffset += int64(read)
34 | 			continue
35 | 		}
36 | 
37 | 		return currentOffset, currentOffset + int64(read), bytes.NewReader(buf[:read]), nil
38 | 	}
39 | }
40 | 
41 | func isBufferEmpty(buf []byte) bool {
42 | 	for _, b := range buf {
43 | 		if b != 0 {
44 | 			return false
45 | 		}
46 | 	}
47 | 	return true
48 | }
49 | 


--------------------------------------------------------------------------------
/example/webserver/server.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"compress/gzip"
 5 | 	"github.com/svenwiltink/sparsecat"
 6 | 	"io"
 7 | 	"net/http"
 8 | 	"os"
 9 | )
10 | 
11 | func main() {
12 | 	http.HandleFunc("/store", func(writer http.ResponseWriter, request *http.Request) {
13 | 		if request.Method != http.MethodPost {
14 | 			http.NotFound(writer, request)
15 | 			return
16 | 		}
17 | 
18 | 		sparseReader := sparsecat.NewDecoder(request.Body)
19 | 		target, err := os.Create("based.raw")
20 | 		if err != nil {
21 | 			panic(err)
22 | 		}
23 | 
24 | 		_, err = io.Copy(target, sparseReader)
25 | 		if err != nil {
26 | 			panic(err)
27 | 		}
28 | 	})
29 | 
30 | 	http.HandleFunc("/store-zipped", func(writer http.ResponseWriter, request *http.Request) {
31 | 		if request.Method != http.MethodPost {
32 | 			http.NotFound(writer, request)
33 | 			return
34 | 		}
35 | 
36 | 		sparseReader := sparsecat.NewDecoder(request.Body)
37 | 		target, err := os.Create("based.raw.gz")
38 | 		if err != nil {
39 | 			panic(err)
40 | 		}
41 | 		defer target.Close()
42 | 
43 | 		zw := gzip.NewWriter(target)
44 | 		_, err = io.Copy(zw, sparseReader)
45 | 		zw.Close()
46 | 
47 | 		if err != nil {
48 | 			panic(err)
49 | 		}
50 | 	})
51 | 
52 | 	err := http.ListenAndServe("localhost:6969", nil)
53 | 	if err != nil {
54 | 		panic(err)
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/format/format.go:
--------------------------------------------------------------------------------
 1 | package format
 2 | 
 3 | import "io"
 4 | 
 5 | type Section struct {
 6 | 	Offset, Length int64
 7 | }
 8 | 
 9 | // Format defines the wire format function. ReadFileSize and ReadSectionHeader are used
10 | // for parsing incoming data whereas the GetFileSizeReader, GetSectionReader and GetEndTagReader
11 | // functions are used to create readers that can be used by io.Copy. The length returned by these
12 | // functions must be the amount of bytes the reader will return before reaching io.EOF.
13 | type Format interface {
14 | 	// ReadFileSize reads the file size from an incoming stream
15 | 	ReadFileSize(reader io.Reader) (int64, error)
16 | 	// ReadSectionHeader reads a data section header from the incoming stream.
17 | 	// When the incoming stream receives an end tag an empty section must be returned
18 | 	// in combination with an io.EOF error.
19 | 	ReadSectionHeader(reader io.Reader) (Section, error)
20 | 
21 | 	GetFileSizeReader(size uint64) (reader io.Reader, length int64)
22 | 	GetSectionReader(source io.Reader, section Section) (reader io.Reader, length int64)
23 | 	GetEndTagReader() (reader io.Reader, length int64)
24 | }
25 | 
26 | var formats = map[string]Format{
27 | 	"rbd-diff-v1": RbdDiffv1,
28 | 	"rbd-diff-v2": RbdDiffv2,
29 | }
30 | 
31 | func GetByName(name string) (format Format, exists bool) {
32 | 	format, exists = formats[name]
33 | 	return
34 | }
35 | 


--------------------------------------------------------------------------------
/example/ceph-import-diff/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"encoding/binary"
 5 | 	"github.com/svenwiltink/sparsecat"
 6 | 	"github.com/svenwiltink/sparsecat/format"
 7 | 	"io"
 8 | 	"log"
 9 | 	"os"
10 | )
11 | 
12 | const (
13 | 	RBDImageHeader = "rbd image v2\n"
14 | 	RBDImageEndTag = "E"
15 | 
16 | 	RBDImageDiffsV2Header = "rbd image diffs v2\n"
17 | 	RBDImageDiffV2Header  = "rbd diff v2\n"
18 | )
19 | 
20 | func main() {
21 | 	log.SetFlags(log.Llongfile)
22 | 	if len(os.Args) != 2 {
23 | 		log.Fatalln(os.Args[0], " <file to import>")
24 | 	}
25 | 
26 | 	file, err := os.Open(os.Args[1])
27 | 	if err != nil {
28 | 		panic(err)
29 | 	}
30 | 	defer file.Close()
31 | 
32 | 	_, err = os.Stdout.WriteString(RBDImageHeader)
33 | 	if err != nil {
34 | 		panic(err)
35 | 	}
36 | 
37 | 	_, err = os.Stdout.WriteString(RBDImageEndTag)
38 | 	if err != nil {
39 | 		panic(err)
40 | 	}
41 | 
42 | 	_, err = os.Stdout.WriteString(RBDImageDiffsV2Header)
43 | 	if err != nil {
44 | 		panic(err)
45 | 	}
46 | 
47 | 	var buf [8]byte
48 | 	binary.LittleEndian.PutUint64(buf[:], 1)
49 | 
50 | 	_, err = os.Stdout.Write(buf[:])
51 | 	if err != nil {
52 | 		panic(err)
53 | 	}
54 | 
55 | 	_, err = os.Stdout.WriteString(RBDImageDiffV2Header)
56 | 	if err != nil {
57 | 		panic(err)
58 | 	}
59 | 
60 | 	encoder := sparsecat.NewEncoder(file)
61 | 	encoder.Format = format.RbdDiffv2
62 | 	encoder.MaxSectionSize = 16_000_000
63 | 
64 | 	_, err = io.Copy(os.Stdout, encoder)
65 | 	if err != nil {
66 | 		panic(err)
67 | 	}
68 | }
69 | 


--------------------------------------------------------------------------------
/sparse_unix.go:
--------------------------------------------------------------------------------
 1 | //go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
 2 | // +build darwin dragonfly freebsd linux netbsd openbsd solaris
 3 | 
 4 | package sparsecat
 5 | 
 6 | import (
 7 | 	"errors"
 8 | 	"fmt"
 9 | 	"io"
10 | 	"os"
11 | 	"syscall"
12 | )
13 | 
14 | const (
15 | 	SEEK_DATA = 3
16 | 	SEEK_HOLE = 4
17 | )
18 | 
19 | // detectDataSection detects the start and end of the next section containing data. This
20 | // skips any sparse sections. The implementation and supported filesystems are listed
21 | // here https://man7.org/linux/man-pages/man2/lseek.2.html
22 | func detectDataSection(file *os.File, offset int64) (start int64, end int64, err error) {
23 | 	var syserr syscall.Errno
24 | 
25 | 	startOfData, err := file.Seek(offset, SEEK_DATA)
26 | 	if errors.As(err, &syserr) {
27 | 		if syserr == syscall.ENXIO {
28 | 			return 0, 0, io.EOF
29 | 		}
30 | 		return 0, 0, fmt.Errorf("error seeking to data: %w", err)
31 | 	}
32 | 
33 | 	if err != nil {
34 | 		return 0, 0, fmt.Errorf("error seeking to data: %w", err)
35 | 	}
36 | 
37 | 	endOfData, err := file.Seek(startOfData, SEEK_HOLE)
38 | 	if errors.As(err, &syserr) {
39 | 		if syserr == syscall.ENXIO {
40 | 			return 0, 0, io.EOF
41 | 		}
42 | 		return 0, 0, fmt.Errorf("error seeking to hole: %w", err)
43 | 	}
44 | 
45 | 	if err != nil {
46 | 		return 0, 0, fmt.Errorf("error seeking to hole: %w", err)
47 | 	}
48 | 
49 | 	return startOfData, endOfData, err
50 | }
51 | 
52 | func supportsSeekHole(file *os.File) bool {
53 | 	_, err := file.Seek(0, SEEK_DATA)
54 | 	var syserr syscall.Errno
55 | 
56 | 	// when a file is completely empty SEEK_DATA fails with ENXIO indicating an EOF.
57 | 	if errors.As(err, &syserr) {
58 | 		if syserr == syscall.ENXIO {
59 | 			return true
60 | 		}
61 | 	}
62 | 	return err == nil
63 | }
64 | 
65 | func SparseTruncate(file *os.File, size int64) error {
66 | 	return file.Truncate(size)
67 | }
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## SparseCat
 2 | 
 3 | ### Goal
 4 | Skipping the hole in sparse file when transmitting large files over the network. Using the filesystem seek capabilities
 5 | hole can be detected. Instead of transmitting these zero bytes and wasting precious bandwidth only sections of the file
 6 | containing data are sent.
 7 | 
 8 | 
 9 | ### Example usage
10 | ```shell
11 | // create sparse image
12 | truncate -s150G image.raw
13 | 
14 | // add some random data to the sparse file
15 | dd if=/dev/urandom bs=4M count=10 conv=notrunc seek=30 of=image.raw
16 | 
17 | // send sparse file and reconstruct it on the other host. The amount
18 | // of data transmitted will only be 40MB instead of 150G
19 | sparsecat -if image.raw | pv | ssh GLaDOS sparsecat -r -of image.raw
20 | ```
21 | [![asciicast](https://asciinema.org/a/BMQStO5yWGWsG3xBigE2NV9Gx.svg)](https://asciinema.org/a/BMQStO5yWGWsG3xBigE2NV9Gx)
22 | 
23 | ### But how does it work?
24 | 
25 | Sparsecat used the `SEEK_HOLE` and `SEEK_DATA` capabilities of `lseek` on linux. See [the man pages](https://man7.org/linux/man-pages/man2/lseek.2.html)
26 | for more information. Before sending the data inside a file Sparsecat creates a small header containing the size of the
27 | source file. The data sections follow this header. Each section consists of the offset in the target file, the length
28 | of the data section followed by the data itself. The wire format is identical to [ceph rbd export-diff](https://github.com/ceph/ceph/blob/aa913ced1240a366e063182cd359b562c626643d/doc/dev/rbd-diff.rst)
29 | 
30 | 
31 | When receiving a Sparsecat stream the Decoder detects if the target is an `*os.File`. When this is the case and the
32 | file is capable of seeking a fast path is used and the sparseness of the target file is preserved. When the target
33 | is not a file, such as an `io.Copy` to a buffer, Sparsecat will pad the output zero bytes. As if it is outputting the
34 | entire file.


--------------------------------------------------------------------------------
/sparse_windows.go:
--------------------------------------------------------------------------------
 1 | package sparsecat
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"golang.org/x/sys/windows"
 6 | 	"io"
 7 | 	"os"
 8 | 	"syscall"
 9 | 	"unsafe"
10 | )
11 | 
12 | const (
13 | 	queryAllocRanges = 0x000940CF
14 | 	setSparse        = 0x000900c4
15 | )
16 | 
17 | // detectDataSection detects the start and end of the next section containing data. This
18 | // skips any sparse sections.
19 | func detectDataSection(file *os.File, offset int64) (start int64, end int64, err error) {
20 | 	// typedef struct _FILE_ALLOCATED_RANGE_BUFFER {
21 | 	//  LARGE_INTEGER FileOffset;
22 | 	//  LARGE_INTEGER Length;
23 | 	//} FILE_ALLOCATED_RANGE_BUFFER, *PFILE_ALLOCATED_RANGE_BUFFER;
24 | 	type allocRangeBuffer struct{ offset, length int64 }
25 | 
26 | 	// TODO: prevent this stat call
27 | 	s, err := file.Stat()
28 | 	if err != nil {
29 | 		return 0, 0, err
30 | 	}
31 | 
32 | 	queryRange := allocRangeBuffer{offset, s.Size()}
33 | 	allocRanges := make([]allocRangeBuffer, 1)
34 | 
35 | 	var bytesReturned uint32
36 | 	err = windows.DeviceIoControl(
37 | 		windows.Handle(file.Fd()), queryAllocRanges,
38 | 		(*byte)(unsafe.Pointer(&queryRange)), uint32(unsafe.Sizeof(queryRange)),
39 | 		(*byte)(unsafe.Pointer(&allocRanges[0])), uint32(len(allocRanges)*int(unsafe.Sizeof(allocRanges[0]))),
40 | 		&bytesReturned, nil,
41 | 	)
42 | 
43 | 	if err != nil {
44 | 		if !errors.Is(err, syscall.ERROR_MORE_DATA) {
45 | 			panic(err)
46 | 		}
47 | 	}
48 | 
49 | 	// no error and nothing returned, assume EOF
50 | 	if bytesReturned == 0 {
51 | 		return 0, 0, io.EOF
52 | 	}
53 | 
54 | 	return allocRanges[0].offset, allocRanges[0].offset + allocRanges[0].length, nil
55 | }
56 | 
57 | func supportsSeekHole(f *os.File) bool {
58 | 	return true
59 | }
60 | 
61 | func SparseTruncate(file *os.File, size int64) error {
62 | 	err := windows.DeviceIoControl(
63 | 		windows.Handle(file.Fd()), setSparse,
64 | 		nil, 0,
65 | 		nil, 0,
66 | 		nil, nil,
67 | 	)
68 | 
69 | 	if err != nil {
70 | 		return err
71 | 	}
72 | 
73 | 	err = file.Truncate(size)
74 | 	if err != nil {
75 | 		return nil
76 | 	}
77 | 	return err
78 | }
79 | 


--------------------------------------------------------------------------------
/cmd/sparsecat/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"flag"
  5 | 	"github.com/svenwiltink/sparsecat"
  6 | 	"github.com/svenwiltink/sparsecat/format"
  7 | 	"io"
  8 | 	"log"
  9 | 	"os"
 10 | )
 11 | 
 12 | type OperationType int
 13 | 
 14 | const (
 15 | 	Send OperationType = iota
 16 | 	Receive
 17 | )
 18 | 
 19 | func main() {
 20 | 	inputFileName := flag.String("if", "", "input inputFile. '-' for stdin")
 21 | 	outputFileName := flag.String("of", "", "output inputFile. '-' for stdout")
 22 | 	formatName := flag.String("format", "rbd-diff-v1", "the wire format to use. Currently either rbd-diff-v1 or rbd-diff-v2")
 23 | 	receive := flag.Bool("r", false, "receive a file instead of transmitting")
 24 | 	disableSparseTarget := flag.Bool("disable-sparse-target", false, "disable sparse writing the target file")
 25 | 	disableFileTruncate := flag.Bool("disable-file-truncate", false, "disable truncating the target file, *only use this when you know what you are doing*")
 26 | 
 27 | 	flag.Parse()
 28 | 
 29 | 	log.SetFlags(0)
 30 | 
 31 | 	f, exists := format.GetByName(*formatName)
 32 | 	if !exists {
 33 | 		log.Fatalf("Format %s doesn't exist", *formatName)
 34 | 	}
 35 | 
 36 | 	operation := Send
 37 | 	if *receive {
 38 | 		operation = Receive
 39 | 	}
 40 | 
 41 | 	// apply defaults
 42 | 	if operation == Send && *outputFileName == "" {
 43 | 		*outputFileName = "-"
 44 | 	}
 45 | 
 46 | 	if operation == Receive && *inputFileName == "" {
 47 | 		*inputFileName = "-"
 48 | 	}
 49 | 
 50 | 	inputFile, outputFile := setupFiles(operation, *inputFileName, *outputFileName)
 51 | 
 52 | 	defer inputFile.Close()
 53 | 	defer outputFile.Close()
 54 | 
 55 | 	if operation == Send {
 56 | 		encoder := sparsecat.NewEncoder(inputFile)
 57 | 		encoder.Format = f
 58 | 		_, err := io.Copy(outputFile, encoder)
 59 | 		if err != nil {
 60 | 			log.Fatal(err)
 61 | 		}
 62 | 		return
 63 | 	}
 64 | 
 65 | 	decoder := sparsecat.NewDecoder(inputFile)
 66 | 	decoder.Format = f
 67 | 	decoder.DisableSparseWriting = *disableSparseTarget
 68 | 	decoder.DisableFileTruncate = *disableFileTruncate
 69 | 
 70 | 	_, err := io.Copy(outputFile, decoder)
 71 | 	if err != nil {
 72 | 		log.Fatal(err)
 73 | 	}
 74 | }
 75 | 
 76 | func setupFiles(operation OperationType, inputFileName string, outputFileName string) (*os.File, *os.File) {
 77 | 	if inputFileName == "" {
 78 | 		flag.Usage()
 79 | 		os.Exit(1)
 80 | 	}
 81 | 
 82 | 	var inputFile *os.File
 83 | 	var outputFile *os.File
 84 | 	var err error
 85 | 
 86 | 	if inputFileName == "-" {
 87 | 		if operation == Send {
 88 | 			log.Fatal("input must be a file when sending data")
 89 | 		}
 90 | 		inputFile = os.Stdin
 91 | 	} else {
 92 | 		inputFile, err = os.Open(inputFileName)
 93 | 		if err != nil {
 94 | 			log.Fatalf("unable to open inputFile: %s", err)
 95 | 		}
 96 | 	}
 97 | 
 98 | 	if outputFileName == "-" {
 99 | 		outputFile = os.Stdout
100 | 	} else {
101 | 		outputFile, err = os.Create(outputFileName)
102 | 		if err != nil {
103 | 			log.Fatalf("unable to create outputFile: %s", err)
104 | 		}
105 | 	}
106 | 
107 | 	return inputFile, outputFile
108 | }
109 | 


--------------------------------------------------------------------------------
/format/rbd.go:
--------------------------------------------------------------------------------
  1 | package format
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/binary"
  6 | 	"fmt"
  7 | 	"io"
  8 | )
  9 | 
 10 | const (
 11 | 	sizeIndicator byte = 's'
 12 | 	dataIndicator byte = 'w'
 13 | 	endIndicator  byte = 'e'
 14 | )
 15 | 
 16 | // RbdDiffv1 implements the rbd diff v1 wire format as described by https://github.com/ceph/ceph/blob/master/doc/dev/rbd-diff.rst#header.
 17 | // Only the Size, UpdatedData and End sections are implemented. Zero data is simply not transmitted.
 18 | var RbdDiffv1 rbdDiffv1
 19 | 
 20 | type rbdDiffv1 struct{}
 21 | 
 22 | func (r rbdDiffv1) ReadFileSize(reader io.Reader) (int64, error) {
 23 | 	// 1 byte for segment type. 8 bytes for int64
 24 | 	var header [1 + 8]byte
 25 | 	_, err := io.ReadFull(reader, header[:])
 26 | 	if err != nil {
 27 | 		return 0, err
 28 | 	}
 29 | 
 30 | 	if header[0] != sizeIndicator {
 31 | 		return 0, fmt.Errorf("invalid header. Expected size segment but got %s", string(header[0]))
 32 | 	}
 33 | 
 34 | 	size := binary.LittleEndian.Uint64(header[1:])
 35 | 	return int64(size), nil
 36 | }
 37 | 
 38 | func (r rbdDiffv1) ReadSectionHeader(reader io.Reader) (Section, error) {
 39 | 	// use 8 + 8 here as that is the maximum buffer size we need for parsing getting
 40 | 	// the data size. The two int64 for writing data sections.
 41 | 	var segmentHeader [8 + 8]byte
 42 | 
 43 | 	// first byte contains the segment type
 44 | 	_, err := io.ReadFull(reader, segmentHeader[0:1])
 45 | 	if err != nil {
 46 | 		return Section{}, fmt.Errorf("error reading segmentHeader header: %w", err)
 47 | 	}
 48 | 
 49 | 	switch segmentHeader[0] {
 50 | 	case endIndicator:
 51 | 		return Section{}, io.EOF
 52 | 	case dataIndicator:
 53 | 		_, err = io.ReadFull(reader, segmentHeader[:])
 54 | 		if err != nil {
 55 | 			return Section{}, fmt.Errorf("error reading data header: %w", err)
 56 | 		}
 57 | 
 58 | 		offset := int64(binary.LittleEndian.Uint64(segmentHeader[:9]))
 59 | 		length := int64(binary.LittleEndian.Uint64(segmentHeader[8:]))
 60 | 
 61 | 		return Section{
 62 | 			Offset: offset,
 63 | 			Length: length,
 64 | 		}, nil
 65 | 	}
 66 | 
 67 | 	return Section{}, fmt.Errorf(`invalid section type: "%d:" %x`, segmentHeader[0], segmentHeader[0])
 68 | }
 69 | 
 70 | func (r rbdDiffv1) GetFileSizeReader(size uint64) (reader io.Reader, length int64) {
 71 | 	buf := make([]byte, 1+8)
 72 | 	buf[0] = sizeIndicator
 73 | 	binary.LittleEndian.PutUint64(buf[1:], size)
 74 | 	return bytes.NewReader(buf), 1 + 8
 75 | }
 76 | 
 77 | func (r rbdDiffv1) GetSectionReader(source io.Reader, section Section) (reader io.Reader, length int64) {
 78 | 	// char + int64 + int64
 79 | 	const headerSize = 1 + 8 + 8
 80 | 
 81 | 	buf := make([]byte, headerSize)
 82 | 	buf[0] = dataIndicator
 83 | 
 84 | 	binary.LittleEndian.PutUint64(buf[1:], uint64(section.Offset))
 85 | 	binary.LittleEndian.PutUint64(buf[1+8:], uint64(section.Length))
 86 | 
 87 | 	headerReader := bytes.NewReader(buf[:])
 88 | 	fileReader := io.LimitReader(source, section.Length)
 89 | 
 90 | 	return io.MultiReader(headerReader, fileReader), headerSize + section.Length
 91 | }
 92 | 
 93 | func (r rbdDiffv1) GetEndTagReader() (reader io.Reader, length int64) {
 94 | 	return bytes.NewReader([]byte{endIndicator}), 1
 95 | }
 96 | 
 97 | // RbdDiffv2 implements the rbd diff v2 wire format as described by https://github.com/ceph/ceph/blob/master/doc/dev/rbd-diff.rst#header-1.
 98 | // Only the Size, UpdatedData and End sections are implemented. Zero data is simply not transmitted.
 99 | var RbdDiffv2 rbdDiffv2
100 | 
101 | type rbdDiffv2 struct{}
102 | 
103 | func (r rbdDiffv2) ReadFileSize(reader io.Reader) (int64, error) {
104 | 	// 1 byte for segment type. 8 bytes for int64
105 | 	var header [1 + 8 + 8]byte
106 | 	_, err := io.ReadFull(reader, header[:])
107 | 	if err != nil {
108 | 		return 0, err
109 | 	}
110 | 
111 | 	if header[0] != sizeIndicator {
112 | 		return 0, fmt.Errorf("invalid header. Expected size segment but got %s", string(header[0]))
113 | 	}
114 | 
115 | 	size := binary.LittleEndian.Uint64(header[9:])
116 | 	return int64(size), nil
117 | }
118 | 
119 | func (r rbdDiffv2) ReadSectionHeader(reader io.Reader) (Section, error) {
120 | 	// use 8 + 8 + 8 here as that is the maximum buffer size we need for parsing getting
121 | 	// the data size. The three int64 for writing data sections.
122 | 	var segmentHeader [8 + 8 + 8]byte
123 | 
124 | 	// first byte contains the segment type
125 | 	_, err := io.ReadFull(reader, segmentHeader[0:1])
126 | 	if err != nil {
127 | 		return Section{}, fmt.Errorf("error reading segmentHeader header: %w", err)
128 | 	}
129 | 
130 | 	switch segmentHeader[0] {
131 | 	case endIndicator:
132 | 		return Section{}, io.EOF
133 | 	case dataIndicator:
134 | 		_, err = io.ReadFull(reader, segmentHeader[:])
135 | 		if err != nil {
136 | 			return Section{}, fmt.Errorf("error reading data header: %w", err)
137 | 		}
138 | 
139 | 		// ignore the first int64 as we don't actually need that
140 | 		offset := int64(binary.LittleEndian.Uint64(segmentHeader[8:17]))
141 | 		length := int64(binary.LittleEndian.Uint64(segmentHeader[16:]))
142 | 
143 | 		return Section{
144 | 			Offset: offset,
145 | 			Length: length,
146 | 		}, nil
147 | 	}
148 | 
149 | 	return Section{}, fmt.Errorf(`invalid section type: "%d:" %x`, segmentHeader[0], segmentHeader[0])
150 | }
151 | 
152 | func (r rbdDiffv2) GetFileSizeReader(size uint64) (reader io.Reader, length int64) {
153 | 	buf := make([]byte, 1+8+8)
154 | 	buf[0] = sizeIndicator
155 | 	binary.LittleEndian.PutUint64(buf[1:], 8)
156 | 	binary.LittleEndian.PutUint64(buf[1+8:], size)
157 | 	return bytes.NewReader(buf), 1 + 8 + 8
158 | }
159 | 
160 | func (r rbdDiffv2) GetSectionReader(source io.Reader, section Section) (reader io.Reader, length int64) {
161 | 	// char + int64 + int64 + int64
162 | 	const headerSize = 1 + 8 + 8 + 8
163 | 
164 | 	buf := make([]byte, headerSize)
165 | 	buf[0] = dataIndicator
166 | 
167 | 	binary.LittleEndian.PutUint64(buf[1:], 16+uint64(section.Length))
168 | 	binary.LittleEndian.PutUint64(buf[1+8:], uint64(section.Offset))
169 | 	binary.LittleEndian.PutUint64(buf[1+8+8:], uint64(section.Length))
170 | 
171 | 	headerReader := bytes.NewReader(buf[:])
172 | 	fileReader := io.LimitReader(source, section.Length)
173 | 
174 | 	return io.MultiReader(headerReader, fileReader), headerSize + section.Length
175 | }
176 | 
177 | func (r rbdDiffv2) GetEndTagReader() (reader io.Reader, length int64) {
178 | 	return bytes.NewReader([]byte{endIndicator}), 1
179 | }
180 | 


--------------------------------------------------------------------------------
/copy.go:
--------------------------------------------------------------------------------
  1 | package sparsecat
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"github.com/svenwiltink/sparsecat/format"
  7 | 	"io"
  8 | 	"os"
  9 | )
 10 | 
 11 | type onlyReader struct {
 12 | 	io.Reader
 13 | }
 14 | 
 15 | type zeroReader struct{}
 16 | 
 17 | func (zeroReader) Read(p []byte) (int, error) {
 18 | 	for index := range p {
 19 | 		p[index] = 0
 20 | 	}
 21 | 
 22 | 	return len(p), nil
 23 | }
 24 | 
 25 | func NewDecoder(reader io.Reader) *Decoder {
 26 | 	return &Decoder{reader: reader, Format: format.RbdDiffv1}
 27 | }
 28 | 
 29 | // Decoder decodes an incoming sparsecat stream. It is able to convert it to a 'normal'
 30 | // stream of data using the WriteTo method. An optimized path is used  when the target of an io.Copy
 31 | // is an *os.File (not a pipe or socket)
 32 | type Decoder struct {
 33 | 	Format               format.Format
 34 | 	DisableSparseWriting bool
 35 | 	DisableFileTruncate  bool
 36 | 
 37 | 	reader io.Reader
 38 | 
 39 | 	fileSize      int64
 40 | 	currentOffset int64
 41 | 
 42 | 	currentSection       io.Reader
 43 | 	currentSectionLength int64
 44 | 	currentSectionRead   int
 45 | 
 46 | 	done bool
 47 | }
 48 | 
 49 | // Read is the slow path of the decoder. It output the entire sparse file.
 50 | func (d *Decoder) Read(p []byte) (int, error) {
 51 | 	var err error
 52 | 	if d.currentSection == nil {
 53 | 		d.fileSize, err = d.Format.ReadFileSize(d.reader)
 54 | 		if err != nil {
 55 | 			return 0, fmt.Errorf("error determining target file size: %w", err)
 56 | 		}
 57 | 
 58 | 		err = d.parseSection()
 59 | 		if err != nil {
 60 | 			return 0, fmt.Errorf("error reading first section: %w", err)
 61 | 		}
 62 | 	}
 63 | 
 64 | 	read, err := d.currentSection.Read(p)
 65 | 	d.currentSectionRead += read
 66 | 	d.currentOffset += int64(read)
 67 | 
 68 | 	if err == nil {
 69 | 		return read, nil
 70 | 	}
 71 | 	if !errors.Is(err, io.EOF) {
 72 | 		return read, err
 73 | 	}
 74 | 
 75 | 	// current section has ended. Was it expected?
 76 | 	if d.currentSectionLength != int64(d.currentSectionRead) {
 77 | 		return read, fmt.Errorf("read size doesn't equal section size. %d vs %d. %w", d.currentSectionRead, d.currentSectionLength, io.ErrUnexpectedEOF)
 78 | 	}
 79 | 
 80 | 	// EOF was expected. Are there more sections?
 81 | 	if d.done {
 82 | 		return read, err
 83 | 	}
 84 | 
 85 | 	// there are more sections to read. Reset counter and get next section
 86 | 	d.currentSectionRead = 0
 87 | 
 88 | 	// get next section
 89 | 	err = d.parseSection()
 90 | 	return read, err
 91 | }
 92 | 
 93 | func (d *Decoder) parseSection() error {
 94 | 	section, err := d.Format.ReadSectionHeader(d.reader)
 95 | 	if errors.Is(err, io.EOF) {
 96 | 		d.currentSectionLength = d.fileSize - d.currentOffset
 97 | 		d.currentSection = io.LimitReader(zeroReader{}, d.currentSectionLength)
 98 | 		d.done = true
 99 | 		return nil
100 | 	}
101 | 
102 | 	if err != nil {
103 | 		return err
104 | 	}
105 | 
106 | 	padding := section.Offset - d.currentOffset
107 | 	d.currentSectionLength = padding + section.Length
108 | 
109 | 	paddingReader := io.LimitReader(zeroReader{}, padding)
110 | 	dataReader := io.LimitReader(d.reader, section.Length)
111 | 	d.currentSection = io.MultiReader(paddingReader, dataReader)
112 | 
113 | 	return nil
114 | }
115 | 
116 | // WriteTo is the fast path optimisation of Decoder.Read. If the target of io.Copy is an *os.File that is
117 | // capable of seeking WriteTo will be used. It preserves the sparseness of the target file and does not need
118 | // to write the entire file. Only section of the file containing data will be written. When s.DisableSparseWriting
119 | // has been set this falls back to io.Copy with only the s.Read function exposed. When s.DisableFileTruncate has
120 | // been set the output file will not be truncated prior to writing to it
121 | func (d *Decoder) WriteTo(writer io.Writer) (int64, error) {
122 | 	if d.DisableSparseWriting {
123 | 		return io.Copy(writer, onlyReader{d})
124 | 	}
125 | 
126 | 	file, isFile := d.isSeekableFile(writer)
127 | 	if !isFile {
128 | 		return io.Copy(writer, onlyReader{d})
129 | 	}
130 | 
131 | 	size, err := d.Format.ReadFileSize(d.reader)
132 | 
133 | 	if err != nil {
134 | 		return 0, fmt.Errorf("error determining target file size: %w", err)
135 | 	}
136 | 
137 | 	if !d.DisableFileTruncate {
138 | 		err = SparseTruncate(file, size)
139 | 		if err != nil {
140 | 			return 0, fmt.Errorf("error truncating target file: %w", err)
141 | 		}
142 | 	}
143 | 
144 | 	var written int64 = 0
145 | 
146 | 	for {
147 | 		section, err := d.Format.ReadSectionHeader(d.reader)
148 | 		if errors.Is(err, io.EOF) {
149 | 			return written, nil
150 | 		}
151 | 
152 | 		if err != nil {
153 | 			return written, err
154 | 		}
155 | 
156 | 		_, err = file.Seek(section.Offset, io.SeekStart)
157 | 		if err != nil {
158 | 			return written, fmt.Errorf("error seeking to start of data section: %w", err)
159 | 		}
160 | 
161 | 		copied, err := io.Copy(writer, io.LimitReader(d.reader, section.Length))
162 | 		written += copied
163 | 		if err != nil {
164 | 			return written, fmt.Errorf("error copying data: %w", err)
165 | 		}
166 | 	}
167 | }
168 | 
169 | func (d *Decoder) isSeekableFile(writer io.Writer) (*os.File, bool) {
170 | 	file, isFile := writer.(*os.File)
171 | 	if isFile {
172 | 		// not all files are actually seekable. pipes aren't for example
173 | 		_, err := file.Seek(0, io.SeekCurrent)
174 | 		return file, err == nil
175 | 	}
176 | 	return nil, false
177 | }
178 | 
179 | func NewEncoder(file *os.File) *Encoder {
180 | 	return &Encoder{file: file, Format: format.RbdDiffv1, MaxSectionSize: 1 << 32}
181 | }
182 | 
183 | // Encoder encodes a file to a stream of sparsecat data.
184 | type Encoder struct {
185 | 	file *os.File
186 | 
187 | 	Format         format.Format
188 | 	MaxSectionSize int64
189 | 
190 | 	fileSize int64
191 | 
192 | 	currentOffset        int64
193 | 	currentSection       io.Reader
194 | 	currentSectionLength int64
195 | 	currentSectionEnd    int64
196 | 	currentSectionRead   int
197 | 
198 | 	supportsHoleDetection bool
199 | 
200 | 	done bool
201 | }
202 | 
203 | func (e *Encoder) Read(p []byte) (int, error) {
204 | 	if e.currentSection == nil {
205 | 		info, err := e.file.Stat()
206 | 		if err != nil {
207 | 			return 0, fmt.Errorf("error running stat: %w", err)
208 | 		}
209 | 
210 | 		size := uint64(info.Size())
211 | 
212 | 		if isBlockDevice(info) {
213 | 			e.supportsHoleDetection = false
214 | 			bsize, err := getBlockDeviceSize(e.file)
215 | 			if err != nil {
216 | 				return 0, fmt.Errorf("error determining size of block device: %w", err)
217 | 			}
218 | 
219 | 			size = uint64(bsize)
220 | 		} else {
221 | 			e.supportsHoleDetection = supportsSeekHole(e.file)
222 | 		}
223 | 
224 | 		e.currentSection, e.currentSectionLength = e.Format.GetFileSizeReader(size)
225 | 	}
226 | 
227 | 	read, err := e.currentSection.Read(p)
228 | 	e.currentSectionRead += read
229 | 
230 | 	if err == nil {
231 | 		return read, err
232 | 	}
233 | 
234 | 	if !errors.Is(err, io.EOF) {
235 | 		return read, err
236 | 	}
237 | 
238 | 	// current section has ended. Was it expected?
239 | 	if e.currentSectionLength != int64(e.currentSectionRead) {
240 | 		return read, fmt.Errorf("read size doesn't equal section size. %d vs %d. %w", e.currentSectionRead, e.currentSectionLength, io.ErrUnexpectedEOF)
241 | 	}
242 | 
243 | 	// are there more sections to come?
244 | 	if e.done {
245 | 		return read, io.EOF
246 | 	}
247 | 
248 | 	e.currentOffset = e.currentSectionEnd
249 | 	e.currentSectionRead = 0
250 | 
251 | 	err = e.parseSection()
252 | 	return read, err
253 | }
254 | 
255 | func (e *Encoder) parseSection() error {
256 | 	if !e.supportsHoleDetection {
257 | 		return e.slowDetectSection()
258 | 	}
259 | 
260 | 	start, end, err := detectDataSection(e.file, e.currentOffset)
261 | 	if errors.Is(err, io.EOF) {
262 | 		e.currentSection, e.currentSectionLength = e.Format.GetEndTagReader()
263 | 		e.done = true
264 | 		return nil
265 | 	}
266 | 
267 | 	if err != nil {
268 | 		return fmt.Errorf("error detecting data section: %w", err)
269 | 	}
270 | 
271 | 	length := end - start
272 | 
273 | 	if length > e.MaxSectionSize {
274 | 		end = start + e.MaxSectionSize
275 | 		length = e.MaxSectionSize
276 | 	}
277 | 
278 | 	e.currentSectionEnd = end
279 | 
280 | 	_, err = e.file.Seek(start, io.SeekStart)
281 | 	if err != nil {
282 | 		return err
283 | 	}
284 | 
285 | 	e.currentSection, e.currentSectionLength = e.Format.GetSectionReader(e.file, format.Section{
286 | 		Offset: start,
287 | 		Length: length,
288 | 	})
289 | 
290 | 	return nil
291 | }
292 | 
293 | func (e *Encoder) slowDetectSection() error {
294 | 	start, end, reader, err := slowDetectDataSection(e.file, e.currentOffset)
295 | 	if errors.Is(err, io.EOF) {
296 | 		e.currentSection, e.currentSectionLength = e.Format.GetEndTagReader()
297 | 		e.done = true
298 | 		return nil
299 | 	}
300 | 
301 | 	if err != nil {
302 | 		return fmt.Errorf("error detecting data section for block device: %w", err)
303 | 	}
304 | 
305 | 	length := end - start
306 | 	e.currentSectionEnd = end
307 | 
308 | 	e.currentSection, e.currentSectionLength = e.Format.GetSectionReader(reader, format.Section{
309 | 		Offset: start,
310 | 		Length: length,
311 | 	})
312 | 
313 | 	return nil
314 | }
315 | 


--------------------------------------------------------------------------------