├── LICENSE ├── README.md ├── cpc.go ├── cpc ├── cpc.go └── cpc_test.go ├── go.mod └── go.sum /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2023 Tailscale Inc & AUTHORS. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cpc 2 | 3 | A copy tool for incremental copies of large files, such as databases. 4 | 5 | It's like `rsync --inplace` but a bit faster. 6 | 7 | It uses a thread per core and doesn't do writes of 4KB pages that are unchanged. 8 | 9 | ## Problem statement 10 | 11 | We had two filesystems on a machine and some large SQLite databases and other 12 | files on one filesystem that we wanted to move to the other filesystem. More 13 | specifically: two ext4 filesystems on separate AWS EBS block devices, both 14 | attached to the same Linux 64-core VM with lots of memory (larger than the data 15 | to be copied). 16 | 17 | To minimize service disruption, we wanted to do a live inconsistent copy of the 18 | data to get most of it over, then stop the service, then do another quick 19 | increment copy, then start it up again. 20 | 21 | This tool let us do the migration with minimal downtime. 22 | 23 | -------------------------------------------------------------------------------- /cpc.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) Tailscale Inc & AUTHORS 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | 4 | // The cpc command is like cp but optimized for files like SQLite that 5 | // are only written at 4K page granularity. It writes the dest file in-place 6 | // and tries to not write a page that's identical. 7 | package main 8 | 9 | import ( 10 | "context" 11 | "flag" 12 | "log" 13 | "os" 14 | "path/filepath" 15 | 16 | "github.com/tailscale/cpc/cpc" 17 | ) 18 | 19 | func main() { 20 | flag.Parse() 21 | n := flag.NArg() 22 | if n < 2 { 23 | log.Fatalf("usage: cpc ") 24 | } 25 | last := flag.Arg(n - 1) 26 | var lastIsDir bool 27 | if fi, err := os.Stat(last); err == nil && fi.IsDir() { 28 | lastIsDir = true 29 | } 30 | if n > 2 && !lastIsDir { 31 | log.Fatalf("with more than two arguments, final one must be a directory") 32 | } 33 | ctx := context.Background() 34 | // Directory copy mode. 35 | if lastIsDir { 36 | for _, srcName := range flag.Args()[:n-1] { 37 | dstName := filepath.Join(last, filepath.Base(srcName)) 38 | if _, err := cpc.Copy(ctx, log.Printf, srcName, dstName); err != nil { 39 | log.Fatal(err) 40 | } 41 | } 42 | return 43 | } 44 | // Single file copy mode. 45 | srcName, dstName := flag.Arg(0), flag.Arg(1) 46 | if _, err := cpc.Copy(ctx, log.Printf, srcName, dstName); err != nil { 47 | log.Fatal(err) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /cpc/cpc.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) Tailscale Inc & AUTHORS 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | 4 | // Package cpc provides a copy function optimized for files like SQLite that 5 | // are only written at 4K page granularity. It writes the dest file in-place 6 | // and tries to not write a page that's identical. 7 | package cpc 8 | 9 | import ( 10 | "bytes" 11 | "context" 12 | "fmt" 13 | "os" 14 | "runtime" 15 | "sync/atomic" 16 | "time" 17 | 18 | "golang.org/x/sync/errgroup" 19 | ) 20 | 21 | // Stats contains the stats of a copy operation. 22 | type Stats struct { 23 | Duration time.Duration 24 | PageSize int64 25 | PagesWritten int64 26 | PagesUnmodified int64 27 | } 28 | 29 | const pgSize = 4 << 10 30 | 31 | // Page is a 4K page of a file. 32 | type Page struct { 33 | Off int64 // always 4K aligned 34 | Len int // usually 4K, except at the tail 35 | } 36 | 37 | // Logf is a logger that takes a format string and arguments. 38 | type Logf func(format string, args ...interface{}) 39 | 40 | // Copy provides a concurrent blockwise copy of srcName to dstName. 41 | func Copy(ctx context.Context, logf Logf, srcName, dstName string) (*Stats, error) { 42 | numCPU := runtime.NumCPU() 43 | 44 | t0 := time.Now() 45 | srcF, err := os.Open(srcName) 46 | if err != nil { 47 | return nil, err 48 | } 49 | fi, err := srcF.Stat() 50 | if err != nil { 51 | return nil, err 52 | } 53 | if !fi.Mode().IsRegular() { 54 | return nil, fmt.Errorf("only copies regular files; src %v is %v", srcName, fi.Mode()) 55 | } 56 | size := fi.Size() 57 | 58 | dstF, err := os.OpenFile(dstName, os.O_CREATE|os.O_RDWR, fi.Mode().Perm()) 59 | if err != nil { 60 | return nil, err 61 | } 62 | if err := dstF.Truncate(size); err != nil { 63 | return nil, err 64 | } 65 | 66 | pages := 0 67 | workc := make(chan Page, size/pgSize+1) 68 | remainSize := size 69 | off := int64(0) 70 | for remainSize > 0 { 71 | chunkSize := remainSize 72 | if chunkSize > pgSize { 73 | chunkSize = pgSize 74 | } 75 | p := Page{Off: off, Len: int(chunkSize)} 76 | remainSize -= chunkSize 77 | off += chunkSize 78 | pages++ 79 | workc <- p 80 | } 81 | close(workc) 82 | 83 | logf("file %v is %v bytes, %v pages", srcName, size, pages) 84 | logf("over %v CPUs, %v pages per CPU", numCPU, pages/numCPU) 85 | 86 | var pagesUnmodified atomicInt64 87 | var pagesWritten atomicInt64 88 | var pagesTotal atomicInt64 89 | 90 | copyPage := func(p Page, bufSrc, bufDst []byte) error { 91 | bufSrc = bufSrc[:p.Len] 92 | bufDst = bufDst[:p.Len] 93 | // Note: ReadAt doesn't do short reads like io.Reader. Also, these two 94 | // ReadAt calls could be in theory be concurrent but we're already 95 | // running NumCPUs goroutines, so it wouldn't really help. 96 | if _, err := srcF.ReadAt(bufSrc, p.Off); err != nil { 97 | return err 98 | } 99 | if _, err := dstF.ReadAt(bufDst, p.Off); err != nil { 100 | return err 101 | } 102 | if bytes.Equal(bufSrc, bufDst) { 103 | pagesUnmodified.Add(1) 104 | return nil 105 | } 106 | if _, err := dstF.WriteAt(bufSrc, p.Off); err != nil { 107 | return err 108 | } 109 | pagesWritten.Add(1) 110 | return nil 111 | } 112 | 113 | var lastPrint atomic.Value // of time.Time 114 | printProgress := func() { 115 | logf("%0.2f%% done; %v pages written, %v unchanged", 116 | float64(pagesTotal.Load())*100/float64(pages), 117 | pagesWritten.Load(), 118 | pagesUnmodified.Load()) 119 | } 120 | 121 | grp, ctx := errgroup.WithContext(ctx) 122 | for i := 0; i < numCPU; i++ { 123 | grp.Go(func() error { 124 | bufSrc := make([]byte, pgSize) 125 | bufDst := make([]byte, pgSize) 126 | for { 127 | select { 128 | case <-ctx.Done(): 129 | return ctx.Err() 130 | case p, ok := <-workc: 131 | if !ok { 132 | return nil 133 | } 134 | if err := copyPage(p, bufSrc, bufDst); err != nil { 135 | return err 136 | } 137 | done := pagesTotal.Add(1) 138 | lastPrintTime, _ := lastPrint.Load().(time.Time) 139 | if done%100 == 0 && time.Since(lastPrintTime) > time.Second { 140 | printProgress() 141 | lastPrint.Store(time.Now()) 142 | } 143 | } 144 | } 145 | }) 146 | } 147 | if err := grp.Wait(); err != nil { 148 | return nil, err 149 | } 150 | printProgress() 151 | d := time.Since(t0) 152 | logf("Done in %v", d.Round(time.Millisecond)) 153 | if pagesWritten.Load()+pagesUnmodified.Load() != int64(pages) { 154 | return nil, fmt.Errorf("not consistent; expected %v pages total", pages) 155 | } 156 | return &Stats{ 157 | Duration: d, 158 | PageSize: pgSize, 159 | PagesWritten: pagesWritten.Load(), 160 | PagesUnmodified: pagesUnmodified.Load(), 161 | }, nil 162 | } 163 | 164 | // atomicInt64 is sync/atomic.Int64, but this package is targeting 165 | // pretty ancient Go. 166 | type atomicInt64 int64 167 | 168 | func (x *atomicInt64) Load() int64 { return atomic.LoadInt64((*int64)(x)) } 169 | func (x *atomicInt64) Add(v int64) int64 { return atomic.AddInt64((*int64)(x), v) } 170 | -------------------------------------------------------------------------------- /cpc/cpc_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) Tailscale Inc & AUTHORS 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | 4 | package cpc 5 | 6 | import ( 7 | "bytes" 8 | "context" 9 | "crypto/rand" 10 | "fmt" 11 | "io/ioutil" 12 | mathrand "math/rand" 13 | "path/filepath" 14 | "runtime" 15 | "testing" 16 | ) 17 | 18 | func TestCopyBlockwise(t *testing.T) { 19 | sizes := []int64{0, 1} 20 | for n := 1; n < runtime.NumCPU(); n++ { 21 | for delta := -1; delta <= 1; delta++ { 22 | sizes = append(sizes, int64(n*(4<<10)+delta)) 23 | } 24 | } 25 | td := t.TempDir() 26 | for _, size := range sizes { 27 | t.Run(fmt.Sprintf("size%d", size), func(t *testing.T) { 28 | ss := fmt.Sprint(size) 29 | src := filepath.Join(td, "input-"+ss) 30 | dst := filepath.Join(td, "output-"+ss) 31 | want := randBytes(int(size)) 32 | if err := ioutil.WriteFile(src, want, 0644); err != nil { 33 | t.Fatal(err) 34 | } 35 | // 3 runs: initial copy, no-op copy, single byte dirtied copy 36 | for run := 1; run <= 3; run++ { 37 | t.Run(fmt.Sprintf("run%d", run), func(t *testing.T) { 38 | // On the 3rd run, dirty one random byte of the dst file. 39 | if run == 3 { 40 | if size == 0 { 41 | t.Skip("n/a") 42 | } 43 | dirtyCopy := append([]byte(nil), want...) 44 | dirtyCopy[mathrand.Intn(int(size))] = byte(mathrand.Intn(256)) 45 | if err := ioutil.WriteFile(dst, dirtyCopy, 0644); err != nil { 46 | t.Fatal(err) 47 | } 48 | } 49 | st, err := Copy(context.Background(), loggerDiscard, src, dst) 50 | if err != nil { 51 | t.Fatalf("cpblockwise: %v", err) 52 | } 53 | if run == 1 && st.PagesUnmodified != 0 { 54 | t.Errorf("initial unmodified pages = %v; want 0", st.PagesUnmodified) 55 | } 56 | if run == 2 && st.PagesWritten > 0 { 57 | t.Errorf("second run written pages = %v; want 0", st.PagesWritten) 58 | } 59 | if run == 3 { 60 | if st.PagesWritten != 1 { 61 | t.Errorf("PagesWritten = %v; want 1", st.PagesWritten) 62 | } 63 | if size > 4<<10 && st.PagesUnmodified == 0 { 64 | t.Errorf("PagesUnmodified = %v; want >0", st.PagesUnmodified) 65 | } 66 | } 67 | got, err := ioutil.ReadFile(dst) 68 | if err != nil { 69 | t.Fatal(err) 70 | } 71 | if !bytes.Equal(got, want) { 72 | t.Fatalf("bytes didn't equal; dst len = %v; want len %v", len(got), size) 73 | } 74 | }) 75 | } 76 | }) 77 | } 78 | } 79 | 80 | // loggerDiscard is a Logf that throws away the logs given to it. 81 | func loggerDiscard(string, ...interface{}) {} 82 | 83 | func randBytes(n int) []byte { 84 | ret := make([]byte, n) 85 | rand.Read(ret) 86 | return ret 87 | } 88 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/tailscale/cpc 2 | 3 | go 1.15 4 | 5 | require golang.org/x/sync v0.1.0 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= 2 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 3 | --------------------------------------------------------------------------------