├── archive └── tar │ ├── testdata │ ├── small.txt │ ├── small2.txt │ ├── v7.tar │ ├── gnu-utf8.tar │ ├── neg-size.tar │ ├── issue10968.tar │ ├── issue12435.tar │ ├── writer-big.tar │ ├── gnu-not-utf8.tar │ ├── invalid-go17.tar │ ├── gnu-sparse-big.tar │ ├── pax-bad-hdr-large.tar.bz2 │ ├── gnu-sparse-many-zeros.tar.bz2 │ ├── issue11169.tar │ ├── nil-uid.tar │ ├── pax-path-hdr.tar │ ├── ustar-file-devs.tar │ ├── ustar-file-reg.tar │ ├── gnu-nil-sparse-hole.tar │ ├── writer-big-long.tar │ ├── ustar.tar │ ├── file-and-dir.tar │ ├── gnu-long-nul.tar │ ├── hardlink.tar │ ├── pax-bad-hdr-file.tar │ ├── pax-nul-path.tar │ ├── pax-nul-xattrs.tar │ ├── trailing-slash.tar │ ├── gnu-nil-sparse-data.tar │ ├── pax-bad-mtime-file.tar │ ├── pax-pos-size-file.tar │ ├── pax-records.tar │ ├── gnu-incremental.tar │ ├── gnu.tar │ ├── star.tar │ ├── pax-nil-sparse-hole.tar │ ├── writer.tar │ ├── pax-nil-sparse-data.tar │ ├── gnu-multi-hdrs.tar │ ├── pax-multi-hdrs.tar │ ├── xattrs.tar │ ├── pax-sparse-big.tar │ ├── pax-global-records.tar │ ├── hdr-only.tar │ └── pax.tar │ ├── stat_actime1.go │ ├── stat_actime2.go │ ├── example_test.go │ ├── stat_unix.go │ ├── strconv.go │ └── format.go ├── tar ├── asm │ ├── testdata │ │ ├── t.tar.gz │ │ ├── extranils.tar.gz │ │ ├── iso-8859.tar.gz │ │ ├── longlink.tar.gz │ │ ├── fatlonglink.tar.gz │ │ ├── notenoughnils.tar.gz │ │ └── 1c51fc286aa95d9413226599576bafa38490b1e292375c90de095855b64caea6 │ ├── doc.go │ ├── README.md │ ├── disassemble_test.go │ ├── iterate.go │ ├── assemble.go │ ├── iterate_test.go │ ├── disassemble.go │ └── assemble_test.go └── storage │ ├── doc.go │ ├── entry_test.go │ ├── getter_test.go │ ├── entry.go │ ├── packer.go │ ├── getter.go │ └── packer_test.go ├── mage.go ├── .github ├── FUNDING.yml └── workflows │ ├── lint.yml │ └── go.yml ├── mage_color.go ├── go.mod ├── cmd └── tar-split │ ├── README.md │ ├── disasm.go │ ├── asm.go │ ├── tar_benchmark_test.go │ ├── main.go │ └── checksize.go ├── LICENSE ├── concept ├── main.go └── DESIGN.md ├── go.sum ├── magefile.go └── README.md /archive/tar/testdata/small.txt: -------------------------------------------------------------------------------- 1 | Kilts -------------------------------------------------------------------------------- /archive/tar/testdata/small2.txt: -------------------------------------------------------------------------------- 1 | Google.com 2 | -------------------------------------------------------------------------------- /tar/asm/testdata/t.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/tar/asm/testdata/t.tar.gz -------------------------------------------------------------------------------- /archive/tar/testdata/v7.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/archive/tar/testdata/v7.tar -------------------------------------------------------------------------------- /archive/tar/testdata/gnu-utf8.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/archive/tar/testdata/gnu-utf8.tar -------------------------------------------------------------------------------- /archive/tar/testdata/neg-size.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/archive/tar/testdata/neg-size.tar -------------------------------------------------------------------------------- /tar/asm/testdata/extranils.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/tar/asm/testdata/extranils.tar.gz -------------------------------------------------------------------------------- /tar/asm/testdata/iso-8859.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/tar/asm/testdata/iso-8859.tar.gz -------------------------------------------------------------------------------- /tar/asm/testdata/longlink.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/tar/asm/testdata/longlink.tar.gz -------------------------------------------------------------------------------- /archive/tar/testdata/issue10968.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/archive/tar/testdata/issue10968.tar -------------------------------------------------------------------------------- /archive/tar/testdata/issue12435.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/archive/tar/testdata/issue12435.tar -------------------------------------------------------------------------------- /archive/tar/testdata/writer-big.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/archive/tar/testdata/writer-big.tar -------------------------------------------------------------------------------- /tar/asm/testdata/fatlonglink.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/tar/asm/testdata/fatlonglink.tar.gz -------------------------------------------------------------------------------- /archive/tar/testdata/gnu-not-utf8.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/archive/tar/testdata/gnu-not-utf8.tar -------------------------------------------------------------------------------- /archive/tar/testdata/invalid-go17.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/archive/tar/testdata/invalid-go17.tar -------------------------------------------------------------------------------- /tar/asm/testdata/notenoughnils.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/tar/asm/testdata/notenoughnils.tar.gz -------------------------------------------------------------------------------- /archive/tar/testdata/gnu-sparse-big.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/archive/tar/testdata/gnu-sparse-big.tar -------------------------------------------------------------------------------- /archive/tar/testdata/pax-bad-hdr-large.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/archive/tar/testdata/pax-bad-hdr-large.tar.bz2 -------------------------------------------------------------------------------- /archive/tar/testdata/gnu-sparse-many-zeros.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/archive/tar/testdata/gnu-sparse-many-zeros.tar.bz2 -------------------------------------------------------------------------------- /mage.go: -------------------------------------------------------------------------------- 1 | // +build ignore 2 | 3 | package main 4 | 5 | import ( 6 | "os" 7 | "github.com/magefile/mage/mage" 8 | ) 9 | 10 | func main() { os.Exit(mage.Main()) } 11 | -------------------------------------------------------------------------------- /tar/asm/testdata/1c51fc286aa95d9413226599576bafa38490b1e292375c90de095855b64caea6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vbatts/tar-split/HEAD/tar/asm/testdata/1c51fc286aa95d9413226599576bafa38490b1e292375c90de095855b64caea6 -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [vbatts] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | #open_collective: # Replace with a single Open Collective username 5 | -------------------------------------------------------------------------------- /tar/asm/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package asm provides the API for streaming assembly and disassembly of tar 3 | archives. 4 | 5 | Using the `github.com/vbatts/tar-split/tar/storage` for Packing/Unpacking the 6 | metadata for a stream, as well as an implementation of Getting/Putting the file 7 | entries' payload. 8 | */ 9 | package asm 10 | -------------------------------------------------------------------------------- /tar/storage/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package storage is for metadata of a tar archive. 3 | 4 | Packing and unpacking the Entries of the stream. The types of streams are 5 | either segments of raw bytes (for the raw headers and various padding) and for 6 | an entry marking a file payload. 7 | 8 | The raw bytes are stored precisely in the packed (marshalled) Entry, whereas 9 | the file payload marker include the name of the file, size, and crc64 checksum 10 | (for basic file integrity). 11 | */ 12 | package storage 13 | -------------------------------------------------------------------------------- /archive/tar/stat_actime1.go: -------------------------------------------------------------------------------- 1 | // Copyright 2012 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build linux dragonfly openbsd solaris 6 | 7 | package tar 8 | 9 | import ( 10 | "syscall" 11 | "time" 12 | ) 13 | 14 | func statAtime(st *syscall.Stat_t) time.Time { 15 | return time.Unix(st.Atim.Unix()) 16 | } 17 | 18 | func statCtime(st *syscall.Stat_t) time.Time { 19 | return time.Unix(st.Ctim.Unix()) 20 | } 21 | -------------------------------------------------------------------------------- /archive/tar/stat_actime2.go: -------------------------------------------------------------------------------- 1 | // Copyright 2012 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build darwin freebsd netbsd 6 | 7 | package tar 8 | 9 | import ( 10 | "syscall" 11 | "time" 12 | ) 13 | 14 | func statAtime(st *syscall.Stat_t) time.Time { 15 | return time.Unix(st.Atimespec.Unix()) 16 | } 17 | 18 | func statCtime(st *syscall.Stat_t) time.Time { 19 | return time.Unix(st.Ctimespec.Unix()) 20 | } 21 | -------------------------------------------------------------------------------- /archive/tar/testdata/issue11169.tar: -------------------------------------------------------------------------------- 1 | ./PaxHeaders.14463/aaa00006440000000000000000000000132125311453710114200xustar0030 00000=00000000000000000000030 00000=00000000000000000000030 00000=000000000000000000000 -------------------------------------------------------------------------------- /mage_color.go: -------------------------------------------------------------------------------- 1 | //go:build mage 2 | // +build mage 3 | 4 | package main 5 | 6 | import ( 7 | "io" 8 | "os" 9 | 10 | "github.com/fatih/color" 11 | ) 12 | 13 | var ( 14 | ourStdout = cw{c: color.New(color.FgGreen), o: os.Stdout} 15 | ourStderr = cw{c: color.New(color.FgRed), o: os.Stderr} 16 | ) 17 | 18 | // hack around color.Color not implementing Write() 19 | type cw struct { 20 | c *color.Color 21 | o io.Writer 22 | } 23 | 24 | func (cw cw) Write(p []byte) (int, error) { 25 | i := len(p) 26 | _, err := cw.c.Fprint(cw.o, string(p)) // discarding the number of bytes written for now... 27 | return i, err 28 | } 29 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/vbatts/tar-split 2 | 3 | go 1.17 4 | 5 | require ( 6 | github.com/fatih/color v1.15.0 7 | github.com/magefile/mage v1.14.0 8 | github.com/sirupsen/logrus v1.9.3 9 | github.com/stretchr/testify v1.9.0 10 | github.com/urfave/cli v1.22.16 11 | ) 12 | 13 | require ( 14 | github.com/cpuguy83/go-md2man/v2 v2.0.5 // indirect 15 | github.com/davecgh/go-spew v1.1.1 // indirect 16 | github.com/mattn/go-colorable v0.1.13 // indirect 17 | github.com/mattn/go-isatty v0.0.17 // indirect 18 | github.com/pmezard/go-difflib v1.0.0 // indirect 19 | github.com/russross/blackfriday/v2 v2.1.0 // indirect 20 | golang.org/x/sys v0.26.0 // indirect 21 | gopkg.in/yaml.v3 v3.0.1 // indirect 22 | ) 23 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | 3 | on: 4 | pull_request: 5 | branches_ignore: [] 6 | 7 | jobs: 8 | lint: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | go: ['1.22'] 13 | 14 | name: Linting 15 | steps: 16 | 17 | - uses: actions/checkout@v2 18 | with: 19 | path: go/src/github.com/vbatts/tar-split 20 | 21 | - uses: actions/setup-go@v4 22 | with: 23 | go-version: ${{ matrix.go }} 24 | 25 | - name: lint 26 | env: 27 | GOPATH: /home/runner/work/tar-split/tar-split/go 28 | run: | 29 | set -x 30 | export PATH=$GOPATH/bin:$PATH 31 | cd go/src/github.com/vbatts/tar-split 32 | go run mage.go -v lint 33 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: build and vet 2 | 3 | on: 4 | pull_request: 5 | branches_ignore: [] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | go: ['1.18', '1.19', '1.20', '1.21', '1.22'] 13 | 14 | name: build and vet 15 | steps: 16 | 17 | - uses: actions/checkout@v2 18 | with: 19 | path: go/src/github.com/vbatts/tar-split 20 | 21 | - uses: actions/setup-go@v4 22 | with: 23 | go-version: ${{ matrix.go }} 24 | 25 | - name: vet and build 26 | env: 27 | GOPATH: /home/runner/work/tar-split/tar-split/go 28 | run: | 29 | set -x 30 | export PATH=$GOPATH/bin:$PATH 31 | cd go/src/github.com/vbatts/tar-split 32 | go run mage.go -v vet build test 33 | -------------------------------------------------------------------------------- /archive/tar/testdata/nil-uid.tar: -------------------------------------------------------------------------------- 1 | P1050238.JPG.log00006640000000001612130627766012777 0ustar eyefieyefi121304042001213062776644,44,POWERON -------------------------------------------------------------------------------- /archive/tar/testdata/pax-path-hdr.tar: -------------------------------------------------------------------------------- 1 | path/to/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/r0000000000000000000000000000004100000000000032025 xustar000000000000000033 path=PAX1/PAX1/long-path-name 2 | -------------------------------------------------------------------------------- /cmd/tar-split/README.md: -------------------------------------------------------------------------------- 1 | # tar-split utility 2 | 3 | ## Installation 4 | 5 | go get -u github.com/vbatts/tar-split/cmd/tar-split 6 | 7 | ## Usage 8 | 9 | ### Disassembly 10 | 11 | ```bash 12 | $ sha256sum archive.tar 13 | d734a748db93ec873392470510b8a1c88929abd8fae2540dc43d5b26f7537868 archive.tar 14 | $ mkdir ./x 15 | $ tar-split disasm --output tar-data.json.gz ./archive.tar | tar -C ./x -x 16 | time="2015-07-20T15:45:04-04:00" level=info msg="created tar-data.json.gz from ./archive.tar (read 204800 bytes)" 17 | ``` 18 | 19 | ### Assembly 20 | 21 | ```bash 22 | $ tar-split asm --output new.tar --input ./tar-data.json.gz --path ./x/ 23 | INFO[0000] created new.tar from ./x/ and ./tar-data.json.gz (wrote 204800 bytes) 24 | $ sha256sum new.tar 25 | d734a748db93ec873392470510b8a1c88929abd8fae2540dc43d5b26f7537868 new.tar 26 | ``` 27 | 28 | ### Estimating metadata size 29 | 30 | ```bash 31 | $ tar-split checksize ./archive.tar 32 | inspecting "./archive.tar" (size 200k) 33 | -- number of files: 28 34 | -- size of metadata uncompressed: 28k 35 | -- size of gzip compressed metadata: 1k 36 | ``` 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /archive/tar/testdata/ustar-file-devs.tar: -------------------------------------------------------------------------------- 1 | file0000644000000000000000000000000000000000000010037 0ustar0000000010000001 -------------------------------------------------------------------------------- /archive/tar/testdata/ustar-file-reg.tar: -------------------------------------------------------------------------------- 1 | foo0000640116074500116100000000125412575676024010640 0ustar00joetsaiengiRFmWghs3CK9/2HSvRja4TzX8HsRwzbVYl+h0HRkH9uPho2BGmrG5a0vpHsPn2W7Pn33Ux/+rkLSA3GUOX/WiPmP+h73T1r0DZIDJXtOgYWIUhsqUE0zUz1LEaO/y2H+WAe/ZlWt90N2KHka0bkXajoEAdOUrN42PKl/3mu7jiCW45hTNBDp3ArJD8QHN7l3JFMfnusPuir9+K8Oh6bEfN2bHhXjZ41ZkweCHZWUKT8NsdHeObQnXAyvkU5q1OhefE0+uvksVba2ZNyhThAAGZgiqEtTOJJLm8zgcI5avXHMVwlR6mt1jepOct4jQNlAdpkmslKW3BuiwLswGAsw7ttr/pRa/oCT4HUoBWcY3w96+TGR6uXtvbDOM9WhPXGo+1bwhAsA/RXPA1ZX+oS6t4rl/ZvkMZZN4VO5OvKph8tthdG3ocpXUw11zv6mQ7n6kyObLDCMFOtkdnhQBU/BGEK6mw4oTRa1Hd91+bUUqQh6hl3JeDk/t2KDWOEehOxgOqfVG72UuMeo2IayNK/pUXrcUXuywq9KT+bWQxdJsXzwkkyT8Ovz4oiIzHAa14e/Ib8Xxz+BHwpN3TtOXsHziuqLGMzqv867CganwsFxNEGRaTQ6C2bRK+OxetaxhQqe1G/UWwfi5a9PuJC3wfITSa0IhBot9hGAG35VVb4LsRE= -------------------------------------------------------------------------------- /archive/tar/testdata/gnu-nil-sparse-hole.tar: -------------------------------------------------------------------------------- 1 | sparse.db0000000000000000000000000000000000000000000014076 Sustar 00000000000000000000017500000000000000000001750 -------------------------------------------------------------------------------- /archive/tar/testdata/writer-big-long.tar: -------------------------------------------------------------------------------- 1 | longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/l0000000000000000000000000000025600000000000031420 xustar00154 path=longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/16gig.txt 2 | 20 size=17179869184 3 | longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/l0000644000175000017500000000000012332770507036462 0ustar00guillaumeguillaume00000000000000 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Vincent Batts, Raleigh, NC, USA 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software without 17 | specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /cmd/tar-split/disasm.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "compress/gzip" 5 | "io" 6 | "os" 7 | 8 | "github.com/sirupsen/logrus" 9 | "github.com/urfave/cli" 10 | "github.com/vbatts/tar-split/tar/asm" 11 | "github.com/vbatts/tar-split/tar/storage" 12 | ) 13 | 14 | func CommandDisasm(c *cli.Context) { 15 | if len(c.Args()) != 1 { 16 | logrus.Fatalf("please specify tar to be disabled ") 17 | } 18 | if len(c.String("output")) == 0 { 19 | logrus.Fatalf("--output filename must be set") 20 | } 21 | 22 | // Set up the tar input stream 23 | var inputStream io.Reader 24 | if c.Args()[0] == "-" { 25 | inputStream = os.Stdin 26 | } else { 27 | fh, err := os.Open(c.Args()[0]) 28 | if err != nil { 29 | logrus.Fatal(err) 30 | } 31 | defer fh.Close() 32 | inputStream = fh 33 | } 34 | 35 | // Set up the metadata storage 36 | mf, err := os.OpenFile(c.String("output"), os.O_CREATE|os.O_WRONLY|os.O_TRUNC, os.FileMode(0600)) 37 | if err != nil { 38 | logrus.Fatal(err) 39 | } 40 | defer mf.Close() 41 | mfz := gzip.NewWriter(mf) 42 | defer mfz.Close() 43 | metaPacker := storage.NewJSONPacker(mfz) 44 | 45 | // we're passing nil here for the file putter, because the ApplyDiff will 46 | // handle the extraction of the archive 47 | its, err := asm.NewInputTarStream(inputStream, metaPacker, nil) 48 | if err != nil { 49 | logrus.Fatal(err) 50 | } 51 | var out io.Writer 52 | if c.Bool("no-stdout") { 53 | out = io.Discard 54 | } else { 55 | out = os.Stdout 56 | } 57 | i, err := io.Copy(out, its) 58 | if err != nil { 59 | logrus.Fatal(err) 60 | } 61 | logrus.Infof("created %s from %s (read %d bytes)", c.String("output"), c.Args()[0], i) 62 | } 63 | -------------------------------------------------------------------------------- /tar/asm/README.md: -------------------------------------------------------------------------------- 1 | asm 2 | === 3 | 4 | This library for assembly and disassembly of tar archives, facilitated by 5 | `github.com/vbatts/tar-split/tar/storage`. 6 | 7 | 8 | Concerns 9 | -------- 10 | 11 | For completely safe assembly/disassembly, there will need to be a Content 12 | Addressable Storage (CAS) directory, that maps to a checksum in the 13 | `storage.Entity` of `storage.FileType`. 14 | 15 | This is due to the fact that tar archives _can_ allow multiple records for the 16 | same path, but the last one effectively wins. Even if the prior records had a 17 | different payload. 18 | 19 | In this way, when assembling an archive from relative paths, if the archive has 20 | multiple entries for the same path, then all payloads read in from a relative 21 | path would be identical. 22 | 23 | 24 | Thoughts 25 | -------- 26 | 27 | Have a look-aside directory or storage. This way when a clobbering record is 28 | encountered from the tar stream, then the payload of the prior/existing file is 29 | stored to the CAS. This way the clobbering record's file payload can be 30 | extracted, but we'll have preserved the payload needed to reassemble a precise 31 | tar archive. 32 | 33 | clobbered/path/to/file.[0-N] 34 | 35 | *alternatively* 36 | 37 | We could just _not_ support tar streams that have clobbering file paths. 38 | Appending records to the archive is not incredibly common, and doesn't happen 39 | by default for most implementations. Not supporting them wouldn't be a 40 | security concern either, as if it did occur, we would reassemble an archive 41 | that doesn't validate signature/checksum, so it shouldn't be trusted anyway. 42 | 43 | Otherwise, this will allow us to defer support for appended files as a FUTURE FEATURE. 44 | 45 | -------------------------------------------------------------------------------- /archive/tar/example_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package tar_test 6 | 7 | import ( 8 | "archive/tar" 9 | "bytes" 10 | "fmt" 11 | "io" 12 | "log" 13 | "os" 14 | ) 15 | 16 | func Example_minimal() { 17 | // Create and add some files to the archive. 18 | var buf bytes.Buffer 19 | tw := tar.NewWriter(&buf) 20 | var files = []struct { 21 | Name, Body string 22 | }{ 23 | {"readme.txt", "This archive contains some text files."}, 24 | {"gopher.txt", "Gopher names:\nGeorge\nGeoffrey\nGonzo"}, 25 | {"todo.txt", "Get animal handling license."}, 26 | } 27 | for _, file := range files { 28 | hdr := &tar.Header{ 29 | Name: file.Name, 30 | Mode: 0600, 31 | Size: int64(len(file.Body)), 32 | } 33 | if err := tw.WriteHeader(hdr); err != nil { 34 | log.Fatal(err) 35 | } 36 | if _, err := tw.Write([]byte(file.Body)); err != nil { 37 | log.Fatal(err) 38 | } 39 | } 40 | if err := tw.Close(); err != nil { 41 | log.Fatal(err) 42 | } 43 | 44 | // Open and iterate through the files in the archive. 45 | tr := tar.NewReader(&buf) 46 | for { 47 | hdr, err := tr.Next() 48 | if err == io.EOF { 49 | break // End of archive 50 | } 51 | if err != nil { 52 | log.Fatal(err) 53 | } 54 | fmt.Printf("Contents of %s:\n", hdr.Name) 55 | if _, err := io.Copy(os.Stdout, tr); err != nil { 56 | log.Fatal(err) 57 | } 58 | fmt.Println() 59 | } 60 | 61 | // Output: 62 | // Contents of readme.txt: 63 | // This archive contains some text files. 64 | // Contents of gopher.txt: 65 | // Gopher names: 66 | // George 67 | // Geoffrey 68 | // Gonzo 69 | // Contents of todo.txt: 70 | // Get animal handling license. 71 | } 72 | -------------------------------------------------------------------------------- /tar/asm/disassemble_test.go: -------------------------------------------------------------------------------- 1 | package asm 2 | 3 | import ( 4 | "archive/tar" 5 | "fmt" 6 | "io" 7 | "os" 8 | "testing" 9 | 10 | "github.com/vbatts/tar-split/tar/storage" 11 | ) 12 | 13 | // This test failing causes the binary to crash due to memory overcommitment. 14 | func TestLargeJunkPadding(t *testing.T) { 15 | pR, pW := io.Pipe() 16 | 17 | // Write a normal tar file into the pipe and then load it full of junk 18 | // bytes as padding. We have to do this in a goroutine because we can't 19 | // store 20GB of junk in-memory. 20 | go func() { 21 | // Empty archive. 22 | tw := tar.NewWriter(pW) 23 | if err := tw.Close(); err != nil { 24 | pW.CloseWithError(err) 25 | return 26 | } 27 | 28 | // Write junk. 29 | const ( 30 | junkChunkSize = 64 * 1024 * 1024 31 | junkChunkNum = 20 * 16 32 | ) 33 | devZero, err := os.Open("/dev/zero") 34 | if err != nil { 35 | pW.CloseWithError(err) 36 | return 37 | } 38 | defer devZero.Close() 39 | for i := 0; i < junkChunkNum; i++ { 40 | if i%32 == 0 { 41 | fmt.Fprintf(os.Stderr, "[TestLargeJunkPadding] junk chunk #%d/#%d\n", i, junkChunkNum) 42 | } 43 | if _, err := io.CopyN(pW, devZero, junkChunkSize); err != nil { 44 | pW.CloseWithError(err) 45 | return 46 | } 47 | } 48 | 49 | fmt.Fprintln(os.Stderr, "[TestLargeJunkPadding] junk chunk finished") 50 | pW.Close() 51 | }() 52 | 53 | // Disassemble our junk file. 54 | nilPacker := storage.NewJSONPacker(io.Discard) 55 | rdr, err := NewInputTarStream(pR, nilPacker, nil) 56 | if err != nil { 57 | t.Fatal(err) 58 | } 59 | 60 | // Copy the entire rdr. 61 | _, err = io.Copy(io.Discard, rdr) 62 | if err != nil { 63 | t.Fatal(err) 64 | } 65 | 66 | // At this point, if we haven't crashed then we are not vulnerable to 67 | // CVE-2017-14992. 68 | } 69 | -------------------------------------------------------------------------------- /cmd/tar-split/asm.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "compress/gzip" 5 | "io" 6 | "os" 7 | 8 | "github.com/sirupsen/logrus" 9 | "github.com/urfave/cli" 10 | "github.com/vbatts/tar-split/tar/asm" 11 | "github.com/vbatts/tar-split/tar/storage" 12 | ) 13 | 14 | func CommandAsm(c *cli.Context) { 15 | if len(c.Args()) > 0 { 16 | logrus.Warnf("%d additional arguments passed are ignored", len(c.Args())) 17 | } 18 | if len(c.String("input")) == 0 { 19 | logrus.Fatalf("--input filename must be set") 20 | } 21 | if len(c.String("output")) == 0 { 22 | logrus.Fatalf("--output filename must be set ([FILENAME|-])") 23 | } 24 | if len(c.String("path")) == 0 { 25 | logrus.Fatalf("--path must be set") 26 | } 27 | 28 | var outputStream io.Writer 29 | if c.String("output") == "-" { 30 | outputStream = os.Stdout 31 | } else { 32 | fh, err := os.Create(c.String("output")) 33 | if err != nil { 34 | logrus.Fatal(err) 35 | } 36 | defer fh.Close() 37 | outputStream = fh 38 | } 39 | 40 | if c.Bool("compress") { 41 | zipper := gzip.NewWriter(outputStream) 42 | defer zipper.Close() 43 | outputStream = zipper 44 | } 45 | 46 | // Get the tar metadata reader 47 | mf, err := os.Open(c.String("input")) 48 | if err != nil { 49 | logrus.Fatal(err) 50 | } 51 | defer mf.Close() 52 | mfz, err := gzip.NewReader(mf) 53 | if err != nil { 54 | logrus.Fatal(err) 55 | } 56 | defer mfz.Close() 57 | 58 | metaUnpacker := storage.NewJSONUnpacker(mfz) 59 | // XXX maybe get the absolute path here 60 | fileGetter := storage.NewPathFileGetter(c.String("path")) 61 | 62 | ots := asm.NewOutputTarStream(fileGetter, metaUnpacker) 63 | defer ots.Close() 64 | i, err := io.Copy(outputStream, ots) 65 | if err != nil { 66 | logrus.Fatal(err) 67 | } 68 | 69 | logrus.Infof("created %s from %s and %s (wrote %d bytes)", c.String("output"), c.String("path"), c.String("input"), i) 70 | } 71 | -------------------------------------------------------------------------------- /tar/asm/iterate.go: -------------------------------------------------------------------------------- 1 | package asm 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | 8 | "github.com/vbatts/tar-split/archive/tar" 9 | "github.com/vbatts/tar-split/tar/storage" 10 | ) 11 | 12 | // IterateHeaders calls handler for each tar header provided by Unpacker 13 | func IterateHeaders(unpacker storage.Unpacker, handler func(hdr *tar.Header) error) error { 14 | // We assume about NewInputTarStream: 15 | // - There is a separate SegmentType entry for every tar header, but only one SegmentType entry for the full header incl. any extensions 16 | // - (There is a FileType entry for every tar header, we ignore it) 17 | // - Trailing padding of a file, if any, is included in the next SegmentType entry 18 | // - At the end, there may be SegmentType entries just for the terminating zero blocks. 19 | 20 | var pendingPadding int64 = 0 21 | for { 22 | tsEntry, err := unpacker.Next() 23 | if err != nil { 24 | if err == io.EOF { 25 | return nil 26 | } 27 | return fmt.Errorf("reading tar-split entries: %w", err) 28 | } 29 | switch tsEntry.Type { 30 | case storage.SegmentType: 31 | payload := tsEntry.Payload 32 | if int64(len(payload)) < pendingPadding { 33 | return fmt.Errorf("expected %d bytes of padding after previous file, but next SegmentType only has %d bytes", pendingPadding, len(payload)) 34 | } 35 | payload = payload[pendingPadding:] 36 | pendingPadding = 0 37 | 38 | tr := tar.NewReader(bytes.NewReader(payload)) 39 | hdr, err := tr.Next() 40 | if err != nil { 41 | if err == io.EOF { // Probably the last entry, but let’s let the unpacker drive that. 42 | break 43 | } 44 | return fmt.Errorf("decoding a tar header from a tar-split entry: %w", err) 45 | } 46 | if err := handler(hdr); err != nil { 47 | return err 48 | } 49 | pendingPadding = tr.ExpectedPadding() 50 | 51 | case storage.FileType: 52 | // Nothing 53 | default: 54 | return fmt.Errorf("unexpected tar-split entry type %q", tsEntry.Type) 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /archive/tar/testdata/ustar.tar: -------------------------------------------------------------------------------- 1 | file.txt0000644000076500000240000000000612104402656045134 0ustar00shanestaff00000000000000longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/longname/longnamehello 2 | -------------------------------------------------------------------------------- /cmd/tar-split/tar_benchmark_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "io" 5 | "os" 6 | "testing" 7 | 8 | upTar "archive/tar" 9 | 10 | ourTar "github.com/vbatts/tar-split/archive/tar" 11 | ) 12 | 13 | var testfile = "../../archive/tar/testdata/sparse-formats.tar" 14 | 15 | func BenchmarkUpstreamTar(b *testing.B) { 16 | for n := 0; n < b.N; n++ { 17 | fh, err := os.Open(testfile) 18 | if err != nil { 19 | b.Fatal(err) 20 | } 21 | tr := upTar.NewReader(fh) 22 | for { 23 | _, err := tr.Next() 24 | if err != nil { 25 | if err == io.EOF { 26 | break 27 | } 28 | fh.Close() 29 | b.Fatal(err) 30 | } 31 | _, err = io.Copy(io.Discard, tr) 32 | if err != nil { 33 | b.Fatal(err) 34 | } 35 | } 36 | if err := fh.Close(); err != nil { 37 | b.Fatal(err) 38 | } 39 | } 40 | } 41 | 42 | func BenchmarkOurTarNoAccounting(b *testing.B) { 43 | for n := 0; n < b.N; n++ { 44 | fh, err := os.Open(testfile) 45 | if err != nil { 46 | b.Fatal(err) 47 | } 48 | tr := ourTar.NewReader(fh) 49 | tr.RawAccounting = false // this is default, but explicit here 50 | for { 51 | _, err := tr.Next() 52 | if err != nil { 53 | if err == io.EOF { 54 | break 55 | } 56 | fh.Close() 57 | b.Fatal(err) 58 | } 59 | _, err = io.Copy(io.Discard, tr) 60 | if err != nil { 61 | b.Fatal(err) 62 | } 63 | } 64 | if err := fh.Close(); err != nil { 65 | b.Fatal(err) 66 | } 67 | } 68 | } 69 | func BenchmarkOurTarYesAccounting(b *testing.B) { 70 | for n := 0; n < b.N; n++ { 71 | fh, err := os.Open(testfile) 72 | if err != nil { 73 | b.Fatal(err) 74 | } 75 | tr := ourTar.NewReader(fh) 76 | tr.RawAccounting = true // This enables mechanics for collecting raw bytes 77 | for { 78 | _ = tr.RawBytes() 79 | _, err := tr.Next() 80 | _ = tr.RawBytes() 81 | if err != nil { 82 | if err == io.EOF { 83 | break 84 | } 85 | fh.Close() 86 | b.Fatal(err) 87 | } 88 | _, err = io.Copy(io.Discard, tr) 89 | if err != nil { 90 | b.Fatal(err) 91 | } 92 | _ = tr.RawBytes() 93 | } 94 | if err := fh.Close(); err != nil { 95 | b.Fatal(err) 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /tar/storage/entry_test.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "encoding/json" 5 | "sort" 6 | "testing" 7 | ) 8 | 9 | func TestEntries(t *testing.T) { 10 | e := Entries{ 11 | Entry{ 12 | Type: SegmentType, 13 | Payload: []byte("y'all"), 14 | Position: 1, 15 | }, 16 | Entry{ 17 | Type: SegmentType, 18 | Payload: []byte("doin"), 19 | Position: 3, 20 | }, 21 | Entry{ 22 | Type: FileType, 23 | Name: "./hurr.txt", 24 | Payload: []byte("deadbeef"), 25 | Position: 2, 26 | }, 27 | Entry{ 28 | Type: SegmentType, 29 | Payload: []byte("how"), 30 | Position: 0, 31 | }, 32 | } 33 | sort.Sort(e) 34 | if e[0].Position != 0 { 35 | t.Errorf("expected Position 0, but got %d", e[0].Position) 36 | } 37 | } 38 | 39 | func TestFile(t *testing.T) { 40 | f := Entry{ 41 | Type: FileType, 42 | Size: 100, 43 | Position: 2, 44 | } 45 | f.SetName("./hello.txt") 46 | 47 | buf, err := json.Marshal(f) 48 | if err != nil { 49 | t.Fatal(err) 50 | } 51 | 52 | f1 := Entry{} 53 | if err = json.Unmarshal(buf, &f1); err != nil { 54 | t.Fatal(err) 55 | } 56 | 57 | if f.GetName() != f1.GetName() { 58 | t.Errorf("expected Name %q, got %q", f.GetName(), f1.GetName()) 59 | } 60 | if f.Size != f1.Size { 61 | t.Errorf("expected Size %q, got %q", f.Size, f1.Size) 62 | } 63 | if f.Position != f1.Position { 64 | t.Errorf("expected Position %q, got %q", f.Position, f1.Position) 65 | } 66 | } 67 | 68 | func TestFileRaw(t *testing.T) { 69 | f := Entry{ 70 | Type: FileType, 71 | Size: 100, 72 | Position: 2, 73 | } 74 | f.SetNameBytes([]byte{0x2E, 0x2F, 0x68, 0x65, 0x6C, 0x6C, 0x6F, 0xE4, 0x2E, 0x74, 0x78, 0x74}) 75 | 76 | buf, err := json.Marshal(f) 77 | if err != nil { 78 | t.Fatal(err) 79 | } 80 | 81 | f1 := Entry{} 82 | if err = json.Unmarshal(buf, &f1); err != nil { 83 | t.Fatal(err) 84 | } 85 | 86 | if f.GetName() != f1.GetName() { 87 | t.Errorf("expected Name %q, got %q", f.GetName(), f1.GetName()) 88 | } 89 | if f.Size != f1.Size { 90 | t.Errorf("expected Size %q, got %q", f.Size, f1.Size) 91 | } 92 | if f.Position != f1.Position { 93 | t.Errorf("expected Position %q, got %q", f.Position, f1.Position) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /tar/storage/getter_test.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "strings" 8 | "testing" 9 | ) 10 | 11 | func TestGetter(t *testing.T) { 12 | fgp := NewBufferFileGetPutter() 13 | files := map[string]map[string][]byte{ 14 | "file1.txt": {"foo": []byte{60, 60, 48, 48, 0, 0, 0, 0}}, 15 | "file2.txt": {"bar": []byte{45, 196, 22, 240, 0, 0, 0, 0}}, 16 | } 17 | for n, b := range files { 18 | for body, sum := range b { 19 | _, csum, err := fgp.Put(n, bytes.NewBufferString(body)) 20 | if err != nil { 21 | t.Error(err) 22 | } 23 | if !bytes.Equal(csum, sum) { 24 | t.Errorf("checksum: expected 0x%x; got 0x%x", sum, csum) 25 | } 26 | } 27 | } 28 | for n, b := range files { 29 | for body := range b { 30 | r, err := fgp.Get(n) 31 | if err != nil { 32 | t.Error(err) 33 | } 34 | buf, err := io.ReadAll(r) 35 | if err != nil { 36 | t.Error(err) 37 | } 38 | if body != string(buf) { 39 | t.Errorf("expected %q, got %q", body, string(buf)) 40 | } 41 | } 42 | } 43 | } 44 | 45 | func TestPutter(t *testing.T) { 46 | fp := NewDiscardFilePutter() 47 | // map[filename]map[body]crc64sum 48 | files := map[string]map[string][]byte{ 49 | "file1.txt": {"foo": []byte{60, 60, 48, 48, 0, 0, 0, 0}}, 50 | "file2.txt": {"bar": []byte{45, 196, 22, 240, 0, 0, 0, 0}}, 51 | "file3.txt": {"baz": []byte{32, 68, 22, 240, 0, 0, 0, 0}}, 52 | "file4.txt": {"bif": []byte{48, 9, 150, 240, 0, 0, 0, 0}}, 53 | } 54 | for n, b := range files { 55 | for body, sum := range b { 56 | _, csum, err := fp.Put(n, bytes.NewBufferString(body)) 57 | if err != nil { 58 | t.Error(err) 59 | } 60 | if !bytes.Equal(csum, sum) { 61 | t.Errorf("checksum on %q: expected %v; got %v", n, sum, csum) 62 | } 63 | } 64 | } 65 | } 66 | 67 | func BenchmarkPutter(b *testing.B) { 68 | files := []string{ 69 | strings.Repeat("foo", 1000), 70 | strings.Repeat("bar", 1000), 71 | strings.Repeat("baz", 1000), 72 | strings.Repeat("fooz", 1000), 73 | strings.Repeat("vbatts", 1000), 74 | strings.Repeat("systemd", 1000), 75 | } 76 | for i := 0; i < b.N; i++ { 77 | fgp := NewBufferFileGetPutter() 78 | for n, body := range files { 79 | if _, _, err := fgp.Put(fmt.Sprintf("%d", n), bytes.NewBufferString(body)); err != nil { 80 | b.Fatal(err) 81 | } 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /concept/main.go: -------------------------------------------------------------------------------- 1 | //go:build ignore 2 | // +build ignore 3 | 4 | package main 5 | 6 | import ( 7 | "flag" 8 | "fmt" 9 | "io" 10 | "log" 11 | "os" 12 | 13 | "github.com/vbatts/tar-split/archive/tar" 14 | ) 15 | 16 | func main() { 17 | flag.Parse() 18 | log.SetOutput(os.Stderr) 19 | for _, arg := range flag.Args() { 20 | func() { 21 | // Open the tar archive 22 | fh, err := os.Open(arg) 23 | if err != nil { 24 | log.Fatal(err, arg) 25 | } 26 | defer fh.Close() 27 | 28 | output, err := os.Create(fmt.Sprintf("%s.out", arg)) 29 | if err != nil { 30 | log.Fatal(err) 31 | } 32 | defer output.Close() 33 | log.Printf("writing %q to %q", fh.Name(), output.Name()) 34 | 35 | fi, err := fh.Stat() 36 | if err != nil { 37 | log.Fatal(err, fh.Name()) 38 | } 39 | size := fi.Size() 40 | var sum int64 41 | tr := tar.NewReader(fh) 42 | tr.RawAccounting = true 43 | for { 44 | hdr, err := tr.Next() 45 | if err != nil { 46 | if err != io.EOF { 47 | log.Println(err) 48 | } 49 | // even when an EOF is reached, there is often 1024 null bytes on 50 | // the end of an archive. Collect them too. 51 | post := tr.RawBytes() 52 | output.Write(post) 53 | sum += int64(len(post)) 54 | 55 | fmt.Printf("EOF padding: %d\n", len(post)) 56 | break 57 | } 58 | 59 | pre := tr.RawBytes() 60 | output.Write(pre) 61 | sum += int64(len(pre)) 62 | 63 | var i int64 64 | if i, err = io.Copy(output, tr); err != nil { 65 | log.Println(err) 66 | break 67 | } 68 | sum += i 69 | 70 | fmt.Println(hdr.Name, "pre:", len(pre), "read:", i) 71 | } 72 | 73 | // it is allowable, and not uncommon that there is further padding on the 74 | // end of an archive, apart from the expected 1024 null bytes 75 | remainder, err := io.ReadAll(fh) 76 | if err != nil && err != io.EOF { 77 | log.Fatal(err, fh.Name()) 78 | } 79 | output.Write(remainder) 80 | sum += int64(len(remainder)) 81 | fmt.Printf("Remainder: %d\n", len(remainder)) 82 | 83 | if size != sum { 84 | fmt.Printf("Size: %d; Sum: %d; Diff: %d\n", size, sum, size-sum) 85 | fmt.Printf("Compare like `cmp -bl %s %s | less`\n", fh.Name(), output.Name()) 86 | } else { 87 | fmt.Printf("Size: %d; Sum: %d\n", size, sum) 88 | } 89 | }() 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /cmd/tar-split/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/sirupsen/logrus" 7 | "github.com/urfave/cli" 8 | ) 9 | 10 | var Version = "v0.12.1" 11 | 12 | func main() { 13 | app := cli.NewApp() 14 | app.Name = "tar-split" 15 | app.Usage = "tar assembly and disassembly utility" 16 | app.Version = Version 17 | app.Author = "Vincent Batts" 18 | app.Email = "vbatts@hashbangbash.com" 19 | app.Action = cli.ShowAppHelp 20 | app.Before = func(c *cli.Context) error { 21 | logrus.SetOutput(os.Stderr) 22 | if c.Bool("debug") { 23 | logrus.SetLevel(logrus.DebugLevel) 24 | } 25 | return nil 26 | } 27 | app.Flags = []cli.Flag{ 28 | cli.BoolFlag{ 29 | Name: "debug, D", 30 | Usage: "debug output", 31 | // defaults to false 32 | }, 33 | } 34 | app.Commands = []cli.Command{ 35 | { 36 | Name: "disasm", 37 | Aliases: []string{"d"}, 38 | Usage: "disassemble the input tar stream", 39 | Action: CommandDisasm, 40 | Flags: []cli.Flag{ 41 | cli.StringFlag{ 42 | Name: "output", 43 | Value: "tar-data.json.gz", 44 | Usage: "output of disassembled tar stream", 45 | }, 46 | cli.BoolFlag{ 47 | Name: "no-stdout", 48 | Usage: "do not throughput the stream to STDOUT", 49 | }, 50 | }, 51 | }, 52 | { 53 | Name: "asm", 54 | Aliases: []string{"a"}, 55 | Usage: "assemble tar stream", 56 | Action: CommandAsm, 57 | Flags: []cli.Flag{ 58 | cli.StringFlag{ 59 | Name: "input", 60 | Value: "tar-data.json.gz", 61 | Usage: "input of disassembled tar stream", 62 | }, 63 | cli.StringFlag{ 64 | Name: "output", 65 | Value: "-", 66 | Usage: "reassembled tar archive", 67 | }, 68 | cli.StringFlag{ 69 | Name: "path", 70 | Value: "", 71 | Usage: "relative path of extracted tar", 72 | }, 73 | cli.BoolFlag{ 74 | Name: "compress", 75 | Usage: "gzip compress the output", 76 | // defaults to false 77 | }, 78 | }, 79 | }, 80 | { 81 | Name: "checksize", 82 | Usage: "displays size estimates for metadata storage of a Tar archive", 83 | Action: CommandChecksize, 84 | Flags: []cli.Flag{ 85 | cli.BoolFlag{ 86 | Name: "work", 87 | Usage: "do not delete the working directory", 88 | // defaults to false 89 | }, 90 | }, 91 | }, 92 | } 93 | 94 | if err := app.Run(os.Args); err != nil { 95 | logrus.Fatal(err) 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /archive/tar/testdata/file-and-dir.tar: -------------------------------------------------------------------------------- 1 | small.txt0000000000000000000000000000000500000000000011033 0ustar0000000000000000Kiltsdir/0000000000000000000000000000000000000000000007742 5ustar0000000000000000 -------------------------------------------------------------------------------- /archive/tar/testdata/gnu-long-nul.tar: -------------------------------------------------------------------------------- 1 | ././@LongLink0000644000000000000000000000024100000000000011600 Lustar rootroot01234567891234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890000644000175000017500000000000013044750217022125 0ustar rawrdsnet -------------------------------------------------------------------------------- /archive/tar/testdata/hardlink.tar: -------------------------------------------------------------------------------- 1 | file.txt0000644000175000001440000000001712475625017013267 0ustar00vbattsusers00000000000000Slartibartfast 2 | hard.txt0000644000175000001440000000000012475625017014735 1file.txtustar00vbattsusers00000000000000 -------------------------------------------------------------------------------- /archive/tar/testdata/pax-bad-hdr-file.tar: -------------------------------------------------------------------------------- 1 | path/to/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/r0000000000000000000000000000004100000000000032025 xustar000000000000000033 path=PAX1/PAX1/long-path-namefoo0000640116074500116100000000125412575676024010640 0ustar00joetsaiengiRFmWghs3CK9/2HSvRja4TzX8HsRwzbVYl+h0HRkH9uPho2BGmrG5a0vpHsPn2W7Pn33Ux/+rkLSA3GUOX/WiPmP+h73T1r0DZIDJXtOgYWIUhsqUE0zUz1LEaO/y2H+WAe/ZlWt90N2KHka0bkXajoEAdOUrN42PKl/3mu7jiCW45hTNBDp3ArJD8QHN7l3JFMfnusPuir9+K8Oh6bEfN2bHhXjZ41ZkweCHZWUKT8NsdHeObQnXAyvkU5q1OhefE0+uvksVba2ZNyhThAAGZgiqEtTOJJLm8zgcI5avXHMVwlR6mt1jepOct4jQNlAdpkmslKW3BuiwLswGAsw7ttr/pRa/oCT4HUoBWcY3w96+TGR6uXtvbDOM9WhPXGo+1bwhAsA/RXPA1ZX+oS6t4rl/ZvkMZZN4VO5OvKph8tthdG3ocpXUw11zv6mQ7n6kyObLDCMFOtkdnhQBU/BGEK6mw4oTRa1Hd91+bUUqQh6hl3JeDk/t2KDWOEehOxgOqfVG72UuMeo2IayNK/pUXrcUXuywq9KT+bWQxdJsXzwkkyT8Ovz4oiIzHAa14e/Ib8Xxz+BHwpN3TtOXsHziuqLGMzqv867CganwsFxNEGRaTQ6C2bRK+OxetaxhQqe1G/UWwfi5a9PuJC3wfITSa0IhBot9hGAG35VVb4LsRE= -------------------------------------------------------------------------------- /archive/tar/testdata/pax-nul-path.tar: -------------------------------------------------------------------------------- 1 | PaxHeaders.0/0123456789012345678901234567890123456789012345678901234567890123456789012345678901234560000000000000000000000000000032300000000000022376 xustar0000000000000000211 path=01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789 2 | 01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890000000000000000000000000000000000000000000021361 0ustar0000000000000000 -------------------------------------------------------------------------------- /archive/tar/testdata/pax-nul-xattrs.tar: -------------------------------------------------------------------------------- 1 | PaxHeaders.0/bad-null.txt0000000000000000000000000000003700000000000013720 xustar000000000000000031 SCHILY.xattr.null=fizzbuzz 2 | bad-null.txt0000000000000000000000000000000000000000000011414 0ustar0000000000000000 -------------------------------------------------------------------------------- /archive/tar/testdata/trailing-slash.tar: -------------------------------------------------------------------------------- 1 | 123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/1234567890000000000000000000000000000046600000000000020160 xustar00310 path=123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/ 2 | 123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/123456789/1234567890000000000000000000000000000000000000000000021275 5ustar0000000000000000 -------------------------------------------------------------------------------- /archive/tar/testdata/gnu-nil-sparse-data.tar: -------------------------------------------------------------------------------- 1 | sparse.db0000000000000000000000000000175000000000000014113 Sustar 000000000000000000000000000000001750000000017500123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789 -------------------------------------------------------------------------------- /archive/tar/testdata/pax-bad-mtime-file.tar: -------------------------------------------------------------------------------- 1 | path/to/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/r0000000000000000000000000000004100000000000032025 xustar000000000000000033 mtime=999xxx9324.432432444444 2 | foo0000640116074500116100000000125412575676024010640 0ustar00joetsaiengiRFmWghs3CK9/2HSvRja4TzX8HsRwzbVYl+h0HRkH9uPho2BGmrG5a0vpHsPn2W7Pn33Ux/+rkLSA3GUOX/WiPmP+h73T1r0DZIDJXtOgYWIUhsqUE0zUz1LEaO/y2H+WAe/ZlWt90N2KHka0bkXajoEAdOUrN42PKl/3mu7jiCW45hTNBDp3ArJD8QHN7l3JFMfnusPuir9+K8Oh6bEfN2bHhXjZ41ZkweCHZWUKT8NsdHeObQnXAyvkU5q1OhefE0+uvksVba2ZNyhThAAGZgiqEtTOJJLm8zgcI5avXHMVwlR6mt1jepOct4jQNlAdpkmslKW3BuiwLswGAsw7ttr/pRa/oCT4HUoBWcY3w96+TGR6uXtvbDOM9WhPXGo+1bwhAsA/RXPA1ZX+oS6t4rl/ZvkMZZN4VO5OvKph8tthdG3ocpXUw11zv6mQ7n6kyObLDCMFOtkdnhQBU/BGEK6mw4oTRa1Hd91+bUUqQh6hl3JeDk/t2KDWOEehOxgOqfVG72UuMeo2IayNK/pUXrcUXuywq9KT+bWQxdJsXzwkkyT8Ovz4oiIzHAa14e/Ib8Xxz+BHwpN3TtOXsHziuqLGMzqv867CganwsFxNEGRaTQ6C2bRK+OxetaxhQqe1G/UWwfi5a9PuJC3wfITSa0IhBot9hGAG35VVb4LsRE= -------------------------------------------------------------------------------- /archive/tar/testdata/pax-pos-size-file.tar: -------------------------------------------------------------------------------- 1 | path/to/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/r0000000000000000000000000000004100000000000032025 xustar000000000000000033 size=000000000000000000000999 2 | foo0000640116074500116100000000125412575676024010640 0ustar00joetsaiengiRFmWghs3CK9/2HSvRja4TzX8HsRwzbVYl+h0HRkH9uPho2BGmrG5a0vpHsPn2W7Pn33Ux/+rkLSA3GUOX/WiPmP+h73T1r0DZIDJXtOgYWIUhsqUE0zUz1LEaO/y2H+WAe/ZlWt90N2KHka0bkXajoEAdOUrN42PKl/3mu7jiCW45hTNBDp3ArJD8QHN7l3JFMfnusPuir9+K8Oh6bEfN2bHhXjZ41ZkweCHZWUKT8NsdHeObQnXAyvkU5q1OhefE0+uvksVba2ZNyhThAAGZgiqEtTOJJLm8zgcI5avXHMVwlR6mt1jepOct4jQNlAdpkmslKW3BuiwLswGAsw7ttr/pRa/oCT4HUoBWcY3w96+TGR6uXtvbDOM9WhPXGo+1bwhAsA/RXPA1ZX+oS6t4rl/ZvkMZZN4VO5OvKph8tthdG3ocpXUw11zv6mQ7n6kyObLDCMFOtkdnhQBU/BGEK6mw4oTRa1Hd91+bUUqQh6hl3JeDk/t2KDWOEehOxgOqfVG72UuMeo2IayNK/pUXrcUXuywq9KT+bWQxdJsXzwkkyT8Ovz4oiIzHAa14e/Ib8Xxz+BHwpN3TtOXsHziuqLGMzqv867CganwsFxNEGRaTQ6C2bRK+OxetaxhQqe1G/UWwfi5a9PuJC3wfITSa0IhBot9hGAG35VVb4LsRE= -------------------------------------------------------------------------------- /archive/tar/testdata/pax-records.tar: -------------------------------------------------------------------------------- 1 | PaxHeaders.0/file0000000000000000000000000000013500000000000011062 xustar0018 GOLANG.pkg=tar 2 | 25 comment=Hello, 世界 3 | 50 uname=longlonglonglonglonglonglonglonglonglong 4 | file0000000000000000000000000000000000000000000016617 0ustar00longlonglonglonglonglonglonglong00000000000000 -------------------------------------------------------------------------------- /archive/tar/testdata/gnu-incremental.tar: -------------------------------------------------------------------------------- 1 | test2/0040755000175000017500000000001612574542263013224 Dustar rawrdsnet1257454434512574542274YfooYsparsetest2/foo0100644000175000017500000000010012574542163013667 0ustar rawrdsnet1257454434512574542274fewafewa 2 | fewa 3 | feawfehahaha 4 | hahaafwe 5 | hahafawe 6 | hahawafe 7 | a 8 | fwefewa 9 | test2/sparse0100644000175000017500000000000012574542263017530 Sustar rawrdsnet1257460641412574542274040000000000000000000004000000000 -------------------------------------------------------------------------------- /tar/storage/entry.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import "unicode/utf8" 4 | 5 | // Entries is for sorting by Position 6 | type Entries []Entry 7 | 8 | func (e Entries) Len() int { return len(e) } 9 | func (e Entries) Swap(i, j int) { e[i], e[j] = e[j], e[i] } 10 | func (e Entries) Less(i, j int) bool { return e[i].Position < e[j].Position } 11 | 12 | // Type of Entry 13 | type Type int 14 | 15 | const ( 16 | // FileType represents a file payload from the tar stream. 17 | // 18 | // This will be used to map to relative paths on disk. Only Size > 0 will get 19 | // read into a resulting output stream (due to hardlinks). 20 | FileType Type = 1 + iota 21 | // SegmentType represents a raw bytes segment from the archive stream. These raw 22 | // byte segments consist of the raw headers and various padding. 23 | // 24 | // Its payload is to be marshalled base64 encoded. 25 | SegmentType 26 | ) 27 | 28 | // Entry is the structure for packing and unpacking the information read from 29 | // the Tar archive. 30 | // 31 | // FileType Payload checksum is using `hash/crc64` for basic file integrity, 32 | // _not_ for cryptography. 33 | // From http://www.backplane.com/matt/crc64.html, CRC32 has almost 40,000 34 | // collisions in a sample of 18.2 million, CRC64 had none. 35 | type Entry struct { 36 | Type Type `json:"type"` 37 | Name string `json:"name,omitempty"` 38 | NameRaw []byte `json:"name_raw,omitempty"` 39 | Size int64 `json:"size,omitempty"` 40 | Payload []byte `json:"payload"` // SegmentType stores payload here; FileType stores crc64 checksum here; 41 | Position int `json:"position"` 42 | } 43 | 44 | // SetName will check name for valid UTF-8 string, and set the appropriate 45 | // field. See https://github.com/vbatts/tar-split/issues/17 46 | func (e *Entry) SetName(name string) { 47 | if utf8.ValidString(name) { 48 | e.Name = name 49 | } else { 50 | e.NameRaw = []byte(name) 51 | } 52 | } 53 | 54 | // SetNameBytes will check name for valid UTF-8 string, and set the appropriate 55 | // field 56 | func (e *Entry) SetNameBytes(name []byte) { 57 | if utf8.Valid(name) { 58 | e.Name = string(name) 59 | } else { 60 | e.NameRaw = name 61 | } 62 | } 63 | 64 | // GetName returns the string for the entry's name, regardless of the field stored in 65 | func (e *Entry) GetName() string { 66 | if len(e.NameRaw) > 0 { 67 | return string(e.NameRaw) 68 | } 69 | return e.Name 70 | } 71 | 72 | // GetNameBytes returns the bytes for the entry's name, regardless of the field stored in 73 | func (e *Entry) GetNameBytes() []byte { 74 | if len(e.NameRaw) > 0 { 75 | return e.NameRaw 76 | } 77 | return []byte(e.Name) 78 | } 79 | -------------------------------------------------------------------------------- /cmd/tar-split/checksize.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "archive/tar" 5 | "compress/gzip" 6 | "fmt" 7 | "io" 8 | "log" 9 | "os" 10 | 11 | "github.com/sirupsen/logrus" 12 | "github.com/urfave/cli" 13 | "github.com/vbatts/tar-split/tar/asm" 14 | "github.com/vbatts/tar-split/tar/storage" 15 | ) 16 | 17 | func CommandChecksize(c *cli.Context) { 18 | if len(c.Args()) == 0 { 19 | logrus.Fatalf("please specify tar archives to check ('-' will check stdin)") 20 | } 21 | for _, arg := range c.Args() { 22 | fh, err := os.Open(arg) 23 | if err != nil { 24 | log.Fatal(err) 25 | } 26 | defer fh.Close() 27 | fi, err := fh.Stat() 28 | if err != nil { 29 | log.Fatal(err) 30 | } 31 | fmt.Printf("inspecting %q (size %dk)\n", fh.Name(), fi.Size()/1024) 32 | 33 | packFh, err := os.CreateTemp("", "packed.") 34 | if err != nil { 35 | log.Fatal(err) 36 | } 37 | defer packFh.Close() 38 | if !c.Bool("work") { 39 | defer os.Remove(packFh.Name()) 40 | } else { 41 | fmt.Printf(" -- working file preserved: %s\n", packFh.Name()) 42 | } 43 | 44 | sp := storage.NewJSONPacker(packFh) 45 | fp := storage.NewDiscardFilePutter() 46 | dissam, err := asm.NewInputTarStream(fh, sp, fp) 47 | if err != nil { 48 | log.Fatal(err) 49 | } 50 | 51 | var num int 52 | tr := tar.NewReader(dissam) 53 | for { 54 | _, err = tr.Next() 55 | if err != nil { 56 | if err == io.EOF { 57 | break 58 | } 59 | log.Fatal(err) 60 | } 61 | num++ 62 | if _, err := io.Copy(io.Discard, tr); err != nil { 63 | log.Fatal(err) 64 | } 65 | } 66 | fmt.Printf(" -- number of files: %d\n", num) 67 | 68 | if err := packFh.Sync(); err != nil { 69 | log.Fatal(err) 70 | } 71 | 72 | fi, err = packFh.Stat() 73 | if err != nil { 74 | log.Fatal(err) 75 | } 76 | fmt.Printf(" -- size of metadata uncompressed: %dk\n", fi.Size()/1024) 77 | 78 | gzPackFh, err := os.CreateTemp("", "packed.gz.") 79 | if err != nil { 80 | log.Fatal(err) 81 | } 82 | defer gzPackFh.Close() 83 | if !c.Bool("work") { 84 | defer os.Remove(gzPackFh.Name()) 85 | } 86 | 87 | gzWrtr := gzip.NewWriter(gzPackFh) 88 | 89 | if _, err := packFh.Seek(0, 0); err != nil { 90 | log.Fatal(err) 91 | } 92 | 93 | if _, err := io.Copy(gzWrtr, packFh); err != nil { 94 | log.Fatal(err) 95 | } 96 | gzWrtr.Close() 97 | 98 | if err := gzPackFh.Sync(); err != nil { 99 | log.Fatal(err) 100 | } 101 | 102 | fi, err = gzPackFh.Stat() 103 | if err != nil { 104 | log.Fatal(err) 105 | } 106 | fmt.Printf(" -- size of gzip compressed metadata: %dk\n", fi.Size()/1024) 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /tar/storage/packer.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "io" 7 | "path/filepath" 8 | "unicode/utf8" 9 | ) 10 | 11 | // ErrDuplicatePath occurs when a tar archive has more than one entry for the 12 | // same file path 13 | var ErrDuplicatePath = errors.New("duplicates of file paths not supported") 14 | 15 | // Packer describes the methods to pack Entries to a storage destination 16 | type Packer interface { 17 | // AddEntry packs the Entry and returns its position 18 | AddEntry(e Entry) (int, error) 19 | } 20 | 21 | // Unpacker describes the methods to read Entries from a source 22 | type Unpacker interface { 23 | // Next returns the next Entry being unpacked, or error, until io.EOF 24 | Next() (*Entry, error) 25 | } 26 | 27 | type jsonUnpacker struct { 28 | seen seenNames 29 | dec *json.Decoder 30 | } 31 | 32 | func (jup *jsonUnpacker) Next() (*Entry, error) { 33 | var e Entry 34 | err := jup.dec.Decode(&e) 35 | if err != nil { 36 | return nil, err 37 | } 38 | 39 | // check for dup name 40 | if e.Type == FileType { 41 | cName := filepath.Clean(e.GetName()) 42 | if _, ok := jup.seen[cName]; ok { 43 | return nil, ErrDuplicatePath 44 | } 45 | jup.seen[cName] = struct{}{} 46 | } 47 | 48 | return &e, err 49 | } 50 | 51 | // NewJSONUnpacker provides an Unpacker that reads Entries (SegmentType and 52 | // FileType) as a json document. 53 | // 54 | // Each Entry read are expected to be delimited by new line. 55 | func NewJSONUnpacker(r io.Reader) Unpacker { 56 | return &jsonUnpacker{ 57 | dec: json.NewDecoder(r), 58 | seen: seenNames{}, 59 | } 60 | } 61 | 62 | type jsonPacker struct { 63 | w io.Writer 64 | e *json.Encoder 65 | pos int 66 | seen seenNames 67 | } 68 | 69 | type seenNames map[string]struct{} 70 | 71 | func (jp *jsonPacker) AddEntry(e Entry) (int, error) { 72 | // if Name is not valid utf8, switch it to raw first. 73 | if e.Name != "" { 74 | if !utf8.ValidString(e.Name) { 75 | e.NameRaw = []byte(e.Name) 76 | e.Name = "" 77 | } 78 | } 79 | 80 | // check early for dup name 81 | if e.Type == FileType { 82 | cName := filepath.Clean(e.GetName()) 83 | if _, ok := jp.seen[cName]; ok { 84 | return -1, ErrDuplicatePath 85 | } 86 | jp.seen[cName] = struct{}{} 87 | } 88 | 89 | e.Position = jp.pos 90 | err := jp.e.Encode(e) 91 | if err != nil { 92 | return -1, err 93 | } 94 | 95 | // made it this far, increment now 96 | jp.pos++ 97 | return e.Position, nil 98 | } 99 | 100 | // NewJSONPacker provides a Packer that writes each Entry (SegmentType and 101 | // FileType) as a json document. 102 | // 103 | // The Entries are delimited by new line. 104 | func NewJSONPacker(w io.Writer) Packer { 105 | return &jsonPacker{ 106 | w: w, 107 | e: json.NewEncoder(w), 108 | seen: seenNames{}, 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /archive/tar/testdata/gnu.tar: -------------------------------------------------------------------------------- 1 | small.txt0000640021650100116100000000000511213074064012105 0ustar dsymondsengKiltssmall2.txt0000640021650100116100000000001311213113114012154 0ustar dsymondsengGoogle.com 2 | -------------------------------------------------------------------------------- /archive/tar/testdata/star.tar: -------------------------------------------------------------------------------- 1 | small.txt0000640 0216501 0011610 00000000005 11213575217 0016730 0ustar00dsymondseng0000000 0000000 11213575217 11213575217 tarKiltssmall2.txt0000640 0216501 0011610 00000000013 11213575217 0017011 0ustar00dsymondseng0000000 0000000 11213575217 11213575217 tarGoogle.com 2 | -------------------------------------------------------------------------------- /archive/tar/testdata/pax-nil-sparse-hole.tar: -------------------------------------------------------------------------------- 1 | PaxHeaders.0/sparse.db0000000000000000000000000000016100000000000012023 xustar0022 GNU.sparse.major=1 2 | 22 GNU.sparse.minor=0 3 | 29 GNU.sparse.name=sparse.db 4 | 28 GNU.sparse.realsize=1000 5 | 12 size=512 6 | GNUSparseFile.0/sparse.db0000000000000000000000000000100000000000000013527 0ustar00000000000000001 7 | 1000 8 | 0 9 | -------------------------------------------------------------------------------- /tar/storage/getter.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "bytes" 5 | "errors" 6 | "hash/crc64" 7 | "io" 8 | "os" 9 | "path/filepath" 10 | ) 11 | 12 | // FileGetter is the interface for getting a stream of a file payload, 13 | // addressed by name/filename. Presumably, the names will be scoped to relative 14 | // file paths. 15 | type FileGetter interface { 16 | // Get returns a stream for the provided file path 17 | Get(filename string) (output io.ReadCloser, err error) 18 | } 19 | 20 | // FilePutter is the interface for storing a stream of a file payload, 21 | // addressed by name/filename. 22 | type FilePutter interface { 23 | // Put returns the size of the stream received, and the crc64 checksum for 24 | // the provided stream 25 | Put(filename string, input io.Reader) (size int64, checksum []byte, err error) 26 | } 27 | 28 | // FileGetPutter is the interface that groups both Getting and Putting file 29 | // payloads. 30 | type FileGetPutter interface { 31 | FileGetter 32 | FilePutter 33 | } 34 | 35 | // NewPathFileGetter returns a FileGetter that is for files relative to path 36 | // relpath. 37 | func NewPathFileGetter(relpath string) FileGetter { 38 | return &pathFileGetter{root: relpath} 39 | } 40 | 41 | type pathFileGetter struct { 42 | root string 43 | } 44 | 45 | func (pfg pathFileGetter) Get(filename string) (io.ReadCloser, error) { 46 | return os.Open(filepath.Join(pfg.root, filename)) 47 | } 48 | 49 | type bufferFileGetPutter struct { 50 | files map[string][]byte 51 | } 52 | 53 | func (bfgp bufferFileGetPutter) Get(name string) (io.ReadCloser, error) { 54 | if _, ok := bfgp.files[name]; !ok { 55 | return nil, errors.New("no such file") 56 | } 57 | b := bytes.NewBuffer(bfgp.files[name]) 58 | return &readCloserWrapper{b}, nil 59 | } 60 | 61 | func (bfgp *bufferFileGetPutter) Put(name string, r io.Reader) (int64, []byte, error) { 62 | crc := crc64.New(CRCTable) 63 | buf := bytes.NewBuffer(nil) 64 | cw := io.MultiWriter(crc, buf) 65 | i, err := io.Copy(cw, r) 66 | if err != nil { 67 | return 0, nil, err 68 | } 69 | bfgp.files[name] = buf.Bytes() 70 | return i, crc.Sum(nil), nil 71 | } 72 | 73 | type readCloserWrapper struct { 74 | io.Reader 75 | } 76 | 77 | func (w *readCloserWrapper) Close() error { return nil } 78 | 79 | // NewBufferFileGetPutter is a simple in-memory FileGetPutter 80 | // 81 | // Implication is this is memory intensive... 82 | // Probably best for testing or light weight cases. 83 | func NewBufferFileGetPutter() FileGetPutter { 84 | return &bufferFileGetPutter{ 85 | files: map[string][]byte{}, 86 | } 87 | } 88 | 89 | // NewDiscardFilePutter is a bit bucket FilePutter 90 | func NewDiscardFilePutter() FilePutter { 91 | return &bitBucketFilePutter{} 92 | } 93 | 94 | type bitBucketFilePutter struct { 95 | buffer [32 * 1024]byte // 32 kB is the buffer size currently used by io.Copy, as of August 2021. 96 | } 97 | 98 | func (bbfp *bitBucketFilePutter) Put(name string, r io.Reader) (int64, []byte, error) { 99 | c := crc64.New(CRCTable) 100 | i, err := io.CopyBuffer(c, r, bbfp.buffer[:]) 101 | return i, c.Sum(nil), err 102 | } 103 | 104 | // CRCTable is the default table used for crc64 sum calculations 105 | var CRCTable = crc64.MakeTable(crc64.ISO) 106 | -------------------------------------------------------------------------------- /archive/tar/stat_unix.go: -------------------------------------------------------------------------------- 1 | // Copyright 2012 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build linux darwin dragonfly freebsd openbsd netbsd solaris 6 | 7 | package tar 8 | 9 | import ( 10 | "os" 11 | "os/user" 12 | "runtime" 13 | "strconv" 14 | "sync" 15 | "syscall" 16 | ) 17 | 18 | func init() { 19 | sysStat = statUnix 20 | } 21 | 22 | // userMap and groupMap caches UID and GID lookups for performance reasons. 23 | // The downside is that renaming uname or gname by the OS never takes effect. 24 | var userMap, groupMap sync.Map // map[int]string 25 | 26 | func statUnix(fi os.FileInfo, h *Header) error { 27 | sys, ok := fi.Sys().(*syscall.Stat_t) 28 | if !ok { 29 | return nil 30 | } 31 | h.Uid = int(sys.Uid) 32 | h.Gid = int(sys.Gid) 33 | 34 | // Best effort at populating Uname and Gname. 35 | // The os/user functions may fail for any number of reasons 36 | // (not implemented on that platform, cgo not enabled, etc). 37 | if u, ok := userMap.Load(h.Uid); ok { 38 | h.Uname = u.(string) 39 | } else if u, err := user.LookupId(strconv.Itoa(h.Uid)); err == nil { 40 | h.Uname = u.Username 41 | userMap.Store(h.Uid, h.Uname) 42 | } 43 | if g, ok := groupMap.Load(h.Gid); ok { 44 | h.Gname = g.(string) 45 | } else if g, err := user.LookupGroupId(strconv.Itoa(h.Gid)); err == nil { 46 | h.Gname = g.Name 47 | groupMap.Store(h.Gid, h.Gname) 48 | } 49 | 50 | h.AccessTime = statAtime(sys) 51 | h.ChangeTime = statCtime(sys) 52 | 53 | // Best effort at populating Devmajor and Devminor. 54 | if h.Typeflag == TypeChar || h.Typeflag == TypeBlock { 55 | dev := uint64(sys.Rdev) // May be int32 or uint32 56 | switch runtime.GOOS { 57 | case "linux": 58 | // Copied from golang.org/x/sys/unix/dev_linux.go. 59 | major := uint32((dev & 0x00000000000fff00) >> 8) 60 | major |= uint32((dev & 0xfffff00000000000) >> 32) 61 | minor := uint32((dev & 0x00000000000000ff) >> 0) 62 | minor |= uint32((dev & 0x00000ffffff00000) >> 12) 63 | h.Devmajor, h.Devminor = int64(major), int64(minor) 64 | case "darwin": 65 | // Copied from golang.org/x/sys/unix/dev_darwin.go. 66 | major := uint32((dev >> 24) & 0xff) 67 | minor := uint32(dev & 0xffffff) 68 | h.Devmajor, h.Devminor = int64(major), int64(minor) 69 | case "dragonfly": 70 | // Copied from golang.org/x/sys/unix/dev_dragonfly.go. 71 | major := uint32((dev >> 8) & 0xff) 72 | minor := uint32(dev & 0xffff00ff) 73 | h.Devmajor, h.Devminor = int64(major), int64(minor) 74 | case "freebsd": 75 | // Copied from golang.org/x/sys/unix/dev_freebsd.go. 76 | major := uint32((dev >> 8) & 0xff) 77 | minor := uint32(dev & 0xffff00ff) 78 | h.Devmajor, h.Devminor = int64(major), int64(minor) 79 | case "netbsd": 80 | // Copied from golang.org/x/sys/unix/dev_netbsd.go. 81 | major := uint32((dev & 0x000fff00) >> 8) 82 | minor := uint32((dev & 0x000000ff) >> 0) 83 | minor |= uint32((dev & 0xfff00000) >> 12) 84 | h.Devmajor, h.Devminor = int64(major), int64(minor) 85 | case "openbsd": 86 | // Copied from golang.org/x/sys/unix/dev_openbsd.go. 87 | major := uint32((dev & 0x0000ff00) >> 8) 88 | minor := uint32((dev & 0x000000ff) >> 0) 89 | minor |= uint32((dev & 0xffff0000) >> 8) 90 | h.Devmajor, h.Devminor = int64(major), int64(minor) 91 | default: 92 | // TODO: Implement solaris (see https://golang.org/issue/8106) 93 | } 94 | } 95 | return nil 96 | } 97 | -------------------------------------------------------------------------------- /archive/tar/testdata/writer.tar: -------------------------------------------------------------------------------- 1 | small.txt0000640021650100116100000000000511223032352013400 0ustar00dsymondseng00000000000000Kiltssmall2.txt0000640021650100116100000000001311216101324013457 0ustar00dsymondseng00000000000000Google.com 2 | link.txt0000777000175000017500000000000011626640112015665 2small.txtustar00stringsstrings00000000000000 -------------------------------------------------------------------------------- /concept/DESIGN.md: -------------------------------------------------------------------------------- 1 | # Flow of TAR stream 2 | 3 | ## `./archive/tar` 4 | 5 | The import path `github.com/vbatts/tar-split/archive/tar` is fork of upstream golang stdlib [`archive/tar`](http://golang.org/pkg/archive/tar/). 6 | It adds plumbing to access raw bytes of the tar stream as the headers and payload are read. 7 | 8 | ## Packer interface 9 | 10 | For ease of storage and usage of the raw bytes, there will be a storage 11 | interface, that accepts an io.Writer (This way you could pass it an in memory 12 | buffer or a file handle). 13 | 14 | Having a Packer interface can allow configuration of hash.Hash for file payloads 15 | and providing your own io.Writer. 16 | 17 | Instead of having a state directory to store all the header information for all 18 | Readers, we will leave that up to user of Reader. Because we can not assume an 19 | ID for each Reader, and keeping that information differentiated. 20 | 21 | ## State Directory 22 | 23 | Perhaps we could deduplicate the header info, by hashing the rawbytes and 24 | storing them in a directory tree like: 25 | 26 | ./ac/dc/beef 27 | 28 | Then reference the hash of the header info, in the positional records for the 29 | tar stream. Though this could be a future feature, and not required for an 30 | initial implementation. Also, this would imply an owned state directory, rather 31 | than just writing storage info to an io.Writer. 32 | 33 | ## Concept Example 34 | 35 | First we'll get an archive to work with. For repeatability, we'll make an 36 | archive from what you've just cloned: 37 | 38 | ``` 39 | git archive --format=tar -o tar-split.tar HEAD . 40 | ``` 41 | 42 | Then build the example main.go: 43 | 44 | ``` 45 | go build ./main.go 46 | ``` 47 | 48 | Now run the example over the archive: 49 | 50 | ``` 51 | $ ./main tar-split.tar 52 | 2015/02/20 15:00:58 writing "tar-split.tar" to "tar-split.tar.out" 53 | pax_global_header pre: 512 read: 52 54 | .travis.yml pre: 972 read: 374 55 | DESIGN.md pre: 650 read: 1131 56 | LICENSE pre: 917 read: 1075 57 | README.md pre: 973 read: 4289 58 | archive/ pre: 831 read: 0 59 | archive/tar/ pre: 512 read: 0 60 | archive/tar/common.go pre: 512 read: 7790 61 | [...] 62 | tar/storage/entry_test.go pre: 667 read: 1137 63 | tar/storage/getter.go pre: 911 read: 2741 64 | tar/storage/getter_test.go pre: 843 read: 1491 65 | tar/storage/packer.go pre: 557 read: 3141 66 | tar/storage/packer_test.go pre: 955 read: 3096 67 | EOF padding: 1512 68 | Remainder: 512 69 | Size: 215040; Sum: 215040 70 | ``` 71 | 72 | *What are we seeing here?* 73 | 74 | * `pre` is the header of a file entry, and potentially the padding from the 75 | end of the prior file's payload. Also with particular tar extensions and pax 76 | attributes, the header can exceed 512 bytes. 77 | * `read` is the size of the file payload from the entry 78 | * `EOF padding` is the expected 1024 null bytes on the end of a tar archive, 79 | plus potential padding from the end of the prior file entry's payload 80 | * `Remainder` is the remaining bytes of an archive. This is typically deadspace 81 | as most tar implmentations will return after having reached the end of the 82 | 1024 null bytes. Though various implementations will include some amount of 83 | bytes here, which will affect the checksum of the resulting tar archive, 84 | therefore this must be accounted for as well. 85 | 86 | Ideally the input tar and output `*.out`, will match: 87 | 88 | ``` 89 | $ sha1sum tar-split.tar* 90 | ca9e19966b892d9ad5960414abac01ef585a1e22 tar-split.tar 91 | ca9e19966b892d9ad5960414abac01ef585a1e22 tar-split.tar.out 92 | ``` 93 | 94 | 95 | -------------------------------------------------------------------------------- /tar/asm/assemble.go: -------------------------------------------------------------------------------- 1 | package asm 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "hash" 7 | "hash/crc64" 8 | "io" 9 | "sync" 10 | 11 | "github.com/vbatts/tar-split/tar/storage" 12 | ) 13 | 14 | // NewOutputTarStream returns an io.ReadCloser that is an assembled tar archive 15 | // stream. 16 | // 17 | // It takes a storage.FileGetter, for mapping the file payloads that are to be read in, 18 | // and a storage.Unpacker, which has access to the rawbytes and file order 19 | // metadata. With the combination of these two items, a precise assembled Tar 20 | // archive is possible. 21 | func NewOutputTarStream(fg storage.FileGetter, up storage.Unpacker) io.ReadCloser { 22 | // ... Since these are interfaces, this is possible, so let's not have a nil pointer 23 | if fg == nil || up == nil { 24 | return nil 25 | } 26 | pr, pw := io.Pipe() 27 | go func() { 28 | err := WriteOutputTarStream(fg, up, pw) 29 | if err != nil { 30 | pw.CloseWithError(err) 31 | } else { 32 | pw.Close() 33 | } 34 | }() 35 | return pr 36 | } 37 | 38 | // WriteOutputTarStream writes assembled tar archive to a writer. 39 | func WriteOutputTarStream(fg storage.FileGetter, up storage.Unpacker, w io.Writer) error { 40 | // ... Since these are interfaces, this is possible, so let's not have a nil pointer 41 | if fg == nil || up == nil { 42 | return nil 43 | } 44 | var copyBuffer []byte 45 | var crcHash hash.Hash 46 | var crcSum []byte 47 | var multiWriter io.Writer 48 | for { 49 | entry, err := up.Next() 50 | if err != nil { 51 | if err == io.EOF { 52 | return nil 53 | } 54 | return err 55 | } 56 | switch entry.Type { 57 | case storage.SegmentType: 58 | if _, err := w.Write(entry.Payload); err != nil { 59 | return err 60 | } 61 | case storage.FileType: 62 | if entry.Size == 0 { 63 | continue 64 | } 65 | fh, err := fg.Get(entry.GetName()) 66 | if err != nil { 67 | return err 68 | } 69 | if crcHash == nil { 70 | crcHash = crc64.New(storage.CRCTable) 71 | crcSum = make([]byte, 8) 72 | multiWriter = io.MultiWriter(w, crcHash) 73 | copyBuffer = byteBufferPool.Get().([]byte) 74 | // TODO once we have some benchmark or memory profile then we can experiment with using *bytes.Buffer 75 | //nolint:staticcheck // SA6002 not going to do a pointer here 76 | defer byteBufferPool.Put(copyBuffer) 77 | } else { 78 | crcHash.Reset() 79 | } 80 | 81 | if _, err := copyWithBuffer(multiWriter, fh, copyBuffer); err != nil { 82 | fh.Close() 83 | return err 84 | } 85 | 86 | if !bytes.Equal(crcHash.Sum(crcSum[:0]), entry.Payload) { 87 | // I would rather this be a comparable ErrInvalidChecksum or such, 88 | // but since it's coming through the PipeReader, the context of 89 | // _which_ file would be lost... 90 | fh.Close() 91 | return fmt.Errorf("file integrity checksum failed for %q", entry.GetName()) 92 | } 93 | fh.Close() 94 | } 95 | } 96 | } 97 | 98 | var byteBufferPool = &sync.Pool{ 99 | New: func() interface{} { 100 | return make([]byte, 32*1024) 101 | }, 102 | } 103 | 104 | // copyWithBuffer is taken from stdlib io.Copy implementation 105 | // https://github.com/golang/go/blob/go1.5.1/src/io/io.go#L367 106 | func copyWithBuffer(dst io.Writer, src io.Reader, buf []byte) (written int64, err error) { 107 | for { 108 | nr, er := src.Read(buf) 109 | if nr > 0 { 110 | nw, ew := dst.Write(buf[0:nr]) 111 | if nw > 0 { 112 | written += int64(nw) 113 | } 114 | if ew != nil { 115 | err = ew 116 | break 117 | } 118 | if nr != nw { 119 | err = io.ErrShortWrite 120 | break 121 | } 122 | } 123 | if er == io.EOF { 124 | break 125 | } 126 | if er != nil { 127 | err = er 128 | break 129 | } 130 | } 131 | return written, err 132 | } 133 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= 2 | github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc= 3 | github.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= 4 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 5 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 6 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 7 | github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs= 8 | github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw= 9 | github.com/magefile/mage v1.14.0 h1:6QDX3g6z1YvJ4olPhT1wksUcSa/V0a1B+pJb73fBjyo= 10 | github.com/magefile/mage v1.14.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A= 11 | github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= 12 | github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= 13 | github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= 14 | github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng= 15 | github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= 16 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 17 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 18 | github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= 19 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 20 | github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= 21 | github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= 22 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 23 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= 24 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= 25 | github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= 26 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 27 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 28 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= 29 | github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= 30 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= 31 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 32 | github.com/urfave/cli v1.22.16 h1:MH0k6uJxdwdeWQTwhSO42Pwr4YLrNLwBtg1MRgTqPdQ= 33 | github.com/urfave/cli v1.22.16/go.mod h1:EeJR6BKodywf4zciqrdw6hpCPk68JO9z5LazXZMn5Po= 34 | golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 35 | golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 36 | golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 37 | golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= 38 | golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 39 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 40 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 41 | gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= 42 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 43 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 44 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 45 | -------------------------------------------------------------------------------- /archive/tar/testdata/pax-nil-sparse-data.tar: -------------------------------------------------------------------------------- 1 | PaxHeaders.0/sparse.db0000000000000000000000000000016200000000000012024 xustar0022 GNU.sparse.major=1 2 | 22 GNU.sparse.minor=0 3 | 29 GNU.sparse.name=sparse.db 4 | 28 GNU.sparse.realsize=1000 5 | 13 size=1512 6 | GNUSparseFile.0/sparse.db0000000000000000000000000000275000000000000013544 0ustar00000000000000001 7 | 0 8 | 1000 9 | 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789 -------------------------------------------------------------------------------- /magefile.go: -------------------------------------------------------------------------------- 1 | //go:build mage 2 | // +build mage 3 | 4 | package main 5 | 6 | import ( 7 | "errors" 8 | "fmt" 9 | "os" 10 | "os/exec" 11 | "time" 12 | 13 | "github.com/magefile/mage/mg" // mg contains helpful utility functions, like Deps 14 | ) 15 | 16 | var ( 17 | // Default target to run when none is specified 18 | // If not set, running mage will list available targets 19 | Default = Build 20 | app string = "tar-split" 21 | Stdout = ourStdout 22 | Stderr = ourStderr 23 | 24 | golangcilintVersion = "v1.61.0" 25 | 26 | cleanFiles = []string{} 27 | ) 28 | 29 | // Run all-the-things 30 | func All() error { 31 | mg.Deps(Vet) 32 | mg.Deps(Test) 33 | mg.Deps(Build) 34 | mg.Deps(Lint) 35 | return nil 36 | } 37 | 38 | // A build step that requires additional params, or platform specific steps for example 39 | func Build() error { 40 | mg.Deps(InstallDeps) 41 | fmt.Println("Building...") 42 | cmd := exec.Command("go", "build", "-v", "-o", app, "./cmd/tar-split") 43 | cmd.Stdout = Stdout 44 | cmd.Stderr = Stderr 45 | return cmd.Run() 46 | } 47 | 48 | // Vet the codes 49 | func Vet() error { 50 | fmt.Println("go vet...") 51 | cmd := exec.Command("go", "vet", "./...") 52 | cmd.Stdout = Stdout 53 | cmd.Stderr = Stderr 54 | return cmd.Run() 55 | } 56 | 57 | // Run the Linters 58 | func Lint() error { 59 | mg.Deps(InstallToolsLint) 60 | fmt.Println("Linting...") 61 | cmd := exec.Command("golangci-lint", "run") 62 | cmd.Stdout = Stdout 63 | cmd.Stderr = Stderr 64 | return cmd.Run() 65 | } 66 | 67 | // Run the tests available 68 | func Test() error { 69 | fmt.Println("Testing...") 70 | cmd := exec.Command("go", "test", "-cover", "-v", "-bench", "'.'", "-benchmem", "./...") 71 | cmd.Stdout = Stdout 72 | cmd.Stderr = Stderr 73 | return cmd.Run() 74 | } 75 | 76 | // A custom install step if you need your bin someplace other than go/bin 77 | func Install() error { 78 | mg.Deps(Build) 79 | fmt.Println("Installing...") 80 | return os.Rename(app, "/usr/local/bin/"+app) 81 | } 82 | 83 | func init() { 84 | cleanFiles = append(cleanFiles, ".install.deps") // sloppy 85 | } 86 | 87 | // Manage your deps, or running package managers. 88 | func InstallDeps() error { 89 | const fpath = ".install.deps" 90 | success := false 91 | defer func() { 92 | if success { 93 | fd, err := os.Create(fpath) 94 | if err != nil { 95 | fmt.Fprintln(os.Stderr, err) 96 | } 97 | fd.Close() 98 | } 99 | }() 100 | if IsFresh(fpath, time.Now()) { 101 | return nil 102 | } 103 | 104 | mg.Deps(Tidy) 105 | fmt.Println("Installing Deps...") 106 | cmd := exec.Command("go", "get", "./...") 107 | cmd.Stdout = Stdout 108 | cmd.Stderr = Stderr 109 | err := cmd.Run() 110 | if err != nil { 111 | return err 112 | } 113 | success = true 114 | return nil 115 | } 116 | 117 | // Tools used during build/dev/test 118 | func InstallTools() error { 119 | mg.Deps(InstallToolsLint) 120 | return nil 121 | } 122 | 123 | func InstallToolsLint() error { 124 | fmt.Println("Installing Deps...") 125 | cmd := exec.Command("go", "install", "github.com/golangci/golangci-lint/cmd/golangci-lint@"+golangcilintVersion) 126 | cmd.Stdout = Stdout 127 | cmd.Stderr = Stderr 128 | return cmd.Run() 129 | } 130 | 131 | // Tidy go modules 132 | func Tidy() error { 133 | fmt.Println("Tidy up...") 134 | cmd := exec.Command("go", "mod", "tidy") 135 | cmd.Stdout = Stdout 136 | cmd.Stderr = Stderr 137 | return cmd.Run() 138 | } 139 | 140 | // Clean up after yourself 141 | func Clean() { 142 | fmt.Println("Cleaning...") 143 | os.RemoveAll(app) 144 | for _, fpath := range cleanFiles { 145 | os.RemoveAll(fpath) 146 | } 147 | } 148 | 149 | // IsFresh checks if `fpath` exists (therefore `false`, it is not fresh) or if 150 | // `fpath` is _newer_ than `t` (true, as in it's freshly built) 151 | func IsFresh(fpath string, t time.Time) bool { 152 | fi, err := os.Stat(fpath) 153 | if err != nil && errors.Is(err, os.ErrNotExist) { 154 | return false 155 | } 156 | return fi.ModTime().Before(t) 157 | } 158 | -------------------------------------------------------------------------------- /tar/asm/iterate_test.go: -------------------------------------------------------------------------------- 1 | package asm 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "testing" 8 | "time" 9 | 10 | "github.com/stretchr/testify/assert" 11 | "github.com/stretchr/testify/require" 12 | "github.com/vbatts/tar-split/archive/tar" 13 | "github.com/vbatts/tar-split/tar/storage" 14 | ) 15 | 16 | func createTestTarheader(index int, typeFlag byte, size int64) tar.Header { 17 | n := (index + 1) * 100 // Use predictable, but distinct, values for all headers 18 | 19 | res := tar.Header{ 20 | Typeflag: typeFlag, 21 | Name: fmt.Sprintf("name%d", n), 22 | Size: size, 23 | Mode: int64(n + 1), 24 | Uid: n + 2, 25 | Gid: n + 3, 26 | Uname: fmt.Sprintf("user%d", n), 27 | Gname: fmt.Sprintf("group%d", n), 28 | ModTime: time.Unix(int64(n+4), 0), 29 | AccessTime: time.Unix(int64(n+5), 0), 30 | ChangeTime: time.Unix(int64(n+6), 0), 31 | PAXRecords: map[string]string{fmt.Sprintf("key%d", n): fmt.Sprintf("value%d", n)}, 32 | Format: tar.FormatPAX, // We must set a format, in the default one AccessTime and ChangeTime are discarded. 33 | } 34 | switch res.Typeflag { 35 | case tar.TypeLink, tar.TypeSymlink: 36 | res.Linkname = fmt.Sprintf("link%d", n) 37 | case tar.TypeChar, tar.TypeBlock: 38 | res.Devmajor = int64(n + 7) 39 | res.Devminor = int64(n + 8) 40 | } 41 | return res 42 | } 43 | 44 | func TestIterateHeaders(t *testing.T) { 45 | entries := []struct { 46 | typeFlag byte 47 | size int64 48 | }{ 49 | {tar.TypeReg, 0}, 50 | {tar.TypeReg, 1}, 51 | {tar.TypeReg, 511}, 52 | {tar.TypeReg, 512}, 53 | {tar.TypeReg, 513}, 54 | {tar.TypeLink, 0}, 55 | {tar.TypeSymlink, 0}, 56 | {tar.TypeChar, 0}, 57 | {tar.TypeBlock, 0}, 58 | {tar.TypeDir, 0}, 59 | {tar.TypeFifo, 0}, 60 | } 61 | 62 | var tarball bytes.Buffer 63 | var expected []tar.Header 64 | w := tar.NewWriter(&tarball) 65 | for i, e := range entries { 66 | hdr := createTestTarheader(i, e.typeFlag, e.size) 67 | err := w.WriteHeader(&hdr) 68 | require.NoError(t, err) 69 | data := make([]byte, e.size) 70 | _, err = w.Write(data) 71 | require.NoError(t, err) 72 | expected = append(expected, hdr) 73 | } 74 | err := w.Close() 75 | require.NoError(t, err) 76 | 77 | var tarSplit bytes.Buffer 78 | tsReader, err := NewInputTarStream(&tarball, storage.NewJSONPacker(&tarSplit), storage.NewDiscardFilePutter()) 79 | require.NoError(t, err) 80 | _, err = io.Copy(io.Discard, tsReader) 81 | require.NoError(t, err) 82 | 83 | unpacker := storage.NewJSONUnpacker(&tarSplit) 84 | var actual []tar.Header 85 | err = IterateHeaders(unpacker, func(hdr *tar.Header) error { 86 | actual = append(actual, *hdr) 87 | return nil 88 | }) 89 | require.NoError(t, err) 90 | 91 | assert.Equal(t, len(expected), len(actual)) 92 | for i := range expected { 93 | expected := &expected[i] 94 | actual := &actual[i] 95 | 96 | assert.Equal(t, expected.Typeflag, actual.Typeflag) 97 | assert.Equal(t, expected.Name, actual.Name) 98 | assert.Equal(t, expected.Linkname, actual.Linkname) 99 | assert.Equal(t, expected.Size, actual.Size) 100 | assert.Equal(t, expected.Mode, actual.Mode) 101 | assert.Equal(t, expected.Uid, actual.Uid) 102 | assert.Equal(t, expected.Gid, actual.Gid) 103 | assert.Equal(t, expected.Uname, actual.Uname) 104 | assert.Equal(t, expected.Gname, actual.Gname) 105 | assert.True(t, actual.ModTime.Equal(expected.ModTime)) 106 | assert.True(t, actual.AccessTime.Equal(expected.AccessTime)) 107 | assert.True(t, actual.ChangeTime.Equal(expected.ChangeTime)) 108 | assert.Equal(t, expected.Devmajor, actual.Devmajor) 109 | assert.Equal(t, expected.Devminor, actual.Devminor) 110 | assert.Equal(t, expected.Xattrs, actual.Xattrs) //nolint:staticcheck // We do want a comprehensive coverage in this test. 111 | // We can’t compare PAXRecords for complete equality, because tar.Writer adds atime and ctime entries. So ensure all expected records are present. 112 | for k, v := range expected.PAXRecords { 113 | v2, ok := actual.PAXRecords[k] 114 | assert.True(t, ok, k) 115 | assert.Equal(t, v, v2) 116 | } 117 | assert.Equal(t, expected.Format, actual.Format) 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /archive/tar/testdata/gnu-multi-hdrs.tar: -------------------------------------------------------------------------------- 1 | ././@LongLink0000644000000000000000000000031600000000000011603 Lustar rootrootGNU1/GNU1/long-path-name././@LongLink0000644000000000000000000000031600000000000011603 Lustar rootrootGNU2/GNU2/long-path-name././@LongLink0000644000000000000000000000031600000000000011602 Kustar rootrootGNU3/GNU3/long-linkpath-name././@LongLink0000644000000000000000000000031600000000000011602 Kustar rootrootGNU4/GNU4/long-linkpath-namebar0000000000000000000000000000000000000000000007052 2fooustar -------------------------------------------------------------------------------- /archive/tar/testdata/pax-multi-hdrs.tar: -------------------------------------------------------------------------------- 1 | path/to/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/r0000000000000000000000000000004100000000000032025 xustar000000000000000033 path=PAX1/PAX1/long-path-name 2 | path/to/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/r0000000000000000000000000000004100000000000032025 xustar000000000000000033 path=PAX2/PAX2/long-path-name 3 | path/to/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/r0000000000000000000000000000005100000000000032026 xustar000000000000000041 linkpath=PAX3/PAX3/long-linkpath-name 4 | path/to/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/readme/r0000000000000000000000000000005100000000000032026 xustar000000000000000041 linkpath=PAX4/PAX4/long-linkpath-name 5 | bar0000000000000000000000000000000000000000000007112 2fooustar00 -------------------------------------------------------------------------------- /tar/asm/disassemble.go: -------------------------------------------------------------------------------- 1 | package asm 2 | 3 | import ( 4 | "io" 5 | 6 | "github.com/vbatts/tar-split/archive/tar" 7 | "github.com/vbatts/tar-split/tar/storage" 8 | ) 9 | 10 | // NewInputTarStream wraps the Reader stream of a tar archive and provides a 11 | // Reader stream of the same. 12 | // 13 | // In the middle it will pack the segments and file metadata to storage.Packer 14 | // `p`. 15 | // 16 | // The the storage.FilePutter is where payload of files in the stream are 17 | // stashed. If this stashing is not needed, you can provide a nil 18 | // storage.FilePutter. Since the checksumming is still needed, then a default 19 | // of NewDiscardFilePutter will be used internally 20 | func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io.Reader, error) { 21 | // What to do here... folks will want their own access to the Reader that is 22 | // their tar archive stream, but we'll need that same stream to use our 23 | // forked 'archive/tar'. 24 | // Perhaps do an io.TeeReader that hands back an io.Reader for them to read 25 | // from, and we'll MITM the stream to store metadata. 26 | // We'll need a storage.FilePutter too ... 27 | 28 | // Another concern, whether to do any storage.FilePutter operations, such that we 29 | // don't extract any amount of the archive. But then again, we're not making 30 | // files/directories, hardlinks, etc. Just writing the io to the storage.FilePutter. 31 | // Perhaps we have a DiscardFilePutter that is a bit bucket. 32 | 33 | // we'll return the pipe reader, since TeeReader does not buffer and will 34 | // only read what the outputRdr Read's. Since Tar archives have padding on 35 | // the end, we want to be the one reading the padding, even if the user's 36 | // `archive/tar` doesn't care. 37 | pR, pW := io.Pipe() 38 | outputRdr := io.TeeReader(r, pW) 39 | 40 | // we need a putter that will generate the crc64 sums of file payloads 41 | if fp == nil { 42 | fp = storage.NewDiscardFilePutter() 43 | } 44 | 45 | go func() { 46 | tr := tar.NewReader(outputRdr) 47 | tr.RawAccounting = true 48 | for { 49 | hdr, err := tr.Next() 50 | if err != nil { 51 | if err != io.EOF { 52 | pW.CloseWithError(err) 53 | return 54 | } 55 | // even when an EOF is reached, there is often 1024 null bytes on 56 | // the end of an archive. Collect them too. 57 | if b := tr.RawBytes(); len(b) > 0 { 58 | _, err := p.AddEntry(storage.Entry{ 59 | Type: storage.SegmentType, 60 | Payload: b, 61 | }) 62 | if err != nil { 63 | pW.CloseWithError(err) 64 | return 65 | } 66 | } 67 | break // not return. We need the end of the reader. 68 | } 69 | if hdr == nil { 70 | break // not return. We need the end of the reader. 71 | } 72 | 73 | if b := tr.RawBytes(); len(b) > 0 { 74 | _, err := p.AddEntry(storage.Entry{ 75 | Type: storage.SegmentType, 76 | Payload: b, 77 | }) 78 | if err != nil { 79 | pW.CloseWithError(err) 80 | return 81 | } 82 | } 83 | 84 | var csum []byte 85 | if hdr.Size > 0 { 86 | var err error 87 | _, csum, err = fp.Put(hdr.Name, tr) 88 | if err != nil { 89 | pW.CloseWithError(err) 90 | return 91 | } 92 | } 93 | 94 | entry := storage.Entry{ 95 | Type: storage.FileType, 96 | Size: hdr.Size, 97 | Payload: csum, 98 | } 99 | // For proper marshalling of non-utf8 characters 100 | entry.SetName(hdr.Name) 101 | 102 | // File entries added, regardless of size 103 | _, err = p.AddEntry(entry) 104 | if err != nil { 105 | pW.CloseWithError(err) 106 | return 107 | } 108 | 109 | if b := tr.RawBytes(); len(b) > 0 { 110 | _, err = p.AddEntry(storage.Entry{ 111 | Type: storage.SegmentType, 112 | Payload: b, 113 | }) 114 | if err != nil { 115 | pW.CloseWithError(err) 116 | return 117 | } 118 | } 119 | } 120 | 121 | // It is allowable, and not uncommon that there is further padding on 122 | // the end of an archive, apart from the expected 1024 null bytes. We 123 | // do this in chunks rather than in one go to avoid cases where a 124 | // maliciously crafted tar file tries to trick us into reading many GBs 125 | // into memory. 126 | const paddingChunkSize = 1024 * 1024 127 | var paddingChunk [paddingChunkSize]byte 128 | for { 129 | var isEOF bool 130 | n, err := outputRdr.Read(paddingChunk[:]) 131 | if err != nil { 132 | if err != io.EOF { 133 | pW.CloseWithError(err) 134 | return 135 | } 136 | isEOF = true 137 | } 138 | if n != 0 { 139 | _, err = p.AddEntry(storage.Entry{ 140 | Type: storage.SegmentType, 141 | Payload: paddingChunk[:n], 142 | }) 143 | if err != nil { 144 | pW.CloseWithError(err) 145 | return 146 | } 147 | } 148 | if isEOF { 149 | break 150 | } 151 | } 152 | pW.Close() 153 | }() 154 | 155 | return pR, nil 156 | } 157 | -------------------------------------------------------------------------------- /archive/tar/testdata/xattrs.tar: -------------------------------------------------------------------------------- 1 | ./PaxHeaders.29205/small.txt0000644000000000000000000000033512247327552014100 xustar000000000000000029 mtime=1386065770.44825232 2 | 29 atime=1389782991.41987522 3 | 30 ctime=1389782956.794414986 4 | 31 SCHILY.xattr.user.key=value 5 | 33 SCHILY.xattr.user.key2=value2 6 | 69 SCHILY.xattr.security.selinux=unconfined_u:object_r:default_t:s0 7 | small.txt0000644000175000000120000000000512247327552013040 0ustar00alexwheel00000000000000Kilts./PaxHeaders.29205/small2.txt0000644000000000000000000000023612247327552014162 xustar000000000000000030 mtime=1386065770.449252304 8 | 29 atime=1389782991.41987522 9 | 30 ctime=1386065770.449252304 10 | 69 SCHILY.xattr.security.selinux=unconfined_u:object_r:default_t:s0 11 | small2.txt0000644000175000000120000000001312247327552013121 0ustar00alexwheel00000000000000Google.com 12 | -------------------------------------------------------------------------------- /tar/storage/packer_test.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "bytes" 5 | "compress/gzip" 6 | "io" 7 | "os" 8 | "testing" 9 | ) 10 | 11 | func TestDuplicateFail(t *testing.T) { 12 | e := []Entry{ 13 | { 14 | Type: FileType, 15 | Name: "./hurr.txt", 16 | Payload: []byte("abcde"), 17 | }, 18 | { 19 | Type: FileType, 20 | Name: "./hurr.txt", 21 | Payload: []byte("deadbeef"), 22 | }, 23 | { 24 | Type: FileType, 25 | Name: "hurr.txt", // slightly different path, same file though 26 | Payload: []byte("deadbeef"), 27 | }, 28 | } 29 | buf := []byte{} 30 | b := bytes.NewBuffer(buf) 31 | 32 | jp := NewJSONPacker(b) 33 | if _, err := jp.AddEntry(e[0]); err != nil { 34 | t.Error(err) 35 | } 36 | if _, err := jp.AddEntry(e[1]); err != ErrDuplicatePath { 37 | t.Errorf("expected failure on duplicate path") 38 | } 39 | if _, err := jp.AddEntry(e[2]); err != ErrDuplicatePath { 40 | t.Errorf("expected failure on duplicate path") 41 | } 42 | } 43 | 44 | func TestJSONPackerUnpacker(t *testing.T) { 45 | e := []Entry{ 46 | { 47 | Type: SegmentType, 48 | Payload: []byte("how"), 49 | }, 50 | { 51 | Type: SegmentType, 52 | Payload: []byte("y'all"), 53 | }, 54 | { 55 | Type: FileType, 56 | Name: "./hurr.txt", 57 | Payload: []byte("deadbeef"), 58 | }, 59 | { 60 | Type: SegmentType, 61 | Payload: []byte("doin"), 62 | }, 63 | } 64 | 65 | buf := []byte{} 66 | b := bytes.NewBuffer(buf) 67 | 68 | func() { 69 | jp := NewJSONPacker(b) 70 | for i := range e { 71 | if _, err := jp.AddEntry(e[i]); err != nil { 72 | t.Error(err) 73 | } 74 | } 75 | }() 76 | 77 | // >> packer_test.go:43: uncompressed: 266 78 | //t.Errorf("uncompressed: %d", len(b.Bytes())) 79 | 80 | b = bytes.NewBuffer(b.Bytes()) 81 | entries := Entries{} 82 | func() { 83 | jup := NewJSONUnpacker(b) 84 | for { 85 | entry, err := jup.Next() 86 | if err != nil { 87 | if err == io.EOF { 88 | break 89 | } 90 | t.Error(err) 91 | } 92 | entries = append(entries, *entry) 93 | t.Logf("got %#v", entry) 94 | } 95 | }() 96 | if len(entries) != len(e) { 97 | t.Errorf("expected %d entries, got %d", len(e), len(entries)) 98 | } 99 | } 100 | 101 | // you can use a compress Reader/Writer and make nice savings. 102 | // 103 | // For these two tests that are using the same set, it the difference of 266 104 | // bytes uncompressed vs 138 bytes compressed. 105 | func TestGzip(t *testing.T) { 106 | e := []Entry{ 107 | { 108 | Type: SegmentType, 109 | Payload: []byte("how"), 110 | }, 111 | { 112 | Type: SegmentType, 113 | Payload: []byte("y'all"), 114 | }, 115 | { 116 | Type: FileType, 117 | Name: "./hurr.txt", 118 | Payload: []byte("deadbeef"), 119 | }, 120 | { 121 | Type: SegmentType, 122 | Payload: []byte("doin"), 123 | }, 124 | } 125 | 126 | buf := []byte{} 127 | b := bytes.NewBuffer(buf) 128 | gzW := gzip.NewWriter(b) 129 | jp := NewJSONPacker(gzW) 130 | for i := range e { 131 | if _, err := jp.AddEntry(e[i]); err != nil { 132 | t.Error(err) 133 | } 134 | } 135 | gzW.Close() 136 | 137 | // >> packer_test.go:99: compressed: 138 138 | //t.Errorf("compressed: %d", len(b.Bytes())) 139 | 140 | b = bytes.NewBuffer(b.Bytes()) 141 | gzR, err := gzip.NewReader(b) 142 | if err != nil { 143 | t.Fatal(err) 144 | } 145 | entries := Entries{} 146 | func() { 147 | jup := NewJSONUnpacker(gzR) 148 | for { 149 | entry, err := jup.Next() 150 | if err != nil { 151 | if err == io.EOF { 152 | break 153 | } 154 | t.Error(err) 155 | } 156 | entries = append(entries, *entry) 157 | t.Logf("got %#v", entry) 158 | } 159 | }() 160 | if len(entries) != len(e) { 161 | t.Errorf("expected %d entries, got %d", len(e), len(entries)) 162 | } 163 | } 164 | 165 | func BenchmarkGetPut(b *testing.B) { 166 | e := []Entry{ 167 | { 168 | Type: SegmentType, 169 | Payload: []byte("how"), 170 | }, 171 | { 172 | Type: SegmentType, 173 | Payload: []byte("y'all"), 174 | }, 175 | { 176 | Type: FileType, 177 | Name: "./hurr.txt", 178 | Payload: []byte("deadbeef"), 179 | }, 180 | { 181 | Type: SegmentType, 182 | Payload: []byte("doin"), 183 | }, 184 | } 185 | b.RunParallel(func(pb *testing.PB) { 186 | for pb.Next() { 187 | func() { 188 | fh, err := os.CreateTemp("", "tar-split.") 189 | if err != nil { 190 | b.Fatal(err) 191 | } 192 | defer os.Remove(fh.Name()) 193 | defer fh.Close() 194 | 195 | jp := NewJSONPacker(fh) 196 | for i := range e { 197 | if _, err := jp.AddEntry(e[i]); err != nil { 198 | b.Fatal(err) 199 | } 200 | } 201 | if err := fh.Sync(); err != nil { 202 | b.Fatal(err) 203 | } 204 | 205 | up := NewJSONUnpacker(fh) 206 | for { 207 | _, err := up.Next() 208 | if err != nil { 209 | if err == io.EOF { 210 | break 211 | } 212 | b.Fatal(err) 213 | } 214 | } 215 | 216 | }() 217 | } 218 | }) 219 | } 220 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tar-split 2 | 3 | ![Build Status](https://github.com/vbatts/tar-split/actions/workflows/go.yml/badge.svg) 4 | ![Lint](https://github.com/vbatts/tar-split/actions/workflows/lint.yml/badge.svg) 5 | [![Go Report Card](https://goreportcard.com/badge/github.com/vbatts/tar-split)](https://goreportcard.com/report/github.com/vbatts/tar-split) 6 | 7 | Pristinely disassembling a tar archive, and stashing needed raw bytes and offsets to reassemble a validating original archive. 8 | 9 | ## Docs 10 | 11 | Code API for libraries provided by `tar-split`: 12 | 13 | * [github.com/vbatts/tar-split/tar/asm](https://pkg.go.dev/github.com/vbatts/tar-split/tar/asm) 14 | * [github.com/vbatts/tar-split/tar/storage](https://pkg.go.dev/github.com/vbatts/tar-split/tar/storage) 15 | * [github.com/vbatts/tar-split/archive/tar](https://pkg.go.dev/github.com/vbatts/tar-split/archive/tar) 16 | 17 | ## Install 18 | 19 | The command line utility is installable via: 20 | 21 | ```bash 22 | go get github.com/vbatts/tar-split/cmd/tar-split 23 | ``` 24 | 25 | ## Usage 26 | 27 | For cli usage, see its [README.md](cmd/tar-split/README.md). 28 | For the library see the [docs](#docs) 29 | 30 | ## Demo 31 | 32 | ### Basic disassembly and assembly 33 | 34 | This demonstrates the `tar-split` command and how to assemble a tar archive from the `tar-data.json.gz` 35 | 36 | 37 | ![basic cmd demo thumbnail](https://i.ytimg.com/vi/vh5wyjIOBtc/2.jpg?time=1445027151805) 38 | [youtube video of basic command demo](https://youtu.be/vh5wyjIOBtc) 39 | 40 | ### Docker layer preservation 41 | 42 | This demonstrates the tar-split integration for docker-1.8. Providing consistent tar archives for the image layer content. 43 | 44 | ![docker tar-split demo](https://i.ytimg.com/vi_webp/vh5wyjIOBtc/default.webp) 45 | [youtube vide of docker layer checksums](https://youtu.be/tV_Dia8E8xw) 46 | 47 | ## Caveat 48 | 49 | Eventually this should detect TARs that this is not possible with. 50 | 51 | For example stored sparse files that have "holes" in them, will be read as a 52 | contiguous file, though the archive contents may be recorded in sparse format. 53 | Therefore when adding the file payload to a reassembled tar, to achieve 54 | identical output, the file payload would need be precisely re-sparsified. This 55 | is not something I seek to fix immediately, but would rather have an alert that 56 | precise reassembly is not possible. 57 | (see more http://www.gnu.org/software/tar/manual/html_node/Sparse-Formats.html) 58 | 59 | 60 | Other caveat, while tar archives support having multiple file entries for the 61 | same path, we will not support this feature. If there are more than one entries 62 | with the same path, expect an err (like `ErrDuplicatePath`) or a resulting tar 63 | stream that does not validate your original checksum/signature. 64 | 65 | ## Contract 66 | 67 | Do not break the API of stdlib `archive/tar` in our fork (ideally find an upstream mergeable solution). 68 | 69 | ## Std Version 70 | 71 | The version of golang stdlib `archive/tar` is from go1.11 72 | It is minimally extended to expose the raw bytes of the TAR, rather than just the marshalled headers and file stream. 73 | 74 | 75 | ## Design 76 | 77 | See the [design](concept/DESIGN.md). 78 | 79 | ## Stored Metadata 80 | 81 | Since the raw bytes of the headers and padding are stored, you may be wondering 82 | what the size implications are. The headers are at least 512 bytes per 83 | file (sometimes more), at least 1024 null bytes on the end, and then various 84 | padding. This makes for a constant linear growth in the stored metadata, with a 85 | naive storage implementation. 86 | 87 | First we'll get an archive to work with. For repeatability, we'll make an 88 | archive from what you've just cloned: 89 | 90 | ```bash 91 | git archive --format=tar -o tar-split.tar HEAD . 92 | ``` 93 | 94 | ```bash 95 | $ go get github.com/vbatts/tar-split/cmd/tar-split 96 | $ tar-split checksize ./tar-split.tar 97 | inspecting "tar-split.tar" (size 210k) 98 | -- number of files: 50 99 | -- size of metadata uncompressed: 53k 100 | -- size of gzip compressed metadata: 3k 101 | ``` 102 | 103 | So assuming you've managed the extraction of the archive yourself, for reuse of 104 | the file payloads from a relative path, then the only additional storage 105 | implications are as little as 3kb. 106 | 107 | But let's look at a larger archive, with many files. 108 | 109 | ```bash 110 | $ ls -sh ./d.tar 111 | 1.4G ./d.tar 112 | $ tar-split checksize ~/d.tar 113 | inspecting "/home/vbatts/d.tar" (size 1420749k) 114 | -- number of files: 38718 115 | -- size of metadata uncompressed: 43261k 116 | -- size of gzip compressed metadata: 2251k 117 | ``` 118 | 119 | Here, an archive with 38,718 files has a compressed footprint of about 2mb. 120 | 121 | Rolling the null bytes on the end of the archive, we will assume a 122 | bytes-per-file rate for the storage implications. 123 | 124 | | uncompressed | compressed | 125 | | :----------: | :--------: | 126 | | ~ 1kb per/file | 0.06kb per/file | 127 | 128 | 129 | ## What's Next? 130 | 131 | * More implementations of storage Packer and Unpacker 132 | * More implementations of FileGetter and FilePutter 133 | * would be interesting to have an assembler stream that implements `io.Seeker` 134 | 135 | 136 | ## License 137 | 138 | See [LICENSE](LICENSE) 139 | -------------------------------------------------------------------------------- /archive/tar/testdata/pax-sparse-big.tar: -------------------------------------------------------------------------------- 1 | PaxHeaders.0/pax-sparse0000000000000000000000000000017200000000000012227 xustar0022 GNU.sparse.major=1 2 | 22 GNU.sparse.minor=0 3 | 30 GNU.sparse.name=pax-sparse 4 | 35 GNU.sparse.realsize=60000000000 5 | 13 size=3584 6 | GNUSparseFile.0/pax-sparse0000000000000000000000000000700000000000000013737 0ustar00000000000000006 7 | 9999999488 8 | 512 9 | 19999999488 10 | 512 11 | 29999999488 12 | 512 13 | 39999999488 14 | 512 15 | 49999999488 16 | 512 17 | 59999999488 18 | 512 19 | 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789 -------------------------------------------------------------------------------- /archive/tar/testdata/pax-global-records.tar: -------------------------------------------------------------------------------- 1 | GlobalHead.0.00000000000000000000000000000004600000000000010217 gustar0022 mtime=1500000000.0 2 | 16 path=global1 3 | file10000000000000000000000000000000000000000000010100 0ustar0000000000000000PaxHeaders.0/file20000000000000000000000000000001600000000000011142 xustar0014 path=file2 4 | file20000000000000000000000000000000000000000000010101 0ustar0000000000000000GlobalHead.0.00000000000000000000000000000001000000000000010206 gustar008 path= 5 | file30000000000000000000000000000000000000000000010102 0ustar0000000000000000PaxHeaders.0/file40000000000000000000000000000002400000000000011143 xustar0020 mtime=1400000000 6 | file40000000000000000000000000000000012334447000010137 0ustar0000000000000000 -------------------------------------------------------------------------------- /tar/asm/assemble_test.go: -------------------------------------------------------------------------------- 1 | package asm 2 | 3 | import ( 4 | "bytes" 5 | "compress/gzip" 6 | "crypto/sha1" 7 | "fmt" 8 | "hash/crc64" 9 | "io" 10 | "os" 11 | "testing" 12 | 13 | "github.com/vbatts/tar-split/tar/storage" 14 | ) 15 | 16 | var entries = []struct { 17 | Entry storage.Entry 18 | Body []byte 19 | }{ 20 | { 21 | Entry: storage.Entry{ 22 | Type: storage.FileType, 23 | Name: "./hurr.txt", 24 | Payload: []byte{2, 116, 164, 177, 171, 236, 107, 78}, 25 | Size: 20, 26 | }, 27 | Body: []byte("imma hurr til I derp"), 28 | }, 29 | { 30 | Entry: storage.Entry{ 31 | Type: storage.FileType, 32 | Name: "./ermahgerd.txt", 33 | Payload: []byte{126, 72, 89, 239, 230, 252, 160, 187}, 34 | Size: 26, 35 | }, 36 | Body: []byte("café con leche, por favor"), 37 | }, 38 | { 39 | Entry: storage.Entry{ 40 | Type: storage.FileType, 41 | NameRaw: []byte{0x66, 0x69, 0x6c, 0x65, 0x2d, 0xe4}, // this is invalid UTF-8. Just checking the round trip. 42 | Payload: []byte{126, 72, 89, 239, 230, 252, 160, 187}, 43 | Size: 26, 44 | }, 45 | Body: []byte("café con leche, por favor"), 46 | }, 47 | } 48 | var entriesMangled = []struct { 49 | Entry storage.Entry 50 | Body []byte 51 | }{ 52 | { 53 | Entry: storage.Entry{ 54 | Type: storage.FileType, 55 | Name: "./hurr.txt", 56 | Payload: []byte{3, 116, 164, 177, 171, 236, 107, 78}, 57 | Size: 20, 58 | }, 59 | // switch 60 | Body: []byte("imma derp til I hurr"), 61 | }, 62 | { 63 | Entry: storage.Entry{ 64 | Type: storage.FileType, 65 | Name: "./ermahgerd.txt", 66 | Payload: []byte{127, 72, 89, 239, 230, 252, 160, 187}, 67 | Size: 26, 68 | }, 69 | // san not con 70 | Body: []byte("café sans leche, por favor"), 71 | }, 72 | { 73 | Entry: storage.Entry{ 74 | Type: storage.FileType, 75 | NameRaw: []byte{0x66, 0x69, 0x6c, 0x65, 0x2d, 0xe4}, 76 | Payload: []byte{127, 72, 89, 239, 230, 252, 160, 187}, 77 | Size: 26, 78 | }, 79 | Body: []byte("café con leche, por favor"), 80 | }, 81 | } 82 | 83 | func TestTarStreamMangledGetterPutter(t *testing.T) { 84 | fgp := storage.NewBufferFileGetPutter() 85 | 86 | // first lets prep a GetPutter and Packer 87 | for i := range entries { 88 | if entries[i].Entry.Type == storage.FileType { 89 | j, csum, err := fgp.Put(entries[i].Entry.GetName(), bytes.NewBuffer(entries[i].Body)) 90 | if err != nil { 91 | t.Error(err) 92 | } 93 | if j != entries[i].Entry.Size { 94 | t.Errorf("size %q: expected %d; got %d", 95 | entries[i].Entry.GetName(), 96 | entries[i].Entry.Size, 97 | j) 98 | } 99 | if !bytes.Equal(csum, entries[i].Entry.Payload) { 100 | t.Errorf("checksum %q: expected %v; got %v", 101 | entries[i].Entry.GetName(), 102 | entries[i].Entry.Payload, 103 | csum) 104 | } 105 | } 106 | } 107 | 108 | for _, e := range entriesMangled { 109 | if e.Entry.Type == storage.FileType { 110 | rdr, err := fgp.Get(e.Entry.GetName()) 111 | if err != nil { 112 | t.Error(err) 113 | } 114 | c := crc64.New(storage.CRCTable) 115 | i, err := io.Copy(c, rdr) 116 | if err != nil { 117 | t.Fatal(err) 118 | } 119 | rdr.Close() 120 | 121 | csum := c.Sum(nil) 122 | if bytes.Equal(csum, e.Entry.Payload) { 123 | t.Errorf("wrote %d bytes. checksum for %q should not have matched! %v", 124 | i, 125 | e.Entry.GetName(), 126 | csum) 127 | } 128 | } 129 | } 130 | } 131 | 132 | var testCases = []struct { 133 | path string 134 | expectedSHA1Sum string 135 | expectedSize int64 136 | }{ 137 | {"./testdata/t.tar.gz", "1eb237ff69bca6e22789ecb05b45d35ca307adbd", 10240}, 138 | {"./testdata/longlink.tar.gz", "d9f6babe107b7247953dff6b5b5ae31a3a880add", 20480}, 139 | {"./testdata/fatlonglink.tar.gz", "8537f03f89aeef537382f8b0bb065d93e03b0be8", 26234880}, 140 | {"./testdata/iso-8859.tar.gz", "ddafa51cb03c74ec117ab366ee2240d13bba1ec3", 10240}, 141 | {"./testdata/extranils.tar.gz", "e187b4b3e739deaccc257342f4940f34403dc588", 10648}, 142 | {"./testdata/notenoughnils.tar.gz", "72f93f41efd95290baa5c174c234f5d4c22ce601", 512}, 143 | {"./testdata/1c51fc286aa95d9413226599576bafa38490b1e292375c90de095855b64caea6", "946caa03167a8cc707db6ff9785608b652e631dc", 1024}, 144 | } 145 | 146 | func TestTarStream(t *testing.T) { 147 | 148 | for _, tc := range testCases { 149 | fh, err := os.Open(tc.path) 150 | if err != nil { 151 | t.Fatal(err) 152 | } 153 | defer fh.Close() 154 | gzRdr, err := gzip.NewReader(fh) 155 | if err != nil { 156 | t.Fatal(err) 157 | } 158 | defer gzRdr.Close() 159 | 160 | // Setup where we'll store the metadata 161 | w := bytes.NewBuffer([]byte{}) 162 | sp := storage.NewJSONPacker(w) 163 | fgp := storage.NewBufferFileGetPutter() 164 | 165 | // wrap the disassembly stream 166 | tarStream, err := NewInputTarStream(gzRdr, sp, fgp) 167 | if err != nil { 168 | t.Fatal(err) 169 | } 170 | 171 | // get a sum of the stream after it has passed through to ensure it's the same. 172 | h0 := sha1.New() 173 | i, err := io.Copy(h0, tarStream) 174 | if err != nil { 175 | t.Fatal(err) 176 | } 177 | 178 | if i != tc.expectedSize { 179 | t.Errorf("size of tar: expected %d; got %d", tc.expectedSize, i) 180 | } 181 | if fmt.Sprintf("%x", h0.Sum(nil)) != tc.expectedSHA1Sum { 182 | t.Fatalf("checksum of tar: expected %s; got %x", tc.expectedSHA1Sum, h0.Sum(nil)) 183 | } 184 | 185 | //t.Logf("%s", w.String()) // if we fail, then show the packed info 186 | 187 | // If we've made it this far, then we'll turn it around and create a tar 188 | // stream from the packed metadata and buffered file contents. 189 | r := bytes.NewBuffer(w.Bytes()) 190 | sup := storage.NewJSONUnpacker(r) 191 | // and reuse the fgp that we Put the payloads to. 192 | 193 | rc := NewOutputTarStream(fgp, sup) 194 | h1 := sha1.New() 195 | i, err = io.Copy(h1, rc) 196 | if err != nil { 197 | t.Fatal(err) 198 | } 199 | 200 | if i != tc.expectedSize { 201 | t.Errorf("size of output tar: expected %d; got %d", tc.expectedSize, i) 202 | } 203 | if fmt.Sprintf("%x", h1.Sum(nil)) != tc.expectedSHA1Sum { 204 | t.Fatalf("checksum of output tar: expected %s; got %x", tc.expectedSHA1Sum, h1.Sum(nil)) 205 | } 206 | } 207 | } 208 | 209 | func BenchmarkAsm(b *testing.B) { 210 | for i := 0; i < b.N; i++ { 211 | for _, tc := range testCases { 212 | func() { 213 | fh, err := os.Open(tc.path) 214 | if err != nil { 215 | b.Fatal(err) 216 | } 217 | defer fh.Close() 218 | gzRdr, err := gzip.NewReader(fh) 219 | if err != nil { 220 | b.Fatal(err) 221 | } 222 | defer gzRdr.Close() 223 | 224 | // Setup where we'll store the metadata 225 | w := bytes.NewBuffer([]byte{}) 226 | sp := storage.NewJSONPacker(w) 227 | fgp := storage.NewBufferFileGetPutter() 228 | 229 | // wrap the disassembly stream 230 | tarStream, err := NewInputTarStream(gzRdr, sp, fgp) 231 | if err != nil { 232 | b.Fatal(err) 233 | } 234 | // read it all to the bit bucket 235 | i1, err := io.Copy(io.Discard, tarStream) 236 | if err != nil { 237 | b.Fatal(err) 238 | } 239 | 240 | r := bytes.NewBuffer(w.Bytes()) 241 | sup := storage.NewJSONUnpacker(r) 242 | // and reuse the fgp that we Put the payloads to. 243 | 244 | rc := NewOutputTarStream(fgp, sup) 245 | 246 | i2, err := io.Copy(io.Discard, rc) 247 | if err != nil { 248 | b.Fatal(err) 249 | } 250 | if i1 != i2 { 251 | b.Errorf("%s: input(%d) and ouput(%d) byte count didn't match", tc.path, i1, i2) 252 | } 253 | }() 254 | } 255 | } 256 | } 257 | -------------------------------------------------------------------------------- /archive/tar/testdata/hdr-only.tar: -------------------------------------------------------------------------------- 1 | dir/0000750116074500116100000000000012575654704010646 5ustar joetsaiengfifo0000640116074500116100000000000012575655016010730 6ustar joetsaiengfile0000640116074500116100000000005612575654723010735 0ustar joetsaiengThe quick brown fox jumped over the lazy dog! 2 | hardlink0000640116074500116100000000000012575654723012440 1fileustar joetsaiengnull0000666116074500116100000000000012575632775012237 3ustar joetsaieng00000010000003sda0000660116074500116100000000000012575632775012024 4ustar joetsaieng00000100000000symlink0000777116074500116100000000000012575654734012350 2fileustar joetsaiengbadlink0000777116074500116100000000000012575655374013021 2missingustar joetsaiengdir/0000750116074500116100000000000512575654704010653 5ustar joetsaiengfifo0000640116074500116100000000000512575655016010735 6ustar joetsaiengfile0000640116074500116100000000005612575654723010735 0ustar joetsaiengThe quick brown fox jumped over the lazy dog! 3 | hardlink0000640116074500116100000000000512575654723012445 1fileustar joetsaiengnull0000666116074500116100000000000512575632775012244 3ustar joetsaieng00000010000003sda0000660116074500116100000000000512575632775012031 4ustar joetsaieng00000100000000symlink0000777116074500116100000000000512575654734012355 2fileustar joetsaiengbadlink0000777116074500116100000000000512575655374013026 2missingustar joetsaieng -------------------------------------------------------------------------------- /archive/tar/testdata/pax.tar: -------------------------------------------------------------------------------- 1 | a/PaxHeaders.6887/12345678910111213141516171819202122232425262728293031323334353637383940414243444540000644000175000017500000000044612036615200022461 xustar0000000000000000204 path=a/123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 2 | 30 mtime=1350244992.023960108 3 | 30 atime=1350244992.023960108 4 | 30 ctime=1350244992.023960108 5 | a/123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525350000664000175000017500000000000712036615200023454 0ustar00shaneshane00000000000000shaner 6 | a/PaxHeaders.6887/b0000644000175000017500000000045012036666720012440 xustar0000000000000000206 linkpath=123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 7 | 30 mtime=1350266320.910238425 8 | 30 atime=1350266320.910238425 9 | 30 ctime=1350266320.910238425 10 | a/b0000777000175000017500000000000012036666720024004 21234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545ustar00shaneshane00000000000000 -------------------------------------------------------------------------------- /archive/tar/strconv.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package tar 6 | 7 | import ( 8 | "bytes" 9 | "fmt" 10 | "strconv" 11 | "strings" 12 | "time" 13 | ) 14 | 15 | // hasNUL reports whether the NUL character exists within s. 16 | func hasNUL(s string) bool { 17 | return strings.IndexByte(s, 0) >= 0 18 | } 19 | 20 | // isASCII reports whether the input is an ASCII C-style string. 21 | func isASCII(s string) bool { 22 | for _, c := range s { 23 | if c >= 0x80 || c == 0x00 { 24 | return false 25 | } 26 | } 27 | return true 28 | } 29 | 30 | // toASCII converts the input to an ASCII C-style string. 31 | // This a best effort conversion, so invalid characters are dropped. 32 | func toASCII(s string) string { 33 | if isASCII(s) { 34 | return s 35 | } 36 | b := make([]byte, 0, len(s)) 37 | for _, c := range s { 38 | if c < 0x80 && c != 0x00 { 39 | b = append(b, byte(c)) 40 | } 41 | } 42 | return string(b) 43 | } 44 | 45 | type parser struct { 46 | err error // Last error seen 47 | } 48 | 49 | type formatter struct { 50 | err error // Last error seen 51 | } 52 | 53 | // parseString parses bytes as a NUL-terminated C-style string. 54 | // If a NUL byte is not found then the whole slice is returned as a string. 55 | func (*parser) parseString(b []byte) string { 56 | if i := bytes.IndexByte(b, 0); i >= 0 { 57 | return string(b[:i]) 58 | } 59 | return string(b) 60 | } 61 | 62 | // formatString copies s into b, NUL-terminating if possible. 63 | func (f *formatter) formatString(b []byte, s string) { 64 | if len(s) > len(b) { 65 | f.err = ErrFieldTooLong 66 | } 67 | copy(b, s) 68 | if len(s) < len(b) { 69 | b[len(s)] = 0 70 | } 71 | 72 | // Some buggy readers treat regular files with a trailing slash 73 | // in the V7 path field as a directory even though the full path 74 | // recorded elsewhere (e.g., via PAX record) contains no trailing slash. 75 | if len(s) > len(b) && b[len(b)-1] == '/' { 76 | n := len(strings.TrimRight(s[:len(b)], "/")) 77 | b[n] = 0 // Replace trailing slash with NUL terminator 78 | } 79 | } 80 | 81 | // fitsInBase256 reports whether x can be encoded into n bytes using base-256 82 | // encoding. Unlike octal encoding, base-256 encoding does not require that the 83 | // string ends with a NUL character. Thus, all n bytes are available for output. 84 | // 85 | // If operating in binary mode, this assumes strict GNU binary mode; which means 86 | // that the first byte can only be either 0x80 or 0xff. Thus, the first byte is 87 | // equivalent to the sign bit in two's complement form. 88 | func fitsInBase256(n int, x int64) bool { 89 | binBits := uint(n-1) * 8 90 | return n >= 9 || (x >= -1< 0 && b[0]&0x80 != 0 { 101 | // Handling negative numbers relies on the following identity: 102 | // -a-1 == ^a 103 | // 104 | // If the number is negative, we use an inversion mask to invert the 105 | // data bytes and treat the value as an unsigned number. 106 | var inv byte // 0x00 if positive or zero, 0xff if negative 107 | if b[0]&0x40 != 0 { 108 | inv = 0xff 109 | } 110 | 111 | var x uint64 112 | for i, c := range b { 113 | c ^= inv // Inverts c only if inv is 0xff, otherwise does nothing 114 | if i == 0 { 115 | c &= 0x7f // Ignore signal bit in first byte 116 | } 117 | if (x >> 56) > 0 { 118 | p.err = ErrHeader // Integer overflow 119 | return 0 120 | } 121 | x = x<<8 | uint64(c) 122 | } 123 | if (x >> 63) > 0 { 124 | p.err = ErrHeader // Integer overflow 125 | return 0 126 | } 127 | if inv == 0xff { 128 | return ^int64(x) 129 | } 130 | return int64(x) 131 | } 132 | 133 | // Normal case is base-8 (octal) format. 134 | return p.parseOctal(b) 135 | } 136 | 137 | // formatNumeric encodes x into b using base-8 (octal) encoding if possible. 138 | // Otherwise it will attempt to use base-256 (binary) encoding. 139 | func (f *formatter) formatNumeric(b []byte, x int64) { 140 | if fitsInOctal(len(b), x) { 141 | f.formatOctal(b, x) 142 | return 143 | } 144 | 145 | if fitsInBase256(len(b), x) { 146 | for i := len(b) - 1; i >= 0; i-- { 147 | b[i] = byte(x) 148 | x >>= 8 149 | } 150 | b[0] |= 0x80 // Highest bit indicates binary format 151 | return 152 | } 153 | 154 | f.formatOctal(b, 0) // Last resort, just write zero 155 | f.err = ErrFieldTooLong 156 | } 157 | 158 | func (p *parser) parseOctal(b []byte) int64 { 159 | // Because unused fields are filled with NULs, we need 160 | // to skip leading NULs. Fields may also be padded with 161 | // spaces or NULs. 162 | // So we remove leading and trailing NULs and spaces to 163 | // be sure. 164 | b = bytes.Trim(b, " \x00") 165 | 166 | if len(b) == 0 { 167 | return 0 168 | } 169 | x, perr := strconv.ParseUint(p.parseString(b), 8, 64) 170 | if perr != nil { 171 | p.err = ErrHeader 172 | } 173 | return int64(x) 174 | } 175 | 176 | func (f *formatter) formatOctal(b []byte, x int64) { 177 | if !fitsInOctal(len(b), x) { 178 | x = 0 // Last resort, just write zero 179 | f.err = ErrFieldTooLong 180 | } 181 | 182 | s := strconv.FormatInt(x, 8) 183 | // Add leading zeros, but leave room for a NUL. 184 | if n := len(b) - len(s) - 1; n > 0 { 185 | s = strings.Repeat("0", n) + s 186 | } 187 | f.formatString(b, s) 188 | } 189 | 190 | // fitsInOctal reports whether the integer x fits in a field n-bytes long 191 | // using octal encoding with the appropriate NUL terminator. 192 | func fitsInOctal(n int, x int64) bool { 193 | octBits := uint(n-1) * 3 194 | return x >= 0 && (n >= 22 || x < 1<= 0 { 206 | ss, sn = s[:pos], s[pos+1:] 207 | } 208 | 209 | // Parse the seconds. 210 | secs, err := strconv.ParseInt(ss, 10, 64) 211 | if err != nil { 212 | return time.Time{}, ErrHeader 213 | } 214 | if len(sn) == 0 { 215 | return time.Unix(secs, 0), nil // No sub-second values 216 | } 217 | 218 | // Parse the nanoseconds. 219 | if strings.Trim(sn, "0123456789") != "" { 220 | return time.Time{}, ErrHeader 221 | } 222 | if len(sn) < maxNanoSecondDigits { 223 | sn += strings.Repeat("0", maxNanoSecondDigits-len(sn)) // Right pad 224 | } else { 225 | sn = sn[:maxNanoSecondDigits] // Right truncate 226 | } 227 | nsecs, _ := strconv.ParseInt(sn, 10, 64) // Must succeed 228 | if len(ss) > 0 && ss[0] == '-' { 229 | return time.Unix(secs, -1*nsecs), nil // Negative correction 230 | } 231 | return time.Unix(secs, nsecs), nil 232 | } 233 | 234 | // formatPAXTime converts ts into a time of the form %d.%d as described in the 235 | // PAX specification. This function is capable of negative timestamps. 236 | func formatPAXTime(ts time.Time) (s string) { 237 | secs, nsecs := ts.Unix(), ts.Nanosecond() 238 | if nsecs == 0 { 239 | return strconv.FormatInt(secs, 10) 240 | } 241 | 242 | // If seconds is negative, then perform correction. 243 | sign := "" 244 | if secs < 0 { 245 | sign = "-" // Remember sign 246 | secs = -(secs + 1) // Add a second to secs 247 | nsecs = -(nsecs - 1E9) // Take that second away from nsecs 248 | } 249 | return strings.TrimRight(fmt.Sprintf("%s%d.%09d", sign, secs, nsecs), "0") 250 | } 251 | 252 | // parsePAXRecord parses the input PAX record string into a key-value pair. 253 | // If parsing is successful, it will slice off the currently read record and 254 | // return the remainder as r. 255 | func parsePAXRecord(s string) (k, v, r string, err error) { 256 | // The size field ends at the first space. 257 | sp := strings.IndexByte(s, ' ') 258 | if sp == -1 { 259 | return "", "", s, ErrHeader 260 | } 261 | 262 | // Parse the first token as a decimal integer. 263 | n, perr := strconv.ParseInt(s[:sp], 10, 0) // Intentionally parse as native int 264 | if perr != nil || n < 5 || int64(len(s)) < n { 265 | return "", "", s, ErrHeader 266 | } 267 | 268 | // Extract everything between the space and the final newline. 269 | rec, nl, rem := s[sp+1:n-1], s[n-1:n], s[n:] 270 | if nl != "\n" { 271 | return "", "", s, ErrHeader 272 | } 273 | 274 | // The first equals separates the key from the value. 275 | eq := strings.IndexByte(rec, '=') 276 | if eq == -1 { 277 | return "", "", s, ErrHeader 278 | } 279 | k, v = rec[:eq], rec[eq+1:] 280 | 281 | if !validPAXRecord(k, v) { 282 | return "", "", s, ErrHeader 283 | } 284 | return k, v, rem, nil 285 | } 286 | 287 | // formatPAXRecord formats a single PAX record, prefixing it with the 288 | // appropriate length. 289 | func formatPAXRecord(k, v string) (string, error) { 290 | if !validPAXRecord(k, v) { 291 | return "", ErrHeader 292 | } 293 | 294 | const padding = 3 // Extra padding for ' ', '=', and '\n' 295 | size := len(k) + len(v) + padding 296 | size += len(strconv.Itoa(size)) 297 | record := strconv.Itoa(size) + " " + k + "=" + v + "\n" 298 | 299 | // Final adjustment if adding size field increased the record size. 300 | if len(record) != size { 301 | size = len(record) 302 | record = strconv.Itoa(size) + " " + k + "=" + v + "\n" 303 | } 304 | return record, nil 305 | } 306 | 307 | // validPAXRecord reports whether the key-value pair is valid where each 308 | // record is formatted as: 309 | // "%d %s=%s\n" % (size, key, value) 310 | // 311 | // Keys and values should be UTF-8, but the number of bad writers out there 312 | // forces us to be a more liberal. 313 | // Thus, we only reject all keys with NUL, and only reject NULs in values 314 | // for the PAX version of the USTAR string fields. 315 | // The key must not contain an '=' character. 316 | func validPAXRecord(k, v string) bool { 317 | if k == "" || strings.IndexByte(k, '=') >= 0 { 318 | return false 319 | } 320 | switch k { 321 | case paxPath, paxLinkpath, paxUname, paxGname: 322 | return !hasNUL(v) 323 | default: 324 | return !hasNUL(k) 325 | } 326 | } 327 | -------------------------------------------------------------------------------- /archive/tar/format.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package tar 6 | 7 | import "strings" 8 | 9 | // Format represents the tar archive format. 10 | // 11 | // The original tar format was introduced in Unix V7. 12 | // Since then, there have been multiple competing formats attempting to 13 | // standardize or extend the V7 format to overcome its limitations. 14 | // The most common formats are the USTAR, PAX, and GNU formats, 15 | // each with their own advantages and limitations. 16 | // 17 | // The following table captures the capabilities of each format: 18 | // 19 | // | USTAR | PAX | GNU 20 | // ------------------+--------+-----------+---------- 21 | // Name | 256B | unlimited | unlimited 22 | // Linkname | 100B | unlimited | unlimited 23 | // Size | uint33 | unlimited | uint89 24 | // Mode | uint21 | uint21 | uint57 25 | // Uid/Gid | uint21 | unlimited | uint57 26 | // Uname/Gname | 32B | unlimited | 32B 27 | // ModTime | uint33 | unlimited | int89 28 | // AccessTime | n/a | unlimited | int89 29 | // ChangeTime | n/a | unlimited | int89 30 | // Devmajor/Devminor | uint21 | uint21 | uint57 31 | // ------------------+--------+-----------+---------- 32 | // string encoding | ASCII | UTF-8 | binary 33 | // sub-second times | no | yes | no 34 | // sparse files | no | yes | yes 35 | // 36 | // The table's upper portion shows the Header fields, where each format reports 37 | // the maximum number of bytes allowed for each string field and 38 | // the integer type used to store each numeric field 39 | // (where timestamps are stored as the number of seconds since the Unix epoch). 40 | // 41 | // The table's lower portion shows specialized features of each format, 42 | // such as supported string encodings, support for sub-second timestamps, 43 | // or support for sparse files. 44 | // 45 | // The Writer currently provides no support for sparse files. 46 | type Format int 47 | 48 | // Constants to identify various tar formats. 49 | const ( 50 | // Deliberately hide the meaning of constants from public API. 51 | _ Format = (1 << iota) / 4 // Sequence of 0, 0, 1, 2, 4, 8, etc... 52 | 53 | // FormatUnknown indicates that the format is unknown. 54 | FormatUnknown 55 | 56 | // The format of the original Unix V7 tar tool prior to standardization. 57 | formatV7 58 | 59 | // FormatUSTAR represents the USTAR header format defined in POSIX.1-1988. 60 | // 61 | // While this format is compatible with most tar readers, 62 | // the format has several limitations making it unsuitable for some usages. 63 | // Most notably, it cannot support sparse files, files larger than 8GiB, 64 | // filenames larger than 256 characters, and non-ASCII filenames. 65 | // 66 | // Reference: 67 | // http://pubs.opengroup.org/onlinepubs/9699919799/utilities/pax.html#tag_20_92_13_06 68 | FormatUSTAR 69 | 70 | // FormatPAX represents the PAX header format defined in POSIX.1-2001. 71 | // 72 | // PAX extends USTAR by writing a special file with Typeflag TypeXHeader 73 | // preceding the original header. This file contains a set of key-value 74 | // records, which are used to overcome USTAR's shortcomings, in addition to 75 | // providing the ability to have sub-second resolution for timestamps. 76 | // 77 | // Some newer formats add their own extensions to PAX by defining their 78 | // own keys and assigning certain semantic meaning to the associated values. 79 | // For example, sparse file support in PAX is implemented using keys 80 | // defined by the GNU manual (e.g., "GNU.sparse.map"). 81 | // 82 | // Reference: 83 | // http://pubs.opengroup.org/onlinepubs/009695399/utilities/pax.html 84 | FormatPAX 85 | 86 | // FormatGNU represents the GNU header format. 87 | // 88 | // The GNU header format is older than the USTAR and PAX standards and 89 | // is not compatible with them. The GNU format supports 90 | // arbitrary file sizes, filenames of arbitrary encoding and length, 91 | // sparse files, and other features. 92 | // 93 | // It is recommended that PAX be chosen over GNU unless the target 94 | // application can only parse GNU formatted archives. 95 | // 96 | // Reference: 97 | // https://www.gnu.org/software/tar/manual/html_node/Standard.html 98 | FormatGNU 99 | 100 | // Schily's tar format, which is incompatible with USTAR. 101 | // This does not cover STAR extensions to the PAX format; these fall under 102 | // the PAX format. 103 | formatSTAR 104 | 105 | formatMax 106 | ) 107 | 108 | func (f Format) has(f2 Format) bool { return f&f2 != 0 } 109 | func (f *Format) mayBe(f2 Format) { *f |= f2 } 110 | func (f *Format) mayOnlyBe(f2 Format) { *f &= f2 } 111 | func (f *Format) mustNotBe(f2 Format) { *f &^= f2 } 112 | 113 | var formatNames = map[Format]string{ 114 | formatV7: "V7", FormatUSTAR: "USTAR", FormatPAX: "PAX", FormatGNU: "GNU", formatSTAR: "STAR", 115 | } 116 | 117 | func (f Format) String() string { 118 | var ss []string 119 | for f2 := Format(1); f2 < formatMax; f2 <<= 1 { 120 | if f.has(f2) { 121 | ss = append(ss, formatNames[f2]) 122 | } 123 | } 124 | switch len(ss) { 125 | case 0: 126 | return "" 127 | case 1: 128 | return ss[0] 129 | default: 130 | return "(" + strings.Join(ss, " | ") + ")" 131 | } 132 | } 133 | 134 | // Magics used to identify various formats. 135 | const ( 136 | magicGNU, versionGNU = "ustar ", " \x00" 137 | magicUSTAR, versionUSTAR = "ustar\x00", "00" 138 | trailerSTAR = "tar\x00" 139 | ) 140 | 141 | // Size constants from various tar specifications. 142 | const ( 143 | blockSize = 512 // Size of each block in a tar stream 144 | nameSize = 100 // Max length of the name field in USTAR format 145 | prefixSize = 155 // Max length of the prefix field in USTAR format 146 | 147 | // Max length of a special file (PAX header, GNU long name or link). 148 | // This matches the limit used by libarchive. 149 | maxSpecialFileSize = 1 << 20 150 | ) 151 | 152 | // blockPadding computes the number of bytes needed to pad offset up to the 153 | // nearest block edge where 0 <= n < blockSize. 154 | func blockPadding(offset int64) (n int64) { 155 | return -offset & (blockSize - 1) 156 | } 157 | 158 | var zeroBlock block 159 | 160 | type block [blockSize]byte 161 | 162 | // Convert block to any number of formats. 163 | func (b *block) V7() *headerV7 { return (*headerV7)(b) } 164 | func (b *block) GNU() *headerGNU { return (*headerGNU)(b) } 165 | func (b *block) STAR() *headerSTAR { return (*headerSTAR)(b) } 166 | func (b *block) USTAR() *headerUSTAR { return (*headerUSTAR)(b) } 167 | func (b *block) Sparse() sparseArray { return (sparseArray)(b[:]) } 168 | 169 | // GetFormat checks that the block is a valid tar header based on the checksum. 170 | // It then attempts to guess the specific format based on magic values. 171 | // If the checksum fails, then FormatUnknown is returned. 172 | func (b *block) GetFormat() Format { 173 | // Verify checksum. 174 | var p parser 175 | value := p.parseOctal(b.V7().Chksum()) 176 | chksum1, chksum2 := b.ComputeChecksum() 177 | if p.err != nil || (value != chksum1 && value != chksum2) { 178 | return FormatUnknown 179 | } 180 | 181 | // Guess the magic values. 182 | magic := string(b.USTAR().Magic()) 183 | version := string(b.USTAR().Version()) 184 | trailer := string(b.STAR().Trailer()) 185 | switch { 186 | case magic == magicUSTAR && trailer == trailerSTAR: 187 | return formatSTAR 188 | case magic == magicUSTAR: 189 | return FormatUSTAR | FormatPAX 190 | case magic == magicGNU && version == versionGNU: 191 | return FormatGNU 192 | default: 193 | return formatV7 194 | } 195 | } 196 | 197 | // SetFormat writes the magic values necessary for specified format 198 | // and then updates the checksum accordingly. 199 | func (b *block) SetFormat(format Format) { 200 | // Set the magic values. 201 | switch { 202 | case format.has(formatV7): 203 | // Do nothing. 204 | case format.has(FormatGNU): 205 | copy(b.GNU().Magic(), magicGNU) 206 | copy(b.GNU().Version(), versionGNU) 207 | case format.has(formatSTAR): 208 | copy(b.STAR().Magic(), magicUSTAR) 209 | copy(b.STAR().Version(), versionUSTAR) 210 | copy(b.STAR().Trailer(), trailerSTAR) 211 | case format.has(FormatUSTAR | FormatPAX): 212 | copy(b.USTAR().Magic(), magicUSTAR) 213 | copy(b.USTAR().Version(), versionUSTAR) 214 | default: 215 | panic("invalid format") 216 | } 217 | 218 | // Update checksum. 219 | // This field is special in that it is terminated by a NULL then space. 220 | var f formatter 221 | field := b.V7().Chksum() 222 | chksum, _ := b.ComputeChecksum() // Possible values are 256..128776 223 | f.formatOctal(field[:7], chksum) // Never fails since 128776 < 262143 224 | field[7] = ' ' 225 | } 226 | 227 | // ComputeChecksum computes the checksum for the header block. 228 | // POSIX specifies a sum of the unsigned byte values, but the Sun tar used 229 | // signed byte values. 230 | // We compute and return both. 231 | func (b *block) ComputeChecksum() (unsigned, signed int64) { 232 | for i, c := range b { 233 | if 148 <= i && i < 156 { 234 | c = ' ' // Treat the checksum field itself as all spaces. 235 | } 236 | unsigned += int64(c) 237 | signed += int64(int8(c)) 238 | } 239 | return unsigned, signed 240 | } 241 | 242 | // Reset clears the block with all zeros. 243 | func (b *block) Reset() { 244 | *b = block{} 245 | } 246 | 247 | type headerV7 [blockSize]byte 248 | 249 | func (h *headerV7) Name() []byte { return h[000:][:100] } 250 | func (h *headerV7) Mode() []byte { return h[100:][:8] } 251 | func (h *headerV7) UID() []byte { return h[108:][:8] } 252 | func (h *headerV7) GID() []byte { return h[116:][:8] } 253 | func (h *headerV7) Size() []byte { return h[124:][:12] } 254 | func (h *headerV7) ModTime() []byte { return h[136:][:12] } 255 | func (h *headerV7) Chksum() []byte { return h[148:][:8] } 256 | func (h *headerV7) TypeFlag() []byte { return h[156:][:1] } 257 | func (h *headerV7) LinkName() []byte { return h[157:][:100] } 258 | 259 | type headerGNU [blockSize]byte 260 | 261 | func (h *headerGNU) V7() *headerV7 { return (*headerV7)(h) } 262 | func (h *headerGNU) Magic() []byte { return h[257:][:6] } 263 | func (h *headerGNU) Version() []byte { return h[263:][:2] } 264 | func (h *headerGNU) UserName() []byte { return h[265:][:32] } 265 | func (h *headerGNU) GroupName() []byte { return h[297:][:32] } 266 | func (h *headerGNU) DevMajor() []byte { return h[329:][:8] } 267 | func (h *headerGNU) DevMinor() []byte { return h[337:][:8] } 268 | func (h *headerGNU) AccessTime() []byte { return h[345:][:12] } 269 | func (h *headerGNU) ChangeTime() []byte { return h[357:][:12] } 270 | func (h *headerGNU) Sparse() sparseArray { return (sparseArray)(h[386:][:24*4+1]) } 271 | func (h *headerGNU) RealSize() []byte { return h[483:][:12] } 272 | 273 | type headerSTAR [blockSize]byte 274 | 275 | func (h *headerSTAR) V7() *headerV7 { return (*headerV7)(h) } 276 | func (h *headerSTAR) Magic() []byte { return h[257:][:6] } 277 | func (h *headerSTAR) Version() []byte { return h[263:][:2] } 278 | func (h *headerSTAR) UserName() []byte { return h[265:][:32] } 279 | func (h *headerSTAR) GroupName() []byte { return h[297:][:32] } 280 | func (h *headerSTAR) DevMajor() []byte { return h[329:][:8] } 281 | func (h *headerSTAR) DevMinor() []byte { return h[337:][:8] } 282 | func (h *headerSTAR) Prefix() []byte { return h[345:][:131] } 283 | func (h *headerSTAR) AccessTime() []byte { return h[476:][:12] } 284 | func (h *headerSTAR) ChangeTime() []byte { return h[488:][:12] } 285 | func (h *headerSTAR) Trailer() []byte { return h[508:][:4] } 286 | 287 | type headerUSTAR [blockSize]byte 288 | 289 | func (h *headerUSTAR) V7() *headerV7 { return (*headerV7)(h) } 290 | func (h *headerUSTAR) Magic() []byte { return h[257:][:6] } 291 | func (h *headerUSTAR) Version() []byte { return h[263:][:2] } 292 | func (h *headerUSTAR) UserName() []byte { return h[265:][:32] } 293 | func (h *headerUSTAR) GroupName() []byte { return h[297:][:32] } 294 | func (h *headerUSTAR) DevMajor() []byte { return h[329:][:8] } 295 | func (h *headerUSTAR) DevMinor() []byte { return h[337:][:8] } 296 | func (h *headerUSTAR) Prefix() []byte { return h[345:][:155] } 297 | 298 | type sparseArray []byte 299 | 300 | func (s sparseArray) Entry(i int) sparseElem { return (sparseElem)(s[i*24:]) } 301 | func (s sparseArray) IsExtended() []byte { return s[24*s.MaxEntries():][:1] } 302 | func (s sparseArray) MaxEntries() int { return len(s) / 24 } 303 | 304 | type sparseElem []byte 305 | 306 | func (s sparseElem) Offset() []byte { return s[00:][:12] } 307 | func (s sparseElem) Length() []byte { return s[12:][:12] } 308 | --------------------------------------------------------------------------------