├── .github └── workflows │ ├── ci.yml │ └── release.yml ├── .gitignore ├── .goreleaser.yaml ├── LICENSE ├── Makefile ├── README.md ├── adapters └── cli │ ├── build_binary.go │ └── driver.go ├── archiver.go ├── archiver_options.go ├── archiver_test.go ├── cli.go ├── cli_test.go ├── cmd ├── punzip │ ├── main.go │ └── punzip_test.go └── pzip │ ├── main.go │ └── pzip_test.go ├── extra.go ├── extractor.go ├── extractor_options.go ├── extractor_test.go ├── go.mod ├── go.sum ├── internal └── testutils │ └── archiver.go ├── pool ├── file.go ├── file_test.go ├── file_worker_pool.go ├── file_worker_pool_test.go └── worker_pool.go ├── specifications ├── archive.go └── extract.go └── testdata ├── hello.md ├── hello.txt ├── hello ├── hello.txt └── nested │ └── hello.md └── test.zip /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | test: 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | os: 17 | - ubuntu-latest 18 | - macos-latest 19 | go: 20 | - '1.21' 21 | include: 22 | - go: '1.21' 23 | GO_SEMVER: '~1.21.0' 24 | 25 | runs-on: ${{ matrix.os }} 26 | 27 | steps: 28 | - name: Checkout code 29 | uses: actions/checkout@v4 30 | 31 | - name: Install Go 32 | uses: actions/setup-go@v4 33 | with: 34 | go-version: ${{ matrix.GO_SEMVER }} 35 | check-latest: true 36 | 37 | - name: Get dependencies 38 | run: | 39 | go get -v -t -d ./... 40 | 41 | - name: Build pzip 42 | working-directory: ./cmd/pzip 43 | env: 44 | CGO_ENABLED: 0 45 | run: | 46 | go build -v 47 | 48 | - name: Build punzip 49 | working-directory: ./cmd/punzip 50 | env: 51 | CGO_ENABLED: 0 52 | run: | 53 | go build -v 54 | 55 | 56 | - name: Run tests 57 | run: | 58 | go test -v -race ./... 59 | 60 | 61 | goreleaser-check: 62 | runs-on: ubuntu-latest 63 | steps: 64 | - name: Checkout code 65 | uses: actions/checkout@v4 66 | 67 | - uses: goreleaser/goreleaser-action@v4 68 | with: 69 | version: latest 70 | args: check 71 | env: 72 | TAG: ${{ steps.vars.outputs.version_tag }} 73 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' 7 | 8 | jobs: 9 | release: 10 | name: Release 11 | strategy: 12 | matrix: 13 | os: 14 | - ubuntu-latest 15 | go: 16 | - '1.21' 17 | include: 18 | - go: '1.21' 19 | GO_SEMVER: '~1.21.0' 20 | 21 | runs-on: ${{ matrix.os }} 22 | steps: 23 | - name: Checkout code 24 | uses: actions/checkout@v3 25 | with: 26 | fetch-depth: 0 27 | - name: Install Go 28 | uses: actions/setup-go@v4 29 | with: 30 | go-version: ${{ matrix.GO_SEMVER }} 31 | check-latest: true 32 | - name: Install Cloudsmith CLI 33 | run: pip install --upgrade cloudsmith-cli 34 | - name: Run GoReleaser 35 | uses: goreleaser/goreleaser-action@v4 36 | with: 37 | version: latest 38 | args: release --clean --timeout 30m 39 | env: 40 | GITHUB_TOKEN: ${{ secrets.GH_PAT }} 41 | TAG: ${{ steps.vars.outputs.version_tag }} 42 | - name: Publish .deb to Cloudsmith 43 | if: ${{ steps.vars.output.tag_special == '' }} 44 | env: 45 | CLOUDSMITH_API_KEY: ${{ secrets.CLOUDSMITH_API_KEY }} 46 | run: | 47 | for filename in dist/*.deb; do 48 | echo "Pushing $filename to 'stable'" 49 | cloudsmith push deb pzip/stable/any-distro/any-version $filename 50 | done 51 | 52 | 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | todos.txt 2 | 3 | # internal benchmarking and profiling artifacts 4 | benchmarks.txt 5 | benchmark 6 | profiles 7 | 8 | # mac specific 9 | .DS_Store 10 | 11 | # goreleaser artifacts 12 | dist 13 | pzip-build 14 | pzip-dist 15 | -------------------------------------------------------------------------------- /.goreleaser.yaml: -------------------------------------------------------------------------------- 1 | project_name: pzip 2 | 3 | before: 4 | hooks: 5 | - go mod tidy 6 | 7 | builds: 8 | - id: pzip 9 | env: 10 | - CGO_ENABLED=0 11 | - GO111MODULE=on 12 | main: ./cmd/pzip/ 13 | binary: pzip 14 | goos: 15 | - linux 16 | - darwin 17 | - freebsd 18 | goarch: 19 | - amd64 20 | - arm64 21 | - id: punzip 22 | env: 23 | - CGO_ENABLED=0 24 | - GO111MODULE=on 25 | main: ./cmd/punzip/ 26 | binary: punzip 27 | goos: 28 | - linux 29 | - darwin 30 | - freebsd 31 | goarch: 32 | - amd64 33 | - arm64 34 | 35 | archives: 36 | - id: pzip-archive 37 | format: tar.gz 38 | builds: 39 | - pzip 40 | name_template: >- 41 | pzip_ 42 | {{- title .Os }}_ 43 | {{- if eq .Arch "amd64" }}x86_64 44 | {{- else if eq .Arch "386" }}i386 45 | {{- else }}{{ .Arch }}{{ end }} 46 | {{- if .Arm }}v{{ .Arm }}{{ end }} 47 | format_overrides: 48 | - goos: windows 49 | format: zip 50 | - id: punzip-archive 51 | format: tar.gz 52 | builds: 53 | - punzip 54 | name_template: >- 55 | punzip_ 56 | {{- title .Os }}_ 57 | {{- if eq .Arch "amd64" }}x86_64 58 | {{- else if eq .Arch "386" }}i386 59 | {{- else }}{{ .Arch }}{{ end }} 60 | {{- if .Arm }}v{{ .Arm }}{{ end }} 61 | format_overrides: 62 | - goos: windows 63 | format: zip 64 | 65 | snapshot: 66 | name_template: "{{ incpatch .Version }}-next" 67 | 68 | changelog: 69 | sort: asc 70 | filters: 71 | exclude: 72 | - '^docs?:' 73 | - '^tests?:' 74 | - '^readme:' 75 | 76 | nfpms: 77 | - id: pzip-package 78 | builds: 79 | - pzip 80 | package_name: pzip 81 | maintainer: Yusuf Birader 82 | homepage: https://github.com/ybirader/pzip 83 | description: | 84 | pzip, short for parallel-zip, is a blazing fast concurrent zip archiver. 85 | license: Apache 2.0 86 | formats: 87 | - deb 88 | bindir: /usr/bin 89 | - id: punzip-package 90 | builds: 91 | - punzip 92 | package_name: punzip 93 | maintainer: Yusuf Birader 94 | homepage: https://github.com/ybirader/pzip 95 | description: | 96 | punzip, short for parallel-unzip, is a blazing fast concurrent zip extractor. 97 | license: Apache 2.0 98 | formats: 99 | - deb 100 | bindir: /usr/bin 101 | 102 | release: 103 | github: 104 | owner: ybirader 105 | name: pzip 106 | draft: true 107 | prerelease: auto 108 | header: | 109 | ## Features 110 | 111 | List of newly introduced features: 112 | 113 | - Item 1 114 | - Item 2 115 | 116 | ## Bug fixes 117 | 118 | List of fixed issues: 119 | 120 | - Item 1 121 | - Item 2 122 | 123 | brews: 124 | - name: pzip 125 | description: "pzip, short for parallel-zip, is a blazing fast concurrent zip archiver." 126 | license: Apache 2.0 127 | homepage: https://github.com/ybirader/pzip 128 | ids: 129 | - pzip-archive 130 | repository: 131 | name: homebrew-pzip 132 | owner: ybirader 133 | - name: punzip 134 | description: "punzip, short for parallel-unzip, is a blazing fast concurrent zip extractor." 135 | license: Apache 2.0 136 | homepage: https://github.com/ybirader/pzip 137 | ids: 138 | - punzip-archive 139 | repository: 140 | name: homebrew-pzip 141 | owner: ybirader 142 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2023 Yusuf Birader 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | go test ./... 3 | 4 | test-short: 5 | go test -short ./... 6 | 7 | build: 8 | go build -o ./cmd/pzip ./cmd/pzip && go build -o ./cmd/punzip ./cmd/punzip 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![logo-5](https://github.com/ybirader/pzip/assets/68111562/0b3cee2c-1af0-4753-b088-8a488f8ff642) 2 | 3 | # pzip 4 | pzip, short for parallel-zip, is a blazing fast concurrent zip archiver and extractor. 5 | 6 | ## Features 7 | 8 | - Archives files and directories into a valid zip archive, using DEFLATE. 9 | - Preserves modification times of files. 10 | - Files are read and compressed concurrently 11 | 12 | ## Installation 13 | 14 | ### Command Line 15 | 16 | For command-line usage, we provide two binaries which can be installed separately: 17 | - **pzip-** concurrent zip archiving 18 | - **punzip-** concurrent zip extraction 19 | 20 | To install, run: 21 | 22 | ### macOS 23 | 24 | For zip archiving: `brew install ybirader/pzip/pzip` 25 | 26 | For zip extraction: `brew install ybirader/pzip/punzip` 27 | 28 | #### Debian, Ubuntu, Raspbian 29 | 30 | For the latest stable release: 31 | 32 | ``` 33 | curl -1sLf 'https://dl.cloudsmith.io/public/pzip/stable/setup.deb.sh' | sudo -E bash 34 | sudo apt update 35 | sudo apt install pzip 36 | ``` 37 | 38 | ``` 39 | curl -1sLf 'https://dl.cloudsmith.io/public/pzip/stable/setup.deb.sh' | sudo -E bash 40 | sudo apt update 41 | sudo apt install punzip 42 | ``` 43 | 44 | ### Go 45 | 46 | Alternatively, if you have Go installed: 47 | ``` 48 | go install github.com/ybirader/pzip 49 | ``` 50 | 51 | ### Build from source 52 | 53 | To build from source, we require Go 1.21 or newer. 54 | 55 | 1. Clone the repository by running `git clone "https://github.com/ybirader/pzip.git"` 56 | 2. Build both pzip and punzip by running `make build` or build separately via `cd cmd/pzip && go build` and `cd cmd/punzip && go build` 57 | 58 | ## Usage 59 | 60 | ### Archiving 61 | 62 | `pzip`'s API is similar to that of the standard zip utlity found on most *-nix systems. 63 | 64 | ``` 65 | pzip /path/to/compressed.zip path/to/file_or_directory1 path/to/file_or_directory2 ... path/to/file_or_directoryN 66 | ``` 67 | 68 | Alternatively, pzip can be imported as a package 69 | 70 | ```go 71 | archive, err := os.Create("archive.zip") 72 | if err != nil { 73 | log.Fatal(err) 74 | } 75 | 76 | archiver, err := pzip.NewArchiver(archive) 77 | if err != nil { 78 | log.Fatal(err) 79 | } 80 | defer archiver.Close() 81 | 82 | files := []string{ "./hello", "./hello.txt", "./bye.md" } 83 | 84 | err = archiver.Archive(context.Background(), files) 85 | if err != nil { 86 | log.Fatal(err) 87 | } 88 | ``` 89 | 90 | The concurrency of the archiver can be configured using the corresponding flag: 91 | ``` 92 | pzip --concurrency 2 /path/to/compressed.zip path/to/file_or_directory1 path/to/file_or_directory2 ... path/to/file_or_directoryN 93 | 94 | ``` 95 | or by passing the `ArchiverConcurrency` option: 96 | ```go 97 | archiver, err := pzip.NewArchiver(archive, ArchiverConcurrency(2)) 98 | ``` 99 | 100 | ### Extraction 101 | 102 | `punzip`'s API is similar to that of the standard unzip utlity found on most *-nix systems. 103 | 104 | ``` 105 | punzip /path/to/compressed.zip 106 | ``` 107 | 108 | By default, `punzip` extracts into the current directory. We can extract to a particular path by: 109 | ``` 110 | punzip -d /path/to/output /path/to/compressed.zip 111 | ``` 112 | 113 | Using the Go package, we have: 114 | ```go 115 | outputDirPath := "./output" 116 | archivePath := "./archive.zip" 117 | 118 | extractor, err := pzip.NewExtractor(outputDirPath) 119 | if err != nil { 120 | log.Fatal(err) 121 | } 122 | defer extractor.Close() 123 | 124 | err = extractor.Extract(context.Background(), archivePath) 125 | if err != nil { 126 | log.Fatal(err) 127 | } 128 | ``` 129 | 130 | As with pzip, we can configure the concurrency of the extractor using: 131 | 132 | ``` 133 | punzip --concurrency 2 /path/to/compressed.zip 134 | ``` 135 | 136 | Similarly, with the Go package, we pass in the `ExtractorConcurrency` option: 137 | ```go 138 | extractor, err := pzip.NewExtractor(outputDirPath, ExtractorConcurrency(2)) 139 | ``` 140 | 141 | 142 | ### Benchmarks 143 | 144 | pzip was benchmarked using Matt Mahoney's [sample directory](https://mattmahoney.net/dc/10gb.html). 145 | 146 | Using the standard `zip` utlity, we get the following time to archive: 147 | ``` 148 | real 14m31.809s 149 | user 13m12.833s 150 | sys 0m24.193s 151 | ``` 152 | 153 | Running the same benchmark with pzip, we find that: 154 | 155 | ``` 156 | real 0m56.851s 157 | user 3m32.619s 158 | sys 1m25.040s 159 | ``` 160 | 161 | ## Contributing 162 | 163 | To contribute to pzip, first submit or comment in an issue to discuss your contribution, then open a pull request (PR). 164 | 165 | ## License 166 | 167 | pzip is released under the [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) license. 168 | 169 | ## Acknowledgements 170 | 171 | Many thanks to the folks at [Cloudsmith](https://cloudsmith.com) for graciously providing Debian package hosting. Cloudsmith is the only fully hosted, cloud-native, universal package management solution, that enables your organization to create, store and share packages in any format, to any place, with total confidence. 172 | 173 | -------------------------------------------------------------------------------- /adapters/cli/build_binary.go: -------------------------------------------------------------------------------- 1 | package cli 2 | 3 | import ( 4 | "os" 5 | "os/exec" 6 | "path/filepath" 7 | "runtime" 8 | ) 9 | 10 | func BuildBinary() (binPath string, cleanup func(), err error) { 11 | binName := "pzip-test" 12 | 13 | if runtime.GOOS == "windows" { 14 | binName += ".exe" 15 | } 16 | 17 | build := exec.Command("go", "build", "-o", binName) 18 | 19 | if err := build.Run(); err != nil { 20 | return "", nil, err 21 | } 22 | 23 | dir, err := os.Getwd() 24 | if err != nil { 25 | return "", nil, err 26 | } 27 | 28 | binPath = filepath.Join(dir, binName) 29 | 30 | cleanup = func() { 31 | os.Remove(binPath) 32 | } 33 | 34 | return 35 | } 36 | -------------------------------------------------------------------------------- /adapters/cli/driver.go: -------------------------------------------------------------------------------- 1 | package cli 2 | 3 | import ( 4 | "log" 5 | "os/exec" 6 | ) 7 | 8 | type Driver struct { 9 | binPath string 10 | archivePath string 11 | dirPath string 12 | } 13 | 14 | func NewDriver(binPath, archivePath, dirPath string) *Driver { 15 | return &Driver{binPath, archivePath, dirPath} 16 | } 17 | 18 | func (d *Driver) DirPath() string { 19 | return d.dirPath 20 | } 21 | 22 | func (d *Driver) ArchivePath() string { 23 | return d.archivePath 24 | } 25 | 26 | func (d *Driver) Archive() { 27 | pzip := exec.Command(d.binPath, d.ArchivePath(), d.DirPath()) 28 | 29 | if err := pzip.Run(); err != nil { 30 | log.Fatal("ERROR: could not run pzip binary", err) 31 | } 32 | } 33 | 34 | func (d *Driver) Extract() { 35 | punzip := exec.Command(d.binPath, d.ArchivePath()) 36 | 37 | if err := punzip.Run(); err != nil { 38 | log.Fatal("ERROR: could not run punzip binary", err) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /archiver.go: -------------------------------------------------------------------------------- 1 | package pzip 2 | 3 | import ( 4 | "archive/zip" 5 | "bufio" 6 | "context" 7 | "fmt" 8 | "hash/crc32" 9 | "io" 10 | "io/fs" 11 | "os" 12 | "path/filepath" 13 | "runtime" 14 | "sync" 15 | "unicode/utf8" 16 | 17 | "github.com/ybirader/pzip/pool" 18 | ) 19 | 20 | const ( 21 | defaultCompression = -1 22 | zipVersion20 = 20 23 | sequentialWrites = 1 24 | ) 25 | 26 | const bufferSize = 32 * 1024 27 | 28 | var bufferPool = sync.Pool{ 29 | New: func() any { 30 | return bufio.NewReaderSize(nil, bufferSize) 31 | }, 32 | } 33 | 34 | type archiver struct { 35 | xArchive *os.File 36 | concurrency int 37 | w *zip.Writer 38 | fileProcessPool pool.WorkerPool[pool.File] 39 | fileWriterPool pool.WorkerPool[pool.File] 40 | chroot string 41 | absoluteArchivePath string 42 | } 43 | 44 | // NewArchiver returns a new pzip archiver. The archiver can be configured by passing in a number of options. 45 | // Available options include ArchiverConcurrency(n int). It returns an error if the archiver can't be created 46 | // Close() should be called on the returned archiver when done 47 | func NewArchiver(archive *os.File, options ...archiverOption) (*archiver, error) { 48 | a := &archiver{ 49 | xArchive: archive, 50 | w: zip.NewWriter(archive), 51 | concurrency: runtime.GOMAXPROCS(0), 52 | } 53 | 54 | var err error 55 | a.absoluteArchivePath, err = filepath.Abs(archive.Name()) 56 | if err != nil { 57 | return nil, fmt.Errorf("absolute archive path %q: %w", archive.Name(), err) 58 | } 59 | 60 | fileProcessExecutor := func(file *pool.File) error { 61 | err := a.compress(file) 62 | if err != nil { 63 | return fmt.Errorf("compress file %q: %w", file.Path, err) 64 | } 65 | 66 | a.fileWriterPool.Enqueue(file) 67 | 68 | return nil 69 | } 70 | 71 | fileProcessPool, err := pool.NewFileWorkerPool(fileProcessExecutor, &pool.Config{Concurrency: a.concurrency, Capacity: 1}) 72 | if err != nil { 73 | return nil, fmt.Errorf("new file process pool: %w", err) 74 | } 75 | a.fileProcessPool = fileProcessPool 76 | 77 | fileWriterExecutor := func(file *pool.File) error { 78 | err := a.archive(file) 79 | if err != nil { 80 | return fmt.Errorf("archive %q: %w", file.Path, err) 81 | } 82 | 83 | return nil 84 | } 85 | 86 | fileWriterPool, err := pool.NewFileWorkerPool(fileWriterExecutor, &pool.Config{Concurrency: sequentialWrites, Capacity: 1}) 87 | if err != nil { 88 | return nil, fmt.Errorf("new file writer pool: %w", err) 89 | } 90 | a.fileWriterPool = fileWriterPool 91 | 92 | for _, option := range options { 93 | err = option(a) 94 | if err != nil { 95 | return nil, err 96 | } 97 | } 98 | 99 | return a, nil 100 | } 101 | 102 | // Archive compresses and stores (archives) the files at the provides filePaths to 103 | // the corresponding archive registered with the archiver. Archiving is canceled when the 104 | // associated ctx is canceled. The first error that arises during archiving is returned. 105 | func (a *archiver) Archive(ctx context.Context, filePaths []string) error { 106 | a.fileProcessPool.Start(ctx) 107 | a.fileWriterPool.Start(ctx) 108 | 109 | for _, path := range filePaths { 110 | info, err := os.Lstat(path) 111 | if err != nil { 112 | return fmt.Errorf("lstat %q: %w", path, err) 113 | } 114 | 115 | if info.IsDir() { 116 | if err = a.archiveDir(path); err != nil { 117 | return fmt.Errorf("archive dir %q: %w", path, err) 118 | } 119 | } else { 120 | a.chroot = "" 121 | file, err := pool.NewFile(path, info, "") 122 | if err != nil { 123 | return fmt.Errorf("new file %q: %w", path, err) 124 | } 125 | 126 | a.archiveFile(file) 127 | } 128 | } 129 | 130 | if err := a.fileProcessPool.Close(); err != nil { 131 | return fmt.Errorf("close file process pool: %w", err) 132 | } 133 | 134 | if err := a.fileWriterPool.Close(); err != nil { 135 | return fmt.Errorf("close file writer pool: %w", err) 136 | } 137 | 138 | return nil 139 | } 140 | 141 | func (a *archiver) Close() error { 142 | if err := a.w.Close(); err != nil { 143 | return fmt.Errorf("close zip writer: %w", err) 144 | } 145 | 146 | return nil 147 | } 148 | 149 | func (a *archiver) archiveDir(root string) error { 150 | if err := a.changeRoot(root); err != nil { 151 | return fmt.Errorf("change root to %q: %w", root, err) 152 | } 153 | 154 | if err := a.walkDir(); err != nil { 155 | return fmt.Errorf("walk directory: %w", err) 156 | } 157 | 158 | return nil 159 | } 160 | 161 | // archiveFile enqueues file for archiving if it doesn't match 162 | // our output file. 163 | func (a *archiver) archiveFile(file *pool.File) { 164 | if file.Path == a.absoluteArchivePath { 165 | // Don't archive the output file. 166 | return 167 | } 168 | 169 | a.fileProcessPool.Enqueue(file) 170 | } 171 | 172 | func (a *archiver) changeRoot(root string) error { 173 | absRoot, err := filepath.Abs(root) 174 | if err != nil { 175 | return fmt.Errorf("get absolute path of %q: %w", root, err) 176 | } 177 | 178 | a.chroot = absRoot 179 | return nil 180 | } 181 | 182 | func (a *archiver) walkDir() error { 183 | if err := filepath.Walk(a.chroot, func(path string, info fs.FileInfo, err error) error { 184 | if err != nil { 185 | return err 186 | } 187 | 188 | file, err := pool.NewFile(path, info, a.chroot) 189 | if err != nil { 190 | return fmt.Errorf("new file %q: %w", path, err) 191 | } 192 | a.archiveFile(file) 193 | 194 | return nil 195 | }); err != nil { 196 | return fmt.Errorf("walk directory %q: %w", a.chroot, err) 197 | } 198 | 199 | return nil 200 | } 201 | 202 | func (a *archiver) compress(file *pool.File) error { 203 | if file.Info.IsDir() { 204 | if err := a.populateHeader(file); err != nil { 205 | return fmt.Errorf("populate header for %q: %w", file.Path, err) 206 | } 207 | return nil 208 | } 209 | 210 | hasher := crc32.NewIEEE() 211 | 212 | if err := a.copy(io.MultiWriter(file.Compressor, hasher), file); err != nil { 213 | return fmt.Errorf("copy %q: %w", file.Path, err) 214 | } 215 | 216 | if err := file.Compressor.Close(); err != nil { 217 | return fmt.Errorf("close compressor for %q: %w", file.Path, err) 218 | } 219 | 220 | if err := a.populateHeader(file); err != nil { 221 | return fmt.Errorf("populate header for %q: %w", file.Path, err) 222 | } 223 | 224 | file.Header.CRC32 = hasher.Sum32() 225 | return nil 226 | } 227 | 228 | func (a *archiver) copy(w io.Writer, file *pool.File) error { 229 | f, err := os.Open(file.Path) 230 | if err != nil { 231 | return fmt.Errorf("open %q: %w", file.Path, err) 232 | } 233 | defer f.Close() 234 | 235 | buf := bufferPool.Get().(*bufio.Reader) 236 | buf.Reset(f) 237 | 238 | _, err = io.Copy(w, buf) 239 | bufferPool.Put(buf) 240 | if err != nil { 241 | return fmt.Errorf("copy %q: %w", file.Path, err) 242 | } 243 | 244 | return nil 245 | } 246 | 247 | func (a *archiver) populateHeader(file *pool.File) error { 248 | header := file.Header 249 | 250 | utf8ValidName, utf8RequireName := detectUTF8(header.Name) 251 | utf8ValidComment, utf8RequireComment := detectUTF8(header.Comment) 252 | switch { 253 | case header.NonUTF8: 254 | header.Flags &^= 0x800 255 | case (utf8RequireName || utf8RequireComment) && (utf8ValidName && utf8ValidComment): 256 | header.Flags |= 0x800 257 | } 258 | 259 | header.CreatorVersion = header.CreatorVersion&0xff00 | zipVersion20 260 | header.ReaderVersion = zipVersion20 261 | 262 | // we store local times in header.Modified- other zip readers expect this 263 | // we set extended timestamp (UTC) info as an Extra for compatibility 264 | // we only set mod time, not time of last access or time of original creation 265 | // https://libzip.org/specifications/extrafld.txt 266 | 267 | if !header.Modified.IsZero() { 268 | header.Extra = append(header.Extra, NewExtendedTimestampExtraField(header.Modified).Encode()...) 269 | } 270 | 271 | if file.Info.IsDir() { 272 | header.Name += "/" 273 | header.Method = zip.Store 274 | header.Flags &^= 0x8 // won't write data descriptor (crc32, comp, uncomp) 275 | header.UncompressedSize64 = 0 276 | } else { 277 | header.Method = zip.Deflate 278 | header.Flags |= 0x8 // will write data descriptor (crc32, comp, uncomp) 279 | header.CompressedSize64 = uint64(file.Written()) 280 | } 281 | 282 | file.Header = header 283 | 284 | return nil 285 | } 286 | 287 | func (a *archiver) archive(file *pool.File) error { 288 | fileWriter, err := a.w.CreateRaw(file.Header) 289 | if err != nil { 290 | return fmt.Errorf("create raw for %q: %w", file.Path, err) 291 | } 292 | 293 | if _, err = io.Copy(fileWriter, file.CompressedData); err != nil { 294 | return fmt.Errorf("write compressed data for %q: %w", file.Path, err) 295 | } 296 | 297 | if file.Overflowed() { 298 | if _, err = file.Overflow.Seek(0, io.SeekStart); err != nil { 299 | return fmt.Errorf("seek overflow for %q: %w", file.Path, err) 300 | } 301 | if _, err = io.Copy(fileWriter, file.Overflow); err != nil { 302 | return fmt.Errorf("copy overflow for %q: %w", file.Path, err) 303 | } 304 | 305 | file.Overflow.Close() 306 | if err = os.Remove(file.Overflow.Name()); err != nil { 307 | return fmt.Errorf("remove overflow for %q: %w", file.Overflow.Name(), err) 308 | } 309 | } 310 | 311 | pool.FilePool.Put(file) 312 | 313 | return nil 314 | } 315 | 316 | // https://cs.opensource.google/go/go/+/refs/tags/go1.21.0:src/archive/zip/writer.go 317 | func detectUTF8(s string) (valid, require bool) { 318 | for i := 0; i < len(s); { 319 | r, size := utf8.DecodeRuneInString(s[i:]) 320 | i += size 321 | 322 | if r < 0x20 || r > 0x7d || r == 0x5c { 323 | if !utf8.ValidRune(r) || (r == utf8.RuneError && size == 1) { 324 | return false, false 325 | } 326 | require = true 327 | } 328 | } 329 | return true, require 330 | } 331 | -------------------------------------------------------------------------------- /archiver_options.go: -------------------------------------------------------------------------------- 1 | package pzip 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | const minConcurrency = 1 8 | 9 | type archiverOption func(*archiver) error 10 | 11 | // ArchiverConcurrency sets the number of goroutines used during archiving 12 | // An error is returned if n is less than 1. 13 | func ArchiverConcurrency(n int) archiverOption { 14 | return func(a *archiver) error { 15 | if n < minConcurrency { 16 | return fmt.Errorf("concurrency %d not greater than zero", n) 17 | } 18 | 19 | a.concurrency = n 20 | return nil 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /archiver_test.go: -------------------------------------------------------------------------------- 1 | package pzip 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "context" 7 | "encoding/binary" 8 | "fmt" 9 | "path/filepath" 10 | "testing" 11 | "time" 12 | 13 | "github.com/alecthomas/assert/v2" 14 | "github.com/ybirader/pzip/internal/testutils" 15 | "github.com/ybirader/pzip/pool" 16 | ) 17 | 18 | const ( 19 | testdataRoot = "testdata/" 20 | archivePath = testdataRoot + "archive.zip" 21 | helloTxtFileFixture = testdataRoot + "hello.txt" 22 | helloMarkdownFileFixture = testdataRoot + "hello.md" 23 | helloDirectoryFixture = testdataRoot + "hello/" 24 | ) 25 | 26 | func TestArchive(t *testing.T) { 27 | t.Run("archives a single file with a name", func(t *testing.T) { 28 | archive, cleanup := testutils.CreateTempArchive(t, archivePath) 29 | defer cleanup() 30 | 31 | archiver, err := NewArchiver(archive) 32 | assert.NoError(t, err) 33 | err = archiver.Archive(context.Background(), []string{helloTxtFileFixture}) 34 | assert.NoError(t, err) 35 | archiver.Close() 36 | 37 | archiveReader := testutils.GetArchiveReader(t, archive.Name()) 38 | defer archiveReader.Close() 39 | 40 | assert.Equal(t, 1, len(archiveReader.File)) 41 | testutils.AssertArchiveContainsFile(t, archiveReader.File, "hello.txt") 42 | 43 | info := testutils.GetFileInfo(t, helloTxtFileFixture) 44 | 45 | got := archiveReader.File[0].UncompressedSize64 46 | want := uint64(info.Size()) 47 | 48 | assert.Equal(t, want, got, "expected file %s to have raw size %d but got %d", info.Name(), want, got) 49 | }) 50 | 51 | t.Run("retains the last modified date of an archived file", func(t *testing.T) { 52 | archive, cleanup := testutils.CreateTempArchive(t, archivePath) 53 | defer cleanup() 54 | 55 | archiver, err := NewArchiver(archive) 56 | assert.NoError(t, err) 57 | err = archiver.Archive(context.Background(), []string{helloTxtFileFixture}) 58 | assert.NoError(t, err) 59 | archiver.Close() 60 | 61 | archiveReader := testutils.GetArchiveReader(t, archive.Name()) 62 | defer archiveReader.Close() 63 | 64 | info := testutils.GetFileInfo(t, helloTxtFileFixture) 65 | 66 | archivedFile, found := testutils.Find(archiveReader.File, func(file *zip.File) bool { 67 | return file.Name == "hello.txt" 68 | }) 69 | assert.True(t, found) 70 | 71 | assertMatchingTimes(t, archivedFile.Modified, info.ModTime()) 72 | }) 73 | 74 | t.Run("archives two files", func(t *testing.T) { 75 | archive, cleanup := testutils.CreateTempArchive(t, archivePath) 76 | defer cleanup() 77 | 78 | archiver, err := NewArchiver(archive) 79 | assert.NoError(t, err) 80 | err = archiver.Archive(context.Background(), []string{helloTxtFileFixture, helloMarkdownFileFixture}) 81 | assert.NoError(t, err) 82 | archiver.Close() 83 | 84 | archiveReader := testutils.GetArchiveReader(t, archive.Name()) 85 | defer archiveReader.Close() 86 | 87 | assert.Equal(t, 2, len(archiveReader.File)) 88 | }) 89 | 90 | t.Run("archives a directory of files", func(t *testing.T) { 91 | archive, cleanup := testutils.CreateTempArchive(t, archivePath) 92 | defer cleanup() 93 | 94 | archiver, err := NewArchiver(archive) 95 | assert.NoError(t, err) 96 | err = archiver.Archive(context.Background(), []string{helloDirectoryFixture}) 97 | assert.NoError(t, err) 98 | archiver.Close() 99 | 100 | archiveReader := testutils.GetArchiveReader(t, archive.Name()) 101 | defer archiveReader.Close() 102 | 103 | assert.Equal(t, 4, len(archiveReader.File)) 104 | }) 105 | 106 | t.Run("can archive files separately", func(t *testing.T) { 107 | archive, cleanup := testutils.CreateTempArchive(t, archivePath) 108 | defer cleanup() 109 | 110 | archiver, err := NewArchiver(archive) 111 | assert.NoError(t, err) 112 | err = archiver.Archive(context.Background(), []string{helloTxtFileFixture}) 113 | assert.NoError(t, err) 114 | err = archiver.Archive(context.Background(), []string{helloMarkdownFileFixture}) 115 | assert.NoError(t, err) 116 | archiver.Close() 117 | 118 | archiveReader := testutils.GetArchiveReader(t, archive.Name()) 119 | defer archiveReader.Close() 120 | 121 | assert.Equal(t, 2, len(archiveReader.File)) 122 | }) 123 | } 124 | 125 | func TestCompress(t *testing.T) { 126 | t.Run("when file has compressed size less than or equal to buffer size", func(t *testing.T) { 127 | archive, cleanup := testutils.CreateTempArchive(t, archivePath) 128 | defer cleanup() 129 | 130 | archiver, err := NewArchiver(archive) 131 | assert.NoError(t, err) 132 | 133 | info := testutils.GetFileInfo(t, helloTxtFileFixture) 134 | file, err := pool.NewFile(helloTxtFileFixture, info, "") 135 | assert.NoError(t, err) 136 | 137 | err = archiver.compress(file) 138 | assert.NoError(t, err) 139 | 140 | assert.False(t, file.Overflowed()) 141 | assert.Equal(t, zip.Deflate, file.Header.Method) 142 | assertMatchingTimes(t, info.ModTime(), file.Header.Modified) 143 | assert.Equal(t, info.Mode(), file.Header.Mode()) 144 | assert.NotZero(t, file.Header.CRC32) 145 | assert.Equal(t, uint64(info.Size()), file.Header.UncompressedSize64) 146 | assert.Equal(t, uint64(file.CompressedData.Len()), file.Header.CompressedSize64) 147 | assert.Equal(t, int64(file.CompressedData.Len()), file.Written()) 148 | assertExtendedTimestamp(t, file.Header.Extra) 149 | }) 150 | 151 | t.Run("writes a maximum of buffer cap bytes and remainder directly to temp file", func(t *testing.T) { 152 | archive, cleanup := testutils.CreateTempArchive(t, archivePath) 153 | defer cleanup() 154 | 155 | archiver, err := NewArchiver(archive) 156 | assert.NoError(t, err) 157 | 158 | info := testutils.GetFileInfo(t, helloTxtFileFixture) 159 | file, err := pool.NewFile(helloTxtFileFixture, info, "") 160 | assert.NoError(t, err) 161 | bufCap := 5 162 | file.CompressedData = bytes.NewBuffer(make([]byte, 0, bufCap)) 163 | 164 | err = archiver.compress(file) 165 | assert.NoError(t, err) 166 | 167 | assert.Equal(t, file.CompressedData.Len(), bufCap) 168 | assert.True(t, file.Overflowed()) 169 | assertGreaterThan(t, file.Written(), int64(file.CompressedData.Len())) 170 | assert.Equal(t, file.Written(), int64(file.Header.CompressedSize64)) 171 | }) 172 | 173 | t.Run("for directories", func(t *testing.T) { 174 | archive, cleanup := testutils.CreateTempArchive(t, archivePath) 175 | defer cleanup() 176 | 177 | archiver, err := NewArchiver(archive) 178 | assert.NoError(t, err) 179 | 180 | filePath := filepath.Join(helloDirectoryFixture, "nested") 181 | info := testutils.GetFileInfo(t, filePath) 182 | file, err := pool.NewFile(filePath, info, helloDirectoryFixture) 183 | assert.NoError(t, err) 184 | 185 | err = archiver.compress(file) 186 | assert.NoError(t, err) 187 | 188 | assert.Equal(t, "hello/nested/", file.Header.Name) 189 | assert.False(t, file.Overflowed()) 190 | assert.Equal(t, zip.Store, file.Header.Method) 191 | assert.Zero(t, file.Header.CRC32) 192 | assert.Equal(t, 0, file.Header.UncompressedSize64) 193 | assert.Equal(t, 0, file.Header.CompressedSize64) 194 | assert.Equal(t, int64(file.CompressedData.Len()), file.Written()) 195 | }) 196 | } 197 | 198 | func assertExtendedTimestamp(t testing.TB, extraField []byte) { 199 | want := make([]byte, 2) 200 | binary.LittleEndian.PutUint16(want, extendedTimestampTag) 201 | got := extraField[:2] 202 | assert.Equal(t, want, got, "expected header to contain extended timestamp") 203 | } 204 | 205 | func assertMatchingTimes(t testing.TB, t1, t2 time.Time) { 206 | t.Helper() 207 | 208 | assert.True(t, 209 | t1.Year() == t2.Year() && t1.YearDay() == t2.YearDay() && t1.Second() == t2.Second(), 210 | fmt.Sprintf("expected %+v to match %+v but didn't", t1, t2)) 211 | } 212 | 213 | func assertGreaterThan(t testing.TB, a, b int64) { 214 | if b >= a { 215 | t.Fatalf("expected %d to be greater than %d", a, b) 216 | } 217 | } 218 | -------------------------------------------------------------------------------- /cli.go: -------------------------------------------------------------------------------- 1 | package pzip 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "os" 7 | ) 8 | 9 | type ArchiverCLI struct { 10 | ArchivePath string 11 | Files []string 12 | Concurrency int 13 | } 14 | 15 | func (a *ArchiverCLI) Archive(ctx context.Context) error { 16 | archive, err := os.Create(a.ArchivePath) 17 | if err != nil { 18 | return fmt.Errorf("create archive at %q: %w", a.ArchivePath, err) 19 | } 20 | defer archive.Close() 21 | 22 | archiver, err := NewArchiver(archive, ArchiverConcurrency(a.Concurrency)) 23 | if err != nil { 24 | return fmt.Errorf("create archiver: %w", err) 25 | } 26 | defer archiver.Close() 27 | 28 | err = archiver.Archive(ctx, a.Files) 29 | if err != nil { 30 | return fmt.Errorf("archive files: %w", err) 31 | } 32 | 33 | return nil 34 | } 35 | 36 | type ExtractorCLI struct { 37 | ArchivePath string 38 | OutputDir string 39 | Concurrency int 40 | } 41 | 42 | func (e *ExtractorCLI) Extract(ctx context.Context) error { 43 | extractor, err := NewExtractor(e.OutputDir, ExtractorConcurrency(e.Concurrency)) 44 | if err != nil { 45 | return fmt.Errorf("new extractor: %w", err) 46 | } 47 | defer extractor.Close() 48 | 49 | if err = extractor.Extract(ctx, e.ArchivePath); err != nil { 50 | return fmt.Errorf("extract %q to %q: %w", e.ArchivePath, e.OutputDir, err) 51 | 52 | } 53 | 54 | return nil 55 | } 56 | -------------------------------------------------------------------------------- /cli_test.go: -------------------------------------------------------------------------------- 1 | package pzip_test 2 | 3 | import ( 4 | "context" 5 | "os" 6 | "path/filepath" 7 | "runtime" 8 | "testing" 9 | 10 | "github.com/alecthomas/assert/v2" 11 | "github.com/ybirader/pzip" 12 | "github.com/ybirader/pzip/internal/testutils" 13 | ) 14 | 15 | const ( 16 | benchmarkRoot = "testdata/benchmark" 17 | benchmarkDir = "minibench" // modify this to match the file/directory you want to benchmark 18 | benchmarkArchive = "miniextractbench.zip" // modify this to match archive you want to benchmark 19 | testArchiveDirectoryName = "hello" 20 | ) 21 | 22 | func TestArchiverCLI(t *testing.T) { 23 | t.Run("archives a directory and some files", func(t *testing.T) { 24 | files := []string{"testdata/hello", "testdata/hello.txt"} 25 | archivePath := "testdata/archive.zip" 26 | defer os.RemoveAll(archivePath) 27 | 28 | cli := pzip.ArchiverCLI{archivePath, files, runtime.GOMAXPROCS(0)} 29 | err := cli.Archive(context.Background()) 30 | assert.NoError(t, err) 31 | 32 | archiveReader := testutils.GetArchiveReader(t, archivePath) 33 | defer archiveReader.Close() 34 | 35 | assert.Equal(t, 5, len(archiveReader.File)) 36 | }) 37 | } 38 | 39 | func TestExtractorCLI(t *testing.T) { 40 | t.Run("extracts an archive", func(t *testing.T) { 41 | archivePath := "testdata/test.zip" 42 | outputDirPath := "testdata/test" 43 | 44 | err := os.Mkdir(outputDirPath, 0755) 45 | assert.NoError(t, err) 46 | extractedDirPath := filepath.Join(outputDirPath, testArchiveDirectoryName) 47 | defer os.RemoveAll(outputDirPath) 48 | 49 | cli := pzip.ExtractorCLI{archivePath, outputDirPath, runtime.GOMAXPROCS(0)} 50 | err = cli.Extract(context.Background()) 51 | assert.NoError(t, err) 52 | 53 | assert.Equal(t, 3, len(testutils.GetAllFiles(t, extractedDirPath))) 54 | }) 55 | } 56 | 57 | // BenchmarkArchiverCLI benchmarks the archiving of a file/directory, referenced by benchmarkDir in the benchmarkRoot directory 58 | func BenchmarkArchiverCLI(b *testing.B) { 59 | outputDirPath := filepath.Join(benchmarkRoot, benchmarkDir) 60 | archivePath := filepath.Join(benchmarkRoot, benchmarkDir+".zip") 61 | 62 | cli := pzip.ArchiverCLI{archivePath, []string{outputDirPath}, runtime.GOMAXPROCS(0)} 63 | 64 | b.ReportAllocs() 65 | b.ResetTimer() 66 | 67 | for i := 0; i < b.N; i++ { 68 | if err := cli.Archive(context.Background()); err != nil { 69 | b.Fatal(err) 70 | } 71 | } 72 | } 73 | 74 | // BenchmarkExtractorCLI benchmarks extracting an archive, referenced by benchmarkArchive 75 | func BenchmarkExtractorCLI(b *testing.B) { 76 | archivePath := filepath.Join(benchmarkRoot, benchmarkArchive) 77 | 78 | cli := pzip.ExtractorCLI{archivePath, benchmarkRoot, runtime.GOMAXPROCS(0)} 79 | 80 | b.ReportAllocs() 81 | b.ResetTimer() 82 | 83 | for i := 0; i < b.N; i++ { 84 | if err := cli.Extract(context.Background()); err != nil { 85 | b.Fatal(err) 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /cmd/punzip/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "fmt" 7 | "log" 8 | "os" 9 | "os/signal" 10 | "runtime" 11 | 12 | "github.com/ybirader/pzip" 13 | ) 14 | 15 | const description = "punzip is a tool for extracting files concurrently." 16 | 17 | func main() { 18 | flag.Usage = func() { 19 | fmt.Fprintln(os.Stderr, description) 20 | fmt.Fprintln(os.Stderr, "\nUsage:") 21 | flag.PrintDefaults() 22 | } 23 | 24 | var concurrency int 25 | var outputDir string 26 | flag.IntVar(&concurrency, "concurrency", runtime.GOMAXPROCS(0), "allow up to n compression routines") 27 | flag.StringVar(&outputDir, "d", ".", "extract files into the specified directory") 28 | 29 | flag.Parse() 30 | 31 | args := flag.Args() 32 | 33 | if len(args) < 1 { 34 | flag.Usage() 35 | return 36 | } 37 | 38 | cli := pzip.ExtractorCLI{ArchivePath: args[0], OutputDir: outputDir, Concurrency: concurrency} 39 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt) 40 | go func() { 41 | <-ctx.Done() 42 | stop() 43 | }() 44 | 45 | err := cli.Extract(ctx) 46 | if err != nil { 47 | log.Fatal(err) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /cmd/punzip/punzip_test.go: -------------------------------------------------------------------------------- 1 | package main_test 2 | 3 | import ( 4 | "os/exec" 5 | "path/filepath" 6 | "testing" 7 | 8 | "github.com/alecthomas/assert/v2" 9 | "github.com/ybirader/pzip/adapters/cli" 10 | "github.com/ybirader/pzip/internal/testutils" 11 | "github.com/ybirader/pzip/specifications" 12 | ) 13 | 14 | const ( 15 | testdataRoot = "../../testdata" 16 | archivePath = testdataRoot + "/test.zip" 17 | ) 18 | 19 | func TestPunzip(t *testing.T) { 20 | binPath, cleanup, err := cli.BuildBinary() 21 | if err != nil { 22 | t.Fatal("ERROR: could not build binary", err) 23 | } 24 | t.Cleanup(cleanup) 25 | 26 | t.Run("outputs usage to stderr when no arguments or flags provided", func(t *testing.T) { 27 | pzip := exec.Command(binPath) 28 | out := testutils.GetOutput(t, pzip) 29 | 30 | assert.Contains(t, out, "punzip is a tool for extracting files concurrently.\n") 31 | assert.Contains(t, out, "Usage") 32 | }) 33 | t.Run("extracts an archive", func(t *testing.T) { 34 | if testing.Short() { 35 | t.Skip() 36 | } 37 | 38 | absArchivePath, err := filepath.Abs(archivePath) 39 | assert.NoError(t, err) 40 | 41 | driver := cli.NewDriver(binPath, absArchivePath, "") 42 | 43 | specifications.Extract(t, driver) 44 | }) 45 | } 46 | -------------------------------------------------------------------------------- /cmd/pzip/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "fmt" 7 | "log" 8 | "os" 9 | "os/signal" 10 | "runtime" 11 | 12 | "github.com/ybirader/pzip" 13 | ) 14 | 15 | const description = "pzip is a tool for archiving files concurrently." 16 | 17 | func main() { 18 | flag.Usage = func() { 19 | fmt.Fprintln(os.Stderr, description) 20 | fmt.Fprintln(os.Stderr, "\nUsage:") 21 | flag.PrintDefaults() 22 | } 23 | 24 | var concurrency int 25 | flag.IntVar(&concurrency, "concurrency", runtime.GOMAXPROCS(0), "allow up to n compression routines") 26 | 27 | flag.Parse() 28 | 29 | args := flag.Args() 30 | 31 | if len(args) < 1 { 32 | flag.Usage() 33 | return 34 | } else if len(args) < 2 { 35 | fmt.Fprintln(os.Stderr, "pzip error: invalid usage") 36 | return 37 | } 38 | 39 | cli := pzip.ArchiverCLI{ArchivePath: args[0], Files: args[1:], Concurrency: concurrency} 40 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt) 41 | go func() { 42 | <-ctx.Done() 43 | stop() 44 | }() 45 | 46 | err := cli.Archive(ctx) 47 | if err != nil { 48 | os.RemoveAll(cli.ArchivePath) 49 | log.Fatal(err) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /cmd/pzip/pzip_test.go: -------------------------------------------------------------------------------- 1 | package main_test 2 | 3 | import ( 4 | "os/exec" 5 | "path/filepath" 6 | "testing" 7 | 8 | "github.com/alecthomas/assert/v2" 9 | "github.com/ybirader/pzip/adapters/cli" 10 | "github.com/ybirader/pzip/internal/testutils" 11 | "github.com/ybirader/pzip/specifications" 12 | ) 13 | 14 | const ( 15 | testdataRoot = "../../testdata" 16 | archivePath = testdataRoot + "/archive.zip" 17 | dirPath = testdataRoot + "/hello" 18 | ) 19 | 20 | func TestPzip(t *testing.T) { 21 | binPath, cleanup, err := cli.BuildBinary() 22 | if err != nil { 23 | t.Fatal("ERROR: could not build binary", err) 24 | } 25 | t.Cleanup(cleanup) 26 | 27 | t.Run("outputs usage to stderr when no arguments or flags provided", func(t *testing.T) { 28 | pzip := exec.Command(binPath) 29 | out := testutils.GetOutput(t, pzip) 30 | 31 | assert.Contains(t, out, "pzip is a tool for archiving files concurrently.\n") 32 | assert.Contains(t, out, "Usage") 33 | }) 34 | 35 | t.Run("outputs error when only one argument passed", func(t *testing.T) { 36 | pzip := exec.Command(binPath, "archive.zip") 37 | out := testutils.GetOutput(t, pzip) 38 | 39 | assert.Contains(t, out, "pzip error: invalid usage\n") 40 | }) 41 | 42 | t.Run("archives directory", func(t *testing.T) { 43 | if testing.Short() { 44 | t.Skip() 45 | } 46 | 47 | absArchivePath, err := filepath.Abs(archivePath) 48 | if err != nil { 49 | t.Fatalf("ERROR: could not get path to archive %s", archivePath) 50 | } 51 | 52 | absDirPath, err := filepath.Abs(dirPath) 53 | if err != nil { 54 | t.Fatalf("ERROR: could not get path to directory %s", dirPath) 55 | } 56 | 57 | driver := cli.NewDriver(binPath, absArchivePath, absDirPath) 58 | 59 | specifications.Archive(t, driver) 60 | }) 61 | } 62 | -------------------------------------------------------------------------------- /extra.go: -------------------------------------------------------------------------------- 1 | package pzip 2 | 3 | import ( 4 | "encoding/binary" 5 | "time" 6 | ) 7 | 8 | const extendedTimestampTag = 0x5455 9 | 10 | // ExtendedTimeStampExtraField is the extended timestamp field, as defined in the zip specification (See 4.5.3 https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT). 11 | type ExtendedTimestampExtraField struct { 12 | modified time.Time 13 | } 14 | 15 | func NewExtendedTimestampExtraField(modified time.Time) *ExtendedTimestampExtraField { 16 | return &ExtendedTimestampExtraField{ 17 | modified, 18 | } 19 | } 20 | 21 | // Encode returns the modified time of the associated ExtendedTimestampExtraField as a slice of bytes. 22 | func (e *ExtendedTimestampExtraField) Encode() []byte { 23 | extraBuf := make([]byte, 0, 9) // 2*SizeOf(uint16) + SizeOf(uint) + SizeOf(uint32) 24 | extraBuf = binary.LittleEndian.AppendUint16(extraBuf, extendedTimestampTag) 25 | extraBuf = binary.LittleEndian.AppendUint16(extraBuf, 5) // block size 26 | extraBuf = append(extraBuf, uint8(1)) // flags 27 | extraBuf = binary.LittleEndian.AppendUint32(extraBuf, uint32(e.modified.Unix())) 28 | return extraBuf 29 | } 30 | -------------------------------------------------------------------------------- /extractor.go: -------------------------------------------------------------------------------- 1 | package pzip 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "io" 7 | "os" 8 | "path/filepath" 9 | "runtime" 10 | "strings" 11 | 12 | "github.com/klauspost/compress/zip" 13 | "github.com/ybirader/pzip/pool" 14 | ) 15 | 16 | type extractor struct { 17 | outputDir string 18 | archiveReader *zip.ReadCloser 19 | fileWorkerPool pool.WorkerPool[zip.File] 20 | concurrency int 21 | } 22 | 23 | // NewExtractor returns a new pzip extractor. The extractor can be configured by passing in a number of options. 24 | // Available options include ExtractorConcurrency(n int). It returns an error if the extractor can't be created 25 | // Close() should be called on the returned extractor when done 26 | func NewExtractor(outputDir string, options ...extractorOption) (*extractor, error) { 27 | absOutputDir, err := filepath.Abs(outputDir) 28 | if err != nil { 29 | return nil, fmt.Errorf("absolute path %q: %w", outputDir, err) 30 | } 31 | e := &extractor{outputDir: absOutputDir, concurrency: runtime.GOMAXPROCS(0)} 32 | 33 | fileExecutor := func(file *zip.File) error { 34 | if err := e.extractFile(file); err != nil { 35 | return fmt.Errorf("extract file %q: %w", file.Name, err) 36 | } 37 | 38 | return nil 39 | } 40 | 41 | fileWorkerPool, err := pool.NewFileWorkerPool(fileExecutor, &pool.Config{Concurrency: e.concurrency, Capacity: 10}) 42 | if err != nil { 43 | return nil, fmt.Errorf("new file worker pool: %w", err) 44 | } 45 | 46 | e.fileWorkerPool = fileWorkerPool 47 | 48 | for _, option := range options { 49 | if err = option(e); err != nil { 50 | return nil, err 51 | } 52 | } 53 | 54 | return e, nil 55 | } 56 | 57 | // Extract extracts the files from the specified archivePath to 58 | // the corresponding outputDir registered with the extractor. Extraction is canceled when the 59 | // associated ctx is canceled. The first error that arises during extraction is returned. 60 | func (e *extractor) Extract(ctx context.Context, archivePath string) (err error) { 61 | e.archiveReader, err = zip.OpenReader(archivePath) 62 | if err != nil { 63 | return fmt.Errorf("open archive %q: %w", archivePath, err) 64 | } 65 | 66 | e.fileWorkerPool.Start(ctx) 67 | 68 | for _, file := range e.archiveReader.File { 69 | e.fileWorkerPool.Enqueue(file) 70 | } 71 | 72 | if err = e.fileWorkerPool.Close(); err != nil { 73 | return fmt.Errorf("close file worker pool: %w", err) 74 | } 75 | 76 | return nil 77 | } 78 | 79 | func (e *extractor) Close() error { 80 | if err := e.archiveReader.Close(); err != nil { 81 | return fmt.Errorf("close archive reader: %w", err) 82 | } 83 | 84 | return nil 85 | } 86 | 87 | func (e *extractor) extractFile(file *zip.File) (err error) { 88 | outputPath := e.outputPath(file.Name) 89 | 90 | dir := filepath.Dir(outputPath) 91 | if err = os.MkdirAll(dir, 0755); err != nil { 92 | return fmt.Errorf("create directory %q: %w", dir, err) 93 | } 94 | 95 | if e.isDir(file.Name) { 96 | if err = e.writeDir(outputPath, file); err != nil { 97 | return fmt.Errorf("write directory %q: %w", file.Name, err) 98 | } 99 | return nil 100 | } 101 | 102 | if err = e.writeFile(outputPath, file); err != nil { 103 | return fmt.Errorf("write file %q: %w", file.Name, err) 104 | } 105 | 106 | return nil 107 | } 108 | 109 | func (e *extractor) writeDir(outputPath string, file *zip.File) error { 110 | err := os.Mkdir(outputPath, file.Mode()) 111 | if os.IsExist(err) { 112 | if err = os.Chmod(outputPath, file.Mode()); err != nil { 113 | return fmt.Errorf("chmod directory %q: %w", outputPath, err) 114 | } 115 | } else if err != nil { 116 | return fmt.Errorf("create directory %q: %w", outputPath, err) 117 | } 118 | 119 | return nil 120 | } 121 | 122 | func (e *extractor) writeFile(outputPath string, file *zip.File) (err error) { 123 | outputFile, err := os.OpenFile(outputPath, os.O_CREATE|os.O_WRONLY, file.Mode()) 124 | if err != nil { 125 | return fmt.Errorf("create file %q: %w", outputPath, err) 126 | } 127 | defer func() { 128 | if cerr := outputFile.Close(); cerr != nil && err == nil { 129 | err = fmt.Errorf("close output file %q: %w", outputPath, cerr) 130 | } 131 | }() 132 | 133 | srcFile, err := file.Open() 134 | if err != nil { 135 | return fmt.Errorf("open file %q: %w", file.Name, err) 136 | } 137 | defer func() { 138 | if cerr := srcFile.Close(); cerr != nil && err == nil { 139 | err = fmt.Errorf("close source file %q: %w", file.Name, cerr) 140 | } 141 | }() 142 | 143 | if _, err = io.Copy(outputFile, srcFile); err != nil { 144 | return fmt.Errorf("decompress file %q: %w", file.Name, err) 145 | } 146 | 147 | return nil 148 | } 149 | 150 | func (e *extractor) isDir(name string) bool { 151 | return strings.HasSuffix(filepath.ToSlash(name), "/") 152 | } 153 | 154 | func (e *extractor) outputPath(name string) string { 155 | return filepath.Join(e.outputDir, name) 156 | } 157 | -------------------------------------------------------------------------------- /extractor_options.go: -------------------------------------------------------------------------------- 1 | package pzip 2 | 3 | import "fmt" 4 | 5 | type extractorOption func(*extractor) error 6 | 7 | // ExtractorConcurrency sets the number of goroutines used during extraction 8 | // An error is returned if n is less than 1. 9 | func ExtractorConcurrency(n int) extractorOption { 10 | return func(e *extractor) error { 11 | if n < minConcurrency { 12 | return fmt.Errorf("concurrency %d not greater than zero", n) 13 | } 14 | 15 | e.concurrency = n 16 | return nil 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /extractor_test.go: -------------------------------------------------------------------------------- 1 | package pzip 2 | 3 | import ( 4 | "context" 5 | "io/fs" 6 | "os" 7 | "path/filepath" 8 | "testing" 9 | 10 | "github.com/alecthomas/assert/v2" 11 | "github.com/ybirader/pzip/internal/testutils" 12 | ) 13 | 14 | const ( 15 | testArchiveFixture = testdataRoot + "test.zip" // test.zip fixture is an archive of the helloDirectory fixture 16 | outputDirPath = testdataRoot + "test" 17 | ) 18 | 19 | func TestExtract(t *testing.T) { 20 | t.Run("writes decompressed archive files to output directory", func(t *testing.T) { 21 | err := os.Mkdir(outputDirPath, 0755) 22 | assert.NoError(t, err) 23 | defer os.RemoveAll(outputDirPath) 24 | 25 | extractor, err := NewExtractor(outputDirPath) 26 | assert.NoError(t, err) 27 | defer extractor.Close() 28 | 29 | err = extractor.Extract(context.Background(), testArchiveFixture) 30 | assert.NoError(t, err) 31 | 32 | files := testutils.GetAllFiles(t, filepath.Join(outputDirPath, "hello")) 33 | assert.Equal(t, []string{"hello.txt", "nested", "hello.md"}, testutils.Map(files, func(element fs.FileInfo) string { 34 | return element.Name() 35 | })) 36 | 37 | helloFileInfo := files[0] 38 | assert.NotZero(t, helloFileInfo.Size()) 39 | }) 40 | } 41 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/ybirader/pzip 2 | 3 | go 1.21 4 | 5 | require ( 6 | github.com/alecthomas/assert/v2 v2.3.0 7 | github.com/klauspost/compress v1.16.7 8 | golang.org/x/sync v0.3.0 9 | ) 10 | 11 | require ( 12 | github.com/alecthomas/repr v0.2.0 // indirect 13 | github.com/hexops/gotextdiff v1.0.3 // indirect 14 | ) 15 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/alecthomas/assert/v2 v2.3.0 h1:mAsH2wmvjsuvyBvAmCtm7zFsBlb8mIHx5ySLVdDZXL0= 2 | github.com/alecthomas/assert/v2 v2.3.0/go.mod h1:pXcQ2Asjp247dahGEmsZ6ru0UVwnkhktn7S0bBDLxvQ= 3 | github.com/alecthomas/repr v0.2.0 h1:HAzS41CIzNW5syS8Mf9UwXhNH1J9aix/BvDRf1Ml2Yk= 4 | github.com/alecthomas/repr v0.2.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4= 5 | github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= 6 | github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg= 7 | github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= 8 | github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= 9 | golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E= 10 | golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= 11 | -------------------------------------------------------------------------------- /internal/testutils/archiver.go: -------------------------------------------------------------------------------- 1 | package testutils 2 | 3 | import ( 4 | "archive/zip" 5 | "fmt" 6 | "io/fs" 7 | "os" 8 | "os/exec" 9 | "path/filepath" 10 | "testing" 11 | 12 | "github.com/alecthomas/assert/v2" 13 | ) 14 | 15 | func CreateTempArchive(t testing.TB, name string) (*os.File, func()) { 16 | t.Helper() 17 | 18 | archive, err := os.Create(name) 19 | assert.NoError(t, err, fmt.Sprintf("could not create archive %s: %v", name, err)) 20 | 21 | cleanup := func() { 22 | archive.Close() 23 | os.RemoveAll(archive.Name()) 24 | } 25 | 26 | return archive, cleanup 27 | } 28 | 29 | func GetFileInfo(t testing.TB, name string) fs.FileInfo { 30 | t.Helper() 31 | 32 | info, err := os.Stat(name) 33 | assert.NoError(t, err, fmt.Sprintf("could not get file info for %s", name)) 34 | 35 | return info 36 | } 37 | 38 | func GetArchiveReader(t testing.TB, name string) *zip.ReadCloser { 39 | t.Helper() 40 | 41 | reader, err := zip.OpenReader(name) 42 | assert.NoError(t, err) 43 | 44 | return reader 45 | } 46 | 47 | func AssertArchiveContainsFile(t testing.TB, files []*zip.File, name string) { 48 | t.Helper() 49 | 50 | _, found := Find(files, func(f *zip.File) bool { 51 | return f.Name == name 52 | }) 53 | 54 | if !found { 55 | t.Errorf("expected file %s to be in archive but wasn't", name) 56 | } 57 | } 58 | 59 | func Find[T any](elements []T, cb func(element T) bool) (T, bool) { 60 | for _, e := range elements { 61 | if cb(e) { 62 | return e, true 63 | } 64 | } 65 | 66 | return *new(T), false 67 | } 68 | 69 | func GetAllFiles(t testing.TB, dirPath string) []fs.FileInfo { 70 | var result []fs.FileInfo 71 | 72 | err := filepath.Walk(dirPath, func(path string, info fs.FileInfo, err error) error { 73 | if err != nil { 74 | return err 75 | } 76 | 77 | if dirPath == path { 78 | return nil 79 | } 80 | 81 | result = append(result, info) 82 | 83 | return nil 84 | }) 85 | 86 | if err != nil { 87 | t.Fatalf("could not walk directory %s: %v", dirPath, err) 88 | } 89 | 90 | return result 91 | } 92 | 93 | func GetOutput(t testing.TB, cmd *exec.Cmd) string { 94 | out, err := cmd.CombinedOutput() 95 | if err != nil { 96 | t.Fatal("ERROR: could not get output of cmd", string(out), err) 97 | } 98 | 99 | return string(out) 100 | } 101 | 102 | func Map[T, K any](elements []T, cb func(element T) K) []K { 103 | results := make([]K, len(elements)) 104 | 105 | for i, element := range elements { 106 | results[i] = cb(element) 107 | } 108 | 109 | return results 110 | } 111 | -------------------------------------------------------------------------------- /pool/file.go: -------------------------------------------------------------------------------- 1 | package pool 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "fmt" 7 | "io/fs" 8 | "os" 9 | "path/filepath" 10 | "sync" 11 | 12 | "github.com/klauspost/compress/flate" 13 | ) 14 | 15 | const DefaultBufferSize = 2 * 1024 * 1024 16 | 17 | var FilePool = sync.Pool{ 18 | New: func() any { 19 | return &File{CompressedData: bytes.NewBuffer(make([]byte, DefaultBufferSize))} 20 | }, 21 | } 22 | 23 | // A File refers to a file-backed buffer 24 | type File struct { 25 | Info fs.FileInfo 26 | Header *zip.FileHeader 27 | CompressedData *bytes.Buffer 28 | Overflow *os.File 29 | Compressor *flate.Writer 30 | Path string 31 | written int64 32 | } 33 | 34 | func NewFile(path string, info fs.FileInfo, relativeTo string) (*File, error) { 35 | f := FilePool.Get().(*File) 36 | err := f.Reset(path, info, relativeTo) 37 | return f, err 38 | } 39 | 40 | // Reset resets the file-backed buffer ready to be used by another file. 41 | func (f *File) Reset(path string, info fs.FileInfo, relativeTo string) error { 42 | hdr, err := zip.FileInfoHeader(info) 43 | if err != nil { 44 | return fmt.Errorf("file info header for %q: %w", path, err) 45 | } 46 | f.Path = path 47 | f.Info = info 48 | f.Header = hdr 49 | f.CompressedData.Reset() 50 | f.Overflow = nil 51 | f.written = 0 52 | 53 | if f.Compressor == nil { 54 | f.Compressor, err = flate.NewWriter(f, flate.DefaultCompression) 55 | if err != nil { 56 | return fmt.Errorf("new compressor: %w", err) 57 | } 58 | } else { 59 | f.Compressor.Reset(f) 60 | } 61 | 62 | if relativeTo != "" { 63 | if err := f.setNameRelativeTo(relativeTo); err != nil { 64 | return fmt.Errorf("set name relative to %q: %w", relativeTo, err) 65 | } 66 | } 67 | 68 | return nil 69 | } 70 | 71 | func (f *File) Write(p []byte) (n int, err error) { 72 | if f.CompressedData.Available() != 0 { 73 | maxWriteable := min(f.CompressedData.Available(), len(p)) 74 | f.written += int64(maxWriteable) 75 | f.CompressedData.Write(p[:maxWriteable]) 76 | p = p[maxWriteable:] 77 | } 78 | 79 | if len(p) > 0 { 80 | if f.Overflow == nil { 81 | if f.Overflow, err = os.CreateTemp("", "pzip-overflow"); err != nil { 82 | return len(p), fmt.Errorf("create temporary file: %w", err) 83 | } 84 | } 85 | 86 | if _, err := f.Overflow.Write(p); err != nil { 87 | return len(p), fmt.Errorf("write temporary file for %q: %w", f.Header.Name, err) 88 | } 89 | f.written += int64(len(p)) 90 | } 91 | 92 | return len(p), nil 93 | } 94 | 95 | // Written returns the number of bytes of the file compressed and written to a destination 96 | func (f *File) Written() int64 { 97 | return f.written 98 | } 99 | 100 | // Overflowed returns true if the compressed contents of the file was too large to fit in the in-memory buffer. 101 | // The overflowed contents are written to a temporary file. 102 | func (f *File) Overflowed() bool { 103 | return f.Overflow != nil 104 | } 105 | 106 | func (f *File) setNameRelativeTo(root string) error { 107 | relativeToRoot, err := filepath.Rel(root, f.Path) 108 | if err != nil { 109 | return fmt.Errorf("relative path of %q to root %q: %w", f.Path, root, err) 110 | } 111 | f.Header.Name = filepath.ToSlash(filepath.Join(filepath.Base(root), relativeToRoot)) 112 | return nil 113 | } 114 | -------------------------------------------------------------------------------- /pool/file_test.go: -------------------------------------------------------------------------------- 1 | package pool_test 2 | 3 | import ( 4 | "path/filepath" 5 | "testing" 6 | 7 | "github.com/alecthomas/assert/v2" 8 | "github.com/ybirader/pzip/internal/testutils" 9 | "github.com/ybirader/pzip/pool" 10 | ) 11 | 12 | const ( 13 | testdataRoot = "../testdata/" 14 | archivePath = testdataRoot + "archive.zip" 15 | helloTxtFileFixture = testdataRoot + "hello.txt" 16 | helloMarkdownFileFixture = testdataRoot + "hello.md" 17 | helloDirectoryFixture = testdataRoot + "hello/" 18 | ) 19 | 20 | func TestNewFile(t *testing.T) { 21 | t.Run("with file name relative to archive root when file path is relative", func(t *testing.T) { 22 | info := testutils.GetFileInfo(t, helloTxtFileFixture) 23 | file, err := pool.NewFile(helloTxtFileFixture, info, "") 24 | assert.NoError(t, err) 25 | 26 | assert.Equal(t, "hello.txt", file.Header.Name) 27 | }) 28 | 29 | t.Run("with file name relative to archive root when file path is absolute", func(t *testing.T) { 30 | absFilePath, err := filepath.Abs(helloTxtFileFixture) 31 | assert.NoError(t, err) 32 | info := testutils.GetFileInfo(t, absFilePath) 33 | file, err := pool.NewFile(absFilePath, info, "") 34 | assert.NoError(t, err) 35 | 36 | assert.Equal(t, "hello.txt", file.Header.Name) 37 | }) 38 | 39 | t.Run("with file name relative to archive root for directories", func(t *testing.T) { 40 | filePath := filepath.Join(helloDirectoryFixture, "nested/hello.md") 41 | info := testutils.GetFileInfo(t, filePath) 42 | 43 | file, err := pool.NewFile(filePath, info, helloDirectoryFixture) 44 | assert.NoError(t, err) 45 | 46 | assert.Equal(t, "hello/nested/hello.md", file.Header.Name) 47 | }) 48 | 49 | t.Run("resets file as new", func(t *testing.T) { 50 | filePath := filepath.Join(helloDirectoryFixture, "nested/hello.md") 51 | info := testutils.GetFileInfo(t, filePath) 52 | 53 | file, err := pool.NewFile(filePath, info, helloDirectoryFixture) 54 | assert.NoError(t, err) 55 | 56 | newInfo := testutils.GetFileInfo(t, helloTxtFileFixture) 57 | err = file.Reset(helloTxtFileFixture, newInfo, "") 58 | assert.NoError(t, err) 59 | 60 | assert.Equal(t, helloTxtFileFixture, file.Path) 61 | assert.Equal(t, newInfo, file.Info) 62 | assert.Equal(t, "hello.txt", file.Header.Name) 63 | assert.Equal(t, 0, file.CompressedData.Len()) 64 | assert.Equal(t, pool.DefaultBufferSize, file.CompressedData.Cap()) 65 | }) 66 | } 67 | -------------------------------------------------------------------------------- /pool/file_worker_pool.go: -------------------------------------------------------------------------------- 1 | package pool 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "golang.org/x/sync/errgroup" 8 | ) 9 | 10 | const ( 11 | minConcurrency = 1 12 | ) 13 | 14 | type Config struct { 15 | Concurrency int 16 | Capacity int 17 | } 18 | 19 | // A FileWorkerPool is a worker pool in which files are enqueued and for each file, the executor function is called. 20 | // The number of files that can be enqueued for processing at any time is defined by the capacity. The number of 21 | // workers processing files is set by configuring concurrency. 22 | type FileWorkerPool[T any] struct { 23 | tasks chan *T 24 | executor func(f *T) error 25 | g *errgroup.Group 26 | ctxCancel func(error) 27 | concurrency int 28 | capacity int 29 | } 30 | 31 | func NewFileWorkerPool[T any](executor func(f *T) error, config *Config) (*FileWorkerPool[T], error) { 32 | if config.Concurrency < minConcurrency { 33 | return nil, fmt.Errorf("concurrency %d not greater than zero", config.Concurrency) 34 | } 35 | 36 | return &FileWorkerPool[T]{ 37 | tasks: make(chan *T, config.Capacity), 38 | executor: executor, 39 | g: new(errgroup.Group), 40 | concurrency: config.Concurrency, 41 | capacity: config.Capacity, 42 | }, nil 43 | } 44 | 45 | // Start creates n goroutine workers, where n can be configured by setting 46 | // the concurrency option of the FileWorkerPool. The workers listen and execute tasks 47 | // as they are enqueued. The workers are shut down when an error occurs or the associated 48 | // ctx is canceled. 49 | func (f *FileWorkerPool[T]) Start(ctx context.Context) { 50 | f.reset() 51 | 52 | ctx, cancel := context.WithCancelCause(ctx) 53 | f.ctxCancel = cancel 54 | 55 | for i := 0; i < f.concurrency; i++ { 56 | f.g.Go(func() error { 57 | if err := f.listen(ctx); err != nil { 58 | f.ctxCancel(err) 59 | return err 60 | } 61 | 62 | return nil 63 | }) 64 | } 65 | } 66 | 67 | // Enqueue enqueues a file for processing 68 | func (f *FileWorkerPool[T]) Enqueue(file *T) { 69 | f.tasks <- file 70 | } 71 | 72 | // PendingFiles returns the number of tasks that are waiting to be processed 73 | func (f FileWorkerPool[T]) PendingFiles() int { 74 | return len(f.tasks) 75 | } 76 | 77 | // Close gracefully shuts down the FileWorkerPool, ensuring all enqueued tasks have been processed. 78 | // Files cannot be enqueued after Close has been called; attempting this will cause a panic. 79 | // Close returns the first error that was encountered during file processing. 80 | func (f *FileWorkerPool[T]) Close() error { 81 | close(f.tasks) 82 | err := f.g.Wait() 83 | f.ctxCancel(err) 84 | return err 85 | } 86 | 87 | func (f *FileWorkerPool[T]) listen(ctx context.Context) error { 88 | for file := range f.tasks { 89 | if err := f.executor(file); err != nil { 90 | return fmt.Errorf("process file: %w", err) 91 | } else if err := ctx.Err(); err != nil { 92 | return err 93 | } 94 | } 95 | 96 | return nil 97 | } 98 | 99 | func (f *FileWorkerPool[T]) reset() { 100 | f.tasks = make(chan *T, f.capacity) 101 | } 102 | -------------------------------------------------------------------------------- /pool/file_worker_pool_test.go: -------------------------------------------------------------------------------- 1 | package pool_test 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "errors" 7 | "testing" 8 | 9 | "github.com/alecthomas/assert/v2" 10 | "github.com/ybirader/pzip/pool" 11 | ) 12 | 13 | func TestFileWorkerPool(t *testing.T) { 14 | t.Run("can enqueue tasks", func(t *testing.T) { 15 | fileProcessPool, err := pool.NewFileWorkerPool(func(f *pool.File) error { return nil }, &pool.Config{Concurrency: 1, Capacity: 1}) 16 | assert.NoError(t, err) 17 | fileProcessPool.Start(context.Background()) 18 | 19 | fileProcessPool.Enqueue(&pool.File{}) 20 | 21 | assert.Equal(t, 1, fileProcessPool.PendingFiles()) 22 | }) 23 | 24 | t.Run("has workers process files to completion", func(t *testing.T) { 25 | output := bytes.Buffer{} 26 | executor := func(_ *pool.File) error { 27 | output.WriteString("hello, world!") 28 | return nil 29 | } 30 | 31 | fileProcessPool, err := pool.NewFileWorkerPool(executor, &pool.Config{Concurrency: 1, Capacity: 1}) 32 | assert.NoError(t, err) 33 | fileProcessPool.Start(context.Background()) 34 | 35 | fileProcessPool.Enqueue(&pool.File{}) 36 | 37 | err = fileProcessPool.Close() 38 | 39 | assert.NoError(t, err) 40 | assert.Equal(t, 0, fileProcessPool.PendingFiles()) 41 | assert.Equal(t, "hello, world!", output.String()) 42 | }) 43 | 44 | t.Run("returns an error if number of workers is less than one", func(t *testing.T) { 45 | executor := func(_ *pool.File) error { return nil } 46 | 47 | _, err := pool.NewFileWorkerPool(executor, &pool.Config{Concurrency: 0, Capacity: 1}) 48 | assert.Error(t, err) 49 | }) 50 | 51 | t.Run("can be closed and restarted", func(t *testing.T) { 52 | output := bytes.Buffer{} 53 | executor := func(_ *pool.File) error { 54 | output.WriteString("hello ") 55 | return nil 56 | } 57 | 58 | fileProcessPool, err := pool.NewFileWorkerPool(executor, &pool.Config{Concurrency: 1, Capacity: 1}) 59 | assert.NoError(t, err) 60 | 61 | fileProcessPool.Start(context.Background()) 62 | fileProcessPool.Enqueue(&pool.File{}) 63 | err = fileProcessPool.Close() 64 | assert.NoError(t, err) 65 | 66 | fileProcessPool.Start(context.Background()) 67 | fileProcessPool.Enqueue(&pool.File{}) 68 | err = fileProcessPool.Close() 69 | 70 | assert.NoError(t, err) 71 | assert.Equal(t, "hello hello ", output.String()) 72 | }) 73 | 74 | t.Run("stops workers with first error encountered by a goroutine", func(t *testing.T) { 75 | executor := func(file *pool.File) error { 76 | if file.Path == "1" { 77 | return errors.New("file is corrupt") 78 | } 79 | 80 | return nil 81 | } 82 | 83 | fileProcessPool, err := pool.NewFileWorkerPool(executor, &pool.Config{Concurrency: 2, Capacity: 1}) 84 | assert.NoError(t, err) 85 | 86 | fileProcessPool.Start(context.Background()) 87 | 88 | fileProcessPool.Enqueue(&pool.File{}) 89 | fileProcessPool.Enqueue(&pool.File{}) 90 | fileProcessPool.Enqueue(&pool.File{Path: "1"}) 91 | 92 | err = fileProcessPool.Close() 93 | 94 | assert.Error(t, err) 95 | }) 96 | } 97 | -------------------------------------------------------------------------------- /pool/worker_pool.go: -------------------------------------------------------------------------------- 1 | package pool 2 | 3 | import "context" 4 | 5 | type WorkerPool[T any] interface { 6 | Start(ctx context.Context) 7 | Close() error 8 | Enqueue(v *T) 9 | } 10 | -------------------------------------------------------------------------------- /specifications/archive.go: -------------------------------------------------------------------------------- 1 | package specifications 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "os/exec" 7 | "path/filepath" 8 | "testing" 9 | 10 | "github.com/alecthomas/assert/v2" 11 | ) 12 | 13 | type Archiver interface { 14 | ArchivePath() string 15 | DirPath() string 16 | Archive() 17 | } 18 | 19 | func Archive(t *testing.T, driver Archiver) { 20 | driver.Archive() 21 | defer os.RemoveAll(driver.ArchivePath()) 22 | 23 | assertValidArchive(t, driver.ArchivePath(), driver.DirPath()) 24 | } 25 | 26 | func assertValidArchive(t testing.TB, archivePath, dirPath string) { 27 | t.Helper() 28 | 29 | tmpDirPath, err := os.MkdirTemp("", "unzipped-archive") 30 | if err != nil { 31 | t.Fatal("ERROR: could not create temp directory", err) 32 | } 33 | defer os.RemoveAll(tmpDirPath) 34 | 35 | unzip := exec.Command("unzip", archivePath, "-d", tmpDirPath) 36 | unzipOutput, err := unzip.CombinedOutput() 37 | if err != nil { 38 | t.Fatalf("ERROR: could not unzip archive %s: %s: %v", archivePath, unzipOutput, err) 39 | } 40 | 41 | diff := exec.Command("diff", "--recursive", "--brief", dirPath, filepath.Join(tmpDirPath, filepath.Base(dirPath))) 42 | diffOutput, err := diff.Output() 43 | if err != nil { 44 | t.Fatal("ERROR: could not get stdout of diff", err) 45 | } 46 | 47 | assert.Zero(t, len(diffOutput), fmt.Sprintf("expected no output from diff but got %s", diffOutput)) 48 | } 49 | -------------------------------------------------------------------------------- /specifications/extract.go: -------------------------------------------------------------------------------- 1 | package specifications 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | "testing" 7 | ) 8 | 9 | const testArchiveDirectoryName = "hello" 10 | 11 | type Extractor interface { 12 | DirPath() string 13 | ArchivePath() string 14 | Extract() 15 | } 16 | 17 | func Extract(t *testing.T, driver Extractor) { 18 | driver.Extract() 19 | dirPath := filepath.Join(driver.DirPath(), testArchiveDirectoryName) 20 | defer os.RemoveAll(dirPath) 21 | 22 | assertValidArchive(t, driver.ArchivePath(), dirPath) 23 | } 24 | -------------------------------------------------------------------------------- /testdata/hello.md: -------------------------------------------------------------------------------- 1 | This is a second file that needs archiving 2 | -------------------------------------------------------------------------------- /testdata/hello.txt: -------------------------------------------------------------------------------- 1 | hello, world! 2 | -------------------------------------------------------------------------------- /testdata/hello/hello.txt: -------------------------------------------------------------------------------- 1 | This is a file at the top-level of the test directory 2 | -------------------------------------------------------------------------------- /testdata/hello/nested/hello.md: -------------------------------------------------------------------------------- 1 | This is a nested file within the test directory tree. 2 | -------------------------------------------------------------------------------- /testdata/test.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ybirader/pzip/9eb13490a5a50cc90eb522783ce372a55bcd5196/testdata/test.zip --------------------------------------------------------------------------------