├── .cargo └── config.toml ├── .gitattributes ├── .github ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE │ ├── bug_report_bin.md │ └── bug_report_lib.md ├── SUPPORT.md ├── pull_request_template.md └── workflows │ ├── build.yml │ └── test.yml ├── .gitignore ├── .readthedocs.yaml ├── .vscode └── settings.json ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE.txt ├── README.md ├── doc ├── Makefile ├── README.md ├── cli_reference.md ├── compiling.md ├── conf.py ├── downloads.md ├── export_import.md ├── index.md ├── install.md ├── install_manual.md ├── integration.md ├── intro_to_cli.md ├── make.bat ├── requirements.txt ├── setup.md └── usage_examples.md ├── examples ├── README.md ├── c-sharp │ ├── .gitignore │ ├── .vscode │ │ └── settings.json │ ├── WarcatExample.sln │ ├── WarcatExample │ │ ├── Decode.cs │ │ ├── Encode.cs │ │ ├── Message.cs │ │ ├── Program.cs │ │ └── WarcatExample.csproj │ └── example.warc ├── decode.rs ├── encode.rs ├── example.warc └── python │ ├── decode.py │ ├── encode.py │ └── message.py ├── misc └── release_digests │ ├── README.md │ ├── keys.toml │ ├── v0.1.0.toml │ ├── v0.2.0.toml │ ├── v0.3.0.toml │ ├── v0.3.1.toml │ ├── v0.3.2.toml │ ├── v0.3.3.toml │ └── v0.3.4.toml ├── roadmap.md ├── src ├── app.rs ├── app │ ├── arg.rs │ ├── common.rs │ ├── dump_help.rs │ ├── export.rs │ ├── extract.rs │ ├── filter.rs │ ├── format.rs │ ├── get.rs │ ├── import.rs │ ├── io.rs │ ├── list.rs │ ├── logging.rs │ ├── model.rs │ ├── progress.rs │ ├── self_.rs │ └── verify.rs ├── compress.rs ├── compress │ ├── decode.rs │ ├── encode.rs │ ├── zstd.rs │ └── zstd │ │ ├── decode.rs │ │ └── encode.rs ├── dataseq.rs ├── digest.rs ├── error.rs ├── extract.rs ├── fields.rs ├── fields │ ├── de.rs │ └── ser.rs ├── header.rs ├── header │ └── fields.rs ├── http.rs ├── http │ ├── h1.rs │ └── h1 │ │ ├── codec.rs │ │ ├── codec │ │ ├── chunked.rs │ │ └── compress.rs │ │ ├── error.rs │ │ ├── header.rs │ │ ├── header │ │ ├── fields.rs │ │ └── parse.rs │ │ ├── recv.rs │ │ └── send.rs ├── io.rs ├── lib.rs ├── main.rs ├── parse.rs ├── parse │ ├── fields.rs │ ├── fields_str.rs │ ├── header_deliminator.rs │ └── warc.rs ├── util.rs ├── verify.rs ├── warc.rs └── warc │ ├── decode.rs │ └── encode.rs ├── tests ├── test_decode.rs └── warc_generator.rs └── xtask ├── Cargo.toml ├── README.md └── src ├── digest.rs ├── dist_license.txt ├── dist_readme.txt ├── doc.rs ├── gh.rs ├── license.rs ├── main.rs └── package.rs /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [alias] 2 | xtask = "run --release --package xtask --" 3 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.warc binary -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | * If you encounter a bug and want to report it, please visit the [*Issues*](https://github.com/chfoo/warcat-rs/issues) page. Try searching if the problem already exists to help avoiding duplicate reports. When reporting bugs, try to fill out as much as the template as possible. 4 | * If there is something limiting the functionality of the software/library and you have details on greatly improving it, file a feature request in *Issues* page as well. 5 | * If you want to contribute some bug fixes, documentation, tests, examples, please feel free to submit a Pull Request. If you want to submit a feature and unsure whether it is useful, feel free to file a Issue first. 6 | * If you need help using Warcat, brainstorming ideas, or want to have a general discussion, please use the [*Discussions*](https://github.com/chfoo/warcat-rs/discussions) page instead. Keeping the Issues page on-topic will help make it organized. 7 | 8 | ## Style guide 9 | 10 | * Please configure your IDE to use [Rustfmt](https://github.com/rust-lang/rustfmt). This is the code style formatting used by the project. 11 | * Also configure your IDE to use [Clippy](https://github.com/rust-lang/rust-clippy). This is optional but recommended. 12 | * Important: CLI code is put under the `bin` feature which is not on by default. (This is a workaround to keep library crate lightweight.) You need to configure your IDE/Clippy to enable the `bin` feature. 13 | * There is an inadvertent use mixed of line endings (CRLF/LF). For old files, please keep them as is for now. For new files, use LF. -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report_bin.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report (application) 3 | about: Create a bug report for the CLI application 4 | title: "[✏️Put a short title here, something error when doing something]" 5 | labels: 6 | assignees: 7 | --- 8 | 9 | **Describe the bug** 10 | 11 | [✏️A clear and concise description of what the bug is.] 12 | 13 | **To Reproduce** 14 | 15 | Program arguments: 16 | 17 | ``` 18 | [✏️e.g. warcat extract --input my_warc_file.warc.gz --output workspace/the_data/] 19 | ``` 20 | 21 | Steps to reproduce the behavior: 22 | 23 | 1. [✏️Remove this list if not applicable.] 24 | 2. ... 25 | 3. ... 26 | 27 | **Expected behavior** 28 | 29 | [✏️A clear and concise description of what you expected to happen.] 30 | 31 | **Screenshots/Logs** 32 | 33 | [✏️If applicable, attach sample files, screenshots, or log files to help explain your problem. Otherwise, delete this section.] 34 | 35 | **System** 36 | 37 | - OS: [✏️e.g. Windows 11, macOS 15, Ubuntu 24.04] 38 | - Terminal: [✏️e.g. Windows Console, Windows Terminal, macOS Terminal, GNOME Console, Konsole] 39 | - Program Version (Check with `--version`): [✏️e.g. 1.0.0.] 40 | 41 | **Additional context** 42 | 43 | [✏️Add any other context about the problem here or delete this section.] 44 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report_lib.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report (library) 3 | about: Create a bug report for the library 4 | title: "[✏️Put a short title here, something error when calling something]" 5 | labels: 6 | assignees: 7 | --- 8 | 9 | **Describe the bug** 10 | 11 | [✏️A clear and concise description of what the bug is.] 12 | 13 | **To Reproduce** 14 | 15 | ``` 16 | [✏️Sample code here that reproduces the behavior or attach a sample program.] 17 | ``` 18 | 19 | **Expected behavior** 20 | 21 | [✏️A clear and concise description of what you expected to happen.] 22 | 23 | **Screenshots/Logs** 24 | 25 | [✏️If applicable, attach screenshots or log files to help explain your problem. Otherwise, delete this section.] 26 | 27 | **System** 28 | 29 | - OS: [✏️e.g. Windows 11, macOS 15, Ubuntu 24.04] 30 | - Rust Version (Check with `rustc --version`): [✏️e.g. 1.80.0] 31 | - Crate Version (Check with `cargo tree --package warcat --depth 0`): [✏️e.g. 1.0.0] 32 | 33 | **Additional context** 34 | 35 | [✏️Add any other context about the problem here or delete this section. ] 36 | -------------------------------------------------------------------------------- /.github/SUPPORT.md: -------------------------------------------------------------------------------- 1 | # Support 2 | 3 | * If you need help with using Warcat, you can ask in the [*Discussions*](https://github.com/chfoo/warcat-rs/discussions) page. 4 | * If you want to file a bug report, use the [*Issues*](https://github.com/chfoo/warcat-rs/issues) page. 5 | * Please note that I'm not always available to provide help. Check out alternative places to ask questions in the section below. 6 | 7 | ## Alternative forums 8 | 9 | * For chatting about archiving in general, join [#archiveteam-bs](ircs://irc.hackint.org:6697/archiveteam-bs) on Hackint ([details](https://wiki.archiveteam.org/index.php/Archiveteam:IRC)). Note that this is chat room; you may not receive a instant response due to time zones. 10 | * Alternatively on [ArchiveTeam Reddit](https://www.reddit.com/r/Archiveteam/). Note that this is a low traffic forum. 11 | * For help on how to use command line programs or software in general, try searching or asking on [Super User](https://superuser.com/). 12 | * For help on programming software in general, try searching or asking on [Stack Overflow](https://stackoverflow.com/). -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | **Summary:** 2 | 3 | [✏️A description of the changes proposed in the pull request.] 4 | 5 | **Related issues:** 6 | 7 | [✏️Any references to related issues if applicable. Otherwise, delete this section.] 8 | 9 | **Other:** 10 | 11 | [✏️Comments on whether your PR needs further testing, has working tests, part of a Hacktoberfest, etc. Otherwise, delete this section.] -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build and package 2 | 3 | on: workflow_dispatch 4 | 5 | env: 6 | CARGO_TERM_COLOR: always 7 | 8 | jobs: 9 | build: 10 | strategy: 11 | fail-fast: false 12 | matrix: 13 | params: 14 | - os: ubuntu-22.04 15 | target: x86_64-unknown-linux-musl 16 | apt-install: musl-tools 17 | use-cross-rs: false 18 | - os: ubuntu-22.04 19 | target: aarch64-unknown-linux-musl 20 | apt-install: musl-tools 21 | use-cross-rs: true 22 | - os: windows-2022 23 | target: x86_64-pc-windows-msvc 24 | use-cross-rs: false 25 | - os: windows-2022 26 | target: aarch64-pc-windows-msvc 27 | use-cross-rs: false 28 | - os: macos-14 29 | target: x86_64-apple-darwin 30 | use-cross-rs: false 31 | - os: macos-14 32 | target: aarch64-apple-darwin 33 | use-cross-rs: false 34 | 35 | runs-on: ${{ matrix.params.os }} 36 | steps: 37 | - uses: kaven-universe/github-action-current-date-time@v1.4.0 38 | name: Current date time 39 | id: datetime 40 | with: 41 | format: YYYYMMDD_HHmmss 42 | - uses: imesense/gha-echo-action@v0.2 43 | name: Debug info 44 | with: 45 | input-string: | 46 | OS: ${{ matrix.params.os }} 47 | Target: ${{ matrix.params.target }} 48 | Date: ${{ steps.datetime.outputs.time }} 49 | - name: Install packages (apt) 50 | if: ${{ matrix.params.apt-install }} 51 | run: sudo apt-get -y install ${{ matrix.params.apt-install }} 52 | - uses: actions/checkout@v4 53 | - uses: Swatinem/rust-cache@v2 54 | with: 55 | key: ${{ matrix.params.os }}.${{ matrix.params.target }} 56 | - name: Install target 57 | run: rustup target add ${{ matrix.params.target }} 58 | - name: Run release build 59 | if: ${{ !matrix.params.use-cross-rs }} 60 | run: cargo build --features=bin --release --verbose --target ${{ matrix.params.target }} 61 | - name: Run cross release build 62 | if: ${{ matrix.params.use-cross-rs }} 63 | uses: houseabsolute/actions-rust-cross@v1.0.4 64 | with: 65 | command: build 66 | args: "--features=bin --release --verbose" 67 | target: ${{ matrix.params.target }} 68 | cross-version: 51f46f296253d8122c927c5bb933e3c4f27cc317 69 | - name: Package binary 70 | run: cargo xtask package-bin ${{ matrix.params.target }} 71 | - uses: actions/upload-artifact@v4 72 | name: Save artifact 73 | with: 74 | name: artifact.${{ matrix.params.target }}.${{ steps.datetime.outputs.time }} 75 | if-no-files-found: error 76 | path: | 77 | target/xtask-package-bin-output/* -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Cargo test 2 | 3 | on: 4 | push: 5 | branches: [ "main", "gh" ] 6 | pull_request: 7 | branches: [ "main", "gh" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | test: 14 | name: Test on latest Ubuntu 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | - uses: Swatinem/rust-cache@v2 19 | - name: Run tests 20 | run: cargo test --verbose --features=bin 21 | lint: 22 | name: Lint check on latest Ubuntu 23 | runs-on: ubuntu-latest 24 | steps: 25 | - uses: actions/checkout@v4 26 | - uses: Swatinem/rust-cache@v2 27 | - name: Run clippy 28 | run: cargo clippy --verbose --features=bin 29 | - name: Make annotation 30 | run: if ! cargo clippy --quiet --features=bin -- -D warnings; then echo "::warning::Lint check failed"; fi 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Rust output 2 | /target/ 3 | 4 | # Sphinx output 5 | /doc/_build/ 6 | 7 | # Python output 8 | __pycache__ -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file for Sphinx projects 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the OS, Python version and other tools you might need 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | python: "3.12" 12 | # You can also specify other tool versions: 13 | # nodejs: "20" 14 | # rust: "1.70" 15 | # golang: "1.20" 16 | 17 | # Build documentation in the "docs/" directory with Sphinx 18 | sphinx: 19 | configuration: doc/conf.py 20 | # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs 21 | # builder: "dirhtml" 22 | # Fail on all warnings to avoid broken references 23 | # fail_on_warning: true 24 | 25 | # Optionally build your docs in additional formats such as PDF and ePub 26 | # formats: 27 | # - pdf 28 | # - epub 29 | 30 | # Optional but recommended, declare the Python requirements required 31 | # to build your documentation 32 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 33 | python: 34 | install: 35 | - requirements: doc/requirements.txt -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "rust-analyzer.cargo.features": [ 3 | "bin" 4 | ] 5 | } -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.3.4 (2025-06-02) 4 | 5 | * Fixed: WARC header incorrectly rejected as invalid (#8). 6 | * Fixed: Unexpected end of file error during decompression of the last record. 7 | * Changed: `--id` is now optional for `get export` and `get extract` (#7). 8 | 9 | ### Library 10 | 11 | * Added: PushDecompressor::write_eof() 12 | * Changed: PushDecoderEvent and PushDecoder::write_eof() 13 | 14 | ## 0.3.3 (2025-05-26) 15 | 16 | * Fixed: parse error for HTTP responses without a space after the status code (#3). 17 | * Fixed: wrong record boundaries listed for uncompressed WARC files (#4). 18 | * Added: new errors for unknown headers or unexpected compressed files (#5). 19 | 20 | ## 0.3.2 (2024-11-14) 21 | 22 | * Fixed: application named with version isn't detected as installer on macOS/Linux. 23 | 24 | ## 0.3.1 (2024-10-22) 25 | 26 | * Fixed: memory error reading ".warc.zst" files with compressed dictionaries. 27 | * Fixed: corrupted data reading and file offsets for highly compressed ".warc.zst" files. 28 | * Fixed: exclude-check from verify command not respected. 29 | * Fixed: ANSI codes written to log files. 30 | * Fixed: corrupted decoding Chunk-Transfer Encoding in cases where data aligns within a boundary. 31 | 32 | ## 0.3.0 (2024-10-20) 33 | 34 | * Fixed: false positive Payload Digest problem during verify for "revisit" records. 35 | * Added: Get command for exporting/extracting single records. 36 | * Added: Record-at-time compression check to verify. 37 | * Added: Zstandard (.warc.zst) support. 38 | 39 | ### Library 40 | 41 | * Changed: `compress`: structs now take a configuration, renamed function for reading concatenated members 42 | * Added `warc::PushDecoder`. 43 | 44 | ## 0.2.0 (2024-10-12) 45 | 46 | * Fixed: HTTP decoder (and Extract command) incorrectly truncated data with Content-Length. 47 | * Fixed: Verify functionality: block and payload digest checks were not functional. 48 | * Added: filter options for Extract command. 49 | * Added: extract option for Export command. 50 | * Changed: Made the EndOfFile message explicit for the Export and Import commands. 51 | 52 | ## 0.1.0 (2024-10-11) 53 | 54 | * First release. 55 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["xtask"] 3 | 4 | [package] 5 | name = "warcat" 6 | version = "0.3.4" 7 | edition = "2024" 8 | license = "MPL-2.0" 9 | authors = ["Christopher Foo", "Warcat-rs contributors"] 10 | description = "Command-line tool and library for handling Web ARChive (WARC) files" 11 | repository = "https://github.com/chfoo/warcat-rs" 12 | categories = ["command-line-utilities", "parser-implementations"] 13 | keywords = ["archiving", "warc"] 14 | rust-version = "1.85" 15 | exclude = [ 16 | "/.cargo", 17 | "/.github/ISSUE_TEMPLATE", 18 | "/.github/pull_request_template.md", 19 | "/.github/workflows", 20 | "/.vscode", 21 | "/.readthedocs.yaml", 22 | "/misc" 23 | ] 24 | 25 | [lints.rust] 26 | 27 | [dependencies] 28 | # Dependencies for the binary, enabled by "bin" feature: 29 | anyhow = { version = "1.0.86", optional = true } 30 | clap = { version = "4.5.16", features = ["cargo", "derive"], optional = true } 31 | clap-markdown = { version = "0.1.4", optional = true } 32 | indicatif = { version = "0.17.8", optional = true } 33 | takecrate = { version = "1.0.0", optional = true } 34 | tempfile = { version = "3.12.0", optional = true } 35 | tracing-subscriber = { version = "0.3.18", features = ["json"], optional = true } 36 | # Everything: 37 | blake2 = "0.10.6" 38 | blake3 = { version = "1.5.4", features = ["pure", "traits-preview"] } 39 | brotli = "8.0.1" 40 | chrono = "0.4.38" 41 | ciborium = "0.2.2" 42 | crc32c = "0.6.8" 43 | crc32fast = "1.4.2" 44 | csv = "1.3.0" 45 | data-encoding = "2.6.0" 46 | digest = "0.10.7" 47 | flate2 = "1.0.31" 48 | md-5 = "0.10.6" 49 | nom = "8.0.0" 50 | percent-encoding = "2.3.1" 51 | redb = "2.1.3" 52 | regex = { version = "1.10.6", default-features = false, features = ["std", "perf"] } 53 | serde = "1.0.209" 54 | serde_json = "1.0.127" 55 | serde_with = { version = "3.11.0", features = ["base64", "hex"] } 56 | sha1 = "0.10.6" 57 | sha2 = "0.10.8" 58 | sha3 = "0.10.8" 59 | thiserror = "2.0.0" 60 | tracing = "0.1.40" 61 | url = "2.5.2" 62 | uuid = { version = "1.10.0", features = ["v7"] } 63 | xxhash-rust = { version = "0.8.12", features = ["std", "xxh3"] } 64 | zstd = { version = "0.13.2", optional = true } 65 | 66 | [dev-dependencies] 67 | anyhow = "1.0.86" 68 | rand = "0.9.1" 69 | rand_xoshiro = "0.7.0" 70 | tracing-test = { version = "0.2.5", features = ["no-env-filter"] } 71 | 72 | [features] 73 | default = ["zstd"] 74 | 75 | # Enables support for Zstandard and related APIs. 76 | # zstd is optional because the crate relies on a C library that might not 77 | # be fully portable. 78 | zstd = ["dep:zstd"] 79 | 80 | # FIXME: blake3: a way to provide a "blake3-opt" feature to enable 81 | # compiling native code. The crate misuses the "pure" feature as a 82 | # subtractive feature and defaults to compiling. This is undesirable as it can 83 | # only check whether a compiler is supported, not whether it is installed. 84 | 85 | # This feature is intended to be used only for building the binary (main.rs) 86 | bin = [ 87 | "dep:anyhow", 88 | "dep:clap", 89 | "dep:clap-markdown", 90 | "dep:indicatif", 91 | "dep:takecrate", 92 | "dep:tempfile", 93 | "dep:tracing-subscriber", 94 | "serde/derive", 95 | ] 96 | 97 | [[bin]] 98 | name = "warcat" 99 | required-features = ["bin"] 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # warcat-rs 2 | 3 | Command-line tool and Rust library for handling Web ARChive (WARC) files. 4 | 5 | This project is a rewrite of the [warcat](https://github.com/chfoo/warcat/) project. 6 | 7 | ## Getting started 8 | 9 | * [💿 Downloads](https://github.com/chfoo/warcat-rs/releases) 10 | * [📖 User guide ![Read the Docs](https://img.shields.io/readthedocs/warcat-rs) 11 | ](https://warcat-rs.readthedocs.io/) 12 | * [📦 Crate repository ![Crates.io Version](https://img.shields.io/crates/v/warcat) 13 | ](https://crates.io/crates/warcat) 14 | * [📑 API documentation ![docs.rs](https://img.shields.io/docsrs/warcat) 15 | ](https://docs.rs/warcat) 16 | 17 | ## Compiling 18 | 19 | If you want to compile the program yourself, set up a [Rust environment](https://www.rust-lang.org/tools/install). 20 | 21 | * Project requires Rust 1.85 or higher. 22 | 23 | Once you have Rust installed, use the cargo build tool: 24 | 25 | ```sh 26 | cargo build --features=bin --release 27 | ``` 28 | 29 | The program will be placed in the `target` directory. 30 | 31 | ## Contributing & support 32 | 33 | * [Contributing](https://github.com/chfoo/warcat-rs/blob/main/.github/CONTRIBUTING.md) 34 | * [Support](https://github.com/chfoo/warcat-rs/blob/main/.github/SUPPORT.md) 35 | * [Changelog](https://github.com/chfoo/warcat-rs/blob/main/CHANGELOG.md) 36 | * [Development roadmap](https://github.com/chfoo/warcat-rs/blob/main/roadmap.md) 37 | 38 | ## License 39 | 40 | Copyright 2024-2025 Christopher Foo and Warcat-rs contributors. Licensed under Mozilla Public License 2.0 -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | This directory contains a [Sphinx documentation](https://www.sphinx-doc.org/) project. It is written in [MyST](https://myst-parser.readthedocs.io/) which is a superset of [CommonMark](https://commonmark.org/) which in turn standardizes Markdown. 4 | -------------------------------------------------------------------------------- /doc/compiling.md: -------------------------------------------------------------------------------- 1 | # Compiling it yourself (advanced) 2 | 3 | Compiling the application should only be done when you are comfortable of doing it yourself. 4 | 5 | ## Steps 6 | 7 | Set up a [Rust environment](https://www.rust-lang.org/tools/install). The latest version of Rust should work. (Rust versions ≥ 1.80, < 2.0 are supported.) 8 | 9 | Once you have Rust installed, use the cargo build tool: 10 | 11 | ```sh 12 | cargo build --features=bin --release 13 | ``` 14 | 15 | The program will be placed in the `target` directory. You can run it as is, or install it by adding a "-installer" suffix to the filename before running it. -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = 'Warcat' 10 | copyright = '2024-2025 Warcat contributors' 11 | author = 'Warcat contributors' 12 | 13 | # -- General configuration --------------------------------------------------- 14 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 15 | 16 | extensions = ['myst_parser'] 17 | 18 | templates_path = ['_templates'] 19 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'README.md'] 20 | 21 | 22 | 23 | # -- Options for HTML output ------------------------------------------------- 24 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 25 | 26 | html_theme = 'sphinx_book_theme' 27 | html_static_path = ['_static'] 28 | -------------------------------------------------------------------------------- /doc/downloads.md: -------------------------------------------------------------------------------- 1 | # Downloads 2 | 3 | Downloads are available on [Releases](https://github.com/chfoo/warcat-rs/releases) page. 4 | 5 | ## Supported platforms 6 | 7 | * Windows 10 or newer 8 | * macOS 10.12 or newer 9 | * Linux (kernel compatible with musl 1.2.3) 10 | 11 | ### CPU types 12 | 13 | * x86_64 (x64): 64-bit Intel and AMD CPUs 14 | * Typically for most Windows and Linux devices 15 | * aarch64 (arm64): 64-bit ARM CPUs 16 | * Typically for newer macOS devices -------------------------------------------------------------------------------- /doc/export_import.md: -------------------------------------------------------------------------------- 1 | # Export/import format 2 | 3 | This section describes the message format used during the export and import commands. 4 | 5 | ## Message types 6 | 7 | ### Metadata 8 | 9 | The metadata message is provided during only the export command. It is produced at a start of a WARC record. 10 | 11 | map: 12 | 13 | * `Metadata` - map 14 | * `file` - string: The input filename of the WARC. 15 | * `position` - integer: The position in the WARC file where the record is located. For compressed files, this position is only valid if the file was compressed by concatenating compressed streams. 16 | 17 | Example: 18 | 19 | ```json 20 | { 21 | "Metadata": { 22 | "file": "./my_file.warc.gz", 23 | "position": 123 24 | } 25 | } 26 | ``` 27 | 28 | ### Header 29 | 30 | The header message is provided for both export and import commands. It is produced when a header from a WARC record has been read. 31 | 32 | map: 33 | 34 | * `Header` - map 35 | * `version` - string: The WARC version string such as "WARC/1.1" 36 | * `fields` - array[[string, string]]: Name-value pairs. 37 | 38 | ```json 39 | { 40 | "Header": { 41 | "version": "WARC/1.1", 42 | "fields": [ 43 | ["WARC-Record-Type": "metadata"], 44 | ["Content-Length": "123"] 45 | ] 46 | } 47 | } 48 | ``` 49 | 50 | ### Block chunk 51 | 52 | The block chunk message is provided for export and import commands. It is produced when a segment of a block from a WARC record has been read. 53 | 54 | map: 55 | 56 | * `BlockChunk` - map 57 | * `data` - bytes: A segment of block data. For JSON, this is a string in base64 standard (with padding) encoding. 58 | 59 | ```json 60 | { 61 | "BlockChunk": { 62 | "data": "Zm9vYmFy" 63 | } 64 | } 65 | ``` 66 | 67 | ### Block end 68 | 69 | The block chunk message is provided for export and import commands. It is produced at the end of reading a block and WARC record. 70 | 71 | map: 72 | 73 | * `BlockEnd` - map 74 | * `crc32` - integer (optional, unsigned 32-bit): CRC32 (ITU-T V.42) checksum of the block data. 75 | * `crc32c` - integer (optional, unsigned 32-bit): CRC32C checksum of the block data. 76 | * `xxh3` - integer (optional, unsigned 64-bit): XxHash XXH3 checksum of the block data. 77 | 78 | The checksum is used to ensure that processing of messages was properly implemented. 79 | 80 | When importing, it is required that at least one of the fields "crc32", "crc32c", or "xxh3" be provided. When exporting, all fields will be filled. 81 | 82 | ```json 83 | { 84 | "BlockEnd": { 85 | "crc32c": 123456 86 | } 87 | } 88 | ``` 89 | 90 | ### Extract metadata 91 | 92 | The extract metadata message is provided during only the export command with the extract option. 93 | 94 | map: 95 | 96 | * `ExtractMetadata` - map 97 | * `has_content` - boolean: Whether data can be extracted from this record. 98 | * `file_path_components` - array\[string\]: A safe filename for writing to disk. 99 | * `is_truncated` - bool: As recorded in the header field, whether the content is truncated. 100 | 101 | Example: 102 | 103 | ```json 104 | { 105 | "ExtractMetadata": { 106 | "has_content": true, 107 | "file_path_components": ["http", "www.example.com", "index.html"], 108 | "is_truncated": false 109 | } 110 | } 111 | ``` 112 | 113 | ### Extract chunk 114 | 115 | The extract chunk message is provided for export command with the extract option. It is produced when content can be extracted from a segment of block data. 116 | 117 | map: 118 | 119 | * `ExtractChunk` - map 120 | * `data` - bytes: A segment of block data. For JSON, this is a string in base64 standard (with padding) encoding. 121 | 122 | ```json 123 | { 124 | "ExtractChunk": { 125 | "data": "Zm9vYmFy" 126 | } 127 | } 128 | ``` 129 | 130 | ### Extract end 131 | 132 | The extract end message is provided for export command with extract option. It is produced at the end of extracting a block record. 133 | 134 | map: 135 | 136 | * `ExtractEnd` - map 137 | * `crc32` - integer (optional, unsigned 32-bit): CRC32 (ITU-T V.42) checksum of the extracted content. 138 | * `crc32c` - integer (optional, unsigned 32-bit): CRC32C checksum of the extracted content. 139 | * `xxh3`a - integer (optional, unsigned 64-bit): XxHash XXH3 checksum of the extracted content. 140 | 141 | ```json 142 | { 143 | "ExtractEnd": { 144 | "crc32c": 123456 145 | } 146 | } 147 | ``` 148 | 149 | ### End of file 150 | 151 | The end of file message indicates the output stream is ending and no other messages will be sent. 152 | 153 | map: 154 | 155 | * `EndOfFile` - map 156 | 157 | ```json 158 | { 159 | "EndOfFile": {} 160 | } 161 | ``` 162 | 163 | ## Message flows 164 | 165 | During the export command, every record consists of: 166 | 167 | * 1 `Metadata` 168 | * 1 `Header` 169 | * 0 or more `BlockChunk` 170 | * 1 `BlockEnd` 171 | 172 | During the import command, every record consists of: 173 | 174 | * 1 `Header` 175 | * 0 or more `BlockChunk` 176 | * 1 `BlockEnd` 177 | 178 | After all records are processed, the `EndOfFile` message is sent. -------------------------------------------------------------------------------- /doc/index.md: -------------------------------------------------------------------------------- 1 | # Warcat User Guide 2 | 3 | This documentation provides a user guide for the Warcat application. 4 | 5 | * [Project homepage](https://github.com/chfoo/warcat-rs) 6 | * If you intended to see API docs, see [this page](https://docs.rs/warcat) 7 | 8 | ```{toctree} 9 | :maxdepth: 2 10 | :caption: Contents: 11 | 12 | setup 13 | intro_to_cli 14 | usage_examples 15 | integration 16 | export_import 17 | cli_reference 18 | ``` 19 | -------------------------------------------------------------------------------- /doc/install.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Unzip the compressed file 4 | 5 | Before the application can be run, it needs to be unzipped from the compressed file you have downloaded. 6 | 7 | * How to [unzip on Windows](https://support.microsoft.com/en-us/windows/zip-and-unzip-files-f6dde0a7-0fec-8294-e1d3-703ed85e7ebc) 8 | * How to [unzip on macOS](https://support.apple.com/en-us/guide/mac-help/mchlp2528/mac) 9 | 10 | ## Installer 11 | 12 | The application supports installing itself which is the default behavior. You can double-click to run it. 13 | 14 | ### Disable the installer functionality 15 | 16 | To run the application as a standalone program, remove the "-installer" suffix from the filename. 17 | 18 | To manually install, see [this section](install_manual.md). 19 | 20 | ## Common problems 21 | 22 | ### Windows 23 | 24 | The application is not a signed application and Windows may refuse to run it. To allow an exception, click on "Details" and select "Run anyway". 25 | 26 | ### macOS 27 | 28 | The application is not a signed application and macOS will refuse to run it by default. 29 | 30 | To allow an exception, right-click the program file and select "Open" or [follow these instructions](https://support.apple.com/en-us/guide/mac-help/mh40616/mac). 31 | 32 | ### macOS and Linux 33 | 34 | If you get an error saying it is not an executable program, you need to set the executable bit of the file. To do this, open the terminal and run a similar command to `chmod +x warcat-1.2.3`. -------------------------------------------------------------------------------- /doc/install_manual.md: -------------------------------------------------------------------------------- 1 | # Manual installation (advanced) 2 | 3 | If you do not want to use the automated installer, you can follow instructions below to install it to your user account. 4 | 5 | ## Windows 6 | 7 | Place the executable in the `%LOCALAPPDATA%\Programs\warcat\bin\` folder. To access the Programs folder, press Windows+R and open `%LOCALAPPDATA%\Programs`. Then, create the folders needed if they do not exist. 8 | 9 | Ensure it is in `Path` the environment variable. To edit them, press Windows+R and open 10 | `rundll32 sysdm.cpl,EditEnvironmentVariables`. Then, 11 | 12 | 1. Under User variables, select Path 13 | 2. Press "Edit..." to open a dialog window with a list 14 | 3. Press "New" to edit a blank line 15 | 4. Enter `%LOCALAPPDATA%\Programs\warcat\bin\` in the list. 16 | 5. Press "OK" to close the dialog window with a list 17 | 6. Press "OK" to save changes 18 | 7. If you have any opened Console/Terminal windows, close and reopen them again for changes to take effect. 19 | 20 | ## macOS or Linux 21 | 22 | Place the binary to the `$HOME/.local/bin` directory. You may need to create the directory if it does not exist. 23 | 24 | Ensure it is in the `PATH` environment variable. Check if this section is in `$HOME/.profile` configuration file: 25 | 26 | ```sh 27 | if [ -d "$HOME/.local/bin" ] ; then 28 | PATH="$HOME/.local/bin:$PATH" 29 | fi 30 | ``` 31 | 32 | If not, add it. Then, log out and in for changes to take effect. (If you do not want to close existing terminal windows, run `source $HOME/.profile`.) -------------------------------------------------------------------------------- /doc/integration.md: -------------------------------------------------------------------------------- 1 | # Integration to other programs 2 | 3 | Integration to other programs is done through standard input and output using the `export` and `import` commands. 4 | 5 | For reading WARC files, the `export` command will format the data into messages such as JSON which your program can ingest and process. Likewise for writing WARC files, the `import` command accepts messages from your program. 6 | 7 | For working examples, [see here](https://github.com/chfoo/warcat-rs/tree/main/examples). 8 | 9 | The format of the messages is documented in the next section. 10 | 11 | ## Overview 12 | 13 | In order to integrate with your programming language of choice, your language's library must be able to launch other programs and communicate using standard in or standard out. 14 | 15 | This section will use pseudocode to explain an overview on how to read a WARC file. 16 | 17 | To begin reading, run the warcat program with options to export and output in JSON Lines: 18 | 19 | ``` 20 | process <- run_process("warcat", "export", "--input", "example.warc.gz", "--format=jsonl") 21 | ``` 22 | 23 | Next, get the record header by reading lines containing JSON: 24 | 25 | ``` 26 | metadata_line <- process.stdout.read_line() 27 | metadata <- decode_json(metadata_line) 28 | 29 | print("Reading file " + metadata["Metadata"]["file"]) 30 | 31 | header_line <- process.stdout.read_line() 32 | header <- decode_json(header_line) 33 | 34 | header_fields <- header["Header"]["fields"] 35 | 36 | for each field <- header_fields do 37 | name = field[0] 38 | value = field[1] 39 | 40 | print("Header name: " + name + " value: " + value) 41 | end for 42 | ``` 43 | 44 | Next, get the record block data: 45 | 46 | ``` 47 | message_line <- process.stdout.read_line() 48 | message <- decode_json(message_line) 49 | 50 | loop do 51 | if message.has_key("BlockEnd") then 52 | break loop 53 | end if 54 | 55 | block_chunk <- message 56 | b64_data <- block_chunk["BlockChunk"]["data"] 57 | data <- decode_base64(b64_data) 58 | 59 | print("Read " + data.length() + " bytes") 60 | end loop 61 | ``` 62 | 63 | Once you have read the end of the record, repeat the steps for each record until the end of file message is reached: 64 | 65 | ``` 66 | message_line <- process.stdout.read_line() 67 | message <- decode_json(message_line) 68 | is_end_of_file <- message.has_key("EndOfFile") 69 | ``` 70 | -------------------------------------------------------------------------------- /doc/intro_to_cli.md: -------------------------------------------------------------------------------- 1 | # Introduction to the CLI application 2 | 3 | To begin, open the terminal application. 4 | 5 | On Windows, right-click the Start icon or press Windows+X. Then, select [Terminal](https://learn.microsoft.com/en-us/windows/terminal/). 6 | 7 | On macOS, open Finder, then select Applications, Utilities, then [Terminal](https://support.apple.com/en-us/guide/terminal/apd5265185d-f365-44cb-8b09-71a064a42125/mac). 8 | 9 | On Linux, open Applications. Select System, then Terminal. Or, search for "terminal". 10 | 11 | The terminal application will then present a command line interface (CLI). On Windows, this is [PowerShell](https://learn.microsoft.com/en-us/powershell/). On macOS or Linux, this is typically [Bash shell](https://www.gnu.org/software/bash/manual/bash.html). 12 | 13 | If you have the application is under the search path (PATH environment variable), to run it, type: 14 | 15 | ```sh 16 | warcat 17 | ``` 18 | 19 | and press enter. 20 | 21 | Or, enter the location of the executable directly. For example (Windows): 22 | 23 | ```powershell 24 | .\Downloads\warcat.exe 25 | ``` 26 | 27 | macOS/Linux: 28 | ```sh 29 | ./Downloads/warcat 30 | ``` 31 | 32 | and press enter. 33 | 34 | If it is successful, the warcat application will display help information. 35 | 36 | Entering 37 | 38 | ```sh 39 | warcat help 40 | ``` 41 | 42 | will also show a list of commands and options. `help` is known as an argument that is passed to the program. 43 | 44 | For example using the `list` command: 45 | 46 | ```sh 47 | warcat list --input my_warc_file.warc.gz 48 | ``` 49 | 50 | The above command has 3 arguments to the program: 51 | 52 | 1. `list` is the command. 53 | 2. `--input` is an option. It starts with 2 hyphens. This specifies that the program should accept an input filename. 54 | 3. `my_warc_file.warc.gz` is a value to the `input` option. 55 | 56 | If an option value has spaces or special symbols, put quotation marks: 57 | 58 | ```sh 59 | warcat list --input "My WARC File (Copy).warc.gz" 60 | ``` 61 | 62 | Option values can also be specified by a `=` character if it helps with clarity: 63 | 64 | ```sh 65 | warcat list --input=my_warc_file.warc.gz 66 | warcat list --input="My WARC File (Copy).warc.gz" 67 | ``` 68 | 69 | Note that some options don't take a value. These options are also known as flags (as in boolean true/false): 70 | 71 | ```sh 72 | warcat --quiet 73 | ``` 74 | 75 | If you need help in a command, enter something like: 76 | 77 | ```sh 78 | warcat help list 79 | ``` -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /doc/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==8.0.2 2 | sphinx-book-theme==1.1.3 3 | myst-parser==4.0.0 -------------------------------------------------------------------------------- /doc/setup.md: -------------------------------------------------------------------------------- 1 | # Setting up Warcat 2 | 3 | ```{toctree} 4 | :maxdepth: 1 5 | :caption: Contents: 6 | 7 | downloads 8 | install 9 | compiling 10 | ``` 11 | 12 | ```{toctree} 13 | :hidden: 14 | 15 | install_manual 16 | ``` -------------------------------------------------------------------------------- /doc/usage_examples.md: -------------------------------------------------------------------------------- 1 | # Usage examples 2 | 3 | ## Extract everything 4 | 5 | Extract resources from a WARC file as much as possible: 6 | 7 | ```sh 8 | warcat extract --input my_warc_file.warc.gz --output my_output_folder 9 | ``` 10 | 11 | ## Extract a single item 12 | 13 | First locate where the item is within the WARC file: 14 | 15 | ```sh 16 | warcat list --input my_warc_file.warc.gz --format csv 17 | ``` 18 | 19 | For the purposes of this example, we'll use this hypothetical listing: 20 | 21 | ```csv 22 | 45678,,response,application/http; msgtype=response,https://example.com/index.html 23 | ``` 24 | 25 | Then provide the position and ID to the `get extract` command: 26 | 27 | ```sh 28 | warcat get extract --input my_warc_file.warc.gz --position 45678 --id "" --output index.html 29 | ``` 30 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | For languages other than Rust, please ensure `warcat` is accessible on your search path ("PATH"). 4 | 5 | ## Rust 6 | 7 | From the project root directory, run: 8 | 9 | ```sh 10 | cargo run --example decode 11 | cargo run --example encode 12 | ``` 13 | 14 | ## C# 15 | 16 | From the `examples/c-sharp/` directory, run: 17 | 18 | ```sh 19 | dotnet run --project WarcatExample 20 | ``` 21 | 22 | ## Python 23 | 24 | From the project root directory, run: 25 | 26 | ```sh 27 | python examples/python/decode.py 28 | python examples/python/encode.py 29 | ``` -------------------------------------------------------------------------------- /examples/c-sharp/.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | obj/ -------------------------------------------------------------------------------- /examples/c-sharp/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "dotnet.defaultSolution": "WarcatExample.sln" 3 | } -------------------------------------------------------------------------------- /examples/c-sharp/WarcatExample.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.0.31903.59 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "WarcatExample", "WarcatExample\WarcatExample.csproj", "{2AF7D93D-A2EA-4F23-B80E-2317C74E244B}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(SolutionProperties) = preSolution 14 | HideSolutionNode = FALSE 15 | EndGlobalSection 16 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 17 | {2AF7D93D-A2EA-4F23-B80E-2317C74E244B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 18 | {2AF7D93D-A2EA-4F23-B80E-2317C74E244B}.Debug|Any CPU.Build.0 = Debug|Any CPU 19 | {2AF7D93D-A2EA-4F23-B80E-2317C74E244B}.Release|Any CPU.ActiveCfg = Release|Any CPU 20 | {2AF7D93D-A2EA-4F23-B80E-2317C74E244B}.Release|Any CPU.Build.0 = Release|Any CPU 21 | EndGlobalSection 22 | EndGlobal 23 | -------------------------------------------------------------------------------- /examples/c-sharp/WarcatExample/Decode.cs: -------------------------------------------------------------------------------- 1 | // Example on how to read WARC files. 2 | using System.Diagnostics; 3 | using System.Text.Json; 4 | 5 | namespace WarcatExample; 6 | 7 | class Decode 8 | { 9 | public static void Run() 10 | { 11 | var options = Message.Options(); 12 | 13 | // Launch the warcat program. The options provided will tell it to write 14 | // JSON as a line to standard out. 15 | // Ensure you have warcat on the search path or adjust the path as needed. 16 | using (var process = new Process()) 17 | { 18 | process.StartInfo.FileName = "warcat"; 19 | process.StartInfo.ArgumentList.Add("export"); 20 | process.StartInfo.ArgumentList.Add("--input=example.warc"); 21 | process.StartInfo.ArgumentList.Add("--format=jsonl"); 22 | process.StartInfo.RedirectStandardOutput = true; 23 | process.Start(); 24 | 25 | while (true) 26 | { 27 | var line = process.StandardOutput.ReadLine(); 28 | 29 | if (line == null) 30 | { 31 | break; 32 | } 33 | 34 | // Decode each message 35 | var message = JsonSerializer.Deserialize(line, options)!; 36 | 37 | if (message.Header != null) 38 | { 39 | // We decoded the start of the record. 40 | foreach (var field in message.Header.Fields) 41 | { 42 | Console.WriteLine($"{field[0]}:{field[1]}"); 43 | } 44 | } 45 | else if (message.BlockChunk != null) 46 | { 47 | // We decoded the body of the record. 48 | Console.WriteLine($"{message.BlockChunk.Data.Length}"); 49 | } 50 | else if (message.EndOfFile != null) 51 | { 52 | // The end of the record was reached. 53 | Console.WriteLine("---"); 54 | } 55 | } 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /examples/c-sharp/WarcatExample/Encode.cs: -------------------------------------------------------------------------------- 1 | // Example on how to write WARC files. 2 | using System.Diagnostics; 3 | using System.IO.Hashing; 4 | using System.Text; 5 | using System.Text.Json; 6 | 7 | namespace WarcatExample; 8 | 9 | class Encode 10 | { 11 | public static void Run() 12 | { 13 | var options = Message.Options(); 14 | 15 | // Launch the warcat program. The options provided will tell it to read 16 | // JSON as a line from standard in. 17 | // Ensure you have warcat on the search path or adjust the path as needed. 18 | using (var process = new Process()) 19 | { 20 | process.StartInfo.FileName = "warcat"; 21 | process.StartInfo.ArgumentList.Add("import"); 22 | process.StartInfo.ArgumentList.Add("--compression=none"); 23 | process.StartInfo.ArgumentList.Add("--format=jsonl"); 24 | process.StartInfo.RedirectStandardInput = true; 25 | process.Start(); 26 | 27 | // Write a record header with the given header fields. 28 | // Note: this header is not valid; it is simply a concise demonstration. 29 | 30 | var header = new Message() 31 | { 32 | Header = new Header() 33 | { 34 | Version = "WARC/1.1", 35 | Fields = [ 36 | ["WARC-Record-Type", "resource"], 37 | ["Content-Length", "12"], 38 | ] 39 | } 40 | }; 41 | process.StandardInput.WriteLine(JsonSerializer.Serialize(header, options)); 42 | 43 | // Write the record block data. 44 | var hasher = new XxHash3(); 45 | 46 | var data = Encoding.UTF8.GetBytes("Hello world!"); 47 | hasher.Append(data); 48 | 49 | var block_chunk = new Message() 50 | { 51 | BlockChunk = new BlockChunk() 52 | { 53 | Data = data 54 | } 55 | }; 56 | process.StandardInput.WriteLine(JsonSerializer.Serialize(block_chunk, options)); 57 | 58 | // Write the end of the block message. 59 | var block_end = new Message() 60 | { 61 | BlockEnd = new BlockEnd() 62 | { 63 | Xxh3 = hasher.GetCurrentHashAsUInt64() 64 | } 65 | }; 66 | process.StandardInput.WriteLine(JsonSerializer.Serialize(block_end, options)); 67 | 68 | // Finish writing the file. 69 | process.StandardInput.WriteLine(JsonSerializer.Serialize(new Message() { EndOfFile = new EndOfFile() }, options)); 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /examples/c-sharp/WarcatExample/Message.cs: -------------------------------------------------------------------------------- 1 | using System.Text.Json; 2 | using System.Text.Json.Serialization; 3 | 4 | namespace WarcatExample; 5 | 6 | public class Message 7 | { 8 | [JsonPropertyName("Metadata")] 9 | public Metadata? Metadata { get; set; } 10 | [JsonPropertyName("Header")] 11 | public Header? Header { get; set; } 12 | [JsonPropertyName("BlockChunk")] 13 | public BlockChunk? BlockChunk { get; set; } 14 | [JsonPropertyName("BlockEnd")] 15 | public BlockEnd? BlockEnd { get; set; } 16 | [JsonPropertyName("ExtractMetadata")] 17 | public ExtractMetadata? ExtractMetadata { get; set; } 18 | [JsonPropertyName("ExtractChunk")] 19 | public ExtractChunk? ExtractChunk { get; set; } 20 | [JsonPropertyName("ExtractEnd")] 21 | public ExtractEnd? ExtractEnd { get; set; } 22 | [JsonPropertyName("EndOfFile")] 23 | public EndOfFile? EndOfFile { get; set; } 24 | 25 | public static JsonSerializerOptions Options() 26 | { 27 | // Use snake_case for names. 28 | var options = new JsonSerializerOptions 29 | { 30 | PropertyNamingPolicy = JsonNamingPolicy.SnakeCaseLower, 31 | DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull 32 | }; 33 | 34 | return options; 35 | } 36 | } 37 | 38 | public class Metadata 39 | { 40 | public required string File { get; set; } 41 | public required ulong Position { get; set; } 42 | } 43 | 44 | public class Header 45 | { 46 | public required string Version { get; set; } 47 | public required List Fields { get; set; } 48 | } 49 | 50 | public class BlockChunk 51 | { 52 | public required byte[] Data { get; set; } 53 | } 54 | 55 | public class BlockEnd 56 | { 57 | public uint? Crc32 { get; set; } 58 | public uint? Crc32c { get; set; } 59 | public ulong? Xxh3 { get; set; } 60 | } 61 | 62 | public class ExtractMetadata 63 | { 64 | public required bool HasContent { get; set; } 65 | public required List FilePathComponents { get; set; } 66 | public required bool IsTruncated { get; set; } 67 | } 68 | 69 | public class ExtractChunk 70 | { 71 | public required byte[] Data { get; set; } 72 | } 73 | 74 | public class ExtractEnd 75 | { 76 | public uint? Crc32 { get; set; } 77 | public uint? Crc32c { get; set; } 78 | public ulong? Xxh3 { get; set; } 79 | } 80 | 81 | public class EndOfFile { } 82 | -------------------------------------------------------------------------------- /examples/c-sharp/WarcatExample/Program.cs: -------------------------------------------------------------------------------- 1 | if (args.Length == 0) 2 | { 3 | System.Console.WriteLine("Specify 'encode' or 'decode'"); 4 | return 1; 5 | } 6 | 7 | if (args[0] == "encode") 8 | { 9 | WarcatExample.Encode.Run(); 10 | } 11 | else 12 | { 13 | WarcatExample.Decode.Run(); 14 | } 15 | 16 | return 0; 17 | -------------------------------------------------------------------------------- /examples/c-sharp/WarcatExample/WarcatExample.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Exe 5 | net8.0 6 | enable 7 | enable 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /examples/c-sharp/example.warc: -------------------------------------------------------------------------------- 1 | WARC/1.1 2 | WARC-Record-Type: resource 3 | Content-Length: 12 4 | 5 | Hello world! 6 | 7 | -------------------------------------------------------------------------------- /examples/decode.rs: -------------------------------------------------------------------------------- 1 | //! Example showing how to decode a WARC file by records. 2 | use std::{fs::File, io::Read}; 3 | 4 | use warcat::warc::{Decoder, DecoderConfig}; 5 | 6 | fn main() -> anyhow::Result<()> { 7 | // Source file 8 | let mut warc_file = File::open("examples/example.warc")?; 9 | 10 | // Configure the compression format if needed, otherwise use default 11 | let config = DecoderConfig::default(); 12 | 13 | // Create a new WARC decoder 14 | let mut decoder = Decoder::new(&mut warc_file, config)?; 15 | 16 | loop { 17 | // Check for end of file 18 | if !decoder.has_next_record()? { 19 | break; 20 | } 21 | 22 | // Get the header of the WARC record and a decoder for the 23 | // block part of a record. Note that `read_header()` consumes the 24 | // decoder and returns another decoder with a different type. This 25 | // is known as the typestate pattern. 26 | let (header, mut block_decoder) = decoder.read_header()?; 27 | println!("Header: {:?}", header); 28 | 29 | // Reading the block is like reading a file 30 | let mut buf = Vec::new(); 31 | block_decoder.read_to_end(&mut buf)?; 32 | println!("Block len: {}", buf.len()); 33 | 34 | // Get a header decoder. Again, this is the typestate pattern. 35 | decoder = block_decoder.finish_block()?; 36 | } 37 | 38 | // Get the inner reader if needed 39 | let _file = decoder.into_inner(); 40 | 41 | Ok(()) 42 | } 43 | -------------------------------------------------------------------------------- /examples/encode.rs: -------------------------------------------------------------------------------- 1 | //! Example on how to encode a WARC file by records 2 | use std::io::Write; 3 | 4 | use warcat::{ 5 | header::WarcHeader, 6 | warc::{Encoder, EncoderConfig}, 7 | }; 8 | 9 | fn main() -> anyhow::Result<()> { 10 | // For this example, our file is just a in-memory buffer 11 | let mut warc_file = Vec::new(); 12 | 13 | // Configure the compression format if needed, otherwise use default 14 | let config = EncoderConfig::default(); 15 | 16 | // Create a new WARC encoder 17 | let mut encoder = Encoder::new(&mut warc_file, config); 18 | 19 | // Write a header of a WARC record and return a block encoder. 20 | // Note that `write_header()` consumes the encoder and returns a 21 | // decoder of a different type. This is known as the typestate pattern. 22 | let header = WarcHeader::new(12, "Resource"); 23 | let mut block_encoder = encoder.write_header(&header)?; 24 | 25 | // Write the block like a file. 26 | block_encoder.write_all(b"Hello world!")?; 27 | 28 | // Get a header encoder. Again, this is the typestate pattern. 29 | encoder = block_encoder.finish_block()?; 30 | 31 | // Get the inner writer if needed 32 | let _file = encoder.finish()?; 33 | 34 | println!("Wrote {} bytes", warc_file.len()); 35 | 36 | Ok(()) 37 | } 38 | -------------------------------------------------------------------------------- /examples/example.warc: -------------------------------------------------------------------------------- 1 | WARC/1.1 2 | WARC-Record-Type: resource 3 | Content-Length: 12 4 | 5 | Hello world! 6 | 7 | -------------------------------------------------------------------------------- /examples/python/decode.py: -------------------------------------------------------------------------------- 1 | # Example on how to read WARC files. 2 | import subprocess 3 | 4 | import message 5 | 6 | 7 | def main(): 8 | # Launch the warcat program. The options provided will tell it to write 9 | # JSON as a line to standard out. 10 | # Ensure you have warcat on the search path or adjust the path as needed. 11 | with subprocess.Popen( 12 | [ 13 | "warcat", 14 | "export", 15 | "--input=examples/example.warc", 16 | "--format=jsonl", 17 | ], 18 | stdout=subprocess.PIPE, 19 | ) as process: 20 | # Decode each message by using our helper module. 21 | for msg in message.decode(process.stdout): 22 | if isinstance(msg, message.Header): 23 | # We decoded the start of the record. 24 | print(msg.fields) 25 | elif isinstance(msg, message.BlockChunk): 26 | # We decoded the body of the record. 27 | print(len(msg.data)) 28 | elif isinstance(msg, message.BlockEnd): 29 | # The end of the record was reached. 30 | print("---") 31 | 32 | 33 | if __name__ == "__main__": 34 | main() 35 | -------------------------------------------------------------------------------- /examples/python/encode.py: -------------------------------------------------------------------------------- 1 | # Example on how to write WARC files. 2 | import subprocess 3 | import zlib 4 | 5 | import message 6 | 7 | 8 | def main(): 9 | # Launch the warcat program. The options provided will tell it to read 10 | # JSON as a line from standard in. 11 | # Ensure you have warcat on the search path or adjust the path as needed. 12 | with subprocess.Popen( 13 | [ 14 | "warcat", 15 | "import", 16 | "--compression=none", 17 | "--format=jsonl", 18 | ], 19 | stdin=subprocess.PIPE, 20 | ) as process: 21 | # Write a record header with the given header fields. 22 | # Note: this header is not valid; it is simply a concise demonstration. 23 | header = message.Header( 24 | "WARC/1.1", 25 | [ 26 | ("WARC-Record-Type", "resource"), 27 | ("Content-Length", "12"), 28 | ], 29 | ) 30 | message.encode(process.stdin, header) 31 | 32 | # Write the record block data. 33 | checksum = 0 34 | 35 | data = b"Hello world!" 36 | checksum = zlib.crc32(data, checksum) 37 | 38 | block_chunk = message.BlockChunk(data) 39 | message.encode(process.stdin, block_chunk) 40 | 41 | # Write the end of the block message. 42 | block_end = message.BlockEnd(checksum) 43 | message.encode(process.stdin, block_end) 44 | 45 | # Finish writing the file. 46 | message.encode(process.stdin, message.EndOfFile()) 47 | 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /examples/python/message.py: -------------------------------------------------------------------------------- 1 | # This is a helper module that assists and in encoding/decoding JSON messages from warcat. 2 | import json 3 | import base64 4 | import io 5 | 6 | 7 | # Represents the Metadata message. 8 | class Metadata: 9 | file: str 10 | position: int 11 | 12 | def __init__(self, file: str, position: int): 13 | self.file = file 14 | self.position = position 15 | 16 | def deserialize(file: str, position: str): 17 | return Metadata(file, int(position)) 18 | 19 | def serialize(self) -> dict: 20 | return { 21 | "Metadata": { 22 | "file": self.file, 23 | "position": self.position, 24 | } 25 | } 26 | 27 | 28 | # Represents the Header message. 29 | class Header: 30 | version: str 31 | fields: list 32 | 33 | def __init__(self, version: str, fields: list): 34 | self.version = version 35 | self.fields = fields 36 | 37 | def deserialize(version: str, fields: list): 38 | return Header(version, fields) 39 | 40 | def serialize(self) -> dict: 41 | return { 42 | "Header": { 43 | "version": self.version, 44 | "fields": self.fields, 45 | } 46 | } 47 | 48 | 49 | # Represents the BlockChunk message. 50 | class BlockChunk: 51 | data: bytes 52 | 53 | def __init__(self, data: bytes): 54 | self.data = data 55 | 56 | def deserialize(data: str): 57 | return BlockChunk(base64.b64decode(data)) 58 | 59 | def serialize(self) -> dict: 60 | return {"BlockChunk": {"data": base64.b64encode(self.data).decode("utf8")}} 61 | 62 | 63 | # Represents the BlockEnd message. 64 | class BlockEnd: 65 | crc32c: int 66 | 67 | def __init__(self, crc32: int = None, crc32c: int = None, xxh3: int = None): 68 | self.crc32 = crc32 69 | self.crc32c = crc32c 70 | self.xxh3 = xxh3 71 | 72 | def deserialize(crc32: int = None, crc32c: int = None, xxh3: int = None): 73 | return BlockEnd(crc32, crc32c, xxh3) 74 | 75 | def serialize(self) -> dict: 76 | return { 77 | "BlockEnd": {"crc32": self.crc32, "crc32c": self.crc32c, "xxh3": self.xxh3} 78 | } 79 | 80 | 81 | # Represents the EndOfFile message 82 | class EndOfFile: 83 | def __init__(self): 84 | pass 85 | 86 | def deserialize(): 87 | return EndOfFile() 88 | 89 | def serialize(self) -> dict: 90 | return {"EndOfFile": {}} 91 | 92 | 93 | MESSAGE_TABLE = { 94 | "Metadata": Metadata, 95 | "Header": Header, 96 | "BlockChunk": BlockChunk, 97 | "BlockEnd": BlockEnd, 98 | "EndOfFile": EndOfFile, 99 | } 100 | 101 | 102 | class MessageEncoder(json.JSONEncoder): 103 | def default(self, o): 104 | if hasattr(o, "serialize"): 105 | return o.serialize() 106 | 107 | return super().default(o) 108 | 109 | 110 | def message_object_hook(obj: dict): 111 | for k, v in MESSAGE_TABLE.items(): 112 | if k in obj: 113 | return MESSAGE_TABLE[k].deserialize(**obj[k]) 114 | 115 | return obj 116 | 117 | 118 | # Write a message as a line of JSON to the given stream. 119 | def encode(stream: io.BufferedIOBase, message): 120 | data = MessageEncoder().encode(message).encode("utf8") 121 | 122 | stream.write(data) 123 | stream.write(b"\n") 124 | 125 | 126 | # A generator that produces messages by reading lines containing JSON from 127 | # the given stream. 128 | def decode(stream: io.BufferedIOBase): 129 | for line in stream.readlines(): 130 | segment = line.decode("utf8") 131 | 132 | yield json.loads(segment, object_hook=message_object_hook) 133 | -------------------------------------------------------------------------------- /misc/release_digests/README.md: -------------------------------------------------------------------------------- 1 | # Release digests 2 | 3 | Hash digests and signatures of published releases. 4 | 5 | These files are provided for those who want to verify the downloads. 6 | 7 | Digests are in hexadecimal. 8 | 9 | These file are intended to be updated by the publisher and to be authenticated using signed git commits. The xtask crate can be used to generate the contents. 10 | 11 | PGP and Minisign files are not uploaded because GitHub's interface shows all uploaded files in a single list which can confuse users with too many files to choose from. -------------------------------------------------------------------------------- /misc/release_digests/keys.toml: -------------------------------------------------------------------------------- 1 | [minisign-public-keys] 2 | chfoo = "RWQuQKHwtF7mWxV+/DmYv9NAic64DuxIjr8JDers7Aru4WJSfGZPiLqx" 3 | -------------------------------------------------------------------------------- /misc/release_digests/v0.1.0.toml: -------------------------------------------------------------------------------- 1 | [files] 2 | 3 | [files."warcat-0.1.0-linux-aarch64.tar.gz"] 4 | sha256 = "0ebe17af4a55b7a1c958bf0bb1bf64c9ca0ff7d69a4261ae72ccd84da5e30295" 5 | sha512 = "0cdcfd077dd11d1c3b12980bf1c2ec4eb31df1c087b752e70662f7fd836a43f80e419cc14a3628af184e19ba362121844052e9fe8b9e00157aa73e632e715bbb" 6 | blake2b = "80e1c0246ec39231f0c900ee30804fc81733e5002db584751e885bf35608faedc906cda0ff7c999f697cd9849debf3aeef0fc8fbb8990a5eb4c79b6d8fe01437" 7 | blake3 = "a1f540491b4d3bd8856f2fc2ebdb1926d4992c988364a67f82b1a250b98f9efb" 8 | minisign = """ 9 | untrusted comment: signature from rsign secret key 10 | RUQuQKHwtF7mW+bXhuP7T2wHqfu0i895qd5u4NeNfZ4tdfnD3dqbxqE4/lRGVCNsgM6GYfyzGA8w+qsw/TGQh7vq/F4GkgHtUw0= 11 | trusted comment: timestamp:1728978389 12 | EzdOPEWcIvMctC4VDHhjJKT4wx/qvtMpSDZDpVNgBSIvemf8qkdnbQx7reKLXuOlGyP5iJFoTOXlYh+rJK7bAw== 13 | """ 14 | 15 | [files."warcat-0.1.0-linux-x86_64.tar.gz"] 16 | sha256 = "846d6309ecfa1e03eb59edcece79bfe76c7fcad4e8c07c461843c90a09d38cd4" 17 | sha512 = "7cfa5c0a27f9b940fc6a43ec87c197861d30b79dbfad72fc6867ceda48d4ad83cc8f950c7310aa8c77d0bb01e95139e7a0abb3fb46ccf083e10902d2a0ac1418" 18 | blake2b = "fe426de7c158e3557a09811bed2dd3e28f0fbba8f8d7b5b40fb958cc6411c15fb5189f61b59c59edac60cddec6f01d9c9523d725a8af425f29dd87ca8e706a94" 19 | blake3 = "bc11a7ec992318809a611318a46694e2ca2efb417a4bb5525c6c896b872ccb69" 20 | minisign = """ 21 | untrusted comment: signature from rsign secret key 22 | RUQuQKHwtF7mW/+IjDs6Eqiq6xLLyvxyn3jW1+iDEjR0EF3qj4c6FkSb9ahJOitzu0jIn9+mRIF6EFB50IO5OJHwJdc09ppTcwg= 23 | trusted comment: timestamp:1728978389 24 | gpIZdh2hydI4yYkHfqhfl6bo1B3x/lASihJS1jR+YavyvMOnFQkariewHHl22PzXQHzzeJqSHJ1VLgCYog9wCA== 25 | """ 26 | 27 | [files."warcat-0.1.0-macos-aarch64.tgz"] 28 | sha256 = "66399651b2a6327262352dc22ee45da57f6a3f9d4d2ea24f24884313e5c32f49" 29 | sha512 = "910d292234d39c360f57c437f62bb6141900cd99fc5aba8a562a58406f5443fe91523252639a1be5b1e645276fcc606efb48b147d5fc9620ab0eb852bb5275f2" 30 | blake2b = "00cb723c7c7841521f0bd671fa52d4c7cd7f79bbfd91ac4d73dec791bfb4ad79aad2352df2bae50028997ee52c6dbdc650f3266ed388b1dbccb3a0d90631d33c" 31 | blake3 = "3a47793f2650dffbf91dd0bef1c18f9c41c619721772a78e12b6a660efd86c72" 32 | minisign = """ 33 | untrusted comment: signature from rsign secret key 34 | RUQuQKHwtF7mWxK4MMuvLqDiiZ9guft39n9JC8q8bI6Fh/U+xn1MgbXWXjAPIZIkYOzr90tO/4wBPEsl4VtRHVd6obBn8MNpTgw= 35 | trusted comment: timestamp:1728978389 36 | d/44ZG9Masie+f/WqU1AyBQtNnwqNpbOQtSuZ3ZMLB0/2liWJNNWk2V6tT0vq/TwvIVfS1pLJ9Uio1btW8KbCQ== 37 | """ 38 | 39 | [files."warcat-0.1.0-macos-x86_64.tgz"] 40 | sha256 = "596410f7afabe41b9e43f0e335d10642c95ba2af437153ce5a0eeddc582c990c" 41 | sha512 = "ba906154bfd23e4904c73cc7752f4978a06315141dc6bd3ee2a781f9057c4e765245870c49c03f7fd749ea1c1588f63090e15f1664c4a09dd127afe452304efa" 42 | blake2b = "9de64ef6c99d1d28e7dfb24029f73c19c87a2562db0c21af8b28c92a91026c634efc1286c6f6f78a53f55f6b9fa5524adc19dfd51db86a7f6020ef359eb0649b" 43 | blake3 = "e52d4c56a97e66074832c7850e6545ed7378779d6940cc446cf43d5aca1d3b22" 44 | minisign = """ 45 | untrusted comment: signature from rsign secret key 46 | RUQuQKHwtF7mW6KCRqUj6KtchRhudKpYLdM5dOCVJfDut/iF6SuPAdCArEjqUS/cxstS2vqadQZ2ls6dHxr3O+/D8brVBWt6dwA= 47 | trusted comment: timestamp:1728978389 48 | lMd7kMueZozMix9jpI13dY6JDf2LbUKwUZwssUG5588EPHo3r3aGxQzmQbfg5/nF4jgOCZDPw1j3SEA5gP99Dg== 49 | """ 50 | 51 | [files."warcat-0.1.0-windows-aarch64.zip"] 52 | sha256 = "9d14610de3f03a3cb67d770702906d77d9577921b0cc56b64c0e861b860366bc" 53 | sha512 = "eae838140b109f19d3208fd6741b23d1cf43d0706ef100df83874c834ddcd4b48041968a7d007ee98a472d493053319d01c784f18e80031e3277bc66c47e39fd" 54 | blake2b = "77981f4d67d4aff0c5c21cb9625eab730ce1f566ad252013b3d65905abf8f8476d2ea86a2b966ef9b49430128943525d5495686bea307f9f4cf48b8876b2d3c6" 55 | blake3 = "3200b094fccfed461af5442bbb558dec497ab0c654abaa794a029d16985a3752" 56 | minisign = """ 57 | untrusted comment: signature from rsign secret key 58 | RUQuQKHwtF7mW4i4DEj8V+SHjKnFyll9RSsLf6dOcrWvs+d0WYL6lCX+hnFMMTjg07bvllLhWZxFyQJRGYNvz02x+Dul7V8inww= 59 | trusted comment: timestamp:1728978389 60 | hYIJgW2H0BP8RTACNAT8pMh/Xqka3VqgjtrM44WS2BOhcQDGu+hnXHf8WZeCc1ecechaOG4oki20QlRxtTaSDg== 61 | """ 62 | 63 | [files."warcat-0.1.0-windows-x86_64.zip"] 64 | sha256 = "cc5096df315f6c6d98503a63685b1e45ff4395412c4cb894966dd396cbef86ef" 65 | sha512 = "856409ea4b399298094a2181e863bdf2e6112989c6f6939a18e11ce5e7b5bd19ee3124c287e4b5bb799c061d475a8971b587bc40f9295dc2eebd3aecc2622dbf" 66 | blake2b = "9503df06ac18930f9a3938c915e953880b1dcb422ad0e2757cce5da99c2d0b4dafa522f36f17ca49b18f7ab77d5628345d2c5a6ebb4d5d09890b4a37cbb87284" 67 | blake3 = "206931c5d99506ff6355e7d1ed78101163b8fdc8f28b411d19d8868bf1a11ab2" 68 | minisign = """ 69 | untrusted comment: signature from rsign secret key 70 | RUQuQKHwtF7mW63+qSubLSfrejtgNII+7UIqAriTat2e8TiR+is3/gBkW+DUF2lm1U7nKtgMtCNZdr14fAmS8GbU9vYWDd9Qcgg= 71 | trusted comment: timestamp:1728978389 72 | 1iYQVwNe46HsY7zZCUySdumWQX8Rg6fFCndT7keEGN/spMEl6QQEMBU3JOT+T+EBqg7AWqpCZOneEc95lzU9Cg== 73 | """ 74 | -------------------------------------------------------------------------------- /misc/release_digests/v0.2.0.toml: -------------------------------------------------------------------------------- 1 | [files] 2 | 3 | [files."warcat-0.2.0-linux-aarch64.tar.gz"] 4 | sha256 = "2a51fd906c0e6fa336413c9c28fef0ba4a3f7da58ac8e9c5124ea4c768109452" 5 | sha512 = "4bfba495a56de8e0501c57e012e4db7c921ce144f839e1ce2d56b805a466b69192d33d7c7583d40852ef99cca3d09cac1c32ab3652bc9bb079797baf7e81d06b" 6 | blake2b = "f3e6c7aee6d771e95e0c96e24171de142bfbe3d3001334e08859f21fccbd79b260f417b30f77c7d31cc8f96e9f5506ebeaeb4b8d03831320d7dff1a945705f51" 7 | blake3 = "9a2f3e13a57e0794812047782386d96838841b5cd276a7bc405899e9c5b02725" 8 | minisign = """ 9 | untrusted comment: signature from rsign secret key 10 | RUQuQKHwtF7mW29ZDrTDN7kXMS6lfOtOxyEtDKkLNdyApSTxpK+rJkMiuJs+aOX/DAI2NSPsD+ToUo69VsJs2+x+rGBXpAj76ws= 11 | trusted comment: timestamp:1728978389 12 | K4nDuKcks6FcXknnCRXPfLIrZfAxQsY31ls6BZjzcQs5Yn3DZ7UvG27b5tj0rkWH2kacWDOzvKrDBYyhaNf3AQ== 13 | """ 14 | 15 | [files."warcat-0.2.0-linux-x86_64.tar.gz"] 16 | sha256 = "b85d093e2dbce2b143923d6b9204165235eeea2219296b3f6763e512a43f20d2" 17 | sha512 = "0db021d416b0f38d362c2015d5905418574eb352bd35b29c2b93e6788767f2e82c4d380c91934af8b3a1042d574799c18ea15ba207768814200ceb2399c9b0f1" 18 | blake2b = "d66cb71aac57126995653fe195dba857236c46319c51a16cdea30d14abff570316da0df2e07da5966e81b027b8e0041e25378275f12e7eee4470c55442c11c7e" 19 | blake3 = "9b902f728c190566f0ee489ae1794a2f15a2e99d451888cd8c4ab9dba59de607" 20 | minisign = """ 21 | untrusted comment: signature from rsign secret key 22 | RUQuQKHwtF7mW6+t2rQMqgsWIfx9Y4nfD3sbPeNNJkdFkWs6OU8Jpno386ZGxZFDta6ToViozgk96MUjjKyilIf2EFhZyQqFFwI= 23 | trusted comment: timestamp:1728978389 24 | cAbX8ogfJogclVWSk4XYE30Y++k4WXAtJbsO1Ak1Vj1iFcWZbq9T3a8Cc3wCi9aNYvJaR+CvsFD4KBiKyHRqBg== 25 | """ 26 | 27 | [files."warcat-0.2.0-macos-aarch64.tgz"] 28 | sha256 = "7108a64406ca9949e137ecbae3986fd37e860c9d5c5c526085c334f08c4453aa" 29 | sha512 = "b2a7171208d39a634978971f3cbbfddc092b0f487ab044d023e52035968921b1032afea134323b11c284bb1a9861166fb6acbc17d778709619d54d29b0976950" 30 | blake2b = "86a5c8d45ebbfb9d91122331ecf16c39a898c0fb0a33985013b57b2f57e811275a5625bf71b77c28af401591d1339900897fd40d7f453b39cde74ef678ad7705" 31 | blake3 = "72616bd00e24f5b00981860e94904e10ccf64a0128e4c15fcb41f41e8ba8152d" 32 | minisign = """ 33 | untrusted comment: signature from rsign secret key 34 | RUQuQKHwtF7mW3oeI6HIKz19fogsqpYZmsx3N6xrNJkhNfVkmoTtYfljEC9SbaS9jfQwZlNFwBNZCk4VYuzpJH1NH1el1xm64AU= 35 | trusted comment: timestamp:1728978389 36 | jgoOLYPm/j4Of1WEGSscWZeGY1hD4l+I0h9FRmhkvDmXZW0ohbyWW6D+GHH6YBnF7M7tjPPq1HwRyLOL73JPDA== 37 | """ 38 | 39 | [files."warcat-0.2.0-macos-x86_64.tgz"] 40 | sha256 = "9c406ba6eec2236b027a94508dbce7d939e65fa6d875f1e029c5e5b86596cf45" 41 | sha512 = "db6c2e4fa96c7b5a39a73494abf6475d1de05c14a8ed724fb308c8143ef920e34f63b6b2599e54e7bb0bf473367a6c86728849f391c90db699bc03b8c2ce6f3a" 42 | blake2b = "15c8a1afa6f01080d6205a5051e46afd5c79cba75a6dc8d58aaf188198d6f8faa75f1d44a5eedbc54e2c590505b90341ba9144bed245280a44ee91e8e7d2fdab" 43 | blake3 = "58251a94f9a271e6d67acc992057c61cf1bbfdacc7114b73f7eb17b9dc8498a7" 44 | minisign = """ 45 | untrusted comment: signature from rsign secret key 46 | RUQuQKHwtF7mW84VqjJwHHdbkR5E6gFlRLN451oeHVmLM6ztIJYEWpCrBAEWKZtOHhxqXyviJcPEVm+6f+yD1segt7PUy3/eEAc= 47 | trusted comment: timestamp:1728978390 48 | g5bsEHQ6/jUJITQjDh+xXdAN/tYSdpvdO2iw295bGpmKOuNa/LGTC3ZHu3mtD5Kt4YgRutbRuZxUfBs7uYh5BA== 49 | """ 50 | 51 | [files."warcat-0.2.0-windows-aarch64.zip"] 52 | sha256 = "34874c0cbf10fd55d056757f24d0e869c36c23411e7c405384b5f431af6b9237" 53 | sha512 = "4625debf653147c9959bf0941976191dde3a42610a936482f7dddda06d82b0b3bdebb45ae2ccea141a77b0c08f619bc212a9065359c0a49f6d4c9a0401cdd92d" 54 | blake2b = "bcca9e0f6d461bdb2c6d12b77d6fc4e58d3b568ed14f8572c4886e55f53d8fe1108f5411c6120113fe18e1c24285fab24bde3ea7f160f0ceeca4e24d6dc3adf7" 55 | blake3 = "c4d25b8613fe43f327755111dfad4499ace3fa7e35a89d714ef48005fdc3114b" 56 | minisign = """ 57 | untrusted comment: signature from rsign secret key 58 | RUQuQKHwtF7mW+jGbbjGXQumuC+jOplUJVE8knjlrOwbgcR8q9PTcVfKdep1Q6srtm/c+/jsL9TqdDyUUNqi92VpoVvEALIctwI= 59 | trusted comment: timestamp:1728978390 60 | tSggzsS8js0vtgosl2uON77MAyjNH56twBPHu8wcdO5dkMoIAGHl5Co2/H/L0OtyEhZn/7r/b2Xa4+72CC6RBw== 61 | """ 62 | 63 | [files."warcat-0.2.0-windows-x86_64.zip"] 64 | sha256 = "c252580a57da42641154da3131110aabd3c79462d8499f0bbb4407af067c1861" 65 | sha512 = "c356afdf787bbe6b8fea1c1d6f947246c120ceb4677bfb1e2c5ec3113b09d46ef2f81436980c9e41f150a33597f63d9c5d5ea030de311fa5e9b9e0149fd37567" 66 | blake2b = "6b9df7ec2a4ccae4e4c32196ffbbc73ffcd45b920cee08b29950438a4979b0563367c5ef3e15fc7fc5d9c942af2b19fc40706ae58b405bdc21af9ecfa53f221a" 67 | blake3 = "ebebe4cc439cd7b5108141166beeb9b60f43515785cf4ac97662a4c64658308e" 68 | minisign = """ 69 | untrusted comment: signature from rsign secret key 70 | RUQuQKHwtF7mW7qI255kp3l9RUtgZVij+hrAbE8cHBa0sMI2CqUvyNfOZjEnaGUPfl8mP3bFnQvBgxY2GP/ysdDAp5f7SvcwCQo= 71 | trusted comment: timestamp:1728978390 72 | Onrfl4wIUJox3iAC/MTf274MQwCQcvncaOKguGJD6yQJoDb55G1jQ/cODWkTtki3x/VjTAov18hoTZDiOl/aDQ== 73 | """ -------------------------------------------------------------------------------- /misc/release_digests/v0.3.0.toml: -------------------------------------------------------------------------------- 1 | [files] 2 | 3 | [files."warcat-0.3.0-linux-aarch64.tar.gz"] 4 | sha256 = "5a061616dafef0df239e43ad3491977b628febb99e44066c2ad6b0a9cd92e42d" 5 | sha512 = "5919377ef028d5d22a5a97f39bc51e744a81406f46b0645e487da0e8ee327f64c0e8630c83b0dd450c27c264649af40d7f7b8605fe0b1a1f44e3b92a6a77f5cc" 6 | blake2b = "dd3c2c4a386266a53a0e7a147f09243baa5c6786a8321e6bb2af98cc9a850f57438af704d7a4ccccaaadb206f0bb9dc37bc2aa710e94a76bcd1d6bd2a9e5a3b8" 7 | blake3 = "a34c3d419b72b7fd75cce5f8a1c6b99cca56eb88e4d68d4d1ccf7e3b8d15930b" 8 | minisign = """ 9 | untrusted comment: signature from rsign secret key 10 | RUQuQKHwtF7mW2glP/mfzY1+yqGzqIB7Y2DHkGvz4R/5YBXaia4co6/53I66bwHHqVGvRfVzSwsBn8L481+zzpg76fYIb2RAgg4= 11 | trusted comment: timestamp:1729508179 12 | XV7DRudNouqe38es3cSMbc1DB2VN0yJyuUImfkoq7nfYsRo+2KcOv0BAEFhCBdjUYExbeG35FFaQxj3e4bWwDQ== 13 | """ 14 | 15 | [files."warcat-0.3.0-linux-x86_64.tar.gz"] 16 | sha256 = "36b2bfe4fe633f81499fb7bd33ad78f155852bc062bc5c4253d724b42670f4fc" 17 | sha512 = "8a4a577cca44a7e6085a1df5d2b4bb2cede0e35cc4ac9a336077aa21bb69944f6b8f2864625d7dd2d871268ed12e60f5c4d329ba4acf4360d01152f02371b28d" 18 | blake2b = "1b70ce138eb72c218ddfce7d092b00d478392af4d764a1e829b209bc65b2fe393d0eede5a70008f19428d4dd4f6d4c5f0ae31df1c535fcb065255a94f98c5cde" 19 | blake3 = "94ff9651028d0c16f2007ce8e99e3decef6d0e37dd1559e2db8ff563761714cc" 20 | minisign = """ 21 | untrusted comment: signature from rsign secret key 22 | RUQuQKHwtF7mWyNjnkNrJiXja0r5B8EZJ+heYwWe1Gy4++UmZT0LdsngzLIP0P9x1o4i4ag0vvp40x8z8QQu85yc4va4iTWxygY= 23 | trusted comment: timestamp:1729508179 24 | haeqbaf44hBuIoBXiJ0L1/Nq4l/GMJBOdm7W8WvDBDJBHLDE+KKYRp3PmbUcVOZLz5/+Yr7lLeIoZzBL53PmCQ== 25 | """ 26 | 27 | [files."warcat-0.3.0-macos-aarch64.tgz"] 28 | sha256 = "f4af75a06c7b69512322769d95e7494a332ee0d566f204736c2d647d79fd8ebc" 29 | sha512 = "1b47bb338c58afe34f2bdfad6aae8d037a582aeea7d2c844d3e886f617f12a1eeb4938a5ad2833e99a2b8150a00041b209015585a04ed14f9b9950ebd8e8e04a" 30 | blake2b = "781e7180b2dc2020e101ce01a7e90149f928ec4672a8ac53f35fa5395a10b8d704ee5c389c0a8d08818dd591031e5097b434dd930f07d120281de0405877111f" 31 | blake3 = "0924084bb9f4644c78db377d69c2a4fa7e8d4c54e00a894d86eefa7b30086901" 32 | minisign = """ 33 | untrusted comment: signature from rsign secret key 34 | RUQuQKHwtF7mW7tMZgq1+NCPhnqab83FH1mWz/WuVwghKEF3SpPilmTQGqCKV8WkGN6Cg60f1uGKFmlxXD046ipuHXxse6wMKQs= 35 | trusted comment: timestamp:1729508179 36 | Vp8d/VcJhingkgc0/djgmg2gFsGvTUduk0N1bTj7N73vrpzrbcmVG5aXTjm9VcQ8zUOpc0Vk2QDRe+Ww23+JCA== 37 | """ 38 | 39 | [files."warcat-0.3.0-macos-x86_64.tgz"] 40 | sha256 = "a15d30f4cbdd2394a1218d2692008077cd5754d6040b2adf8081311770ec9800" 41 | sha512 = "fc1125bab186bf8cacc7acdb0f31d69e1aa6bd92b2aeae5d9210cd1b5a9a613997c119d62d39330d12b3bf49a74082e533673bc0aa198520a370d4551ba85792" 42 | blake2b = "17a33aec8132c7c9c32cc5988d883b797daec0795cc6845e9c96ca7331cc41e383040d0125baffc6d4f3a80d1f300f577f787dc83232056a327dba63935d7fb3" 43 | blake3 = "754aa921560f8b6d38b5f6def654ea32e4c353ddc6265183a283f28365b3fc2f" 44 | minisign = """ 45 | untrusted comment: signature from rsign secret key 46 | RUQuQKHwtF7mW450mR2d1ariE5pq3OjeVHDCUfNEhNHGi9PlEmEnkAwNI/H8bp8jEUAzID0HjG9tJiMwHrhD2IbSjvMUq/VEuwo= 47 | trusted comment: timestamp:1729508179 48 | cIRQh5nVXse9ROtdr91x5Ti907P4/0jGW49gKqy2qzO3EQmL/AH5+KxjwDW62B2QyijlFeYS/YcqD5xWO9mRAQ== 49 | """ 50 | 51 | [files."warcat-0.3.0-windows-aarch64.zip"] 52 | sha256 = "90874e7fef402150386f6fe068cd1412031e6a6b9bbe65b264ba763870e36dad" 53 | sha512 = "22aeca4a8a2040c6277d2148719fc95943248eef315e8a32a989c6a3f01b5e4dd6ecd07c5eebd7dc3a6325e75c2082ce68d48dd82c781eebdac05eb113e03fa6" 54 | blake2b = "43f12064eb728d7ff628fdec5948a81b2c31fcf4203ba0ac9b0f03596e90ff1463519e63ae49ccc03516da7227296b7e0e82793b118ed41b2a1d4daed7f1331f" 55 | blake3 = "83d1c2c9cdfdf40b5eccd8b05cc8a2a89bed6cf94a759cb05f9fa4ff7f12aa69" 56 | minisign = """ 57 | untrusted comment: signature from rsign secret key 58 | RUQuQKHwtF7mW9GGFWkc1EiPU2ANVwZiUwGd8LY2t0qmmdsFR1RMBmcgi4OYwoPb+c5gcNXXc94YS1+0/28hG2rHQnZgwCv3ygQ= 59 | trusted comment: timestamp:1729508179 60 | KPrCjs0/iGkXvkGssn+kE1dHs856xMxOQNmiWJjMIt0ZLcbiA1oGP5LKUPhCXpSvIlAP04MtozM7smDBXRj3AA== 61 | """ 62 | 63 | [files."warcat-0.3.0-windows-x86_64.zip"] 64 | sha256 = "caa918d8f5e34ba391c18a2b66251c11c41002585e3656b8d3e2a12d0a707284" 65 | sha512 = "7a485a55dad0f08bbf297ea2815894d0127749e6e6142ed63616868ff72c2db4b7d3105faef60c675fa04cc339f6b577dfc876efdb7ce7ee434978de1a2c4b88" 66 | blake2b = "e021289784d23170ebe38a81009c5233c25aff9c033451bbf3d14456f4c5f59752f248a7eb36756073fddab25d51ddb5eb25eee35fb810b2d464d273b9598a97" 67 | blake3 = "ede997240902ce46718992d7e67e55e82aec914b06cbcf3545cf4ff30e6fb8c3" 68 | minisign = """ 69 | untrusted comment: signature from rsign secret key 70 | RUQuQKHwtF7mW0RLXxdrRIh+kzv7RBhJTo+rlik2gzZotkEbeJ6VyhtbJwQNN9MJZ2W6wwkwIjPShAgmWFHmfALU/BSIeSp+tgI= 71 | trusted comment: timestamp:1729508179 72 | 6zUwlz/SgmCDtnNBywJmQNP9BGsle3sdTUze/m+eWFEGh0Hdbruz/HzpjIh2eBriIZ2ND5OVp06yr1AYfY72Cw== 73 | """ 74 | -------------------------------------------------------------------------------- /misc/release_digests/v0.3.1.toml: -------------------------------------------------------------------------------- 1 | [files] 2 | 3 | [files."warcat-0.3.1-linux-aarch64.tar.gz"] 4 | sha256 = "db7a498f3b5461635cdccbcd3795265d6fedb2f3c983238ae06f3f87f56197bb" 5 | sha512 = "3e99f7f990d15893097adee12d76125531ca588bc2c7ca5c66888a0110b79f6fa200fa56acd2c32873a57ff70f6d2220c41e64652fedf054db3a21f79bab185a" 6 | blake2b = "4839a4e79ac340ffbf0b8b35cac96a81f50d0aa8e965f9c4fda313f5a1e5e6792717fe77062006c4ec397dbc7c72de4812b8c2463789d5137faf1a36d76731dd" 7 | blake3 = "af2c260c6cc53a6fcfcb3576638fb64f07ab9f3703483f99161cd585dd57d731" 8 | minisign = """ 9 | untrusted comment: signature from rsign secret key 10 | RUQuQKHwtF7mWzZVBWQl1OOrKdlYTA7gB67II9mvwGwRmU1EEqF6749uGehC1mrW48VTGmug0us0pEA7Do8QKKUHwN4Y/wDEpQk= 11 | trusted comment: timestamp:1729611596 12 | Z8sBZmCHU7xQPdZx/rNkOIxFJ4tKpBxXaXJF6yiQbDp36BBeHOy06AXIWIr+mDpb7o2icqT0nHk61aqGE1+jBA== 13 | """ 14 | 15 | [files."warcat-0.3.1-linux-x86_64.tar.gz"] 16 | sha256 = "69bf2e09b86315ddbbc9fc5873584e31b9a7d98c5238ea53b9a753a741722898" 17 | sha512 = "834d03b8f7db79bba997ad02e4aea2472f4676aff2ad683847bf80f7f1f45ebfb9d0aa08c437bb0a98fe149c0d7e342e6b6edb0ba356bc75e62f6f0dce707e4b" 18 | blake2b = "1e6e70c3b98a5460301f0269c8d91b1bca98b6bd6bfdce6fd1973e4413778f18e8c4a2db272e26327a6b80b383fa54d8a33b07a496988e654d59c3c02e95bbfd" 19 | blake3 = "24c4643c08f23e60e9a298e6dcb05707c53f62019e4b0eb146c07accf4926fa7" 20 | minisign = """ 21 | untrusted comment: signature from rsign secret key 22 | RUQuQKHwtF7mWwnCvGPbYJ0BZHfi9kiQBz3Vp709+0BJrknH4qVD1BXT8rdEOHRLyUS8oPcELlDzXpYkomBxr2sSo73AS1J9PAg= 23 | trusted comment: timestamp:1729611596 24 | O6CWwlSUolDNkDqW3YNH1qjyRXh3wX6etbA6mLawusaRFYUix5feQ+xUFX98CPhiQp79nJs5sU9jioOFDqanBQ== 25 | """ 26 | 27 | [files."warcat-0.3.1-macos-aarch64.tgz"] 28 | sha256 = "a51fe97bcbd01ac7354b47296d2acd9c382913699275e9a7238ddf9c1a17baf2" 29 | sha512 = "03f616f16733a0c353c42f17cacc03b167bacffdf1f32f19791131214f674c15bb8bfc8a2d9c103c0f19b1f1788eff184b0413182bb250101fd7bc1597f3314f" 30 | blake2b = "1b7a52d0cd2249c18838884b64c6ee9fb855efcd5d1effebc03abad66f12afb043a970406d9fcd90aa1d23dc00622a85a60e6e8a194d98df76e84610e1335349" 31 | blake3 = "64fb8498b10be5b2bc2bba8227c0a1973227a9a75ce78b5c0641036eb7d5ffa5" 32 | minisign = """ 33 | untrusted comment: signature from rsign secret key 34 | RUQuQKHwtF7mW18BMOk1Fs8+SvIPFcivrGBURRQZtD74np8nsV0SME/jie+Pov3VCxT96qE4tsCn5A2TDvVyCrxKoJMBubHBHw8= 35 | trusted comment: timestamp:1729611596 36 | JS8RGbYhCvR0ZEYRqUImfwbZOgrwH7u9rX4eqDAdOXdPMbQn7CzkAgfVELx6V5VbY/8UhLqx+FYHYGcYGoBFCQ== 37 | """ 38 | 39 | [files."warcat-0.3.1-macos-x86_64.tgz"] 40 | sha256 = "16876e9fff07b11a22571c84e61e7baee33ee99d7754e4750e49c65d5aba5ed1" 41 | sha512 = "0f233dd84c4171b1e6c380bb0b9c0e620d0d088857e5c55b0df9398e0e9f2104a5e3fffde5025e0ffca598574f2a8c8e023a5bc120b4283262b25b1728d79345" 42 | blake2b = "9b7c30e32f6f3383d919963db2d8475cadd5d035bfc266b83b7a19a2be6524b22ad5b01d61745063e7c82ac24bcfd954db6ea0e3e4889c0ce1144a2b568f9e69" 43 | blake3 = "8001d0074b47d6e8ebd91835236d0ff9ad747097db37dcf96fdff3f0142b2dee" 44 | minisign = """ 45 | untrusted comment: signature from rsign secret key 46 | RUQuQKHwtF7mWx2g3o9n/EajWgIoC5Fzaza0AuTMjEEE3sIjdY7E43jpOZAcsdbHr3UGn8yEjQ5dVYWivcndaRuuTOW8aPeRZQg= 47 | trusted comment: timestamp:1729611597 48 | zPSPGcF8EZF7da93HoJJnqgnhNy2O4pWa3cgXu39CPSqGwmPR7XQS1XVGo5DMUBfP7ZHLRquIZx1Vo0W4GaLAA== 49 | """ 50 | 51 | [files."warcat-0.3.1-windows-aarch64.zip"] 52 | sha256 = "07dd870555eea8729e0fda030920cedf8319c0b10b50996f09d2fdddca284265" 53 | sha512 = "8c770f83696a78780fab9bacb43dfa7450a24ca49229c0f911bd4ad9ab77f19716f400f4421d51e9b95c8e22f762bafdecca2494a0eec2b7ed62125ae0242fda" 54 | blake2b = "a29402e987f6f5e43f35b76ef3847288a9ac585050f6a40f29ffe559a749b237f5909d680d0e5f3031ef29fa368fa698c7f2fd69f731d253bc145b55a64bb275" 55 | blake3 = "5fd2e69589836a883bda842c14323e517823ad724f77fd87739ee127f8834ccd" 56 | minisign = """ 57 | untrusted comment: signature from rsign secret key 58 | RUQuQKHwtF7mW5/4HT0kLpr8uLMXE3EGWTwgwg0I+ZIW9TmTAIBuX8A0MJ/zULP9fAh6PbczBXpj1XboieJeqqCqRD6G5L27SgM= 59 | trusted comment: timestamp:1729611597 60 | 6SnhbKWz1YtYsHcQuEdzZI/Ed5Mqx1vuBjV+DbL/+JCB9NAbha/U50wwdXSEmwJzxX7QXxzBOOaaeAG9drONDw== 61 | """ 62 | 63 | [files."warcat-0.3.1-windows-x86_64.zip"] 64 | sha256 = "a3e8919292e23761565685dcccdd65b5a7c9a564593e21cc1f2e10ab6c5c8931" 65 | sha512 = "dbdf820d6155c9d8b15829a86aa9e3870e4c2fa57d7b9df08acfa5feb46914a9853bcd78bfb189c10373c4b624ba301af52b89573813e25517701047287885c7" 66 | blake2b = "2fc21d8f8fb4d7096d9c9cddd1860f2adb53a2cb9eb1d48c4d19719b33b8e8d39c09a32b130ced07f06cfd8f5dd6fd6590d95276bd54c14383faa7e3f3397774" 67 | blake3 = "2d405777001aeea60acee3f4da0db886cb4f4432a0649e5e8b649e610d846d1a" 68 | minisign = """ 69 | untrusted comment: signature from rsign secret key 70 | RUQuQKHwtF7mW0fyddxLCdU/iQ+G04Bk9Fa2arbnT304Xi1NLnoTzsIrfCWs12pHsn5HB3bLddt4edsRpV/30apsnvn2ezaKGQ4= 71 | trusted comment: timestamp:1729611597 72 | /YfdCn/V2jqbTSQu09znhgF5T1CuRTcTs+h5p3Mark2lVQokebEICb+mv/Pe/2xSLwRlRoaa/4O4gFqgUWNcCA== 73 | """ -------------------------------------------------------------------------------- /misc/release_digests/v0.3.2.toml: -------------------------------------------------------------------------------- 1 | [files] 2 | 3 | [files."warcat-0.3.2-linux-aarch64.tar.gz"] 4 | sha256 = "2c4c8b7e84bf60cc5b21f01ab4d2e575e5b246989ddcce77b1cc94215a1e20e6" 5 | sha512 = "c88a93cd43af142f9af47ee399d70a78e9bcad1d2d64b71379adb2a8a9b4c4ed231b7df13b19a6591cda4df5ee45683fea042e2b342633fa04874fcc57de664f" 6 | blake2b = "28a4a0788d9179678f9283fd98597f4d2c4284e574f11129c785bd5ffc5b9db8a0077ab4ab1ef7df0197801e54885e89ccc6628f655bba27cd7ec61c444b5f47" 7 | blake3 = "7995c640649198c9ae3e549ba21fc038d7e95d08ffbdac46273b786fb3292c0f" 8 | minisign = """ 9 | untrusted comment: signature from rsign secret key 10 | RUQuQKHwtF7mWxNyyySj7+Mq6LP2ZuTZC3FgYeNbjqokuzpda68tJP5Oy8ryDYWv9VHKiFfNwwVzugpxJQsEqRSNYzw0P78XSgc= 11 | trusted comment: timestamp:1731620824 12 | TZ3C3Tdif4Ba9LbG2rRulzgkawXn4ZeF6p6wnxJNqpxvIpQ/RexQsmEpqQibcFHG8rebVSbNm3y9hevS0+WHBg== 13 | """ 14 | 15 | [files."warcat-0.3.2-linux-x86_64.tar.gz"] 16 | sha256 = "2543a8789dacd8ec5670146edab1682fa530f3c643e617ba6dc9200a056fc8b3" 17 | sha512 = "d52f8eba7d8867519b63ed1edc8469fb4c8fcce40c0188c3b789e5e5fdf5c8032a3864a2b9e9f57e358d8971f61105da506588f09c87f809bdb9bb09aa0dce6d" 18 | blake2b = "9340154e85e40a8abe66dca1e42fa9b72a4ff3d22d5304bafcf5d24848d8fb6dab670dbb02947b02379bc32fd7241a9c8c14b465ee046c46276dfa034c2d1306" 19 | blake3 = "39095b5bf23bef0bac3e87fca6f43210e4891e23a95fe6c6335422b8cab9effb" 20 | minisign = """ 21 | untrusted comment: signature from rsign secret key 22 | RUQuQKHwtF7mW1qFZ8sJkVXGtFrKH2Fokha1EPAxGpTfXdftysmJ/EkCcr6l+HETqxytgszRAAxWKd/ylaSzhKmfj5SuTEl5HAc= 23 | trusted comment: timestamp:1731620824 24 | 9vHqEh5ZmcAgEWUXgpnJ/FTvft9fdY1stzQBxYps3CLPOXY9AkjZXXAJK0xQsgNuci5QD5DTNGPFhXtSvLYFCQ== 25 | """ 26 | 27 | [files."warcat-0.3.2-macos-aarch64.tgz"] 28 | sha256 = "6ed9822215a289ed3056dcd57049937b667bd77ee5ae772f566bdb66ecc42258" 29 | sha512 = "8da71adfddeaefd539305becd1d7cf01290cdd00974f4385c336b4299030fbe5e919ac25c0349c34c61e9af19508a1d9536ff0128db06f1ceb58f5e678e5aae2" 30 | blake2b = "036a68788cfc524cb2a32528e2bdc78197af71fbe98cb5e00b002131981b976d09a2d0c30963fd43ee1ef742f05b2da51d94ccac167c4d54ed9a2b718b350675" 31 | blake3 = "2632fc21455140de891c9f514ba3674d6cda382add1bf5875d556e5234d16f5c" 32 | minisign = """ 33 | untrusted comment: signature from rsign secret key 34 | RUQuQKHwtF7mW+QYr40RvEFjBtkMI1rFJnt7A38M57OR7iVG9/GDsv6eBU2HMMnBFWtv+efmgMtnFr2cmoOhouRbe8w+qgrqOQA= 35 | trusted comment: timestamp:1731620824 36 | 3VMy/L/+2aH2NP4F317DbCtb0uqRVwRNS1Ues1IqHNTlE2wMQgv3v3H++zZTcxcibUvwAmtLUCHIZkVy+YuIDQ== 37 | """ 38 | 39 | [files."warcat-0.3.2-macos-x86_64.tgz"] 40 | sha256 = "f336bd5cc4ed383e278bd1c1b0ce16db55df1c7a4f57b2bca90ed2ae02cb26e1" 41 | sha512 = "534954200c095769ddeee520106795a6e0901f3272a6672bc34b34f789e38c731dfb5f010b59b49099fc6e26e0babf9c689830d9851b61b22a4d9afa252f2862" 42 | blake2b = "35fbae98443191ef40a1325be6dfb2cac54fc757b664943e259b2cfc14b37747844b44bf238d58332836d5c7ad9374972883e3ec012c568c776d32c9958a90f5" 43 | blake3 = "430a0a1ec11f6d06e5f8487958abbd08f7c2c2b1a5831a02748064a3e25d10d5" 44 | minisign = """ 45 | untrusted comment: signature from rsign secret key 46 | RUQuQKHwtF7mWxRPIJtm1LU6LpHKSU70bV04RGHKP4t+iaUDaq8s2LPseh0LEt1xmbLQFevp2Td7J7/O8w+iWDgDYasDDV4RHwQ= 47 | trusted comment: timestamp:1731620824 48 | VmzK6tVL4m9514wNj/nQM+kCsXVda4JDt2pdv4rU/VGN8VJ7ZTnVInwYM8G2/F11jdDoXykczAM57ZtvaWb2BQ== 49 | """ 50 | 51 | [files."warcat-0.3.2-windows-aarch64.zip"] 52 | sha256 = "e3b86877b8016ce31732797940476c124b5f8caa4cc05ee9ee08dbf5eaa18606" 53 | sha512 = "8b5244afc7f7da9000c735c38c779743215089ca5d862bc96a811346ca9b5f5e8d70ffe44cc7793dc56d6ba34213f7f25bacf8f338c89524c0ae4c74beb91d15" 54 | blake2b = "1bafd9e07ce3470b28ff49d91ea0b2b7b42bb0ad725c62bba195b0270c4bdb88ec33d488882af6f84ffb95c29e8992d7ecd7e64a9b11f433e25d3bbfc305f247" 55 | blake3 = "691ccf6eed798b98b644b185abfee281c3d73ed6da88e908d29f3e03bf440ca4" 56 | minisign = """ 57 | untrusted comment: signature from rsign secret key 58 | RUQuQKHwtF7mW9IWe4OsvpgFAqlfQbgkEW3wq7qm10f2RlaAgBdtzdhH5y5OSWX18h5t1/eFYUvsc7EJkJr+6evHcSbXCSZmGAM= 59 | trusted comment: timestamp:1731620824 60 | or69yv7Uv1iSZGN5ISztpNDM4F+UgljqaGNzOB+cJEyzIDoYWFo1zzEeIKHHubc8RHyzQz032Bz9V2vxthZJBg== 61 | """ 62 | 63 | [files."warcat-0.3.2-windows-x86_64.zip"] 64 | sha256 = "86e8b7bbfe1117dc890c2caca90d63efab41e682f47b9c81eb8a2aadf8fbe62e" 65 | sha512 = "643f95208f3ee476ee0180cdcfd690947500e390c0a0a7f8a33ee103fc5ee5d89090d71d4787b027951d664cb8470dced9912c131cc9b8a22263afeaf55c28b5" 66 | blake2b = "520d566b89669f785e5c49b15b977bb2cfb1acb726952f48e1b4e04729823f2a4a4557b5a9c09964ab065f7e919d8fafefb3669533901cdd08574e682f68e805" 67 | blake3 = "c81cc5136d2362a4c7b2f0fa04134c21d935b36ed0f7ddb0408d6c3ec16ae00e" 68 | minisign = """ 69 | untrusted comment: signature from rsign secret key 70 | RUQuQKHwtF7mW60n6KlQrFdn1b4ue3p7DbA3JoD1Q1c8akvHSmu0RBQ95rOECCUNcTQXcx/2UM1msyR3OwO2FluXKZ5DG9NRzwU= 71 | trusted comment: timestamp:1731620824 72 | 9iQS2QD7+JCy1ODFhaP9gnVtPqq/yB3Ubw0QFlyPU/kEEz2pax8ewJBha4nz+Hma/5jgePvYd9JuIttN1ZM6DQ== 73 | """ 74 | -------------------------------------------------------------------------------- /misc/release_digests/v0.3.3.toml: -------------------------------------------------------------------------------- 1 | [files] 2 | 3 | [files."warcat-0.3.3-linux-aarch64.tar.gz"] 4 | sha256 = "e4bb918467fbb7fc4f4dbeda8faceee76a71bc3b45e516665d3d0b1138fb7974" 5 | sha512 = "4d95f9bee1690a01005758fc1bf3e222817ecbc39d29ab6e3cd54be2763ebb5669b5cbea72236c4bbe74039353643f51e156d6e3e0e45cc174cc67cbd402c809" 6 | blake2b = "b9b3f075b2f045d9cfb7b5f7a0cc7bc25263522190347fdcec9f5661be04eda68144627d1ba938fc9148a19835471eee303b726a88a31679cfbd705665aecb24" 7 | blake3 = "b348d964e743cdcf82f455dac7457ea0b25166f70e352463d1252c32958d50a2" 8 | minisign = """ 9 | untrusted comment: signature from rsign secret key 10 | RUQuQKHwtF7mW42veK9lQEUVufcJ80g6DJ0hKRVGIM9hQ8KDjEaUMgwKGH15xezLGlxEEBlJwN1llhyZr35N09LYB4aaTskhLg0= 11 | trusted comment: timestamp:1748237016 12 | g59H2OHzSY5Le0tsvcEFlwz3X8NPDXovxtx19k3Er3Q0/OTUavrgHCNRYokpRPXcgJWMUSq6ajXW/298PT9FCQ== 13 | """ 14 | 15 | [files."warcat-0.3.3-linux-x86_64.tar.gz"] 16 | sha256 = "3005a5becb621e067e832a89c2c43301cd8dfd79549b0503b5370ad801a6cc5a" 17 | sha512 = "41a919b6e2e5f7c310adbaeb6809182eac4fbd424bba5631c11ba3eb5957baa3a8f57e55e05783bda84c9a22097a041079f522dc0beb768e8ddfe3531b6d3306" 18 | blake2b = "b9830c77ad4f66315eae0cc7695de628d10b8a7afc1e7e5eedc384bff8c0f62a7d5929e27877e61a329bab3ed998381efa2193d77fc5bd4df959078b0c5e0ad4" 19 | blake3 = "d0f5d1df6651dc117396f6a4ba784ca14a4e19d581794a0a3b665faf1c9d47ab" 20 | minisign = """ 21 | untrusted comment: signature from rsign secret key 22 | RUQuQKHwtF7mWw1spmlvisGpn3L9Nnohx9jPvIqA+K1iZ+NXyMdNOvU/VrOFhfG3+mR8o/+mxbmKgpcqWUyZxZ4dCDOIx2AZRQ8= 23 | trusted comment: timestamp:1748237016 24 | lSa5nNRCGUuhnvzm1TzwcpriJ9fiJIpJSL01M2h1pHcFONKOjiaanDbLKiE4tWopvVf6fyHDJShOogoh3kaqBQ== 25 | """ 26 | 27 | [files."warcat-0.3.3-macos-aarch64.tgz"] 28 | sha256 = "deeec9cc99284da3fbc5c783e7ad4cbc5d2382807163505d29748d8a99f22cf9" 29 | sha512 = "f4f61dab95136eee19dd21076efc0138395e8163a2c72546f28b8358bd18a7c3af06a0573d89a15dee2afb08d1148586d425984873033938d59f7a377ae02f23" 30 | blake2b = "e9b254bee7cf427a639bb5784a97ab6e3a86be03ab17a097d641a9cc69deb1155059c646ffc54d2be87ee5f82d441def5330f6631e5fe85a46565ccdf14483a7" 31 | blake3 = "4d7d1b6e60e6755f4087574fe039b351a65a85c1963b6cc1718ec9b56e33bb8a" 32 | minisign = """ 33 | untrusted comment: signature from rsign secret key 34 | RUQuQKHwtF7mW/EdcNltIyNGZtcBirOt6cONSeLXE39k91Whv1MKi2mple7g1C/qfgh+oXRLMeStbkbNcpFN+iYBRNOqT7DWjQQ= 35 | trusted comment: timestamp:1748237016 36 | TfKGdTrjdc/UqIOSYhWQVWE8ap2FddLEe7StfumHuz2jtuwQYp3wvAWHoROQFMCzzhBDWIPGRIvRMEUdLMh9AA== 37 | """ 38 | 39 | [files."warcat-0.3.3-macos-x86_64.tgz"] 40 | sha256 = "768fca5de72c20be9747cf2fc31bf52a55391ebbf69b4940420e43bd2dee9ad3" 41 | sha512 = "ec194aabcb99a42992a2179ecefa79e2c797e31e4d0154be4218eb6216ecaedc2738ce16a32e03b400d5e61eed536f900a9654e7ba334c7cbbf9713f0ac60445" 42 | blake2b = "064949dd1910be51ae98af6b6cecd0586042ac978c9d81234e28f27003d1e7251e5fb38e95efe576d51e1ec8903f7f370db4faaac8151a0cbccea5d9ecc4881a" 43 | blake3 = "b3391e624d9ca6bf02da3b7bbbfcaa55f499ed1683e4375b8ea294a418048761" 44 | minisign = """ 45 | untrusted comment: signature from rsign secret key 46 | RUQuQKHwtF7mW4XhetPbXij4ndIOP8NAlrfpPbI/AMGkLm39eiP6c+q8p9slNLCIud+CI/ByuowKHYZUX/t/EwBQicUMCavKewE= 47 | trusted comment: timestamp:1748237016 48 | XZySMItAunjsSGIH8lZDp+jU9NSAVeG5IS0u0xIDtVRO1BsQWrcmo+UHaq+5k9vQQtL4pwfxQkRcy/G/PgQcBQ== 49 | """ 50 | 51 | [files."warcat-0.3.3-windows-aarch64.zip"] 52 | sha256 = "e9039a69bb77b9c34ddc8225f2f6dee30c26d879030e06ea8350bece89cfd628" 53 | sha512 = "f5c31dce3adde00ce236bfda17e2e79d831345c445e570eb74e621dfeb9c9b35b881a7701ad7d61395d546bef69b1b59944b48fbdb8d6b53894c8fd27c55fd8d" 54 | blake2b = "4f8f08f578e7f1c0243d575f8ed6abc0381f7f0b45d0f6d05f8d49f93639aff8d6d948306f8c71f7b0929d2c2b33a6a9d62d52299ed2f92fd4e0ad60df006e56" 55 | blake3 = "32bd372553be0b02379e3f094330c956c3181bb5b444937a6f250bb8d4c43946" 56 | minisign = """ 57 | untrusted comment: signature from rsign secret key 58 | RUQuQKHwtF7mWwUIIRhscwmfhC99vERX0e28Wr61RNFkFZ3On1eS/z2O9JYpU/8hMc8wJTwwBKlGXsjJl5yPZgndlHJ15gT5Dgc= 59 | trusted comment: timestamp:1748237016 60 | Eb6EYoVRSo0y0I83myjT9a/cYNHNTTZaeGvK3s5ih3PaOq5uUv1GHi9d3GfO5g/ItvxGBrRmdX4FzHe+U4MVDA== 61 | """ 62 | 63 | [files."warcat-0.3.3-windows-x86_64.zip"] 64 | sha256 = "803eaabd2eca7e33b25899e5f55b9fc5481869dcf8f74c0bf64a11fde34ce616" 65 | sha512 = "1fdce43d39f2a191f7dcd49681f8bb11bd2fb33fbef862ce3063080ce511f17c95785178e5bb68d7c496b841ed4f5ff6500fb2770e9a9df4185ddd1eea46e146" 66 | blake2b = "2d4301c701118a74b508a4f1a52add5df5f6e78ffcaa5d8f3f3e2da917ca098062ce3b48d8f1216464f338b6029ab216a3245c75061ab727c49aadab4a555ea8" 67 | blake3 = "5175dbc197c88f0fae260e569931053c8c7a92279cec1b9c5d77e3e023979f19" 68 | minisign = """ 69 | untrusted comment: signature from rsign secret key 70 | RUQuQKHwtF7mW2odmVvqLm9p8mMFYmklcyRaj4/R2uaZ8Brzs/HWSxrDyWuq3uIgXJGt/0okbhrxq5ewlDVjm+asiYybhHQ68QE= 71 | trusted comment: timestamp:1748237016 72 | EdHxwfiYcZlif+/PYbx/ksFLeXFjjhsdrJ+hEsdAFfJIlGzVL1O+XU0VWFn63RtZuKKd7T5Izh7KqlfKdy+WCw== 73 | """ 74 | -------------------------------------------------------------------------------- /misc/release_digests/v0.3.4.toml: -------------------------------------------------------------------------------- 1 | [files] 2 | 3 | [files."warcat-0.3.4-linux-aarch64.tar.gz"] 4 | sha256 = "16ee0a729a91dad7728d12791da0153500697659bf7bb2078a09b7674d19b64b" 5 | sha512 = "58a2653a395933931ae52edcff53f094a71920a8d67866b1bda34fc58f51acee2cb96209cb15878704a714e73b984a9e2b217350daa1c291c6372a4952a764fe" 6 | blake2b = "5f536df37e8db1509df1883b26d87ab538b015f63047c320d90fd79bca535c94824ecee7e73fd662ed2a5f60ab544bf9eb3dfec06e8c3a2c9da45525459b7ae9" 7 | blake3 = "d4c023ee3af169c8d9833133a2f7a3197f5bc47b99035b5bb29606b078d22057" 8 | minisign = """ 9 | untrusted comment: signature from rsign secret key 10 | RUQuQKHwtF7mW3PWRDCS7M3DQx3rEb4Brwz+v9SNtqX3zcPWmPEdWUeXldtLPeTlJIpSIHj67hH2doJ+vpszm9dJ4oEBx0Ol+wc= 11 | trusted comment: timestamp:1748855122 12 | xmbrMbSMuzjFM/9py4/66p00rniUyW9mbghgpbW9l4iR/lcuWcJ6t0rYkDvfrFB5aQ3GE7E72sN8c/98RG2kBQ== 13 | """ 14 | 15 | [files."warcat-0.3.4-linux-x86_64.tar.gz"] 16 | sha256 = "97435b1337d4edcd4fcb803ebebc4744df39224cb69610095869cc2dc3e69ea8" 17 | sha512 = "e7ae2d5cf271ecefddd31bf3e4d12f8f2f0a901744588afc45da107d3e1d1118871b16c05515b11aabf60334b0e9c5f376578c8d9770e1cf953037c820a14ea8" 18 | blake2b = "28e1b468cb84e0d491ab94e530bb50af86ef942afecfe0d521601a1f40c619ced5e0d652135894ecdae369719f17436ef654c8565151cef45742bfa49d04ecc2" 19 | blake3 = "0684948757b867cf78bd0f68efaf8557ee2592ce8e5dc0b5b50247370fb4591f" 20 | minisign = """ 21 | untrusted comment: signature from rsign secret key 22 | RUQuQKHwtF7mW+ITks3Q2UmG+lQlPEzhTpVLAh3TFqQYEzQD99sbnl5J+K1lrxEoW/4tVovB82+dddn4CnwXvYwGjIMryyM2YAk= 23 | trusted comment: timestamp:1748855122 24 | ON8zlyt/9j7R6X20Q4TCDj697p/SW+6sUgZThHyLFTiW25VNOg+uwpkngqKOBeMiKNCufN9lCtnQscl5kDdHAQ== 25 | """ 26 | 27 | [files."warcat-0.3.4-macos-aarch64.tgz"] 28 | sha256 = "9032e55d7521b246c4a6b2bbf2f5a3b997a9d523b18cb327e4c01e81083214ef" 29 | sha512 = "8965fc38923f25c37b1e23897bf690d5c7d37bd45a5e6c9ce9374c65dc8ce7ff31c7375556c3c6d4038d92807aa30f2c385cb1d794264ffcbf2ad9e3ea3b8156" 30 | blake2b = "5d9d32c8449dae2cf28605e51ea6dc6ffcd5401df0e08e50dd94c3da51d736d7885032d6b6698a638154c5ca3290fcb39147d91e9174c33c53614465cecae963" 31 | blake3 = "f0ae34eb36b95f8f489ccb36c979a809440ddb7a5b5c5704a8a6cf1f9c9a3a88" 32 | minisign = """ 33 | untrusted comment: signature from rsign secret key 34 | RUQuQKHwtF7mWzO/7YgeELilPKSSmy+FnQWuQIzS4W3PvWEUcViUZ2Yeuz1zLLX3CEf5X1OoknVFIOzu0VfwCa5bynnu8gQAggQ= 35 | trusted comment: timestamp:1748855122 36 | OFv7JdniPRQXZAJoNBPzyxub5mUFR3VR/4zcL5kFth9lApUP81lPCLI0ndaUVjFepRvqWyvoP+m0KEdGMZmzBA== 37 | """ 38 | 39 | [files."warcat-0.3.4-macos-x86_64.tgz"] 40 | sha256 = "67f1d7937a9d01255923e6ee2c52af6410638ad2aab9133ace9a57248d0183c7" 41 | sha512 = "7da6a090bea2d5c5eba3173fe814ab34b588051b63de037c71495dfd430a79c4da306366c11cf4a7b065ae3a00e1eb264d696a210291c5db0306c10168b04bd3" 42 | blake2b = "f888814504851ba2da95410ed5e1704d02f88a03d2f7f0999560b43d493a48ccf8309e6e3df4a4ecb0e7f81477cff0d3978c652acf72f98c7b66ec5b52f2ec44" 43 | blake3 = "c3ec7cb1d58b224a758d148062ac7b1f53197b0043dbf1d869de231ca63b7edc" 44 | minisign = """ 45 | untrusted comment: signature from rsign secret key 46 | RUQuQKHwtF7mW+tyXzmPSoPB9ZSaqfZ6soP9xSE5065gWv8+iX0Y7Pf2zTrA8C8hk2f35yZTSFd8luKjvMxAR/UPsggTF3nP0gU= 47 | trusted comment: timestamp:1748855122 48 | pJmeUtNEM4PUDs5NQHGV7UZn7fzsG9J4BPne5NR2XkRn0WItbAUUbQeVX12mt0rB/sjCWRnjoCIigrd5HkplBQ== 49 | """ 50 | 51 | [files."warcat-0.3.4-windows-aarch64.zip"] 52 | sha256 = "9600a93d56b8c33a329d03b03952ba11170e48fe5423d874129fcda037d4ca8b" 53 | sha512 = "60697264a82116ac37adef2eefb83beb4d69ed6b35433a9212fa9b5dbe0c340b6cbc819ff6f4ed778a8d9fd3d918772d1cffea129394139929f2d847358d75d8" 54 | blake2b = "a35b772082e6638bb5d70b8f8b925dd808d6ede1577a81a4f3b3a1ee5a4f262e3f025104292917bbc940bea38764662fc0dea9d43a01ead3f17c4fae2fe9512b" 55 | blake3 = "671ebaff5bfc70fc54269818cd9c997905daa77a047345edc20837f012749ae5" 56 | minisign = """ 57 | untrusted comment: signature from rsign secret key 58 | RUQuQKHwtF7mW/uj82KHauSeIqogN1R0am5EjW3T+bkaZpGRERAZKweXztsnM2moP3gdVLRVc0lNPV0mxaErKn2Lg9JeJr0Xhwg= 59 | trusted comment: timestamp:1748855122 60 | FtdS2HfvAq4WY1H1YQlgGfHCdSyM5K+Ja4xDtbdGn2+d/tZ4lxr7r0THBItcQznJcwJ8BDSlAe0TP7Pyq09SBQ== 61 | """ 62 | 63 | [files."warcat-0.3.4-windows-x86_64.zip"] 64 | sha256 = "5147af022507e300de74ac1b05ad96aa37df54d9088c80b7fb04979a5da1d770" 65 | sha512 = "ce7cc41a2d6bb0cf6c8214e995d7522811ed3e6b042d65294541a096c386f18f56c1ce7c7f04d7b4a7a5493adae2f2168748cbc8d00e9f16ae5b72e703409655" 66 | blake2b = "e488a581efd19844b0d57255092980be6af1e685fd4b747591ebda1648e83ffb1984a4d5d6d70cafb32d4177d6e498e1db786b5b3b8c18ed9a5eaf607e84f69f" 67 | blake3 = "78892ecb3eb6bdd218a067d2fd5141eec388fbb932c27da97ef9f236c7327e6c" 68 | minisign = """ 69 | untrusted comment: signature from rsign secret key 70 | RUQuQKHwtF7mW/UjGgfqoE1otDuNKtCwcUf3KWCavEKOD1pnpWEEANDuD+w2fJvt1PhSBeWlgaFzX3vlTp7noBzedGdnM9qohgI= 71 | trusted comment: timestamp:1748855122 72 | I3vNPDyosSd1rKIW+L7uN3UsdXDMyRQnK1Q1D/fg1rrNbixFzcaVxSBlU/QXxEQA7KiGcyWxrv2w6KWg46DJDA== 73 | """ 74 | -------------------------------------------------------------------------------- /roadmap.md: -------------------------------------------------------------------------------- 1 | # Development Roadmap 2 | 3 | ## Current work 4 | 5 | * Testing of Zstd support 6 | 7 | ## Planned future features 8 | 9 | * WARC indexing support 10 | * Extract command: support segment records. 11 | * Split and join WARC files 12 | * Similar to the import command, a guided way to write HTTP requests/responses -------------------------------------------------------------------------------- /src/app.rs: -------------------------------------------------------------------------------- 1 | use std::process::ExitCode; 2 | 3 | use clap::Parser; 4 | 5 | use self::arg::Args; 6 | use self::arg::Command; 7 | 8 | mod arg; 9 | mod common; 10 | mod dump_help; 11 | mod export; 12 | mod extract; 13 | mod filter; 14 | mod format; 15 | mod get; 16 | mod import; 17 | mod io; 18 | mod list; 19 | mod logging; 20 | mod model; 21 | mod progress; 22 | mod self_; 23 | mod verify; 24 | 25 | pub fn run() -> ExitCode { 26 | match run_impl() { 27 | Ok(exit_code) => exit_code, 28 | Err(error) => { 29 | tracing::error!(?error); 30 | eprintln!("{:#}", error); 31 | ExitCode::FAILURE 32 | } 33 | } 34 | } 35 | 36 | fn run_impl() -> anyhow::Result { 37 | if self::self_::is_installer() { 38 | self::self_::install_interactive()?; 39 | return Ok(ExitCode::SUCCESS); 40 | } 41 | 42 | let args = Args::parse(); 43 | 44 | if args.quiet { 45 | self::progress::disable_global_progress_bar(); 46 | } 47 | 48 | self::logging::set_up_logging(args.log_level, args.log_file.as_deref(), args.log_json)?; 49 | 50 | let exit_code = match args.command { 51 | Command::Export(args) => { 52 | self::export::export(&args)?; 53 | ExitCode::SUCCESS 54 | } 55 | Command::Import(args) => { 56 | self::import::import(&args)?; 57 | ExitCode::SUCCESS 58 | } 59 | Command::List(args) => { 60 | self::list::list(&args)?; 61 | ExitCode::SUCCESS 62 | } 63 | Command::Get(args) => { 64 | self::get::get(&args)?; 65 | ExitCode::SUCCESS 66 | } 67 | Command::Extract(args) => { 68 | self::extract::extract(&args)?; 69 | ExitCode::SUCCESS 70 | } 71 | Command::Verify(args) => self::verify::verify(&args)?, 72 | Command::Self_(args) => { 73 | self::self_::self_(&args)?; 74 | ExitCode::SUCCESS 75 | } 76 | Command::DumpHelp => { 77 | self::dump_help::dump_help()?; 78 | ExitCode::SUCCESS 79 | } 80 | }; 81 | 82 | self::progress::global_progress_bar().println("Done.")?; 83 | 84 | Ok(exit_code) 85 | } 86 | -------------------------------------------------------------------------------- /src/app/common.rs: -------------------------------------------------------------------------------- 1 | use std::{io::Read, path::Path}; 2 | 3 | use anyhow::Context; 4 | use indicatif::ProgressBar; 5 | 6 | use crate::{ 7 | compress::{Dictionary, Format}, 8 | header::WarcHeader, 9 | io::LogicalPosition, 10 | warc::{DecStateBlock, DecStateHeader, Decoder, DecoderConfig}, 11 | }; 12 | 13 | use super::io::{ProgramInput, ProgramOutput}; 14 | 15 | const BUFFER_LENGTH: usize = crate::io::IO_BUFFER_LENGTH; 16 | 17 | pub fn open_input(path: &Path) -> anyhow::Result { 18 | ProgramInput::open(path).context("opening input file failed") 19 | } 20 | 21 | pub fn open_output(path: &Path) -> anyhow::Result { 22 | ProgramOutput::open(path).context("opening output file failed") 23 | } 24 | 25 | pub enum ReaderEvent<'a> { 26 | Header { 27 | header: WarcHeader, 28 | record_boundary_position: u64, 29 | }, 30 | Block { 31 | data: &'a [u8], 32 | }, 33 | } 34 | 35 | #[derive(Debug)] 36 | enum ReaderState { 37 | None, 38 | Header(Decoder), 39 | Block(Decoder), 40 | } 41 | 42 | impl ReaderState { 43 | fn take(&mut self) -> Self { 44 | std::mem::replace(self, Self::None) 45 | } 46 | 47 | #[allow(clippy::result_large_err)] 48 | fn try_into_header(self) -> Result, Self> { 49 | if let Self::Header(v) = self { 50 | Ok(v) 51 | } else { 52 | Err(self) 53 | } 54 | } 55 | 56 | #[allow(clippy::result_large_err)] 57 | fn try_into_block(self) -> Result, Self> { 58 | if let Self::Block(v) = self { 59 | Ok(v) 60 | } else { 61 | Err(self) 62 | } 63 | } 64 | } 65 | 66 | pub struct ReaderPipeline 67 | where 68 | C: FnMut(ReaderEvent) -> anyhow::Result<()>, 69 | { 70 | progress_bar: ProgressBar, 71 | state: ReaderState, 72 | buf: Vec, 73 | callback: C, 74 | pub has_record_at_time_compression_fault: bool, 75 | } 76 | 77 | impl ReaderPipeline 78 | where 79 | C: FnMut(ReaderEvent) -> anyhow::Result<()>, 80 | { 81 | pub fn new( 82 | callback: C, 83 | input: ProgramInput, 84 | compression_format: Format, 85 | file_len: Option, 86 | ) -> anyhow::Result { 87 | let progress_bar = super::progress::make_bytes_progress_bar(file_len); 88 | 89 | let mut config = DecoderConfig::default(); 90 | config.decompressor.format = compression_format; 91 | config.decompressor.dictionary = Dictionary::WarcZstd(Vec::new()); 92 | 93 | let reader = Decoder::new(input, config)?; 94 | 95 | Ok(Self { 96 | progress_bar, 97 | state: ReaderState::Header(reader), 98 | buf: Vec::new(), 99 | callback, 100 | has_record_at_time_compression_fault: false, 101 | }) 102 | } 103 | 104 | pub fn run(&mut self) -> anyhow::Result<()> { 105 | super::progress::global_progress_bar().add(self.progress_bar.clone()); 106 | 107 | loop { 108 | self.process_header()?; 109 | self.process_block()?; 110 | 111 | let mut reader = self.state.take().try_into_header().unwrap(); 112 | let has_more = reader.has_next_record()?; 113 | self.state = ReaderState::Header(reader); 114 | 115 | if !has_more { 116 | break; 117 | } 118 | } 119 | 120 | self.progress_bar.finish(); 121 | super::progress::global_progress_bar().remove(&self.progress_bar); 122 | 123 | Ok(()) 124 | } 125 | 126 | fn process_header(&mut self) -> anyhow::Result<()> { 127 | let reader = self.state.take().try_into_header().unwrap(); 128 | 129 | self.has_record_at_time_compression_fault = reader.has_record_at_time_compression_fault(); 130 | 131 | let (header, reader) = reader.read_header().context("invalid WARC header")?; 132 | 133 | let record_id = header 134 | .fields 135 | .get("WARC-Record-ID") 136 | .map(|s| s.as_str()) 137 | .unwrap_or_default(); 138 | self.progress_bar 139 | .set_message(format!("Processing record {}", record_id)); 140 | tracing::info!(record_id, "processing record"); 141 | self.progress_bar.set_position(reader.logical_position()); 142 | 143 | (self.callback)(ReaderEvent::Header { 144 | header, 145 | record_boundary_position: reader.record_boundary_position(), 146 | })?; 147 | 148 | self.state = ReaderState::Block(reader); 149 | 150 | Ok(()) 151 | } 152 | 153 | fn process_block(&mut self) -> anyhow::Result<()> { 154 | let mut reader = self.state.take().try_into_block().unwrap(); 155 | 156 | loop { 157 | self.buf.resize(BUFFER_LENGTH, 0); 158 | 159 | let read_length = reader.read(&mut self.buf)?; 160 | self.buf.truncate(read_length); 161 | 162 | if read_length == 0 { 163 | break; 164 | } 165 | 166 | self.progress_bar.set_position(reader.logical_position()); 167 | 168 | (self.callback)(ReaderEvent::Block { data: &self.buf })?; 169 | } 170 | 171 | (self.callback)(ReaderEvent::Block { data: &[] })?; 172 | 173 | self.state = ReaderState::Header(reader.finish_block()?); 174 | 175 | Ok(()) 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /src/app/dump_help.rs: -------------------------------------------------------------------------------- 1 | pub fn dump_help() -> anyhow::Result<()> { 2 | let config = clap_markdown::MarkdownOptions::new() 3 | .show_footer(false) 4 | .show_table_of_contents(false) 5 | .title("{title}".to_string()); 6 | 7 | println!( 8 | "{}", 9 | clap_markdown::help_markdown_custom::(&config) 10 | ); 11 | 12 | Ok(()) 13 | } 14 | -------------------------------------------------------------------------------- /src/app/filter.rs: -------------------------------------------------------------------------------- 1 | use regex::Regex; 2 | 3 | use crate::header::WarcHeader; 4 | 5 | #[derive(Debug, Clone)] 6 | pub struct FieldFilter { 7 | includes: Vec<(String, Option)>, 8 | excludes: Vec<(String, Option)>, 9 | include_patterns: Vec<(String, Regex)>, 10 | exclude_patterns: Vec<(String, Regex)>, 11 | } 12 | 13 | impl FieldFilter { 14 | pub fn new() -> Self { 15 | Self { 16 | includes: Vec::new(), 17 | excludes: Vec::new(), 18 | include_patterns: Vec::new(), 19 | exclude_patterns: Vec::new(), 20 | } 21 | } 22 | 23 | pub fn add_include(&mut self, rule: &str) { 24 | if let Some((name, value)) = rule.split_once(":") { 25 | self.includes 26 | .push((name.to_string(), Some(value.to_string()))); 27 | } else { 28 | self.includes.push((rule.to_string(), None)); 29 | } 30 | } 31 | 32 | pub fn add_exclude(&mut self, rule: &str) { 33 | if let Some((name, value)) = rule.split_once(":") { 34 | self.excludes 35 | .push((name.to_string(), Some(value.to_string()))); 36 | } else { 37 | self.excludes.push((rule.to_string(), None)); 38 | } 39 | } 40 | 41 | pub fn add_include_pattern(&mut self, rule: &str) -> anyhow::Result<()> { 42 | let (name, value) = rule.split_once(":").unwrap_or((rule, "")); 43 | 44 | self.include_patterns 45 | .push((name.to_string(), Regex::new(value)?)); 46 | 47 | Ok(()) 48 | } 49 | 50 | pub fn add_exclude_pattern(&mut self, rule: &str) -> anyhow::Result<()> { 51 | let (name, value) = rule.split_once(":").unwrap_or((rule, "")); 52 | 53 | self.exclude_patterns 54 | .push((name.to_string(), Regex::new(value)?)); 55 | 56 | Ok(()) 57 | } 58 | 59 | pub fn is_allow(&self, header: &WarcHeader) -> bool { 60 | for (rule_name, rule_value) in &self.excludes { 61 | if let Some(rule_value) = rule_value { 62 | for value in header.fields.get_all(rule_name) { 63 | if value == rule_value { 64 | return false; 65 | } 66 | } 67 | } else if header.fields.contains_name(rule_name) { 68 | return false; 69 | } 70 | } 71 | 72 | for (rule_name, value_pattern) in &self.exclude_patterns { 73 | for value in header.fields.get_all(rule_name) { 74 | if value_pattern.is_match(value) { 75 | return false; 76 | } 77 | } 78 | } 79 | 80 | for (rule_name, rule_value) in &self.includes { 81 | if let Some(rule_value) = rule_value { 82 | for value in header.fields.get_all(rule_name) { 83 | if value == rule_value { 84 | return true; 85 | } 86 | } 87 | } else if header.fields.contains_name(rule_name) { 88 | return true; 89 | } 90 | } 91 | 92 | for (rule_name, value_pattern) in &self.include_patterns { 93 | for value in header.fields.get_all(rule_name) { 94 | if value_pattern.is_match(value) { 95 | return true; 96 | } 97 | } 98 | } 99 | 100 | self.includes.is_empty() && self.include_patterns.is_empty() 101 | } 102 | } 103 | 104 | #[cfg(test)] 105 | mod tests { 106 | use super::*; 107 | 108 | #[test] 109 | fn test_filter() { 110 | let mut header1 = WarcHeader::empty(); 111 | header1.fields.insert("n".to_string(), "cat".to_string()); 112 | let mut header2 = WarcHeader::empty(); 113 | header2.fields.insert("n".to_string(), "dog".to_string()); 114 | let mut header3 = WarcHeader::empty(); 115 | header3.fields.insert("n".to_string(), "bird".to_string()); 116 | let mut header4 = WarcHeader::empty(); 117 | header4 118 | .fields 119 | .insert("n".to_string(), "cat-and-dog".to_string()); 120 | 121 | let mut filter = FieldFilter::new(); 122 | filter.add_include("n:dog"); 123 | filter.add_exclude("n:cat"); 124 | 125 | assert!(!filter.is_allow(&header1)); 126 | assert!(filter.is_allow(&header2)); 127 | assert!(!filter.is_allow(&header3)); 128 | assert!(!filter.is_allow(&header4)); 129 | } 130 | 131 | #[test] 132 | fn test_filter_empty_value() { 133 | let mut header1 = WarcHeader::empty(); 134 | header1.fields.insert("a".to_string(), "".to_string()); 135 | let mut header2 = WarcHeader::empty(); 136 | header2.fields.insert("b".to_string(), "".to_string()); 137 | 138 | let mut filter = FieldFilter::new(); 139 | filter.add_include("a"); 140 | filter.add_exclude("b"); 141 | 142 | assert!(filter.is_allow(&header1)); 143 | assert!(!filter.is_allow(&header2)); 144 | } 145 | 146 | #[test] 147 | fn test_filter_regex() { 148 | let mut header1 = WarcHeader::empty(); 149 | header1.fields.insert("n".to_string(), "cat".to_string()); 150 | let mut header2 = WarcHeader::empty(); 151 | header2.fields.insert("n".to_string(), "dog".to_string()); 152 | let mut header3 = WarcHeader::empty(); 153 | header3.fields.insert("n".to_string(), "bird".to_string()); 154 | let mut header4 = WarcHeader::empty(); 155 | header4 156 | .fields 157 | .insert("n".to_string(), "cat-and-dog".to_string()); 158 | 159 | let mut filter = FieldFilter::new(); 160 | filter.add_include_pattern(r"n:\bdog\b").unwrap(); 161 | filter.add_exclude_pattern(r"n:\bcat\b").unwrap(); 162 | 163 | assert!(!filter.is_allow(&header1)); 164 | assert!(filter.is_allow(&header2)); 165 | assert!(!filter.is_allow(&header3)); 166 | assert!(!filter.is_allow(&header4)); 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /src/app/format.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | 3 | use crate::compress::Format; 4 | 5 | pub fn filename_compression_format(path: &Path) -> Option { 6 | if let Some(filename) = path.file_name() { 7 | let filename = filename.to_string_lossy().to_ascii_lowercase(); 8 | 9 | if filename.ends_with(".warc") { 10 | return Some(Format::Identity); 11 | } 12 | if filename.ends_with(".warc.gz") { 13 | return Some(Format::Gzip); 14 | } 15 | #[cfg(feature = "zstd")] 16 | if filename.ends_with(".warc.zst") { 17 | return Some(Format::Zstandard); 18 | } 19 | } 20 | 21 | None 22 | } 23 | -------------------------------------------------------------------------------- /src/app/get.rs: -------------------------------------------------------------------------------- 1 | use std::io::{Read, Seek, Write}; 2 | 3 | use crate::{ 4 | app::export::Exporter, 5 | compress::{Dictionary, Format}, 6 | dataseq::SeqWriter, 7 | error::{ProtocolError, ProtocolErrorKind}, 8 | extract::WarcExtractor, 9 | header::fields::FieldsExt, 10 | warc::{Decoder, DecoderConfig}, 11 | }; 12 | 13 | use super::arg::{GetCommand, GetExportSubcommand, GetExtractSubcommand, GetSubcommand}; 14 | 15 | pub fn get(args: &GetCommand) -> anyhow::Result<()> { 16 | match &args.subcommand { 17 | GetSubcommand::Export(sub_args) => export(sub_args), 18 | GetSubcommand::Extract(sub_args) => extract(sub_args), 19 | } 20 | } 21 | 22 | // FIXME: refactor the copypaste boilerplate 23 | 24 | fn export(args: &GetExportSubcommand) -> anyhow::Result<()> { 25 | let input_path = &args.input; 26 | let output_path = &args.output; 27 | let span = tracing::info_span!("export", path = ?input_path); 28 | let _span_guard = span.enter(); 29 | 30 | let input = super::common::open_input(input_path)?; 31 | let output = super::common::open_output(output_path)?; 32 | 33 | tracing::info!("opened file"); 34 | 35 | let compression_format = args.compression.try_into_native(input_path)?; 36 | let seq_format = args.format.into(); 37 | let writer = SeqWriter::new(output, seq_format); 38 | 39 | let mut exporter = Exporter::new(input_path, writer, args.no_block, args.extract); 40 | 41 | let mut config = DecoderConfig::default(); 42 | config.decompressor.format = compression_format; 43 | config.decompressor.dictionary = get_dictionary(compression_format); 44 | 45 | let mut decoder = Decoder::new(input, config)?; 46 | 47 | if args.position != 0 { 48 | decoder.prepare_for_seek()?; 49 | decoder 50 | .get_mut() 51 | .seek(std::io::SeekFrom::Start(args.position))?; 52 | } 53 | 54 | let (header, mut decoder) = decoder.read_header()?; 55 | 56 | let record_id = header.fields.get_or_default("WARC-Record-ID"); 57 | 58 | if args.id.as_ref().is_some_and(|id| id != record_id) { 59 | return Err(ProtocolError::new(ProtocolErrorKind::NotFound).into()); 60 | } 61 | 62 | let progress_bar = super::progress::make_bytes_progress_bar(Some(header.content_length()?)); 63 | super::progress::global_progress_bar().add(progress_bar.clone()); 64 | 65 | exporter.process_header(&header, decoder.record_boundary_position())?; 66 | 67 | let mut buf = Vec::with_capacity(8192); 68 | 69 | loop { 70 | buf.resize(8192, 0); 71 | 72 | let bytes_read = decoder.read(&mut buf)?; 73 | 74 | if bytes_read == 0 { 75 | break; 76 | } 77 | 78 | progress_bar.inc(bytes_read as u64); 79 | buf.truncate(bytes_read); 80 | exporter.process_block(&buf)?; 81 | } 82 | 83 | decoder.finish_block()?; 84 | exporter.finish()?; 85 | 86 | tracing::info!("closed file"); 87 | 88 | progress_bar.finish(); 89 | super::progress::global_progress_bar().remove(&progress_bar); 90 | 91 | Ok(()) 92 | } 93 | 94 | fn extract(args: &GetExtractSubcommand) -> anyhow::Result<()> { 95 | let input_path = &args.input; 96 | let output_path = &args.output; 97 | let span = tracing::info_span!("export", path = ?input_path); 98 | let _span_guard = span.enter(); 99 | 100 | let input = super::common::open_input(input_path)?; 101 | let mut output = super::common::open_output(output_path)?; 102 | 103 | tracing::info!("opened file"); 104 | 105 | let compression_format = args.compression.try_into_native(input_path)?; 106 | 107 | let mut extractor = WarcExtractor::new(); 108 | 109 | let mut config = DecoderConfig::default(); 110 | config.decompressor.format = compression_format; 111 | config.decompressor.dictionary = get_dictionary(compression_format); 112 | 113 | let mut decoder = Decoder::new(input, config)?; 114 | 115 | if args.position != 0 { 116 | decoder.prepare_for_seek()?; 117 | decoder 118 | .get_mut() 119 | .seek(std::io::SeekFrom::Start(args.position))?; 120 | } 121 | 122 | let (header, mut decoder) = decoder.read_header()?; 123 | 124 | let record_id = header.fields.get_or_default("WARC-Record-ID"); 125 | 126 | if args.id.as_ref().is_some_and(|id| id != record_id) { 127 | return Err(ProtocolError::new(ProtocolErrorKind::NotFound).into()); 128 | } 129 | 130 | let progress_bar = super::progress::make_bytes_progress_bar(Some(header.content_length()?)); 131 | super::progress::global_progress_bar().add(progress_bar.clone()); 132 | 133 | extractor.read_header(&header)?; 134 | 135 | if !extractor.has_content() { 136 | return Err(ProtocolError::new(ProtocolErrorKind::NoContent).into()); 137 | } 138 | 139 | let mut buf = Vec::with_capacity(8192); 140 | 141 | loop { 142 | buf.resize(8192, 0); 143 | 144 | let bytes_read = decoder.read(&mut buf)?; 145 | 146 | if bytes_read == 0 { 147 | break; 148 | } 149 | 150 | progress_bar.inc(bytes_read as u64); 151 | buf.truncate(bytes_read); 152 | extractor.extract_data(&buf, &mut output)?; 153 | } 154 | 155 | decoder.finish_block()?; 156 | output.flush()?; 157 | 158 | tracing::info!("closed file"); 159 | 160 | progress_bar.finish(); 161 | super::progress::global_progress_bar().remove(&progress_bar); 162 | 163 | Ok(()) 164 | } 165 | 166 | fn get_dictionary(format: Format) -> Dictionary { 167 | #[cfg(feature = "zstd")] 168 | if format == Format::Zstandard { 169 | return Dictionary::WarcZstd(Vec::new()); 170 | } 171 | 172 | Dictionary::None 173 | } 174 | -------------------------------------------------------------------------------- /src/app/io.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fs::File, 3 | io::{Read, Seek, Stdin, Stdout, Write}, 4 | path::Path, 5 | }; 6 | 7 | use crate::error::{ProtocolError, ProtocolErrorKind}; 8 | 9 | #[derive(Debug)] 10 | pub enum ProgramInput { 11 | File(File), 12 | Stdin(Stdin), 13 | } 14 | 15 | impl ProgramInput { 16 | pub fn open>(path: P) -> std::io::Result { 17 | let path = path.as_ref(); 18 | 19 | if path.to_str() == Some("-") { 20 | Ok(Self::Stdin(std::io::stdin())) 21 | } else { 22 | let file = File::options().read(true).open(path)?; 23 | Ok(Self::File(file)) 24 | } 25 | } 26 | } 27 | 28 | impl Read for ProgramInput { 29 | fn read(&mut self, buf: &mut [u8]) -> std::io::Result { 30 | match self { 31 | ProgramInput::File(r) => r.read(buf), 32 | ProgramInput::Stdin(r) => r.read(buf), 33 | } 34 | } 35 | } 36 | 37 | impl Seek for ProgramInput { 38 | fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result { 39 | match self { 40 | ProgramInput::File(file) => file.seek(pos), 41 | ProgramInput::Stdin(_stdin) => Err(std::io::Error::other(ProtocolError::new( 42 | ProtocolErrorKind::IoNotSeekable, 43 | ))), 44 | } 45 | } 46 | } 47 | 48 | #[derive(Debug)] 49 | pub enum ProgramOutput { 50 | File(File), 51 | Stdout(Stdout), 52 | } 53 | 54 | impl ProgramOutput { 55 | pub fn open>(path: P) -> std::io::Result { 56 | let path = path.as_ref(); 57 | 58 | if path.to_str() == Some("-") { 59 | Ok(Self::Stdout(std::io::stdout())) 60 | } else { 61 | let file = File::options() 62 | .write(true) 63 | .create(true) 64 | .truncate(true) 65 | .open(path)?; 66 | Ok(Self::File(file)) 67 | } 68 | } 69 | } 70 | 71 | impl Write for ProgramOutput { 72 | fn write(&mut self, buf: &[u8]) -> std::io::Result { 73 | match self { 74 | ProgramOutput::File(w) => w.write(buf), 75 | ProgramOutput::Stdout(w) => w.write(buf), 76 | } 77 | } 78 | 79 | fn flush(&mut self) -> std::io::Result<()> { 80 | match self { 81 | ProgramOutput::File(w) => w.flush(), 82 | ProgramOutput::Stdout(w) => w.flush(), 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/app/list.rs: -------------------------------------------------------------------------------- 1 | use crate::{app::common::ReaderEvent, dataseq::SeqWriter}; 2 | 3 | use super::{arg::ListCommand, common::ReaderPipeline}; 4 | 5 | pub fn list(args: &ListCommand) -> anyhow::Result<()> { 6 | let output_path = &args.output; 7 | let seq_format = args.format.into(); 8 | 9 | for input_path in &args.input { 10 | let span = tracing::info_span!("list", path = ?input_path); 11 | let _span_guard = span.enter(); 12 | 13 | let input = super::common::open_input(input_path)?; 14 | let output = super::common::open_output(output_path)?; 15 | 16 | tracing::info!("opened file"); 17 | 18 | let compression_format = args.compression.try_into_native(input_path)?; 19 | let file_len = std::fs::metadata(input_path).map(|m| m.len()).ok(); 20 | let mut writer = SeqWriter::new(output, seq_format); 21 | 22 | ReaderPipeline::new( 23 | |event| match event { 24 | ReaderEvent::Header { 25 | header, 26 | record_boundary_position, 27 | } => { 28 | let mut values = Vec::new(); 29 | 30 | for name in &args.field { 31 | if name == ":position" { 32 | values.push(serde_json::Value::Number(record_boundary_position.into())); 33 | } else if name == ":file" { 34 | values.push(serde_json::Value::String( 35 | input_path.to_string_lossy().to_string(), 36 | )); 37 | } else { 38 | let value = header.fields.get(name).cloned().unwrap_or_default(); 39 | values.push(serde_json::Value::String(value)); 40 | } 41 | } 42 | 43 | writer.put(values)?; 44 | 45 | Ok(()) 46 | } 47 | ReaderEvent::Block { data: _ } => Ok(()), 48 | }, 49 | input, 50 | compression_format, 51 | file_len, 52 | )? 53 | .run()?; 54 | 55 | tracing::info!("closed file"); 56 | } 57 | 58 | Ok(()) 59 | } 60 | -------------------------------------------------------------------------------- /src/app/logging.rs: -------------------------------------------------------------------------------- 1 | use std::{fs::File, io::Write, path::Path, str::FromStr, sync::Mutex}; 2 | 3 | use tracing_subscriber::{layer::SubscriberExt, Layer}; 4 | 5 | use super::progress::global_progress_bar; 6 | 7 | #[derive(Debug, Clone, Copy, PartialEq, Eq, clap::ValueEnum)] 8 | pub enum Level { 9 | Trace, 10 | Debug, 11 | Info, 12 | Warn, 13 | Error, 14 | Off, 15 | } 16 | 17 | impl Level { 18 | fn as_level_filter(&self) -> tracing_subscriber::filter::LevelFilter { 19 | match self { 20 | Self::Trace => tracing_subscriber::filter::LevelFilter::TRACE, 21 | Self::Debug => tracing_subscriber::filter::LevelFilter::DEBUG, 22 | Self::Info => tracing_subscriber::filter::LevelFilter::INFO, 23 | Self::Warn => tracing_subscriber::filter::LevelFilter::WARN, 24 | Self::Error => tracing_subscriber::filter::LevelFilter::ERROR, 25 | Self::Off => tracing_subscriber::filter::LevelFilter::OFF, 26 | } 27 | } 28 | } 29 | 30 | impl Default for Level { 31 | fn default() -> Self { 32 | Self::Off 33 | } 34 | } 35 | 36 | impl FromStr for Level { 37 | type Err = (); 38 | 39 | fn from_str(s: &str) -> Result { 40 | match s { 41 | "trace" => Ok(Self::Trace), 42 | "debug" => Ok(Self::Debug), 43 | "info" => Ok(Self::Info), 44 | "warn" => Ok(Self::Warn), 45 | "error" => Ok(Self::Error), 46 | "off" => Ok(Self::Off), 47 | _ => Err(()), 48 | } 49 | } 50 | } 51 | 52 | struct ProgressBarMutexWriter { 53 | dest: W, 54 | } 55 | 56 | impl ProgressBarMutexWriter { 57 | fn new(dest: W) -> Self { 58 | Self { dest } 59 | } 60 | } 61 | 62 | impl Write for ProgressBarMutexWriter { 63 | fn write(&mut self, buf: &[u8]) -> std::io::Result { 64 | global_progress_bar().suspend(|| self.dest.write(buf)) 65 | } 66 | 67 | fn flush(&mut self) -> std::io::Result<()> { 68 | global_progress_bar().suspend(|| self.dest.flush()) 69 | } 70 | } 71 | 72 | pub fn set_up_logging(level: Level, file: Option<&Path>, json: bool) -> std::io::Result<()> { 73 | let file_sub = if let Some(path) = file { 74 | let writer = File::options().create(true).append(true).open(path)?; 75 | Some( 76 | tracing_subscriber::fmt::layer() 77 | .with_ansi(false) 78 | .with_writer(Mutex::new(writer)), 79 | ) 80 | } else { 81 | None 82 | }; 83 | 84 | let stderr_sub = if file.is_none() { 85 | let writer = ProgressBarMutexWriter::new(std::io::stderr()); 86 | Some(tracing_subscriber::fmt::layer().with_writer(Mutex::new(writer))) 87 | } else { 88 | None 89 | }; 90 | 91 | let json_sub = if json { 92 | Some(tracing_subscriber::fmt::layer().json()) 93 | } else { 94 | None 95 | }; 96 | 97 | let sub = tracing_subscriber::Registry::default(); 98 | let sub = sub.with(file_sub.with_filter(level.as_level_filter())); 99 | let sub = sub.with(stderr_sub.with_filter(level.as_level_filter())); 100 | let sub = sub.with(json_sub); 101 | tracing::subscriber::set_global_default(sub).unwrap(); 102 | 103 | tracing::debug!("logging configured"); 104 | 105 | Ok(()) 106 | } 107 | -------------------------------------------------------------------------------- /src/app/model.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | use serde::{Deserialize, Serialize}; 4 | 5 | #[derive(Debug, Clone, Deserialize, Serialize)] 6 | pub enum WarcMessage { 7 | Metadata(Metadata), 8 | Header(Header), 9 | BlockChunk(BlockChunk), 10 | BlockEnd(BlockEnd), 11 | ExtractMetadata(ExtractMetadata), 12 | ExtractChunk(ExtractChunk), 13 | ExtractEnd(ExtractEnd), 14 | EndOfFile(EndOfFile), 15 | } 16 | 17 | #[derive(Debug, Clone, Deserialize, Serialize)] 18 | pub struct Metadata { 19 | pub file: PathBuf, 20 | pub position: u64, 21 | } 22 | 23 | #[derive(Debug, Clone, Deserialize, Serialize)] 24 | pub struct Header { 25 | pub version: String, 26 | pub fields: Vec<(String, String)>, 27 | } 28 | 29 | #[serde_with::serde_as] 30 | #[derive(Debug, Clone, Deserialize, Serialize)] 31 | pub struct BlockChunk { 32 | #[serde_as(as = "serde_with::IfIsHumanReadable")] 33 | pub data: Vec, 34 | } 35 | 36 | #[derive(Debug, Clone, Deserialize, Serialize)] 37 | pub struct BlockEnd { 38 | pub crc32: Option, 39 | pub crc32c: Option, 40 | pub xxh3: Option, 41 | } 42 | 43 | #[derive(Debug, Clone, Deserialize, Serialize)] 44 | pub struct ExtractMetadata { 45 | pub has_content: bool, 46 | pub file_path_components: Vec, 47 | pub is_truncated: bool, 48 | } 49 | 50 | #[serde_with::serde_as] 51 | #[derive(Debug, Clone, Deserialize, Serialize)] 52 | pub struct ExtractChunk { 53 | #[serde_as(as = "serde_with::IfIsHumanReadable")] 54 | pub data: Vec, 55 | } 56 | 57 | #[derive(Debug, Clone, Deserialize, Serialize)] 58 | pub struct ExtractEnd { 59 | pub crc32: Option, 60 | pub crc32c: Option, 61 | pub xxh3: Option, 62 | } 63 | 64 | #[derive(Debug, Clone, Deserialize, Serialize)] 65 | pub struct EndOfFile {} 66 | -------------------------------------------------------------------------------- /src/app/progress.rs: -------------------------------------------------------------------------------- 1 | use std::sync::{LazyLock, Mutex, MutexGuard}; 2 | 3 | use indicatif::{MultiProgress, ProgressBar, ProgressDrawTarget, ProgressStyle}; 4 | 5 | pub fn global_progress_bar() -> MutexGuard<'static, MultiProgress> { 6 | static PROGRESS_BAR: LazyLock> = LazyLock::new(|| { 7 | let bar = MultiProgress::with_draw_target(ProgressDrawTarget::stderr_with_hz(4)); 8 | bar.set_move_cursor(true); 9 | Mutex::new(bar) 10 | }); 11 | 12 | (*PROGRESS_BAR).lock().unwrap() 13 | } 14 | 15 | pub fn disable_global_progress_bar() { 16 | global_progress_bar().set_draw_target(ProgressDrawTarget::hidden()); 17 | } 18 | 19 | pub fn make_bytes_progress_bar(len: Option) -> ProgressBar { 20 | if let Some(len) = len { 21 | let style = ProgressStyle::with_template( 22 | "[{bar:30.cyan/cyan.dim}] {percent:.green}% {binary_bytes:.dim} / {binary_total_bytes:.dim} {msg}", 23 | ) 24 | .unwrap(); 25 | let style = style.progress_chars("=>."); 26 | 27 | ProgressBar::new(len).with_style(style) 28 | } else { 29 | let style = ProgressStyle::with_template("{spinner:.cyan} {msg}").unwrap(); 30 | let style = style.tick_strings(&[ 31 | "[= ]", "[ = ]", "[ = ]", "[ =)", "[ =]", "[ = ]", "[ = ]", "(= ]", 32 | "[====]", 33 | ]); 34 | 35 | ProgressBar::new_spinner().with_style(style) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/app/self_.rs: -------------------------------------------------------------------------------- 1 | use regex::Regex; 2 | use takecrate::{ 3 | inst::{InstallConfig, PackageManifest}, 4 | manifest::AppId, 5 | }; 6 | 7 | use super::arg::{SelfCommand, SelfSubcommand}; 8 | 9 | pub fn self_(args: &SelfCommand) -> anyhow::Result<()> { 10 | match &args.command { 11 | SelfSubcommand::Install { quiet } => { 12 | if *quiet { 13 | install_quiet() 14 | } else { 15 | install_interactive() 16 | } 17 | } 18 | SelfSubcommand::Uninstall { quiet } => { 19 | if *quiet { 20 | uninstall_quiet() 21 | } else { 22 | uninstall_interactive() 23 | } 24 | } 25 | } 26 | } 27 | 28 | pub fn is_installer() -> bool { 29 | if std::env::args().len() > 1 { 30 | return false; 31 | } 32 | 33 | let name = std::env::current_exe().unwrap_or_default(); 34 | let name = if !std::env::consts::EXE_SUFFIX.is_empty() { 35 | name.file_stem().unwrap_or(name.as_os_str()) 36 | } else { 37 | name.as_os_str() 38 | }; 39 | let name = name.to_string_lossy(); 40 | let pattern = Regex::new(r"(?-ui:[. _-]installer)$").unwrap(); 41 | pattern.is_match(&name) 42 | } 43 | 44 | pub fn install_interactive() -> anyhow::Result<()> { 45 | let manifest = package_manifest()?; 46 | takecrate::install_interactive(&manifest)?; 47 | Ok(()) 48 | } 49 | 50 | pub fn install_quiet() -> anyhow::Result<()> { 51 | let manifest = package_manifest()?; 52 | let config = InstallConfig::new_user()?; 53 | takecrate::install(&manifest, &config)?; 54 | Ok(()) 55 | } 56 | 57 | pub fn uninstall_interactive() -> anyhow::Result<()> { 58 | let app_id = app_id(); 59 | takecrate::uninstall_interactive(&app_id)?; 60 | Ok(()) 61 | } 62 | 63 | pub fn uninstall_quiet() -> anyhow::Result<()> { 64 | let app_id = app_id(); 65 | takecrate::uninstall(&app_id)?; 66 | Ok(()) 67 | } 68 | 69 | fn app_id() -> AppId { 70 | AppId::new("io.github.chfoo.warcat-rs").unwrap() 71 | } 72 | 73 | fn package_manifest() -> anyhow::Result { 74 | let mut manifest = PackageManifest::new(&app_id()) 75 | .with_interactive_uninstall_args(&["self", "uninstall"]) 76 | .with_quiet_uninstall_args(&["self", "uninstall", "--quiet"]) 77 | .with_self_exe_renamed(format!("warcat{}", std::env::consts::EXE_SUFFIX))?; 78 | 79 | manifest.app_metadata.display_name = "Warcat".to_string(); 80 | manifest.app_metadata.display_version = clap::crate_version!().to_string(); 81 | 82 | Ok(manifest) 83 | } 84 | -------------------------------------------------------------------------------- /src/app/verify.rs: -------------------------------------------------------------------------------- 1 | use std::{cell::RefCell, process::ExitCode, rc::Rc}; 2 | 3 | use crate::{ 4 | app::common::{ReaderEvent, ReaderPipeline}, 5 | dataseq::SeqWriter, 6 | verify::{Check, Verifier, VerifyStatus}, 7 | }; 8 | 9 | use super::arg::VerifyCommand; 10 | 11 | const VERIFY_FAILED_EXIT_CODE: u8 = 8; 12 | 13 | pub fn verify(args: &VerifyCommand) -> anyhow::Result { 14 | let output_path = &args.output; 15 | let output = super::common::open_output(output_path)?; 16 | let seq_format = args.format.into(); 17 | 18 | let mut writer = SeqWriter::new(output, seq_format); 19 | let mut problem_count = 0u64; 20 | let mut verifier = if let Some(path) = &args.database { 21 | Verifier::open(path)? 22 | } else { 23 | Verifier::new() 24 | }; 25 | 26 | for exclude in &args.exclude_check { 27 | verifier.checks_mut().remove(&Check::from(*exclude)); 28 | } 29 | 30 | let verifier = Rc::new(RefCell::new(verifier)); 31 | 32 | for input_path in &args.input { 33 | let span = tracing::info_span!("verify", path = ?input_path); 34 | let _span_guard = span.enter(); 35 | 36 | let input = super::common::open_input(input_path)?; 37 | 38 | tracing::info!("opened file"); 39 | 40 | let compression_format = args.compression.try_into_native(input_path)?; 41 | let file_len = std::fs::metadata(input_path).map(|m| m.len()).ok(); 42 | 43 | let mut reader = ReaderPipeline::new( 44 | |event| match event { 45 | ReaderEvent::Header { 46 | header, 47 | record_boundary_position: _, 48 | } => { 49 | let mut verifier = verifier.borrow_mut(); 50 | 51 | for problem in verifier.problems() { 52 | problem_count += 1; 53 | writer.put(problem)?; 54 | } 55 | verifier.problems_mut().clear(); 56 | verifier.begin_record(&header)?; 57 | 58 | Ok(()) 59 | } 60 | ReaderEvent::Block { data } => { 61 | let mut verifier = verifier.borrow_mut(); 62 | 63 | if data.is_empty() { 64 | verifier.end_record(); 65 | } else { 66 | verifier.block_data(data); 67 | } 68 | 69 | Ok(()) 70 | } 71 | }, 72 | input, 73 | compression_format, 74 | file_len, 75 | )?; 76 | reader.run()?; 77 | 78 | let mut verifier = verifier.borrow_mut(); 79 | 80 | if reader.has_record_at_time_compression_fault { 81 | verifier.add_not_record_at_time_compression(); 82 | } 83 | 84 | loop { 85 | let action = verifier.verify_end()?; 86 | 87 | for problem in verifier.problems() { 88 | problem_count += 1; 89 | writer.put(problem)?; 90 | } 91 | verifier.problems_mut().clear(); 92 | 93 | match action { 94 | VerifyStatus::HasMore => {} 95 | VerifyStatus::Done => break, 96 | } 97 | } 98 | 99 | tracing::info!("closed file"); 100 | } 101 | 102 | let exit_code = if problem_count == 0 { 103 | ExitCode::SUCCESS 104 | } else { 105 | ExitCode::from(VERIFY_FAILED_EXIT_CODE) 106 | }; 107 | 108 | Ok(exit_code) 109 | } 110 | -------------------------------------------------------------------------------- /src/compress/encode.rs: -------------------------------------------------------------------------------- 1 | use std::{fmt::Debug, io::Write}; 2 | 3 | #[cfg(feature = "zstd")] 4 | use super::zstd::ZstdEncoder; 5 | use brotli::CompressorWriter as BrEncoder; 6 | use flate2::write::{GzEncoder, ZlibEncoder}; 7 | 8 | use super::{Dictionary, Format, Level}; 9 | 10 | pub enum Encoder { 11 | Identity(W), 12 | Deflate(ZlibEncoder), 13 | Gzip(GzEncoder), 14 | Brotli(Box>), 15 | #[cfg(feature = "zstd")] 16 | Zstandard(ZstdEncoder), 17 | None, 18 | } 19 | 20 | impl Encoder { 21 | pub fn new(dest: W, format: Format, level: Level, dictionary: &Dictionary) -> Encoder { 22 | let level = get_encoder_level(format, level); 23 | 24 | match format { 25 | Format::Identity => Encoder::Identity(dest), 26 | Format::Deflate => Encoder::Deflate(ZlibEncoder::new( 27 | dest, 28 | flate2::Compression::new(level as u32), 29 | )), 30 | Format::Gzip => { 31 | Encoder::Gzip(GzEncoder::new(dest, flate2::Compression::new(level as u32))) 32 | } 33 | Format::Brotli => { 34 | Encoder::Brotli(Box::new(BrEncoder::new(dest, 4096, level as u32, 22))) 35 | } 36 | #[cfg(feature = "zstd")] 37 | Format::Zstandard => { 38 | Encoder::Zstandard(ZstdEncoder::new(dest, level, dictionary.clone()).unwrap()) 39 | } 40 | } 41 | } 42 | 43 | pub fn get_ref(&self) -> &W { 44 | match self { 45 | Self::Identity(w) => w, 46 | Self::Deflate(codec) => codec.get_ref(), 47 | Self::Gzip(codec) => codec.get_ref(), 48 | Self::Brotli(codec) => codec.get_ref(), 49 | #[cfg(feature = "zstd")] 50 | Self::Zstandard(codec) => codec.get_ref(), 51 | Self::None => unreachable!(), 52 | } 53 | } 54 | 55 | pub fn get_mut(&mut self) -> &mut W { 56 | match self { 57 | Self::Identity(w) => w, 58 | Self::Deflate(codec) => codec.get_mut(), 59 | Self::Gzip(codec) => codec.get_mut(), 60 | Self::Brotli(codec) => codec.get_mut(), 61 | #[cfg(feature = "zstd")] 62 | Self::Zstandard(codec) => codec.get_mut(), 63 | Self::None => unreachable!(), 64 | } 65 | } 66 | 67 | pub fn finish(self) -> std::io::Result { 68 | match self { 69 | Self::Identity(w) => Ok(w), 70 | Self::Deflate(codec) => codec.finish(), 71 | Self::Gzip(codec) => codec.finish(), 72 | Self::Brotli(codec) => Ok(codec.into_inner()), 73 | #[cfg(feature = "zstd")] 74 | Self::Zstandard(codec) => codec.finish(), 75 | Self::None => unreachable!(), 76 | } 77 | } 78 | } 79 | 80 | impl Write for Encoder { 81 | fn write(&mut self, buf: &[u8]) -> std::io::Result { 82 | match self { 83 | Self::Identity(w) => w.write(buf), 84 | Self::Deflate(w) => w.write(buf), 85 | Self::Gzip(w) => w.write(buf), 86 | Self::Brotli(w) => w.write(buf), 87 | #[cfg(feature = "zstd")] 88 | Self::Zstandard(w) => w.write(buf), 89 | Self::None => unreachable!(), 90 | } 91 | } 92 | 93 | fn flush(&mut self) -> std::io::Result<()> { 94 | match self { 95 | Self::Identity(w) => w.flush(), 96 | Self::Deflate(w) => w.flush(), 97 | Self::Gzip(w) => w.flush(), 98 | Self::Brotli(w) => w.flush(), 99 | #[cfg(feature = "zstd")] 100 | Self::Zstandard(w) => w.flush(), 101 | Self::None => unreachable!(), 102 | } 103 | } 104 | } 105 | 106 | impl Debug for Encoder { 107 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 108 | match self { 109 | Self::Identity(_arg0) => f.debug_tuple("Identity").finish(), 110 | Self::Deflate(_arg0) => f.debug_tuple("Deflate").finish(), 111 | Self::Gzip(_arg0) => f.debug_tuple("Gzip").finish(), 112 | Self::Brotli(_arg0) => f.debug_tuple("Brotli").finish(), 113 | #[cfg(feature = "zstd")] 114 | Self::Zstandard(_arg0) => f.debug_tuple("Zstandard").finish(), 115 | Self::None => write!(f, "None"), 116 | } 117 | } 118 | } 119 | 120 | fn get_encoder_level(format: Format, level: Level) -> i32 { 121 | match format { 122 | Format::Identity => match level { 123 | Level::Balanced => 0, 124 | Level::High => 0, 125 | Level::Low => 0, 126 | }, 127 | Format::Deflate | Format::Gzip => match level { 128 | Level::Balanced => 6, 129 | Level::High => 9, 130 | Level::Low => 1, 131 | }, 132 | 133 | Format::Brotli => match level { 134 | Level::Balanced => 4, 135 | Level::High => 7, 136 | Level::Low => 0, 137 | }, 138 | #[cfg(feature = "zstd")] 139 | Format::Zstandard => match level { 140 | Level::Balanced => 3, 141 | Level::High => 9, 142 | Level::Low => 1, 143 | }, 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /src/compress/zstd.rs: -------------------------------------------------------------------------------- 1 | use std::io::Read; 2 | 3 | #[cfg(feature = "zstd")] 4 | pub(crate) use decode::{ZstdDecoder, ZstdPushDecoder}; 5 | #[cfg(feature = "zstd")] 6 | pub(crate) use encode::ZstdEncoder; 7 | 8 | #[cfg(feature = "zstd")] 9 | mod decode; 10 | #[cfg(feature = "zstd")] 11 | mod encode; 12 | 13 | const WARC_DICT_FRAME: u32 = 0x184D2A5D; 14 | const ZSTD_FRAME: u32 = 0xFD2FB528; 15 | const BULK_BUFFER_LENGTH: usize = 16 * 1024 * 1024; 16 | 17 | pub fn is_skippable_frame(magic_number: u32) -> bool { 18 | (0x184D2A50..=0x184D2A5F).contains(&magic_number) 19 | } 20 | 21 | pub fn extract_warc_zst_dictionary( 22 | mut input: R, 23 | ) -> Result, WarcZstDictExtractError> { 24 | let mut buf = [0u8; 8]; 25 | 26 | input.read_exact(&mut buf)?; 27 | 28 | let magic_number = u32::from_le_bytes(buf[0..4].try_into().unwrap()); 29 | let length = u32::from_le_bytes(buf[4..8].try_into().unwrap()); 30 | 31 | if length > BULK_BUFFER_LENGTH as u32 { 32 | return Err(WarcZstDictExtractError::TooLarge); 33 | } 34 | 35 | if magic_number != WARC_DICT_FRAME { 36 | return Err(WarcZstDictExtractError::NotDict); 37 | } 38 | 39 | let mut buf = vec![0u8; length as usize]; 40 | input.read_exact(&mut buf)?; 41 | 42 | if buf.starts_with(&ZSTD_FRAME.to_le_bytes()) { 43 | #[cfg(feature = "zstd")] 44 | { 45 | let buf2 = zstd::bulk::decompress(&buf, BULK_BUFFER_LENGTH)?; 46 | 47 | Ok(buf2) 48 | } 49 | #[cfg(not(feature = "zstd"))] 50 | { 51 | Err(std::io::Error::other( 52 | "failed to read compressed .warc.zst dictionary: zstd feature is not enabled", 53 | )) 54 | } 55 | } else { 56 | Ok(buf) 57 | } 58 | } 59 | 60 | #[derive(Debug, thiserror::Error)] 61 | pub enum WarcZstDictExtractError { 62 | #[error("dictionary too large")] 63 | TooLarge, 64 | #[error("not a .warc.zst dictionary")] 65 | NotDict, 66 | #[error(transparent)] 67 | Other(#[from] std::io::Error), 68 | } 69 | -------------------------------------------------------------------------------- /src/compress/zstd/encode.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | 3 | use zstd::stream::write::Encoder as ZstdEncoderImpl; 4 | 5 | use crate::compress::Dictionary; 6 | 7 | use super::WARC_DICT_FRAME; 8 | 9 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 10 | enum WarcDictionaryState { 11 | None, 12 | PendingFrameWrite, 13 | Ok, 14 | } 15 | 16 | pub struct ZstdEncoder { 17 | level: i32, 18 | dictionary: Dictionary, 19 | warc_dict_state: WarcDictionaryState, 20 | encoder_impl: Option>, 21 | } 22 | 23 | impl ZstdEncoder { 24 | pub fn new(dest: W, level: i32, dictionary: Dictionary) -> std::io::Result { 25 | let warc_dict_state = match &dictionary { 26 | Dictionary::None => WarcDictionaryState::None, 27 | Dictionary::Zstd(_vec) => WarcDictionaryState::None, 28 | Dictionary::WarcZstd(_vec) => WarcDictionaryState::PendingFrameWrite, 29 | }; 30 | let mut encoder_impl = match &dictionary { 31 | Dictionary::None => ZstdEncoderImpl::new(dest, level)?, 32 | Dictionary::Zstd(vec) => ZstdEncoderImpl::with_dictionary(dest, level, vec)?, 33 | Dictionary::WarcZstd(vec) => ZstdEncoderImpl::with_dictionary(dest, level, vec)?, 34 | }; 35 | Self::config_encoder(&mut encoder_impl)?; 36 | Ok(Self { 37 | level, 38 | dictionary, 39 | warc_dict_state, 40 | encoder_impl: Some(encoder_impl), 41 | }) 42 | } 43 | 44 | fn config_encoder(encoder: &mut ZstdEncoderImpl<'static, W>) -> std::io::Result<()> { 45 | encoder.include_checksum(true)?; 46 | Ok(()) 47 | } 48 | 49 | pub fn get_ref(&self) -> &W { 50 | self.encoder_impl.as_ref().unwrap().get_ref() 51 | } 52 | 53 | pub fn get_mut(&mut self) -> &mut W { 54 | self.encoder_impl.as_mut().unwrap().get_mut() 55 | } 56 | 57 | fn write_warc_dictionary(&mut self) -> std::io::Result<()> { 58 | if let Dictionary::WarcZstd(data) = &self.dictionary { 59 | let dest = self.encoder_impl.as_mut().unwrap().get_mut(); 60 | dest.write_all(&WARC_DICT_FRAME.to_le_bytes())?; 61 | dest.write_all(&(data.len() as u32).to_le_bytes())?; 62 | dest.write_all(data)?; 63 | } 64 | 65 | Ok(()) 66 | } 67 | 68 | pub fn finish(self) -> std::io::Result { 69 | self.encoder_impl.unwrap().finish() 70 | } 71 | 72 | pub fn start_new_frame(&mut self) -> std::io::Result<()> { 73 | // FIXME: We should be reusing the zstd context but the API is a bit difficult. 74 | 75 | let dest = self.encoder_impl.take().unwrap().finish()?; 76 | 77 | let mut encoder_impl = match &self.dictionary { 78 | Dictionary::None => ZstdEncoderImpl::new(dest, self.level)?, 79 | Dictionary::Zstd(vec) => ZstdEncoderImpl::with_dictionary(dest, self.level, vec)?, 80 | Dictionary::WarcZstd(vec) => ZstdEncoderImpl::with_dictionary(dest, self.level, vec)?, 81 | }; 82 | Self::config_encoder(&mut encoder_impl)?; 83 | 84 | self.encoder_impl = Some(encoder_impl); 85 | 86 | Ok(()) 87 | } 88 | } 89 | 90 | impl Write for ZstdEncoder { 91 | fn write(&mut self, buf: &[u8]) -> std::io::Result { 92 | if self.warc_dict_state == WarcDictionaryState::PendingFrameWrite { 93 | self.warc_dict_state = WarcDictionaryState::Ok; 94 | 95 | self.write_warc_dictionary()?; 96 | } 97 | 98 | self.encoder_impl.as_mut().unwrap().write(buf) 99 | } 100 | 101 | fn flush(&mut self) -> std::io::Result<()> { 102 | self.encoder_impl.as_mut().unwrap().flush() 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/fields/de.rs: -------------------------------------------------------------------------------- 1 | use std::marker::PhantomData; 2 | 3 | use serde::{de::Visitor, Deserialize, Deserializer}; 4 | 5 | use super::FieldMap; 6 | 7 | struct FieldMapVisitor { 8 | _n: PhantomData, 9 | _v: PhantomData, 10 | } 11 | 12 | impl FieldMapVisitor { 13 | fn new() -> Self { 14 | Self { 15 | _n: PhantomData, 16 | _v: PhantomData, 17 | } 18 | } 19 | } 20 | 21 | impl<'de, N, V> Visitor<'de> for FieldMapVisitor 22 | where 23 | N: Deserialize<'de>, 24 | V: Deserialize<'de>, 25 | { 26 | type Value = FieldMap; 27 | 28 | fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { 29 | formatter.write_str("sequence of two-item tuples") 30 | } 31 | 32 | fn visit_seq(self, mut seq: A) -> Result 33 | where 34 | A: serde::de::SeqAccess<'de>, 35 | { 36 | let mut items = Vec::new(); 37 | 38 | while let Some(item) = seq.next_element()? { 39 | items.push(item); 40 | } 41 | 42 | Ok(FieldMap { fields: items }) 43 | } 44 | } 45 | 46 | impl<'de, N, V> Deserialize<'de> for FieldMap 47 | where 48 | N: Deserialize<'de>, 49 | V: Deserialize<'de>, 50 | { 51 | fn deserialize(deserializer: D) -> Result, D::Error> 52 | where 53 | D: Deserializer<'de>, 54 | { 55 | deserializer.deserialize_seq(FieldMapVisitor::new()) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/fields/ser.rs: -------------------------------------------------------------------------------- 1 | use serde::{ser::SerializeSeq, Serialize, Serializer}; 2 | 3 | use super::FieldMap; 4 | 5 | impl Serialize for FieldMap 6 | where 7 | N: Serialize, 8 | V: Serialize, 9 | { 10 | fn serialize(&self, serializer: S) -> Result 11 | where 12 | S: Serializer, 13 | { 14 | let mut seq = serializer.serialize_seq(Some(self.fields.len()))?; 15 | 16 | for item in &self.fields { 17 | seq.serialize_element(item)?; 18 | } 19 | seq.end() 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/header.rs: -------------------------------------------------------------------------------- 1 | //! WARC headers 2 | use std::io::Write; 3 | 4 | use chrono::Utc; 5 | 6 | use crate::{ 7 | error::{ParseError, ProtocolError, ProtocolErrorKind}, 8 | fields::FieldMap, 9 | }; 10 | 11 | pub mod fields; 12 | 13 | pub type WarcFields = FieldMap; 14 | 15 | /// Data structure for representing a WARC header. 16 | #[derive(Debug, Clone)] 17 | pub struct WarcHeader { 18 | /// The version string such as "WARC/1.1". 19 | pub version: String, 20 | /// The name-value fields of the header. 21 | pub fields: WarcFields, 22 | } 23 | 24 | impl WarcHeader { 25 | /// Create a new empty header. 26 | /// 27 | /// The version and fields will be empty. 28 | pub fn empty() -> Self { 29 | Self { 30 | version: String::new(), 31 | fields: FieldMap::new(), 32 | } 33 | } 34 | 35 | /// Create a new header with the bare minimum values. 36 | /// 37 | /// The user supplies the `Content-Length` and `WARC-Type`. 38 | /// `WARC-Record-ID` and `WARC-Date` is automatically generated. 39 | pub fn new(content_length: u64, warc_type: WT) -> Self 40 | where 41 | WT: Into, 42 | { 43 | let mut header = WarcHeader::empty(); 44 | header.version = "WARC/1.1".to_string(); 45 | let uuid = uuid::Uuid::now_v7(); 46 | let date_now = Utc::now(); 47 | 48 | header 49 | .fields 50 | .insert("WARC-Record-ID".to_string(), format!("<{}>", uuid.urn())); 51 | header 52 | .fields 53 | .insert("WARC-Type".to_string(), warc_type.into()); 54 | header 55 | .fields 56 | .insert("WARC-Date".to_string(), date_now.to_rfc3339()); 57 | header.set_content_length(content_length); 58 | 59 | header 60 | } 61 | 62 | /// Parses a WARC header from the given bytes. 63 | pub fn parse(input: &[u8]) -> Result { 64 | let (remain, version) = crate::parse::warc::version_line(input)?; 65 | 66 | let mut header = Self::empty(); 67 | header.version = String::from_utf8(version.to_vec())?; 68 | 69 | let (_remain, pairs) = crate::parse::fields::field_pairs(remain)?; 70 | 71 | for pair in pairs { 72 | let name = String::from_utf8(pair.name.to_vec())?; 73 | let value = String::from_utf8(crate::parse::remove_line_folding(pair.value).to_vec())?; 74 | 75 | header.fields.insert(name, value); 76 | } 77 | 78 | Ok(header) 79 | } 80 | 81 | /// Returns the value of `Content-Length` as an integer. 82 | pub fn content_length(&self) -> Result { 83 | if let Some(value) = self.fields.get_u64_strict("Content-Length") { 84 | Ok(value.map_err(|e| { 85 | ProtocolError::new(ProtocolErrorKind::InvalidContentLength).with_source(e) 86 | })?) 87 | } else { 88 | Err(ProtocolError::new(ProtocolErrorKind::InvalidContentLength)) 89 | } 90 | } 91 | 92 | /// Sets the value of `Content-Length` as an integer. 93 | pub fn set_content_length(&mut self, value: u64) { 94 | self.fields 95 | .insert("Content-Length".to_string(), value.to_string()); 96 | } 97 | 98 | /// Returns whether the header is a valid WARC formatted header. 99 | /// 100 | /// **Important:** This function does not validate whether the *contents* of 101 | /// the header conforms to the WARC specification! 102 | pub fn validate(&self) -> Result<(), ParseError> { 103 | crate::parse::warc::version(self.version.as_bytes())?; 104 | 105 | for (name, value) in &self.fields { 106 | crate::parse::validate_field_name(name.as_bytes())?; 107 | crate::parse::validate_field_value(value.as_bytes(), false)?; 108 | } 109 | 110 | Ok(()) 111 | } 112 | 113 | /// Write the WARC header as serialized bytes. 114 | pub fn serialize(&self, mut buf: W) -> std::io::Result<()> { 115 | buf.write_all(self.version.as_bytes())?; 116 | buf.write_all(b"\r\n")?; 117 | 118 | for (name, value) in &self.fields { 119 | buf.write_all(name.as_bytes())?; 120 | buf.write_all(b": ")?; 121 | buf.write_all(value.as_bytes())?; 122 | buf.write_all(b"\r\n")?; 123 | } 124 | 125 | buf.write_all(b"\r\n")?; 126 | 127 | Ok(()) 128 | } 129 | } 130 | 131 | #[cfg(test)] 132 | mod tests { 133 | use super::*; 134 | 135 | #[test] 136 | fn test_header_parse_serialize() { 137 | let data = "WARC/1.1\r\n\ 138 | WARC-Record-ID: \r\n\ 139 | Content-Length: 0\r\n\ 140 | \r\n"; 141 | let header = WarcHeader::parse(data.as_bytes()).unwrap(); 142 | 143 | assert_eq!(&header.version, "WARC/1.1"); 144 | assert_eq!(header.fields.len(), 2); 145 | 146 | let mut buf = Vec::new(); 147 | 148 | header.serialize(&mut buf).unwrap(); 149 | 150 | assert_eq!(&buf, data.as_bytes()); 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /src/header/fields.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::HashMap, net::IpAddr, str::FromStr}; 2 | 3 | use chrono::{DateTime, FixedOffset}; 4 | use url::Url; 5 | 6 | use crate::error::ParseError; 7 | 8 | use super::WarcFields; 9 | 10 | pub trait FieldsExt { 11 | /// Returns the value if the name is present, otherwise empty string. 12 | fn get_or_default>(&self, name: N) -> &str; 13 | 14 | /// Parse a "content-type" field. 15 | fn get_media_type>(&self, name: N) -> Option>; 16 | 17 | /// Parse a ISO8601 field. 18 | fn get_date>(&self, name: N) 19 | -> Option, ParseError>>; 20 | 21 | /// Returns whether the value is delimitated by `<` and `>`. 22 | fn is_formatted_bad_spec_url>(&self, name: N) -> bool; 23 | 24 | /// Returns the value with the deliminator `<` and `>` removed. 25 | fn get_url_str>(&self, name: N) -> Option<&str>; 26 | 27 | /// Parse a URL (with the deliminator `<` and `>` removed). 28 | fn get_url>(&self, name: N) -> Option>; 29 | 30 | /// Parse an IP address. 31 | fn get_ip_addr>(&self, name: N) -> Option>; 32 | } 33 | 34 | #[derive(Debug, Clone, Default)] 35 | pub struct MediaType { 36 | pub type_: String, 37 | pub subtype: String, 38 | pub parameters: HashMap, 39 | } 40 | 41 | impl MediaType { 42 | pub fn empty() -> Self { 43 | Self { 44 | ..Default::default() 45 | } 46 | } 47 | } 48 | 49 | impl FromStr for MediaType { 50 | type Err = ParseError; 51 | 52 | fn from_str(s: &str) -> Result { 53 | let (_remain, output) = crate::parse::fields::media_type(s.as_bytes())?; 54 | 55 | Ok(Self { 56 | type_: String::from_utf8_lossy(output.type_).to_string(), 57 | subtype: String::from_utf8_lossy(output.subtype).to_string(), 58 | parameters: HashMap::from_iter(output.parameters.iter().map(|(k, v)| { 59 | ( 60 | String::from_utf8_lossy(k).to_string(), 61 | String::from_utf8_lossy(v).to_string(), 62 | ) 63 | })), 64 | }) 65 | } 66 | } 67 | 68 | impl FieldsExt for WarcFields { 69 | fn get_or_default>(&self, name: N) -> &str { 70 | self.get(name.as_ref()) 71 | .map(String::as_str) 72 | .unwrap_or_default() 73 | } 74 | 75 | fn get_media_type>(&self, name: N) -> Option> { 76 | self.get(name.as_ref()) 77 | .map(|value| MediaType::from_str(value)) 78 | } 79 | 80 | fn get_date>( 81 | &self, 82 | name: N, 83 | ) -> Option, ParseError>> { 84 | self.get(name.as_ref()) 85 | .map(|value| DateTime::parse_from_rfc3339(value).map_err(|error| error.into())) 86 | } 87 | 88 | fn is_formatted_bad_spec_url>(&self, name: N) -> bool { 89 | if let Some(value) = self.get(name.as_ref()) { 90 | value.starts_with("<") && value.ends_with(">") 91 | } else { 92 | false 93 | } 94 | } 95 | 96 | fn get_url_str>(&self, name: N) -> Option<&str> { 97 | if let Some(value) = self.get(name.as_ref()) { 98 | if value.starts_with("<") && value.ends_with(">") { 99 | Some(value.trim_start_matches("<").trim_end_matches(">")) 100 | } else { 101 | Some(value) 102 | } 103 | } else { 104 | None 105 | } 106 | } 107 | 108 | fn get_url>(&self, name: N) -> Option> { 109 | if let Some(value) = self.get(name.as_ref()) { 110 | let value = if value.starts_with("<") && value.ends_with(">") { 111 | value.trim_start_matches("<").trim_end_matches(">") 112 | } else { 113 | value 114 | }; 115 | 116 | Some(Url::parse(value).map_err(|error| error.into())) 117 | } else { 118 | None 119 | } 120 | } 121 | 122 | fn get_ip_addr>(&self, name: N) -> Option> { 123 | self.get(name.as_ref()) 124 | .map(|value| IpAddr::from_str(value).map_err(|error| error.into())) 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/http.rs: -------------------------------------------------------------------------------- 1 | //! Things dealing with the HTTP protocol 2 | pub mod h1; 3 | -------------------------------------------------------------------------------- /src/http/h1.rs: -------------------------------------------------------------------------------- 1 | //! Minimal, low-level HTTP 1.1 protocol implementation 2 | //! 3 | //! This module is sans-IO; it doesn't use networking sockets. 4 | pub mod codec; 5 | pub mod error; 6 | pub mod header; 7 | pub mod recv; 8 | pub mod send; 9 | -------------------------------------------------------------------------------- /src/http/h1/codec/compress.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | use std::str::FromStr; 3 | 4 | use crate::{ 5 | compress::{Compressor, Format as CompressionFormat, PushDecompressor}, 6 | error::{GeneralError, ProtocolError, ProtocolErrorKind}, 7 | }; 8 | 9 | use super::Codec; 10 | 11 | #[derive(Debug)] 12 | pub struct CompressionEncoder { 13 | compressor: Option>>, 14 | } 15 | 16 | impl CompressionEncoder { 17 | pub fn new(compressor: Compressor>) -> Self { 18 | Self { 19 | compressor: Some(compressor), 20 | } 21 | } 22 | 23 | pub fn try_of_name(name: &str) -> Result { 24 | let format = CompressionFormat::from_str(name) 25 | .map_err(|_| ProtocolError::new(ProtocolErrorKind::UnsupportedCompressionFormat))?; 26 | 27 | Ok(Self::new(Compressor::new(Vec::new(), format))) 28 | } 29 | } 30 | 31 | impl Codec for CompressionEncoder { 32 | fn transform(&mut self, input: &[u8], output: &mut Vec) -> Result<(), GeneralError> { 33 | if let Some(compressor) = &mut self.compressor { 34 | compressor.write_all(input)?; 35 | 36 | output.extend_from_slice(compressor.get_ref()); 37 | compressor.get_mut().clear(); 38 | } 39 | 40 | Ok(()) 41 | } 42 | 43 | fn finish_input(&mut self, output: &mut Vec) -> Result<(), GeneralError> { 44 | if let Some(mut compressor) = self.compressor.take() { 45 | compressor.flush()?; 46 | 47 | let buf = compressor.finish()?; 48 | 49 | output.extend_from_slice(&buf); 50 | } 51 | 52 | Ok(()) 53 | } 54 | } 55 | 56 | #[derive(Debug)] 57 | pub struct CompressionDecoder { 58 | decompressor: PushDecompressor>, 59 | } 60 | 61 | impl CompressionDecoder { 62 | pub fn new(decompressor: PushDecompressor>) -> Self { 63 | Self { decompressor } 64 | } 65 | 66 | pub fn try_of_name(name: &str) -> Result { 67 | let format = CompressionFormat::from_str(name) 68 | .map_err(|_| ProtocolError::new(ProtocolErrorKind::UnsupportedCompressionFormat))?; 69 | 70 | Ok(Self::new(PushDecompressor::new(Vec::new(), format)?)) 71 | } 72 | } 73 | 74 | impl Codec for CompressionDecoder { 75 | fn transform(&mut self, input: &[u8], output: &mut Vec) -> Result<(), GeneralError> { 76 | self.decompressor.write_all(input)?; 77 | self.decompressor.flush()?; 78 | 79 | output.extend_from_slice(self.decompressor.get_ref()); 80 | self.decompressor.get_mut().clear(); 81 | 82 | Ok(()) 83 | } 84 | } 85 | 86 | #[cfg(test)] 87 | mod tests { 88 | use super::*; 89 | 90 | #[test] 91 | fn test_compression() { 92 | let mut encoder = CompressionEncoder::try_of_name("gzip").unwrap(); 93 | let mut buf = Vec::new(); 94 | 95 | encoder.transform(b"Hello world!", &mut buf).unwrap(); 96 | encoder.finish_input(&mut buf).unwrap(); 97 | 98 | let mut output = Vec::new(); 99 | 100 | let mut decoder = CompressionDecoder::try_of_name("gzip").unwrap(); 101 | decoder.transform(&buf, &mut output).unwrap(); 102 | 103 | assert_eq!(&output, b"Hello world!"); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/http/h1/error.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chfoo/warcat-rs/cf5f71a67e5c19464039eadde41c616149c5ec11/src/http/h1/error.rs -------------------------------------------------------------------------------- /src/http/h1/header/fields.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::Cow; 2 | 3 | use super::{HeaderFields, Hstring}; 4 | 5 | pub trait FieldsExt { 6 | fn get_comma_list<'a>(&'a self, name: &'a str) -> impl Iterator>; 7 | 8 | fn get_u64_strict>( 9 | &self, 10 | name: N, 11 | ) -> Option>; 12 | } 13 | 14 | impl FieldsExt for HeaderFields { 15 | fn get_comma_list<'a>(&'a self, name: &'a str) -> impl Iterator> { 16 | let mut list = Vec::new(); 17 | 18 | for value in self.get_all(name) { 19 | if let Some(value) = value.as_text() { 20 | for item in value.split(",") { 21 | let item = crate::util::to_ascii_lowercase_cow(item.trim()); 22 | 23 | if !list.contains(&item) { 24 | list.push(item); 25 | } 26 | } 27 | } 28 | } 29 | 30 | list.into_iter() 31 | } 32 | 33 | fn get_u64_strict>( 34 | &self, 35 | name: N, 36 | ) -> Option> { 37 | if let Some(Hstring::Text(value)) = self.get(name.as_ref()) { 38 | Some(crate::parse::parse_u64_strict(value)) 39 | } else { 40 | None 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/http/h1/header/parse.rs: -------------------------------------------------------------------------------- 1 | use nom::{ 2 | IResult, Parser, 3 | branch::alt, 4 | bytes::complete::{tag, tag_no_case, take_while, take_while1}, 5 | character::complete::{digit1, line_ending}, 6 | combinator::{map, recognize, verify}, 7 | sequence::terminated, 8 | }; 9 | 10 | pub enum StartLine<'a> { 11 | RequestLine(RequestLine<'a>), 12 | StatusLine(StatusLine<'a>), 13 | } 14 | 15 | pub struct RequestLine<'a> { 16 | pub method: &'a [u8], 17 | pub request_target: &'a [u8], 18 | pub http_version: &'a [u8], 19 | } 20 | 21 | pub struct StatusLine<'a> { 22 | pub http_version: &'a [u8], 23 | pub status_code: &'a [u8], 24 | pub reason_phrase: &'a [u8], 25 | } 26 | 27 | pub fn start_line(input: &[u8]) -> IResult<&[u8], StartLine<'_>> { 28 | let status_line = map(status_line, StartLine::StatusLine); 29 | let request_line = map(request_line, StartLine::RequestLine); 30 | 31 | terminated(alt((status_line, request_line)), line_ending).parse(input) 32 | } 33 | 34 | pub fn request_line(input: &[u8]) -> IResult<&[u8], RequestLine<'_>> { 35 | let parts = (method, tag(" "), request_target, tag(" "), http_version); 36 | 37 | #[allow(clippy::type_complexity)] 38 | map(parts, |output: (&[u8], &[u8], &[u8], &[u8], &[u8])| { 39 | RequestLine { 40 | method: output.0, 41 | request_target: output.2, 42 | http_version: output.4, 43 | } 44 | }) 45 | .parse(input) 46 | } 47 | 48 | pub fn status_line(input: &[u8]) -> IResult<&[u8], StatusLine<'_>> { 49 | alt((status_line_strict, status_line_non_strict)).parse(input) 50 | } 51 | 52 | fn status_line_strict(input: &[u8]) -> IResult<&[u8], StatusLine<'_>> { 53 | let parts = (http_version, tag(" "), status_code, tag(" "), reason_phrase); 54 | 55 | #[allow(clippy::type_complexity)] 56 | map(parts, |output: (&[u8], &[u8], &[u8], &[u8], &[u8])| { 57 | StatusLine { 58 | http_version: output.0, 59 | status_code: output.2, 60 | reason_phrase: output.4, 61 | } 62 | }) 63 | .parse(input) 64 | } 65 | 66 | fn status_line_non_strict(input: &[u8]) -> IResult<&[u8], StatusLine<'_>> { 67 | // https://mailman.nginx.org/pipermail/nginx/2013-June/039186.html 68 | let parts = (http_version, tag(" "), status_code); 69 | 70 | map(parts, |output: (&[u8], &[u8], &[u8])| StatusLine { 71 | http_version: output.0, 72 | status_code: output.2, 73 | reason_phrase: b"", 74 | }) 75 | .parse(input) 76 | } 77 | 78 | fn method(input: &[u8]) -> IResult<&[u8], &[u8]> { 79 | crate::parse::fields::token(input) 80 | } 81 | 82 | fn request_target(input: &[u8]) -> IResult<&[u8], &[u8]> { 83 | take_while1(|c: u8| c.is_ascii_graphic())(input) 84 | } 85 | 86 | fn http_version(input: &[u8]) -> IResult<&[u8], &[u8]> { 87 | // Newer HTTP specifications requires the http-name to be case-sensitive, 88 | // but we should be lenient instead. 89 | recognize(( 90 | tag_no_case("HTTP"), 91 | tag("/"), 92 | one_digit, 93 | tag("."), 94 | one_digit, 95 | )) 96 | .parse(input) 97 | } 98 | 99 | fn one_digit(input: &[u8]) -> IResult<&[u8], &[u8]> { 100 | verify(digit1, |i: &[u8]| i.len() == 1).parse(input) 101 | } 102 | 103 | fn status_code(input: &[u8]) -> IResult<&[u8], &[u8]> { 104 | verify(digit1, |i: &[u8]| i.len() == 3).parse(input) 105 | } 106 | 107 | fn reason_phrase(input: &[u8]) -> IResult<&[u8], &[u8]> { 108 | take_while(|b: u8| { 109 | b.is_ascii_graphic() || b == b' ' || b == b'\t' || crate::parse::fields::is_obs_text(b) 110 | })(input) 111 | } 112 | -------------------------------------------------------------------------------- /src/http/h1/send.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::VecDeque, io::Read}; 2 | 3 | use crate::error::GeneralError; 4 | 5 | use super::{ 6 | codec::CodecPipeline, 7 | header::{MessageHeader, TrailerFields}, 8 | }; 9 | 10 | /// Encodes a HTTP request/response message. 11 | /// 12 | /// Important: This struct makes no semantic validation! It simply outputs 13 | /// what you call. 14 | pub struct Sender { 15 | codec_pipeline: CodecPipeline, 16 | output_buf: VecDeque, 17 | } 18 | 19 | impl Sender { 20 | pub fn new() -> Self { 21 | Self { 22 | codec_pipeline: CodecPipeline::default(), 23 | output_buf: VecDeque::new(), 24 | } 25 | } 26 | 27 | /// Send the header. 28 | pub fn send_header(&mut self, header: &MessageHeader) -> Result<(), GeneralError> { 29 | let mut codecs = Vec::new(); 30 | super::codec::build_encoders(header, &mut codecs)?; 31 | 32 | self.codec_pipeline = CodecPipeline::new(codecs); 33 | 34 | header.serialize(&mut self.output_buf).unwrap(); 35 | 36 | Ok(()) 37 | } 38 | 39 | /// Send body data. 40 | pub fn send_body(&mut self, data: &[u8]) -> Result<(), GeneralError> { 41 | self.codec_pipeline.transform(data, &mut self.output_buf)?; 42 | 43 | Ok(()) 44 | } 45 | 46 | /// Ends the message with a chunked-transfer encoding. 47 | /// 48 | /// Flushes any buffered output and outputs the trailer. 49 | pub fn send_trailer(&mut self, fields: &TrailerFields) -> Result<(), GeneralError> { 50 | self.codec_pipeline.finish_input(&mut self.output_buf)?; 51 | 52 | fields.serialize(&mut self.output_buf).unwrap(); 53 | 54 | Ok(()) 55 | } 56 | 57 | /// Ends the message, flushing any buffered output. 58 | pub fn end_message(&mut self) -> Result<(), GeneralError> { 59 | self.codec_pipeline.finish_input(&mut self.output_buf)?; 60 | 61 | Ok(()) 62 | } 63 | 64 | /// At the end of the message, reset the internal state for a new message. 65 | pub fn reset(&mut self) { 66 | self.codec_pipeline = CodecPipeline::default(); 67 | } 68 | 69 | /// Writes the output data into the given buffer and returns the amount written. 70 | pub fn read_output(&mut self, buf: &mut [u8]) -> usize { 71 | self.output_buf.read(buf).unwrap() 72 | } 73 | } 74 | 75 | impl Default for Sender { 76 | fn default() -> Self { 77 | Self::new() 78 | } 79 | } 80 | 81 | #[cfg(test)] 82 | mod tests { 83 | use super::*; 84 | 85 | #[tracing_test::traced_test] 86 | #[test] 87 | fn test_send() { 88 | let mut output = Vec::new(); 89 | let mut sender = Sender::new(); 90 | 91 | let header = MessageHeader::new_request("GET", "/index.html"); 92 | sender.send_header(&header).unwrap(); 93 | sender.send_body(b"Hello world!").unwrap(); 94 | sender.end_message().unwrap(); 95 | 96 | loop { 97 | let mut buf = [0u8; 1024]; 98 | let len = sender.read_output(&mut buf); 99 | 100 | if len == 0 { 101 | break; 102 | } 103 | 104 | output.extend_from_slice(&buf[0..len]); 105 | } 106 | 107 | assert_eq!( 108 | output, 109 | b"GET /index.html HTTP/1.1\r\n\ 110 | \r\n\ 111 | Hello world!" 112 | ); 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/io.rs: -------------------------------------------------------------------------------- 1 | //! IO utilities 2 | use std::io::{BufRead, Read, Write}; 3 | 4 | pub(crate) const IO_BUFFER_LENGTH: usize = 4096; 5 | 6 | /// Indicate position in the stream 7 | pub trait LogicalPosition { 8 | /// Returns the position in the stream without accounting for seeks. 9 | /// 10 | /// This value should be the same as the number of bytes read from 11 | /// the stream. 12 | fn logical_position(&self) -> u64; 13 | } 14 | 15 | /// A [`BufRead`] implementation 16 | /// 17 | /// This is an alternative to [`std::io::BufReader`] but implements [`LogicalPosition`] 18 | /// and allows getting a stream position without seekable streams. 19 | #[derive(Debug)] 20 | pub struct BufferReader { 21 | reader: R, 22 | buffer: Vec, 23 | buffer_position: usize, 24 | logical_position: u64, 25 | } 26 | 27 | impl BufferReader { 28 | /// Create a new buffered reader. 29 | pub fn new(reader: R) -> Self { 30 | Self { 31 | reader, 32 | buffer: Vec::new(), 33 | buffer_position: 0, 34 | logical_position: 0, 35 | } 36 | } 37 | 38 | /// Returns a reference to the underlying reader. 39 | pub fn get_ref(&self) -> &R { 40 | &self.reader 41 | } 42 | 43 | /// Returns a mutable reference to the underlying reader. 44 | /// 45 | /// Modifying the underlying reader may cause unexpected behavior. 46 | pub fn get_mut(&mut self) -> &mut R { 47 | &mut self.reader 48 | } 49 | 50 | /// Returns the underlying reader. 51 | pub fn into_inner(self) -> R { 52 | self.reader 53 | } 54 | 55 | /// Returns a slice of the internal buffer. 56 | pub fn buffer(&self) -> &[u8] { 57 | &self.buffer[self.buffer_position..] 58 | } 59 | 60 | /// Fills the internal buffer with more data from the underlying reader. 61 | /// 62 | /// Returns the number of bytes read. 63 | pub fn fill_buffer(&mut self) -> std::io::Result { 64 | let original_len = self.buffer.len(); 65 | self.buffer.resize(original_len + IO_BUFFER_LENGTH, 0); 66 | 67 | let range = original_len..; 68 | 69 | match self.reader.read(&mut self.buffer[range]) { 70 | Ok(read_len) => { 71 | self.buffer.truncate(original_len + read_len); 72 | Ok(read_len) 73 | } 74 | Err(error) => { 75 | self.buffer.truncate(original_len); 76 | Err(error) 77 | } 78 | } 79 | } 80 | 81 | /// Fills the internal buffer only if it is empty. 82 | pub fn fill_buffer_if_empty(&mut self) -> std::io::Result { 83 | if self.buffer.is_empty() { 84 | self.fill_buffer() 85 | } else { 86 | Ok(0) 87 | } 88 | } 89 | 90 | fn compact_buffer(&mut self) { 91 | self.buffer.drain(..self.buffer_position); 92 | self.buffer_position = 0; 93 | } 94 | 95 | fn read_using_buffer(&mut self, mut buf: &mut [u8]) -> std::io::Result { 96 | self.fill_buffer_if_empty()?; 97 | 98 | let range = self.buffer_position..self.buffer.len().min(self.buffer_position + buf.len()); 99 | let write_len = range.len(); 100 | 101 | buf.write_all(&self.buffer[range])?; 102 | self.buffer_position += write_len; 103 | 104 | self.clean_up_buffer(); 105 | 106 | Ok(write_len) 107 | } 108 | 109 | fn clean_up_buffer(&mut self) { 110 | if self.buffer_position >= self.buffer.len() { 111 | self.buffer.clear(); 112 | self.buffer_position = 0; 113 | } else if self.buffer_position > IO_BUFFER_LENGTH { 114 | self.compact_buffer(); 115 | } 116 | } 117 | } 118 | 119 | impl Read for BufferReader { 120 | fn read(&mut self, buf: &mut [u8]) -> std::io::Result { 121 | let read_len = if buf.len() >= IO_BUFFER_LENGTH && self.buffer.is_empty() { 122 | self.reader.read(buf) 123 | } else { 124 | self.read_using_buffer(buf) 125 | }?; 126 | 127 | self.logical_position += read_len as u64; 128 | Ok(read_len) 129 | } 130 | } 131 | 132 | impl BufRead for BufferReader { 133 | fn fill_buf(&mut self) -> std::io::Result<&[u8]> { 134 | self.fill_buffer_if_empty()?; 135 | 136 | Ok(self.buffer()) 137 | } 138 | 139 | fn consume(&mut self, amt: usize) { 140 | self.buffer_position += amt; 141 | self.logical_position += amt as u64; 142 | self.clean_up_buffer(); 143 | } 144 | } 145 | 146 | impl LogicalPosition for BufferReader { 147 | fn logical_position(&self) -> u64 { 148 | self.logical_position 149 | } 150 | } 151 | 152 | #[cfg(test)] 153 | mod tests { 154 | use std::io::Cursor; 155 | 156 | use super::*; 157 | 158 | #[test] 159 | fn test_buffer_reader() { 160 | let mut source = Vec::new(); 161 | let data_len = 50000; 162 | 163 | for i in 0..data_len { 164 | source.push(i as u8); 165 | } 166 | 167 | let mut r = BufferReader::new(Cursor::new(source)); 168 | let mut actual = Vec::new(); 169 | let mut remain_len = data_len; 170 | let mut buf = Vec::new(); 171 | 172 | for buf_size in [10, 2000, 4000, 4096, 4096, 5000].iter().cycle() { 173 | if remain_len == 0 { 174 | break; 175 | } 176 | let read_len = (*buf_size).min(remain_len); 177 | buf.resize(read_len, 0); 178 | r.read_exact(&mut buf).unwrap(); 179 | 180 | actual.extend_from_slice(&buf); 181 | remain_len -= read_len; 182 | } 183 | 184 | let source = r.into_inner().into_inner(); 185 | 186 | assert_eq!(source, actual); 187 | } 188 | 189 | #[test] 190 | fn test_buffer_reader_until() { 191 | let mut source = Vec::new(); 192 | let data_len = 10000; 193 | 194 | for i in 0..data_len { 195 | if i == 5000 { 196 | source.push(b'\n'); 197 | } else { 198 | source.push(0); 199 | } 200 | } 201 | 202 | let mut r = BufferReader::new(Cursor::new(source)); 203 | let mut buf = Vec::new(); 204 | r.read_until(b'\n', &mut buf).unwrap(); 205 | 206 | assert_eq!(buf.len(), 5001); 207 | } 208 | } 209 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Warcat: WARC Archiving Tool 2 | //! 3 | //! This crate provides both a library API and a binary CLI application. 4 | //! The library can be used to read and write WARC files and 5 | //! as well perform functions provided by the binary. 6 | //! 7 | //! In general cases, users working with WARC files do not need to program 8 | //! directly with the library. The CLI application (the tool portion) is 9 | //! designed to be part of a Unix-style pipeline. 10 | //! 11 | //! This documentation is for the library portion. 12 | //! For details on the CLI, see the [user guide](https://warcat-rs.readthedocs.io/). 13 | //! 14 | //! The library is designed first in mind for the binary, so some parts of 15 | //! the API will be unstable or not relevant. 16 | //! 17 | //! The main entrypoints to this library is [`warc::Decoder`]/[`warc::PushDecoder`] and [`warc::Encoder`]. 18 | 19 | #![cfg_attr(docsrs, feature(doc_auto_cfg))] 20 | 21 | pub mod compress; 22 | pub mod dataseq; 23 | pub mod digest; 24 | pub mod error; 25 | pub mod extract; 26 | pub mod fields; 27 | pub mod header; 28 | pub mod http; 29 | pub mod io; 30 | pub mod parse; 31 | pub(crate) mod util; 32 | pub mod verify; 33 | pub mod warc; 34 | 35 | #[cfg(feature = "bin")] 36 | #[doc(hidden)] 37 | pub mod app; 38 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | fn main() -> std::process::ExitCode { 2 | warcat::app::run() 3 | } 4 | -------------------------------------------------------------------------------- /src/parse.rs: -------------------------------------------------------------------------------- 1 | //! Parsing utilities. 2 | use std::{borrow::Cow, cell::LazyCell}; 3 | 4 | use nom::Parser; 5 | use regex::bytes::Regex; 6 | 7 | use crate::error::ParseError; 8 | 9 | pub(crate) mod fields; 10 | pub(crate) mod header_deliminator; 11 | pub(crate) mod warc; 12 | 13 | /// Get the index (inclusive) of the header deliminator (an empty line). 14 | pub fn scan_header_deliminator(data: &[u8]) -> Option { 15 | match header_deliminator::field_lines(data) { 16 | Ok((_input, output)) => Some(output.len()), 17 | Err(_) => None, 18 | } 19 | } 20 | 21 | /// Parse a HTTP-like fields of name-value pairs. 22 | pub fn parse_name_value_fields(value: &[u8]) -> Result, ParseError> { 23 | match fields::field_pairs(value) { 24 | Ok((_input, output)) => Ok(output), 25 | Err(error) => Err(error.into()), 26 | } 27 | } 28 | 29 | /// Returns whether the value is a valid name in a HTTP-like field. 30 | pub fn validate_field_name(value: &[u8]) -> Result<(), ParseError> { 31 | match nom::combinator::all_consuming(fields::field_name).parse(value) { 32 | Ok((_input, _output)) => Ok(()), 33 | Err(error) => Err(error.into()), 34 | } 35 | } 36 | 37 | /// Returns whether the value is a valid value in a HTTP-like field. 38 | /// 39 | /// When `multiline` is `true`, obsolete line folding is permitted. 40 | pub fn validate_field_value(value: &[u8], multiline: bool) -> Result<(), ParseError> { 41 | let f = if multiline { 42 | fields::field_value 43 | } else { 44 | fields::field_value_no_multline 45 | }; 46 | match nom::combinator::all_consuming(f).parse(value) { 47 | Ok((_input, _output)) => Ok(()), 48 | Err(error) => Err(error.into()), 49 | } 50 | } 51 | 52 | /// Parse a value into a `u64`. 53 | /// 54 | /// Unlike [`u64::try_from()`], only ASCII digits are permitted. Use of std 55 | /// library parsing functions may lead to security issues. 56 | pub fn parse_u64_strict(value: &str) -> Result { 57 | if !value.chars().all(|c| c.is_ascii_digit()) { 58 | return "?".parse(); 59 | } 60 | 61 | value.parse() 62 | } 63 | 64 | /// Remove line folding from a HTTP-like field value. 65 | pub fn remove_line_folding(value: &[u8]) -> Cow<'_, [u8]> { 66 | let re = LazyCell::new(|| Regex::new(r"(?:\r\n|\n)[ \t]+").unwrap()); 67 | re.replace_all(value, b" ") 68 | } 69 | 70 | #[cfg(test)] 71 | mod tests { 72 | use super::*; 73 | 74 | #[test] 75 | fn test_scan_header_none() { 76 | assert_eq!(scan_header_deliminator(b""), None); 77 | assert_eq!(scan_header_deliminator(b"a"), None); 78 | } 79 | 80 | #[test] 81 | fn test_scan_header() { 82 | assert_eq!(scan_header_deliminator(b"\r\nz"), Some(2)); 83 | assert_eq!(scan_header_deliminator(b"a\r\n\r\nz"), Some(5)); 84 | assert_eq!(scan_header_deliminator(b"a\r\nb\r\n\r\nz"), Some(8)); 85 | assert_eq!(scan_header_deliminator(b"a\nb\n\nz"), Some(5)); 86 | } 87 | 88 | #[test] 89 | fn test_remove_line_folding() { 90 | assert_eq!(*remove_line_folding(b"abc"), *b"abc"); 91 | assert_eq!(*remove_line_folding(b"abc\r\n def"), *b"abc def"); 92 | assert_eq!( 93 | *remove_line_folding(b"abc\r\n def\r\n\t123"), 94 | *b"abc def 123" 95 | ); 96 | assert_eq!(*remove_line_folding(b"abc\n def"), *b"abc def"); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/parse/fields.rs: -------------------------------------------------------------------------------- 1 | use nom::{ 2 | IResult, Parser, 3 | branch::alt, 4 | bytes::complete::{tag, take_while, take_while_m_n, take_while1}, 5 | character::complete::{line_ending, space0, space1}, 6 | combinator::{all_consuming, map, recognize}, 7 | multi::{many0, many0_count}, 8 | sequence::{delimited, pair, preceded, separated_pair, terminated}, 9 | }; 10 | 11 | pub struct FieldPairRef<'a> { 12 | pub name: &'a [u8], 13 | pub value: &'a [u8], 14 | } 15 | 16 | impl<'a> From<(&'a [u8], &'a [u8])> for FieldPairRef<'a> { 17 | fn from(value: (&'a [u8], &'a [u8])) -> Self { 18 | Self { 19 | name: value.0, 20 | value: value.1, 21 | } 22 | } 23 | } 24 | 25 | pub fn field_pairs(input: &[u8]) -> IResult<&[u8], Vec>> { 26 | many0(terminated(field_pair, line_ending)).parse(input) 27 | } 28 | 29 | fn field_pair(input: &[u8]) -> IResult<&[u8], FieldPairRef<'_>> { 30 | let val = delimited(space0, field_value, space0); 31 | let pair = separated_pair(field_name, tag(":"), val); 32 | 33 | map(pair, |p| p.into()).parse(input) 34 | } 35 | 36 | pub fn field_name(input: &[u8]) -> IResult<&[u8], &[u8]> { 37 | token(input) 38 | } 39 | 40 | pub fn token(input: &[u8]) -> IResult<&[u8], &[u8]> { 41 | take_while1(is_tchar)(input) 42 | } 43 | 44 | pub fn field_value(input: &[u8]) -> IResult<&[u8], &[u8]> { 45 | let a = alt((field_content, obs_fold)); 46 | recognize(many0_count(a)).parse(input) 47 | } 48 | 49 | pub fn field_value_no_multline(input: &[u8]) -> IResult<&[u8], &[u8]> { 50 | recognize(many0_count(field_content)).parse(input) 51 | } 52 | 53 | fn field_content(input: &[u8]) -> IResult<&[u8], &[u8]> { 54 | recognize(pair( 55 | take_while_m_n(1, 1, is_field_vchar), 56 | take_while(is_field_char), 57 | )) 58 | .parse(input) 59 | } 60 | 61 | fn is_field_vchar(b: u8) -> bool { 62 | b.is_ascii_graphic() || is_obs_text(b) 63 | } 64 | 65 | fn is_field_char(b: u8) -> bool { 66 | is_field_vchar(b) || b == b' ' || b == b'\t' 67 | } 68 | 69 | pub fn is_tchar(b: u8) -> bool { 70 | b.is_ascii_alphanumeric() || b"!#$%&'*+-.^_`|~".contains(&b) 71 | } 72 | 73 | pub fn is_obs_text(b: u8) -> bool { 74 | b >= 0x80 75 | } 76 | 77 | fn obs_fold(input: &[u8]) -> IResult<&[u8], &[u8]> { 78 | recognize(pair(line_ending, space1)).parse(input) 79 | } 80 | 81 | pub struct MediaType<'a> { 82 | pub type_: &'a [u8], 83 | pub subtype: &'a [u8], 84 | pub parameters: Vec<(&'a [u8], &'a [u8])>, 85 | } 86 | 87 | pub fn media_type(input: &[u8]) -> IResult<&[u8], MediaType<'_>> { 88 | let types = separated_pair(type_, tag("/"), subtype); 89 | 90 | map( 91 | all_consuming(pair(types, parameters)), 92 | |(types, parameters)| MediaType { 93 | type_: types.0, 94 | subtype: types.1, 95 | parameters, 96 | }, 97 | ) 98 | .parse(input) 99 | } 100 | 101 | fn type_(input: &[u8]) -> IResult<&[u8], &[u8]> { 102 | token(input) 103 | } 104 | 105 | fn subtype(input: &[u8]) -> IResult<&[u8], &[u8]> { 106 | token(input) 107 | } 108 | 109 | type ParametersList<'a> = Vec<(&'a [u8], &'a [u8])>; 110 | 111 | fn parameters(input: &[u8]) -> IResult<&[u8], ParametersList> { 112 | many0(preceded(delimited(space0, tag(";"), space0), parameter)).parse(input) 113 | } 114 | 115 | fn parameter(input: &[u8]) -> IResult<&[u8], (&[u8], &[u8])> { 116 | separated_pair(attribute, tag("="), value).parse(input) 117 | } 118 | 119 | fn attribute(input: &[u8]) -> IResult<&[u8], &[u8]> { 120 | token(input) 121 | } 122 | 123 | fn value(input: &[u8]) -> IResult<&[u8], &[u8]> { 124 | // FIXME: implement quoted-string 125 | token(input) 126 | } 127 | 128 | #[cfg(test)] 129 | mod tests { 130 | use super::*; 131 | 132 | #[test] 133 | fn test_field_pairs_empty() { 134 | let (_remain, output) = field_pairs(b"").unwrap(); 135 | assert!(output.is_empty()); 136 | } 137 | 138 | #[test] 139 | fn test_field_pairs_1() { 140 | let (_remain, output) = field_pairs(b"n1:\r\n").unwrap(); 141 | 142 | assert_eq!(output.len(), 1); 143 | assert_eq!(output[0].name, b"n1"); 144 | assert_eq!(output[0].value, b""); 145 | 146 | let (_remain, output) = field_pairs(b"n1:v1\r\n").unwrap(); 147 | 148 | assert_eq!(output.len(), 1); 149 | assert_eq!(output[0].name, b"n1"); 150 | assert_eq!(output[0].value, b"v1"); 151 | } 152 | 153 | #[test] 154 | fn test_field_pairs_many() { 155 | let (_remain, output) = field_pairs(b"n1:v1\r\nn2:\r\nn3:v3\r\n").unwrap(); 156 | 157 | assert_eq!(output.len(), 3); 158 | assert_eq!(output[0].name, b"n1"); 159 | assert_eq!(output[0].value, b"v1"); 160 | assert_eq!(output[1].name, b"n2"); 161 | assert_eq!(output[1].value, b""); 162 | assert_eq!(output[2].name, b"n3"); 163 | assert_eq!(output[2].value, b"v3"); 164 | } 165 | 166 | #[test] 167 | fn test_field_pairs_line_folding() { 168 | let (_remain, output) = field_pairs(b"n1:v1\r\n 1\r\nn2:v2\r\n").unwrap(); 169 | 170 | assert_eq!(output.len(), 2); 171 | assert_eq!(output[0].name, b"n1"); 172 | assert_eq!(output[0].value, b"v1\r\n 1"); 173 | assert_eq!(output[1].name, b"n2"); 174 | assert_eq!(output[1].value, b"v2"); 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /src/parse/fields_str.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chfoo/warcat-rs/cf5f71a67e5c19464039eadde41c616149c5ec11/src/parse/fields_str.rs -------------------------------------------------------------------------------- /src/parse/header_deliminator.rs: -------------------------------------------------------------------------------- 1 | use nom::{ 2 | bytes::complete::take_till1, character::complete::line_ending, combinator::recognize, 3 | multi::many0_count, sequence::terminated, IResult, Parser, 4 | }; 5 | 6 | fn field_line(input: &[u8]) -> IResult<&[u8], &[u8]> { 7 | terminated(take_till1(|b| b == b'\r' || b == b'\n'), line_ending).parse(input) 8 | } 9 | 10 | pub fn field_lines(input: &[u8]) -> IResult<&[u8], &[u8]> { 11 | recognize(terminated(many0_count(field_line), line_ending)).parse(input) 12 | } 13 | -------------------------------------------------------------------------------- /src/parse/warc.rs: -------------------------------------------------------------------------------- 1 | use nom::{ 2 | bytes::complete::{tag, take_while}, 3 | character::complete::line_ending, 4 | combinator::recognize, 5 | sequence::{pair, terminated}, 6 | IResult, Parser, 7 | }; 8 | 9 | pub fn version(input: &[u8]) -> IResult<&[u8], &[u8]> { 10 | let tag = tag("WARC/"); 11 | let digits = take_while(|c: u8| c.is_ascii_digit() || c == b'.'); 12 | 13 | recognize(pair(tag, digits)).parse(input) 14 | } 15 | 16 | pub fn version_line(input: &[u8]) -> IResult<&[u8], &[u8]> { 17 | terminated(version, line_ending).parse(input) 18 | } 19 | -------------------------------------------------------------------------------- /src/util.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::Cow; 2 | 3 | pub fn to_ascii_uppercase_cow(text: &str) -> Cow<'_, str> { 4 | if text.chars().any(|c| c.is_ascii_lowercase()) { 5 | Cow::Owned(text.to_ascii_uppercase()) 6 | } else { 7 | Cow::Borrowed(text) 8 | } 9 | } 10 | 11 | pub fn to_ascii_lowercase_cow(text: &str) -> Cow<'_, str> { 12 | if text.chars().any(|c| c.is_ascii_uppercase()) { 13 | Cow::Owned(text.to_ascii_lowercase()) 14 | } else { 15 | Cow::Borrowed(text) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/warc.rs: -------------------------------------------------------------------------------- 1 | //! WARC file format 2 | pub use decode::*; 3 | pub use encode::*; 4 | 5 | mod decode; 6 | mod encode; 7 | -------------------------------------------------------------------------------- /src/warc/encode.rs: -------------------------------------------------------------------------------- 1 | //! WARC file writing 2 | use std::io::{BufWriter, Write}; 3 | 4 | use crate::{ 5 | compress::{Compressor, CompressorConfig}, 6 | error::GeneralError, 7 | header::WarcHeader, 8 | }; 9 | 10 | /// Configuration for a [`Encoder`]. 11 | #[derive(Debug, Clone, Default)] 12 | #[non_exhaustive] 13 | pub struct EncoderConfig { 14 | /// Configuration for compressing the written file 15 | pub compressor: CompressorConfig, 16 | } 17 | 18 | pub struct EncStateHeader; 19 | pub struct EncStateBlock { 20 | length: u64, 21 | written: u64, 22 | } 23 | 24 | /// WARC format writer 25 | pub struct Encoder { 26 | state: S, 27 | output: BufWriter>, 28 | config: EncoderConfig, 29 | } 30 | 31 | impl Encoder { 32 | pub fn get_ref(&self) -> &W { 33 | self.output.get_ref().get_ref() 34 | } 35 | 36 | pub fn get_mut(&mut self) -> &mut W { 37 | self.output.get_mut().get_mut() 38 | } 39 | } 40 | 41 | impl Encoder { 42 | /// Create a new encoder. 43 | /// 44 | /// The destination writer should not be a compression stream. To enable 45 | /// compression, you must configure it with [`EncoderConfig`]. 46 | pub fn new(dest: W, config: EncoderConfig) -> Self { 47 | let output = Compressor::with_config(dest, config.compressor.clone()); 48 | 49 | Self { 50 | state: EncStateHeader, 51 | output: BufWriter::new(output), 52 | config, 53 | } 54 | } 55 | 56 | /// Start a new WARC record with a given header. 57 | /// 58 | /// The validation function will be called on the header before 59 | /// writing it to the stream. 60 | /// 61 | /// Consumes the writer and returns a writer that has typestate 62 | /// transitioned to writing the WARC block portion of the record. 63 | pub fn write_header( 64 | mut self, 65 | header: &WarcHeader, 66 | ) -> Result, GeneralError> { 67 | header.validate()?; 68 | header.serialize(&mut self.output)?; 69 | 70 | let length = header.content_length()?; 71 | 72 | Ok(Encoder { 73 | state: EncStateBlock { length, written: 0 }, 74 | output: self.output, 75 | config: self.config, 76 | }) 77 | } 78 | 79 | /// Flushes any buffered data and returns the underlying stream. 80 | /// 81 | /// You must call this function before dropping the struct in order 82 | /// to have a valid WARC file. 83 | pub fn finish(self) -> std::io::Result { 84 | self.output.into_inner()?.finish() 85 | } 86 | } 87 | 88 | impl Encoder { 89 | fn write_block_impl(&mut self, buf: &[u8]) -> std::io::Result { 90 | let remain_length = self.state.length - self.state.written; 91 | let buf_upper = buf 92 | .len() 93 | .min(usize::try_from(remain_length).unwrap_or(usize::MAX)); 94 | let buf = &buf[0..buf_upper]; 95 | 96 | let write_length = self.output.write(buf)?; 97 | self.state.written += write_length as u64; 98 | 99 | debug_assert!(self.state.length >= self.state.written); 100 | 101 | if self.state.length == self.state.written { 102 | self.write_finish_block()?; 103 | } 104 | 105 | Ok(write_length) 106 | } 107 | 108 | fn write_finish_block(&mut self) -> std::io::Result<()> { 109 | self.output.write_all(b"\r\n\r\n")?; 110 | self.output.flush()?; 111 | self.output.get_mut().start_new_segment()?; 112 | Ok(()) 113 | } 114 | 115 | /// Indicate writing the block portion of a WARC record has completed. 116 | /// 117 | /// Consumes the writer and returns a typestate transitioned 118 | /// writer for writing a new record. 119 | pub fn finish_block(self) -> std::io::Result> { 120 | if self.state.length != self.state.written { 121 | return Err(std::io::Error::other(ContentLengthMismatch::new( 122 | self.state.length, 123 | self.state.written, 124 | ))); 125 | } 126 | 127 | Ok(Encoder { 128 | state: EncStateHeader, 129 | output: self.output, 130 | config: self.config, 131 | }) 132 | } 133 | } 134 | 135 | impl Write for Encoder { 136 | fn write(&mut self, buf: &[u8]) -> std::io::Result { 137 | self.write_block_impl(buf) 138 | } 139 | 140 | fn flush(&mut self) -> std::io::Result<()> { 141 | self.output.flush() 142 | } 143 | } 144 | 145 | /// Error for a block size mismatch in a WARC record. 146 | #[derive(Debug, Default, thiserror::Error)] 147 | #[error("content length mismatch: expected {expected}, got {expected}")] 148 | pub struct ContentLengthMismatch { 149 | expected: u64, 150 | actual: u64, 151 | } 152 | 153 | impl ContentLengthMismatch { 154 | pub fn new(expected: u64, actual: u64) -> Self { 155 | Self { expected, actual } 156 | } 157 | } 158 | 159 | #[cfg(test)] 160 | mod tests { 161 | use super::*; 162 | 163 | #[tracing_test::traced_test] 164 | #[test] 165 | fn test_writer() { 166 | let buf = Vec::new(); 167 | let writer = Encoder::new(buf, EncoderConfig::default()); 168 | 169 | let header = WarcHeader::new(12, "a"); 170 | let mut writer = writer.write_header(&header).unwrap(); 171 | writer.write_all(b"Hello world!").unwrap(); 172 | let writer = writer.finish_block().unwrap(); 173 | 174 | let header = WarcHeader::new(0, "a"); 175 | let mut writer = writer.write_header(&header).unwrap(); 176 | writer.write_all(b"").unwrap(); 177 | let writer = writer.finish_block().unwrap(); 178 | 179 | let buf = writer.finish().unwrap(); 180 | 181 | assert!(buf.starts_with(b"WARC/1.1\r\n")); 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /tests/test_decode.rs: -------------------------------------------------------------------------------- 1 | use std::io::{Cursor, Read, Write}; 2 | 3 | use warcat::{ 4 | compress::Dictionary, 5 | io::LogicalPosition, 6 | verify::Verifier, 7 | warc::{Decoder, DecoderConfig, PushDecoder, PushDecoderEvent}, 8 | }; 9 | 10 | mod warc_generator; 11 | 12 | #[tracing_test::traced_test] 13 | #[test] 14 | fn test_decode_gzip() { 15 | let (input, offsets) = warc_generator::generate_warc_gzip(); 16 | dbg!(input.len()); 17 | 18 | let mut config = DecoderConfig::default(); 19 | config.decompressor.format = warcat::compress::Format::Gzip; 20 | 21 | check_push_decoder(input.clone(), config.clone(), offsets); 22 | check_decoder(input, config); 23 | } 24 | 25 | #[cfg(feature = "zstd")] 26 | #[tracing_test::traced_test] 27 | #[test] 28 | fn test_decode_zst() { 29 | let (input, offsets) = warc_generator::generate_warc_zst(false); 30 | dbg!(input.len()); 31 | 32 | let mut config = DecoderConfig::default(); 33 | config.decompressor.format = warcat::compress::Format::Zstandard; 34 | config.decompressor.dictionary = Dictionary::WarcZstd(Vec::new()); 35 | 36 | check_push_decoder(input.clone(), config.clone(), offsets); 37 | check_decoder(input, config); 38 | } 39 | 40 | #[cfg(feature = "zstd")] 41 | #[tracing_test::traced_test] 42 | #[test] 43 | fn test_decode_zst_compressed_dict() { 44 | let (input, offsets) = warc_generator::generate_warc_zst(true); 45 | dbg!(input.len()); 46 | 47 | let mut config = DecoderConfig::default(); 48 | config.decompressor.format = warcat::compress::Format::Zstandard; 49 | config.decompressor.dictionary = Dictionary::WarcZstd(Vec::new()); 50 | 51 | check_push_decoder(input, config, offsets); 52 | } 53 | 54 | fn check_push_decoder(input: Vec, config: DecoderConfig, mut offsets: Vec) { 55 | let mut decoder = PushDecoder::new(config).unwrap(); 56 | let mut verifier = Verifier::new(); 57 | let mut input = Cursor::new(input); 58 | 59 | // dbg!(&offsets); 60 | 61 | loop { 62 | match decoder.get_event().unwrap() { 63 | PushDecoderEvent::Ready | PushDecoderEvent::WantData => { 64 | let mut buf = vec![0; 4096]; 65 | let len = input.read(&mut buf).unwrap(); 66 | buf.truncate(len); 67 | decoder.write_all(&buf).unwrap(); 68 | 69 | if len == 0 { 70 | decoder.write_eof().unwrap(); 71 | break; 72 | } 73 | } 74 | 75 | PushDecoderEvent::Continue => {} 76 | PushDecoderEvent::Header { header } => { 77 | assert_eq!(decoder.record_boundary_position(), offsets[0]); 78 | offsets.drain(0..1); 79 | verifier.begin_record(&header).unwrap(); 80 | } 81 | PushDecoderEvent::BlockData { data } => { 82 | verifier.block_data(data); 83 | } 84 | PushDecoderEvent::EndRecord => { 85 | verifier.end_record(); 86 | } 87 | PushDecoderEvent::Finished => { 88 | break; 89 | } 90 | } 91 | } 92 | } 93 | 94 | fn check_decoder(input: Vec, config: DecoderConfig) { 95 | let mut decoder = Decoder::new(Cursor::new(input), config).unwrap(); 96 | let mut verifier = Verifier::new(); 97 | let mut count = 0; 98 | 99 | while decoder.has_next_record().unwrap() { 100 | dbg!(count); 101 | dbg!(decoder.logical_position()); 102 | dbg!(&decoder.get_ref().position()); 103 | 104 | let (header, mut block_decoder) = decoder.read_header().unwrap(); 105 | 106 | verifier.begin_record(&header).unwrap(); 107 | 108 | let mut buf = [0u8; 4096]; 109 | loop { 110 | let read_len = block_decoder.read(&mut buf).unwrap(); 111 | 112 | if read_len == 0 { 113 | break; 114 | } 115 | 116 | verifier.block_data(&buf[0..read_len]); 117 | } 118 | 119 | verifier.end_record(); 120 | decoder = block_decoder.finish_block().unwrap(); 121 | 122 | if !verifier.problems().is_empty() { 123 | println!("{:?}", verifier.problems()); 124 | } 125 | assert!(verifier.problems().is_empty()); 126 | 127 | count += 1; 128 | } 129 | 130 | decoder.into_inner(); 131 | 132 | println!("{:?}", verifier.problems()); 133 | assert!(verifier.problems().is_empty()); 134 | } 135 | -------------------------------------------------------------------------------- /tests/warc_generator.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | 3 | use rand::{Rng, RngCore}; 4 | use rand_xoshiro::{Xoshiro256PlusPlus, rand_core::SeedableRng}; 5 | use warcat::{ 6 | compress::Dictionary, 7 | digest::{AlgorithmName, Digest, Hasher}, 8 | header::WarcHeader, 9 | warc::{EncStateHeader, Encoder, EncoderConfig}, 10 | }; 11 | 12 | pub fn generate_warc_gzip() -> (Vec, Vec) { 13 | let mut config = EncoderConfig::default(); 14 | config.compressor.format = warcat::compress::Format::Gzip; 15 | let encoder = Encoder::new(Vec::new(), config); 16 | 17 | generate(encoder) 18 | } 19 | 20 | #[cfg(feature = "zstd")] 21 | pub fn generate_warc_zst(compressed_dict: bool) -> (Vec, Vec) { 22 | let mut sample = vec![0; 10000]; 23 | let mut rng = Xoshiro256PlusPlus::seed_from_u64(1234567); 24 | rng.fill_bytes(&mut sample); 25 | let sizes = [100usize; 100]; 26 | 27 | let mut dictionary = zstd::dict::from_continuous(&sample, &sizes, 10000).unwrap(); 28 | 29 | if compressed_dict { 30 | dictionary = zstd::bulk::compress(&dictionary, 3).unwrap(); 31 | } 32 | 33 | let mut config = EncoderConfig::default(); 34 | config.compressor.format = warcat::compress::Format::Zstandard; 35 | config.compressor.dictionary = Dictionary::WarcZstd(dictionary); 36 | let encoder = Encoder::new(Vec::new(), config); 37 | 38 | generate(encoder) 39 | } 40 | 41 | fn generate(mut encoder: Encoder>) -> (Vec, Vec) { 42 | let mut offsets = Vec::new(); 43 | 44 | for round in 0..100 { 45 | offsets.push(encoder.get_ref().len() as u64); 46 | let mut rng = Xoshiro256PlusPlus::seed_from_u64(round); 47 | 48 | let length: u64 = rng.random_range(100 + round * 1234..200 + round * 1234); 49 | 50 | let mut data: Vec = vec![0; length as usize]; 51 | 52 | if rng.random_bool(0.5) { 53 | // Easy to compress 54 | for value in data.iter_mut().step_by(10) { 55 | *value = 0xff; 56 | } 57 | } else { 58 | // Difficult to compress 59 | rng.fill_bytes(&mut data); 60 | } 61 | 62 | let mut hasher = Hasher::new(AlgorithmName::Sha1); 63 | hasher.update(&data); 64 | let digest = Digest::new(AlgorithmName::Sha1, hasher.finish()); 65 | 66 | let mut header = WarcHeader::new(length, "resource"); 67 | header 68 | .fields 69 | .insert("WARC-Block-Digest".to_string(), digest.to_string()); 70 | header.fields.insert( 71 | "WARC-Target-URI".to_string(), 72 | "urn:example:test".to_string(), 73 | ); 74 | 75 | let mut block_encoder = encoder.write_header(&header).unwrap(); 76 | block_encoder.write_all(&data).unwrap(); 77 | encoder = block_encoder.finish_block().unwrap(); 78 | } 79 | 80 | (encoder.finish().unwrap(), offsets) 81 | } 82 | -------------------------------------------------------------------------------- /xtask/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "xtask" 3 | version = "0.0.0" 4 | edition = "2024" 5 | publish = false 6 | 7 | [dependencies] 8 | anyhow = "1.0.86" 9 | blake2 = "0.10.6" 10 | blake3 = { version = "1.5.4", features = ["pure", "traits-preview"] } 11 | cargo_metadata = "0.19.2" 12 | cargo-license = "0.6.1" 13 | cargo-license_cargo_metadata = { package = "cargo_metadata", version = "0.18.1"} 14 | clap = { version = "4.5.16", features = ["derive"] } 15 | data-encoding = "2.6.0" 16 | digest = "0.10.7" 17 | minisign = "0.7.8" 18 | reqwest = { version = "0.12.8", default-features = false, features = ["blocking", "rustls-tls", "gzip", "json"], optional = true } 19 | rpassword = "7.3.1" 20 | serde_json = "1.0.128" 21 | sha2 = "0.10.8" 22 | tempfile = "3.13.0" 23 | toml_edit = "0.22.22" 24 | zip = { version = "4.0.0", default-features = false, features = ["deflate64", "deflate"] } 25 | 26 | [features] 27 | default = [] 28 | bloat = ["dep:reqwest"] -------------------------------------------------------------------------------- /xtask/README.md: -------------------------------------------------------------------------------- 1 | # xtask 2 | 3 | This is a [cargo xtask](https://github.com/matklad/cargo-xtask) crate. 4 | -------------------------------------------------------------------------------- /xtask/src/digest.rs: -------------------------------------------------------------------------------- 1 | use std::{io::Cursor, path::Path}; 2 | 3 | use data_encoding::HEXLOWER; 4 | use digest::Digest; 5 | use minisign::SecretKey; 6 | 7 | pub fn compute_digests(minisign_secret_key: Option<&Path>) -> anyhow::Result<()> { 8 | let minisign_secret_key = if let Some(path) = minisign_secret_key { 9 | Some(get_minisign_secret_key(path)?) 10 | } else { 11 | None 12 | }; 13 | 14 | let package_dir = crate::package::target_dir()?.join("github-artifacts"); 15 | 16 | let mut entries: Vec<_> = package_dir.read_dir()?.collect(); 17 | entries.sort_unstable_by_key(|item| item.as_ref().unwrap().file_name()); 18 | 19 | let mut doc = toml_edit::DocumentMut::new(); 20 | let mut file_table = toml_edit::Table::new(); 21 | 22 | for entry in entries { 23 | let entry = entry.unwrap(); 24 | let filename = entry 25 | .file_name() 26 | .into_string() 27 | .map_err(|_| anyhow::anyhow!("non-utf-8 path"))?; 28 | 29 | let data = std::fs::read(entry.path())?; 30 | 31 | let mut sha256_hasher = sha2::Sha256::new(); 32 | let mut sha512_hasher = sha2::Sha512::new(); 33 | let mut blake2b_hasher = blake2::Blake2b512::new(); 34 | let mut blake3_hasher = blake3::Hasher::new(); 35 | 36 | sha256_hasher.update(&data); 37 | sha512_hasher.update(&data); 38 | blake2b_hasher.update(&data); 39 | blake3_hasher.update(&data); 40 | 41 | let sha256_digest = sha256_hasher.finalize(); 42 | let sha512_digest = sha512_hasher.finalize(); 43 | let blake2b_digest = blake2b_hasher.finalize(); 44 | let blake3_digest = blake3_hasher.finalize(); 45 | 46 | let mut values = toml_edit::Table::new(); 47 | values.insert("sha256", HEXLOWER.encode(&sha256_digest).into()); 48 | values.insert("sha512", HEXLOWER.encode(&sha512_digest).into()); 49 | values.insert("blake2b", HEXLOWER.encode(&blake2b_digest).into()); 50 | values.insert("blake3", HEXLOWER.encode(blake3_digest.as_slice()).into()); 51 | 52 | if let Some(key) = &minisign_secret_key { 53 | let signature = minisign::sign(None, key, Cursor::new(&data), None, None)?; 54 | values.insert("minisign", signature.to_string().into()); 55 | } 56 | 57 | file_table.insert(&filename, toml_edit::Item::Table(values)); 58 | } 59 | 60 | doc.insert("files", toml_edit::Item::Table(file_table)); 61 | let text = doc.to_string(); 62 | println!("{}", text); 63 | 64 | Ok(()) 65 | } 66 | 67 | fn get_minisign_secret_key(path: &Path) -> anyhow::Result { 68 | let password = rpassword::prompt_password("Secret key password: ")?; 69 | eprintln!("Loading key..."); 70 | let key = minisign::SecretKey::from_file(path, Some(password))?; 71 | eprintln!("OK"); 72 | Ok(key) 73 | } 74 | -------------------------------------------------------------------------------- /xtask/src/dist_readme.txt: -------------------------------------------------------------------------------- 1 | Warcat-rs 2 | ========= 3 | 4 | This package contains warcat, a command-line tool for handling Web ARChive (WARC) files. 5 | 6 | To disable the installer functionality, rename the file and remove the "-installer" part. 7 | 8 | Project repository: https://github.com/chfoo/warcat-rs 9 | 10 | User guide: https://warcat-rs.readthedocs.io/ 11 | 12 | Support: https://github.com/chfoo/warcat-rs/blob/main/.github/SUPPORT.md 13 | 14 | Contributing: https://github.com/chfoo/warcat-rs/blob/main/.github/CONTRIBUTING.md 15 | -------------------------------------------------------------------------------- /xtask/src/doc.rs: -------------------------------------------------------------------------------- 1 | use std::process::Command; 2 | 3 | pub fn build_doc() -> anyhow::Result<()> { 4 | let status = if cfg!(windows) { 5 | Command::new("cmd.exe") 6 | .arg("/c") 7 | .arg("make.bat") 8 | .arg("html") 9 | .current_dir("doc/") 10 | .status()? 11 | } else { 12 | Command::new("make") 13 | .arg("html") 14 | .current_dir("doc/") 15 | .status()? 16 | }; 17 | 18 | if !status.success() { 19 | anyhow::bail!("command failure {:?}", status.code()) 20 | } else { 21 | Ok(()) 22 | } 23 | } 24 | 25 | pub fn gen_cli_doc() -> anyhow::Result<()> { 26 | let cargo = std::env::var("CARGO")?; 27 | let output = Command::new(cargo) 28 | .arg("run") 29 | .arg("--features=bin") 30 | .arg("--") 31 | .arg("dump-help") 32 | .stderr(std::process::Stdio::inherit()) 33 | .output()?; 34 | 35 | let text = String::from_utf8(output.stdout)?; 36 | let text = text.replace("{title}", "CLI Reference"); 37 | 38 | let text = "% ATTENTION: This file was automatically generated using cargo xtask.\n\ 39 | % Do not manually edit this file!\n\n" 40 | .to_owned() 41 | + &text; 42 | 43 | std::fs::write("doc/cli_reference.md", text.as_bytes())?; 44 | 45 | Ok(()) 46 | } 47 | -------------------------------------------------------------------------------- /xtask/src/gh.rs: -------------------------------------------------------------------------------- 1 | use std::{fs::File, path::Path}; 2 | 3 | use reqwest::{ 4 | blocking::Client, 5 | header::{HeaderMap, HeaderValue}, 6 | }; 7 | use zip::ZipArchive; 8 | 9 | const REPO_USER: &str = "chfoo"; 10 | const REPO_NAME: &str = "warcat-rs"; 11 | 12 | pub fn download_artifacts(access_token: &Path, workflow_id: &str) -> anyhow::Result<()> { 13 | let token = std::fs::read_to_string(access_token)?; 14 | let token = token.trim_ascii(); 15 | 16 | let mut headers = HeaderMap::new(); 17 | let mut token_value = HeaderValue::from_str(&format!("Bearer {}", token))?; 18 | token_value.set_sensitive(true); 19 | headers.insert("Accept", "application/vnd.github+json".try_into()?); 20 | headers.insert("Authorization", token_value); 21 | headers.insert("X-GitHub-Api-Version", "2022-11-28".try_into()?); 22 | headers.insert("User-Agent", "warcat-rs-xtask".try_into()?); 23 | 24 | let client = Client::builder() 25 | .https_only(true) 26 | .gzip(true) 27 | .default_headers(headers) 28 | .build()?; 29 | 30 | eprintln!("Getting artifacts.."); 31 | let response = client 32 | .get(format!( 33 | "https://api.github.com/repos/{}/{}/actions/runs/{}/artifacts", 34 | REPO_USER, REPO_NAME, workflow_id 35 | )) 36 | .send()?; 37 | 38 | eprintln!(" .. {}", response.status()); 39 | 40 | if !response.status().is_success() { 41 | eprintln!(" {:?}", &response); 42 | eprintln!(" {:?}", response.text()); 43 | 44 | anyhow::bail!("response error") 45 | } 46 | 47 | let doc: serde_json::Value = response.json()?; 48 | 49 | let artifacts = doc 50 | .as_object() 51 | .unwrap() 52 | .get("artifacts") 53 | .unwrap() 54 | .as_array() 55 | .unwrap(); 56 | 57 | let artifact_ids: Vec = artifacts 58 | .iter() 59 | .map(|value| { 60 | value 61 | .as_object() 62 | .unwrap() 63 | .get("id") 64 | .unwrap() 65 | .as_u64() 66 | .unwrap() 67 | }) 68 | .collect(); 69 | 70 | let download_dir = tempfile::tempdir()?; 71 | let output_dir = super::package::target_dir()?.join("github-artifacts"); 72 | 73 | eprintln!("Output directory {:?}", output_dir); 74 | std::fs::create_dir_all(&output_dir)?; 75 | 76 | for artifact_id in artifact_ids { 77 | eprintln!("Downloading artifact {}", artifact_id); 78 | let mut response = client 79 | .get(format!( 80 | "https://api.github.com/repos/{}/{}/actions/artifacts/{}/zip", 81 | REPO_USER, REPO_NAME, artifact_id 82 | )) 83 | .send()?; 84 | 85 | eprintln!(" .. {}", response.status()); 86 | response.error_for_status_ref()?; 87 | 88 | let artifact_path = download_dir.path().join(format!("{}.zip", artifact_id)); 89 | let mut file = File::options() 90 | .write(true) 91 | .truncate(true) 92 | .create(true) 93 | .open(&artifact_path)?; 94 | std::io::copy(&mut response, &mut file)?; 95 | 96 | eprintln!("Extracting {:?}", &artifact_path); 97 | let file = File::open(&artifact_path)?; 98 | let mut zip = ZipArchive::new(file)?; 99 | zip.extract(&output_dir)?; 100 | } 101 | 102 | download_dir.close()?; 103 | eprintln!("Done"); 104 | Ok(()) 105 | } 106 | -------------------------------------------------------------------------------- /xtask/src/license.rs: -------------------------------------------------------------------------------- 1 | use std::{fs::File, io::Write}; 2 | 3 | use cargo_license::GetDependenciesOpt; 4 | use cargo_license_cargo_metadata::MetadataCommand; 5 | 6 | pub fn generate_license_file() -> anyhow::Result<()> { 7 | let mut command = MetadataCommand::new(); 8 | command.features(cargo_license_cargo_metadata::CargoOpt::SomeFeatures(vec![ 9 | "bin".to_string() 10 | ])); 11 | 12 | let opt = GetDependenciesOpt { 13 | avoid_build_deps: true, 14 | avoid_dev_deps: true, 15 | ..Default::default() 16 | }; 17 | 18 | let dependencies = cargo_license::get_dependencies_from_cargo_lock(command, opt)?; 19 | 20 | let mut file = File::options() 21 | .write(true) 22 | .create(true) 23 | .truncate(true) 24 | .open("xtask/src/dist_license.txt")?; 25 | 26 | writeln!( 27 | file, 28 | "Automatically generated using xtask. Do not manually edit!" 29 | )?; 30 | writeln!(file, "")?; 31 | 32 | writeln!(file, "License")?; 33 | writeln!(file, "=======")?; 34 | writeln!(file)?; 35 | 36 | for dependency in dependencies { 37 | writeln!(file, "{} {}", &dependency.name, &dependency.version)?; 38 | writeln!(file, "-----")?; 39 | writeln!(file)?; 40 | 41 | writeln!(file, "Authors:")?; 42 | for author in dependency 43 | .authors 44 | .as_deref() 45 | .unwrap_or("") 46 | .split("|") 47 | { 48 | writeln!(file, " {}", author)? 49 | } 50 | 51 | writeln!(file, "License:")?; 52 | writeln!( 53 | file, 54 | " {}", 55 | dependency.license.as_deref().unwrap_or("") 56 | )?; 57 | 58 | writeln!(file, "Repository:")?; 59 | writeln!( 60 | file, 61 | " {}", 62 | dependency.repository.as_deref().unwrap_or("") 63 | )?; 64 | 65 | writeln!(file)?; 66 | writeln!(file)?; 67 | } 68 | 69 | Ok(()) 70 | } 71 | -------------------------------------------------------------------------------- /xtask/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | use clap::{Parser, Subcommand}; 4 | 5 | mod digest; 6 | mod doc; 7 | #[cfg(feature = "bloat")] 8 | mod gh; 9 | mod license; 10 | mod package; 11 | 12 | #[derive(Parser, Debug)] 13 | #[command(version)] 14 | struct Args { 15 | #[command(subcommand)] 16 | command: Command, 17 | } 18 | 19 | #[derive(Debug, Subcommand)] 20 | pub enum Command { 21 | /// Convenience command to build Sphinx HTML user guide. 22 | BuildDoc, 23 | /// Generate CLI reference to doc directory. 24 | GenCliDoc, 25 | /// Package a built release binary along with supporting files for distribution. 26 | PackageBin { target: String }, 27 | /// Download the artifacts from GitHub Actions containing the packages. 28 | DownloadArtifacts { 29 | #[arg(long, short)] 30 | access_token: PathBuf, 31 | #[arg(long, short)] 32 | workflow_id: String, 33 | }, 34 | /// Output a hash of the packages. 35 | Digests { 36 | #[arg(long)] 37 | minisign_secret_key: Option, 38 | }, 39 | /// Generate the license file of dependencies. 40 | GenLicense, 41 | } 42 | 43 | fn main() -> anyhow::Result<()> { 44 | let args = Args::parse(); 45 | 46 | match args.command { 47 | Command::BuildDoc => crate::doc::build_doc(), 48 | Command::GenCliDoc => crate::doc::gen_cli_doc(), 49 | Command::PackageBin { target } => crate::package::package_bin(&target), 50 | Command::DownloadArtifacts { 51 | access_token, 52 | workflow_id, 53 | } => { 54 | #[cfg(feature = "bloat")] 55 | { 56 | crate::gh::download_artifacts(&access_token, &workflow_id) 57 | } 58 | #[cfg(not(feature = "bloat"))] 59 | { 60 | let _ = access_token; 61 | let _ = workflow_id; 62 | unimplemented!("feature 'bloat' required") 63 | } 64 | } 65 | Command::Digests { 66 | minisign_secret_key, 67 | } => crate::digest::compute_digests(minisign_secret_key.as_deref()), 68 | Command::GenLicense => crate::license::generate_license_file(), 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /xtask/src/package.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | env::consts::EXE_SUFFIX, 3 | io::Write, 4 | path::{Path, PathBuf}, 5 | process::Command, 6 | }; 7 | 8 | use tempfile::NamedTempFile; 9 | 10 | pub fn package_bin(target_triple: &str) -> anyhow::Result<()> { 11 | let packager = Packager::new(target_triple.to_string()); 12 | 13 | match std::env::consts::OS { 14 | "windows" => packager.package_zip(), 15 | "macos" => packager.package_tar("tgz"), 16 | "linux" => packager.package_tar("tar.gz"), 17 | _ => unimplemented!(), 18 | } 19 | } 20 | 21 | struct Packager { 22 | target_triple: String, 23 | } 24 | 25 | impl Packager { 26 | fn new(target_triple: String) -> Self { 27 | Self { target_triple } 28 | } 29 | 30 | fn package_zip(&self) -> anyhow::Result<()> { 31 | let staging_dir = self.prepare_staging_dir()?; 32 | let output_dir = self.prepare_output_dir()?; 33 | let package_name = self.build_package_name()?; 34 | let output_file = output_dir.join(format!("{}.zip", package_name)); 35 | 36 | eprintln!("Creating archive {:?} of {:?}", output_file, staging_dir); 37 | let status = Command::new(r"C:\Program Files\7-Zip\7z.exe") 38 | .arg("a") 39 | .arg(&output_file) 40 | .arg("./") 41 | .current_dir(&staging_dir) 42 | .status()?; 43 | 44 | anyhow::ensure!(status.success()); 45 | eprintln!("Done"); 46 | 47 | Ok(()) 48 | } 49 | 50 | fn package_tar(&self, archive_extension: &str) -> anyhow::Result<()> { 51 | let staging_dir = self.prepare_staging_dir()?; 52 | let output_dir = self.prepare_output_dir()?; 53 | let package_name = self.build_package_name()?; 54 | let output_file = output_dir.join(format!("{}.{}", package_name, archive_extension)); 55 | 56 | let mut staging_dir_contents = Vec::new(); 57 | 58 | for entry in std::fs::read_dir(&staging_dir)? { 59 | let entry = entry?; 60 | 61 | staging_dir_contents.push(entry.file_name()); 62 | } 63 | 64 | eprintln!("Creating archive {:?} of {:?}", output_file, staging_dir); 65 | let status = Command::new("tar") 66 | .arg("-c") 67 | .arg("-f") 68 | .arg(&output_file) 69 | .arg("-v") 70 | .arg("-z") 71 | .args(staging_dir_contents) 72 | .current_dir(&staging_dir) 73 | .status()?; 74 | 75 | anyhow::ensure!(status.success()); 76 | eprintln!("Done"); 77 | 78 | Ok(()) 79 | } 80 | 81 | fn build_package_name(&self) -> anyhow::Result { 82 | let version = package_version()?; 83 | let friendly_target = target_triple_to_friendly_name(&self.target_triple); 84 | let package_name = format!("warcat-{}-{}", version, friendly_target); 85 | 86 | Ok(package_name) 87 | } 88 | 89 | fn prepare_staging_dir(&self) -> anyhow::Result { 90 | let version = package_version()?; 91 | // let package_name = self.build_package_name()?; 92 | 93 | let target_dir = target_dir()?; 94 | let staging_dir = target_dir.join("xtask-package-bin-staging"); 95 | // let content_dir = staging_dir.join(&package_name); 96 | let content_dir = staging_dir.clone(); 97 | 98 | if staging_dir.exists() { 99 | eprintln!("Removing directory {:?}", staging_dir); 100 | std::fs::remove_dir_all(&staging_dir)?; 101 | } 102 | 103 | eprintln!("Creating directory {:?}", content_dir); 104 | std::fs::create_dir_all(&content_dir)?; 105 | 106 | let source_bin_path = target_dir 107 | .join(&self.target_triple) 108 | .join("release") 109 | .join(format!("warcat{}", EXE_SUFFIX)); 110 | 111 | let dest_bin_path = content_dir.join(format!("warcat-{}-installer{}", version, EXE_SUFFIX)); 112 | let license_file = self.license_file()?; 113 | 114 | for (from, to) in [ 115 | (source_bin_path.as_path(), dest_bin_path.as_path()), 116 | (license_file.path(), &content_dir.join("license.txt")), 117 | ( 118 | Path::new("xtask/src/dist_readme.txt"), 119 | &content_dir.join("readme.txt"), 120 | ), 121 | ] { 122 | eprintln!("Copying {:?} -> {:?}", from, to); 123 | std::fs::copy(from, to)?; 124 | } 125 | 126 | Ok(staging_dir) 127 | } 128 | 129 | fn prepare_output_dir(&self) -> anyhow::Result { 130 | let target_dir = target_dir()?; 131 | let output_dir = target_dir.join("xtask-package-bin-output"); 132 | 133 | if output_dir.exists() { 134 | eprintln!("Removing directory {:?}", output_dir); 135 | std::fs::remove_dir_all(&output_dir)?; 136 | } 137 | 138 | eprintln!("Creating directory {:?}", output_dir); 139 | std::fs::create_dir_all(&output_dir)?; 140 | 141 | Ok(output_dir) 142 | } 143 | 144 | fn license_file(&self) -> anyhow::Result { 145 | let mut file = NamedTempFile::new()?; 146 | 147 | let content = std::fs::read_to_string("xtask/src/dist_license.txt")?; 148 | let (_header, content) = content.split_once("").expect("missing license template header"); 149 | let content = content.trim_ascii_start(); 150 | 151 | file.write_all(content.as_bytes())?; 152 | file.flush()?; 153 | 154 | Ok(file) 155 | } 156 | } 157 | 158 | fn target_triple_to_friendly_name(target_triple: &str) -> &str { 159 | match target_triple { 160 | "x86_64-pc-windows-msvc" => "windows-x86_64", 161 | "aarch64-pc-windows-msvc" => "windows-aarch64", 162 | "x86_64-apple-darwin" => "macos-x86_64", 163 | "aarch64-apple-darwin" => "macos-aarch64", 164 | "x86_64-unknown-linux-musl" => "linux-x86_64", 165 | "aarch64-unknown-linux-musl" => "linux-aarch64", 166 | _ => unimplemented!(), 167 | } 168 | } 169 | 170 | pub fn target_dir() -> anyhow::Result { 171 | let metadata = cargo_metadata::MetadataCommand::new().exec()?; 172 | Ok(metadata.target_directory.into_std_path_buf()) 173 | } 174 | 175 | fn package_version() -> anyhow::Result { 176 | let metadata = cargo_metadata::MetadataCommand::new().exec()?; 177 | let package = metadata 178 | .packages 179 | .iter() 180 | .find(|package| package.name == "warcat") 181 | .ok_or_else(|| anyhow::anyhow!("couldn't get package version"))?; 182 | Ok(package.version.to_string()) 183 | } 184 | --------------------------------------------------------------------------------