├── .github ├── actions │ └── setup-rust │ │ └── action.yml └── workflows │ ├── ci.yml │ ├── fuzz.yml │ ├── miri.yml │ └── release-plz.yml ├── .gitignore ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── benches ├── .gitignore ├── compress.rs └── micro.rs ├── examples ├── file_compressor.rs └── round_trip.rs ├── fuzz ├── .gitignore ├── Cargo.lock ├── Cargo.toml └── fuzz_targets │ ├── fuzz_compress.rs │ └── fuzz_train.rs ├── logo.webp ├── renovate.json ├── rust-toolchain.toml ├── src ├── builder.rs ├── lib.rs └── lossy_pht.rs └── tests ├── correctness.rs └── fixtures ├── art_of_war.txt └── declaration.txt /.github/actions/setup-rust/action.yml: -------------------------------------------------------------------------------- 1 | name: "Setup Rust" 2 | description: "Toolchain setup and Initial compilation" 3 | 4 | inputs: 5 | targets: 6 | description: "optional targets override (e.g. wasm32-unknown-unknown)" 7 | required: false 8 | 9 | runs: 10 | using: "composite" 11 | steps: 12 | - name: Rust Version 13 | id: rust-version 14 | shell: bash 15 | run: echo "version=$(cat rust-toolchain.toml | grep channel | awk -F'\"' '{print $2}')" >> $GITHUB_OUTPUT 16 | 17 | - name: Rust Toolchain 18 | id: rust-toolchain 19 | uses: dtolnay/rust-toolchain@master 20 | if: steps.rustup-cache.outputs.cache-hit != 'true' 21 | with: 22 | toolchain: "${{ steps.rust-version.outputs.version }}" 23 | targets: "${{inputs.targets || ''}}" 24 | components: clippy, rustfmt, miri, llvm-tools-preview 25 | 26 | - name: Rust Dependency Cache 27 | uses: Swatinem/rust-cache@v2 28 | with: 29 | save-if: ${{ github.ref_name == 'develop' }} 30 | shared-key: "rust-cache-${{ runner.os }}-${{ runner.environment }}" # To allow reuse across jobs 31 | 32 | - name: Rust Compile Cache 33 | uses: mozilla-actions/sccache-action@v0.0.9 34 | - name: Rust Compile Cache Config 35 | shell: bash 36 | run: | 37 | echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV 38 | echo "RUSTC_WRAPPER=sccache" >> $GITHUB_ENV 39 | echo "CARGO_INCREMENTAL=0" >> $GITHUB_ENV 40 | - name: Export Path 41 | shell: bash 42 | run: echo "PATH=$PATH" >> $GITHUB_ENV 43 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: ["develop"] 6 | pull_request: {} 7 | workflow_dispatch: {} 8 | 9 | permissions: 10 | actions: read 11 | contents: read 12 | 13 | jobs: 14 | build: 15 | name: Build 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: ./.github/actions/setup-rust 20 | 21 | 22 | - name: Rust Build 23 | run: cargo build --all-features --all-targets 24 | - name: Rust Lint - Format 25 | run: cargo fmt --all --check 26 | - name: Rust Lint - Clippy 27 | run: cargo clippy --all-features --all-targets 28 | - name: Rust Test 29 | run: cargo test --workspace --all-features 30 | 31 | bench-codspeed: 32 | name: Benchmark with Codspeed 33 | runs-on: ubuntu-latest 34 | steps: 35 | - uses: actions/checkout@v4 36 | - uses: ./.github/actions/setup-rust 37 | 38 | - name: Install Codspeed 39 | shell: bash 40 | run: cargo install --force cargo-codspeed --locked 41 | 42 | - name: Build benchmarks 43 | env: 44 | RUSTFLAGS: "-C target-cpu=native" 45 | run: | 46 | cargo codspeed build --profile bench 47 | 48 | - name: Run benchmarks 49 | uses: CodSpeedHQ/action@v3 50 | with: 51 | run: cargo codspeed run 52 | token: ${{ secrets.CODSPEED_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/fuzz.yml: -------------------------------------------------------------------------------- 1 | name: Fuzz 2 | 3 | on: 4 | schedule: 5 | - cron: "0 0 * * *" # daily 6 | workflow_dispatch: 7 | 8 | jobs: 9 | fuzz: 10 | name: "fuzz" 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Install cargo fuzz 15 | run: cargo install cargo-fuzz 16 | - name: Run fuzzing target 17 | run: cargo fuzz run fuzz_compress -- -max_total_time=600 18 | continue-on-error: true 19 | - name: Archive crash artifacts 20 | uses: actions/upload-artifact@v4 21 | with: 22 | name: fuzzing-crash-artifacts 23 | path: fuzz/artifacts 24 | - name: Archive fuzzing corpus 25 | uses: actions/upload-artifact@v4 26 | with: 27 | name: fuzzing-corpus 28 | path: fuzz/corpus 29 | -------------------------------------------------------------------------------- /.github/workflows/miri.yml: -------------------------------------------------------------------------------- 1 | name: Miri 2 | 3 | on: 4 | push: 5 | branches: ["develop"] 6 | pull_request: {} 7 | workflow_dispatch: {} 8 | 9 | permissions: 10 | actions: read 11 | contents: read 12 | 13 | jobs: 14 | miri: 15 | name: "miri" 16 | runs-on: ubuntu-latest 17 | env: 18 | RUST_BACKTRACE: 1 19 | MIRIFLAGS: -Zmiri-strict-provenance -Zmiri-symbolic-alignment-check -Zmiri-backtrace=full 20 | steps: 21 | - uses: actions/checkout@v4 22 | 23 | - name: Rust Version 24 | id: rust-version 25 | shell: bash 26 | run: echo "version=$(cat rust-toolchain.toml | grep channel | awk -F'\"' '{print $2}')" >> $GITHUB_OUTPUT 27 | 28 | - name: Rust Toolchain 29 | id: rust-toolchain 30 | uses: dtolnay/rust-toolchain@master 31 | if: steps.rustup-cache.outputs.cache-hit != 'true' 32 | with: 33 | toolchain: "${{ steps.rust-version.outputs.version }}" 34 | components: miri 35 | 36 | - name: Rust Dependency Cache 37 | uses: Swatinem/rust-cache@v2 38 | with: 39 | save-if: ${{ github.ref == 'refs/heads/develop' }} 40 | shared-key: "shared" # To allow reuse across jobs 41 | 42 | - name: Rust Compile Cache 43 | uses: mozilla-actions/sccache-action@v0.0.9 44 | - name: Rust Compile Cache Config 45 | shell: bash 46 | run: | 47 | echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV 48 | echo "RUSTC_WRAPPER=sccache" >> $GITHUB_ENV 49 | echo "CARGO_INCREMENTAL=0" >> $GITHUB_ENV 50 | 51 | - name: Run tests with Miri 52 | run: cargo miri test 53 | -------------------------------------------------------------------------------- /.github/workflows/release-plz.yml: -------------------------------------------------------------------------------- 1 | name: Release-plz 2 | 3 | permissions: 4 | pull-requests: write 5 | contents: write 6 | 7 | on: 8 | push: 9 | branches: 10 | - develop 11 | 12 | jobs: 13 | release-plz: 14 | name: Release-plz 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout repository 18 | uses: actions/checkout@v4 19 | with: 20 | fetch-depth: 0 21 | - name: Install Rust toolchain 22 | uses: dtolnay/rust-toolchain@stable 23 | - name: Run release-plz 24 | uses: MarcoIeni/release-plz-action@v0.5 25 | env: 26 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 27 | CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | .idea/ 3 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [Unreleased] 8 | 9 | ## [0.5.2](https://github.com/spiraldb/fsst/compare/v0.5.1...v0.5.2) - 2025-03-17 10 | 11 | ### Added 12 | 13 | - add rebuild from existing function ([#84](https://github.com/spiraldb/fsst/pull/84)) 14 | 15 | ## [0.5.1](https://github.com/spiraldb/fsst/compare/v0.5.0...v0.5.1) - 2025-03-12 16 | 17 | ### Other 18 | 19 | - Decompress in 8-byte blocks ([#82](https://github.com/spiraldb/fsst/pull/82)) 20 | - *(deps)* lock file maintenance ([#83](https://github.com/spiraldb/fsst/pull/83)) 21 | - Assert enough room in decoded buffer ([#79](https://github.com/spiraldb/fsst/pull/79)) 22 | - *(deps)* update rust crate criterion to v2.9.1 ([#80](https://github.com/spiraldb/fsst/pull/80)) 23 | - *(deps)* update mozilla-actions/sccache-action action to v0.0.8 ([#78](https://github.com/spiraldb/fsst/pull/78)) 24 | - *(deps)* lock file maintenance ([#77](https://github.com/spiraldb/fsst/pull/77)) 25 | - Add codspeed ([#76](https://github.com/spiraldb/fsst/pull/76)) 26 | 27 | ## [0.5.0](https://github.com/spiraldb/fsst/compare/v0.4.4...v0.5.0) - 2025-01-31 28 | 29 | ### Added 30 | 31 | - allow specifying exact size of decompress buffer (#70) 32 | 33 | ## [0.4.4](https://github.com/spiraldb/fsst/compare/v0.4.3...v0.4.4) - 2024-12-30 34 | 35 | ### Added 36 | 37 | - add dbtext decompression benchmark (#44) 38 | 39 | ### Other 40 | 41 | - Decode into an uninitialized byte slice ([#63](https://github.com/spiraldb/fsst/pull/63)) 42 | - *(deps)* lock file maintenance (#62) 43 | - *(deps)* lock file maintenance (#60) 44 | - *(deps)* lock file maintenance (#59) 45 | - *(deps)* update mozilla-actions/sccache-action action to v0.0.7 (#58) 46 | - *(deps)* lock file maintenance (#57) 47 | - *(deps)* lock file maintenance (#56) 48 | - *(deps)* lock file maintenance (#55) 49 | - *(deps)* lock file maintenance (#54) 50 | - *(deps)* lock file maintenance (#53) 51 | - *(deps)* lock file maintenance (#50) 52 | - *(deps)* lock file maintenance (#49) 53 | - *(deps)* lock file maintenance (#47) 54 | - *(deps)* lock file maintenance (#46) 55 | 56 | ## [0.4.3](https://github.com/spiraldb/fsst/compare/v0.4.2...v0.4.3) - 2024-10-03 57 | 58 | ### Added 59 | 60 | - 35% faster decompression with less boundary check ([#41](https://github.com/spiraldb/fsst/pull/41)) 61 | 62 | ### Other 63 | 64 | - *(deps)* update rust crate curl to v0.4.47 ([#40](https://github.com/spiraldb/fsst/pull/40)) 65 | - *(deps)* update mozilla-actions/sccache-action action to v0.0.6 ([#38](https://github.com/spiraldb/fsst/pull/38)) 66 | 67 | ## [0.4.2](https://github.com/spiraldb/fsst/compare/v0.4.1...v0.4.2) - 2024-09-17 68 | 69 | ### Fixed 70 | 71 | - search for first non-empty chunk ([#35](https://github.com/spiraldb/fsst/pull/35)) 72 | - docs first3 ([#33](https://github.com/spiraldb/fsst/pull/33)) 73 | 74 | ### Other 75 | 76 | - Assertion should allow empty compression ([#36](https://github.com/spiraldb/fsst/pull/36)) 77 | 78 | ## [0.4.1](https://github.com/spiraldb/fsst/compare/v0.4.0...v0.4.1) - 2024-09-12 79 | 80 | ### Other 81 | 82 | - Use wrapping operations in fsst_hash ([#31](https://github.com/spiraldb/fsst/pull/31)) 83 | 84 | ## [0.4.0](https://github.com/spiraldb/fsst/compare/v0.3.0...v0.4.0) - 2024-09-03 85 | 86 | ### Fixed 87 | - hash_table_sizing, inline hints, lint rule ([#29](https://github.com/spiraldb/fsst/pull/29)) 88 | 89 | ## [0.3.0](https://github.com/spiraldb/fsst/compare/v0.2.3...v0.3.0) - 2024-09-03 90 | 91 | ### Added 92 | - port in more from the C++ code ([#24](https://github.com/spiraldb/fsst/pull/24)) 93 | 94 | ### Other 95 | - centering ([#26](https://github.com/spiraldb/fsst/pull/26)) 96 | 97 | ## [0.2.3](https://github.com/spiraldb/fsst/compare/v0.2.2...v0.2.3) - 2024-08-22 98 | 99 | ### Added 100 | - reuse and clear instead of allocate, 2x speedup ([#22](https://github.com/spiraldb/fsst/pull/22)) 101 | 102 | ## [0.2.2](https://github.com/spiraldb/fsst/compare/v0.2.1...v0.2.2) - 2024-08-21 103 | 104 | ### Other 105 | - implement second bitmap, ~2x speedup for train ([#21](https://github.com/spiraldb/fsst/pull/21)) 106 | - remove spurious check ([#18](https://github.com/spiraldb/fsst/pull/18)) 107 | 108 | ## [0.2.1](https://github.com/spiraldb/fsst/compare/v0.2.0...v0.2.1) - 2024-08-20 109 | 110 | ### Added 111 | - make Compressor::train 2x faster with bitmap index ([#16](https://github.com/spiraldb/fsst/pull/16)) 112 | 113 | ## [0.2.0](https://github.com/spiraldb/fsst/compare/v0.1.0...v0.2.0) - 2024-08-20 114 | 115 | ### Other 116 | - tput improvements ([#13](https://github.com/spiraldb/fsst/pull/13)) 117 | 118 | ## [0.1.0](https://github.com/spiraldb/fsst/compare/v0.0.1...v0.1.0) - 2024-08-16 119 | 120 | ### Added 121 | - separate Compressor and Decompressor ([#11](https://github.com/spiraldb/fsst/pull/11)) 122 | 123 | ### Other 124 | - add badges ([#10](https://github.com/spiraldb/fsst/pull/10)) 125 | - release v0.0.1 ([#8](https://github.com/spiraldb/fsst/pull/8)) 126 | 127 | ## [0.0.1](https://github.com/spiraldb/fsst/releases/tag/v0.0.1) - 2024-08-15 128 | 129 | ### Fixed 130 | - fix doc link 131 | 132 | ### Other 133 | - turn on release-plz 134 | - add fuzzer, fix bug ([#7](https://github.com/spiraldb/fsst/pull/7)) 135 | - logo ([#6](https://github.com/spiraldb/fsst/pull/6)) 136 | - bugfix, comment fix, force compile fails for big-endian ([#5](https://github.com/spiraldb/fsst/pull/5)) 137 | - Configure Renovate ([#1](https://github.com/spiraldb/fsst/pull/1)) 138 | - Get compress performance to match paper algorithm 4 ([#3](https://github.com/spiraldb/fsst/pull/3)) 139 | - docs 140 | - cleanup 141 | - words 142 | - README 143 | - disable release action for now 144 | - deny(missing_docs), 512 -> 511 145 | - add toolchain 146 | - add actions files 147 | - implementation v0 148 | - initial impl 149 | - Initial commit 150 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.1.3" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "anes" 16 | version = "0.1.6" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" 19 | 20 | [[package]] 21 | name = "anstyle" 22 | version = "1.0.10" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" 25 | 26 | [[package]] 27 | name = "autocfg" 28 | version = "1.4.0" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" 31 | 32 | [[package]] 33 | name = "bitflags" 34 | version = "2.9.1" 35 | source = "registry+https://github.com/rust-lang/crates.io-index" 36 | checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" 37 | 38 | [[package]] 39 | name = "bumpalo" 40 | version = "3.17.0" 41 | source = "registry+https://github.com/rust-lang/crates.io-index" 42 | checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" 43 | 44 | [[package]] 45 | name = "cast" 46 | version = "0.3.0" 47 | source = "registry+https://github.com/rust-lang/crates.io-index" 48 | checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" 49 | 50 | [[package]] 51 | name = "cc" 52 | version = "1.2.24" 53 | source = "registry+https://github.com/rust-lang/crates.io-index" 54 | checksum = "16595d3be041c03b09d08d0858631facccee9221e579704070e6e9e4915d3bc7" 55 | dependencies = [ 56 | "shlex", 57 | ] 58 | 59 | [[package]] 60 | name = "cfg-if" 61 | version = "1.0.0" 62 | source = "registry+https://github.com/rust-lang/crates.io-index" 63 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 64 | 65 | [[package]] 66 | name = "ciborium" 67 | version = "0.2.2" 68 | source = "registry+https://github.com/rust-lang/crates.io-index" 69 | checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" 70 | dependencies = [ 71 | "ciborium-io", 72 | "ciborium-ll", 73 | "serde", 74 | ] 75 | 76 | [[package]] 77 | name = "ciborium-io" 78 | version = "0.2.2" 79 | source = "registry+https://github.com/rust-lang/crates.io-index" 80 | checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" 81 | 82 | [[package]] 83 | name = "ciborium-ll" 84 | version = "0.2.2" 85 | source = "registry+https://github.com/rust-lang/crates.io-index" 86 | checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" 87 | dependencies = [ 88 | "ciborium-io", 89 | "half", 90 | ] 91 | 92 | [[package]] 93 | name = "clap" 94 | version = "4.5.38" 95 | source = "registry+https://github.com/rust-lang/crates.io-index" 96 | checksum = "ed93b9805f8ba930df42c2590f05453d5ec36cbb85d018868a5b24d31f6ac000" 97 | dependencies = [ 98 | "clap_builder", 99 | ] 100 | 101 | [[package]] 102 | name = "clap_builder" 103 | version = "4.5.38" 104 | source = "registry+https://github.com/rust-lang/crates.io-index" 105 | checksum = "379026ff283facf611b0ea629334361c4211d1b12ee01024eec1591133b04120" 106 | dependencies = [ 107 | "anstyle", 108 | "clap_lex", 109 | ] 110 | 111 | [[package]] 112 | name = "clap_lex" 113 | version = "0.7.4" 114 | source = "registry+https://github.com/rust-lang/crates.io-index" 115 | checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" 116 | 117 | [[package]] 118 | name = "codspeed" 119 | version = "2.10.1" 120 | source = "registry+https://github.com/rust-lang/crates.io-index" 121 | checksum = "93f4cce9c27c49c4f101fffeebb1826f41a9df2e7498b7cd4d95c0658b796c6c" 122 | dependencies = [ 123 | "colored", 124 | "libc", 125 | "serde", 126 | "serde_json", 127 | "uuid", 128 | ] 129 | 130 | [[package]] 131 | name = "codspeed-criterion-compat" 132 | version = "2.10.1" 133 | source = "registry+https://github.com/rust-lang/crates.io-index" 134 | checksum = "c3c23d880a28a2aab52d38ca8481dd7a3187157d0a952196b6db1db3c8499725" 135 | dependencies = [ 136 | "codspeed", 137 | "codspeed-criterion-compat-walltime", 138 | "colored", 139 | ] 140 | 141 | [[package]] 142 | name = "codspeed-criterion-compat-walltime" 143 | version = "2.10.1" 144 | source = "registry+https://github.com/rust-lang/crates.io-index" 145 | checksum = "7b0a2f7365e347f4f22a67e9ea689bf7bc89900a354e22e26cf8a531a42c8fbb" 146 | dependencies = [ 147 | "anes", 148 | "cast", 149 | "ciborium", 150 | "clap", 151 | "codspeed", 152 | "criterion-plot", 153 | "is-terminal", 154 | "itertools", 155 | "num-traits", 156 | "once_cell", 157 | "oorandom", 158 | "plotters", 159 | "rayon", 160 | "regex", 161 | "serde", 162 | "serde_derive", 163 | "serde_json", 164 | "tinytemplate", 165 | "walkdir", 166 | ] 167 | 168 | [[package]] 169 | name = "colored" 170 | version = "2.2.0" 171 | source = "registry+https://github.com/rust-lang/crates.io-index" 172 | checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" 173 | dependencies = [ 174 | "lazy_static", 175 | "windows-sys 0.59.0", 176 | ] 177 | 178 | [[package]] 179 | name = "criterion-plot" 180 | version = "0.5.0" 181 | source = "registry+https://github.com/rust-lang/crates.io-index" 182 | checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" 183 | dependencies = [ 184 | "cast", 185 | "itertools", 186 | ] 187 | 188 | [[package]] 189 | name = "crossbeam-deque" 190 | version = "0.8.6" 191 | source = "registry+https://github.com/rust-lang/crates.io-index" 192 | checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" 193 | dependencies = [ 194 | "crossbeam-epoch", 195 | "crossbeam-utils", 196 | ] 197 | 198 | [[package]] 199 | name = "crossbeam-epoch" 200 | version = "0.9.18" 201 | source = "registry+https://github.com/rust-lang/crates.io-index" 202 | checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" 203 | dependencies = [ 204 | "crossbeam-utils", 205 | ] 206 | 207 | [[package]] 208 | name = "crossbeam-utils" 209 | version = "0.8.21" 210 | source = "registry+https://github.com/rust-lang/crates.io-index" 211 | checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" 212 | 213 | [[package]] 214 | name = "crunchy" 215 | version = "0.2.3" 216 | source = "registry+https://github.com/rust-lang/crates.io-index" 217 | checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" 218 | 219 | [[package]] 220 | name = "curl" 221 | version = "0.4.47" 222 | source = "registry+https://github.com/rust-lang/crates.io-index" 223 | checksum = "d9fb4d13a1be2b58f14d60adba57c9834b78c62fd86c3e76a148f732686e9265" 224 | dependencies = [ 225 | "curl-sys", 226 | "libc", 227 | "openssl-probe", 228 | "openssl-sys", 229 | "schannel", 230 | "socket2", 231 | "windows-sys 0.52.0", 232 | ] 233 | 234 | [[package]] 235 | name = "curl-sys" 236 | version = "0.4.80+curl-8.12.1" 237 | source = "registry+https://github.com/rust-lang/crates.io-index" 238 | checksum = "55f7df2eac63200c3ab25bde3b2268ef2ee56af3d238e76d61f01c3c49bff734" 239 | dependencies = [ 240 | "cc", 241 | "libc", 242 | "libz-sys", 243 | "openssl-sys", 244 | "pkg-config", 245 | "vcpkg", 246 | "windows-sys 0.52.0", 247 | ] 248 | 249 | [[package]] 250 | name = "either" 251 | version = "1.15.0" 252 | source = "registry+https://github.com/rust-lang/crates.io-index" 253 | checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" 254 | 255 | [[package]] 256 | name = "fsst-rs" 257 | version = "0.5.2" 258 | dependencies = [ 259 | "codspeed-criterion-compat", 260 | "curl", 261 | ] 262 | 263 | [[package]] 264 | name = "getrandom" 265 | version = "0.3.3" 266 | source = "registry+https://github.com/rust-lang/crates.io-index" 267 | checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" 268 | dependencies = [ 269 | "cfg-if", 270 | "libc", 271 | "r-efi", 272 | "wasi", 273 | ] 274 | 275 | [[package]] 276 | name = "half" 277 | version = "2.6.0" 278 | source = "registry+https://github.com/rust-lang/crates.io-index" 279 | checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" 280 | dependencies = [ 281 | "cfg-if", 282 | "crunchy", 283 | ] 284 | 285 | [[package]] 286 | name = "hermit-abi" 287 | version = "0.5.1" 288 | source = "registry+https://github.com/rust-lang/crates.io-index" 289 | checksum = "f154ce46856750ed433c8649605bf7ed2de3bc35fd9d2a9f30cddd873c80cb08" 290 | 291 | [[package]] 292 | name = "is-terminal" 293 | version = "0.4.16" 294 | source = "registry+https://github.com/rust-lang/crates.io-index" 295 | checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" 296 | dependencies = [ 297 | "hermit-abi", 298 | "libc", 299 | "windows-sys 0.59.0", 300 | ] 301 | 302 | [[package]] 303 | name = "itertools" 304 | version = "0.10.5" 305 | source = "registry+https://github.com/rust-lang/crates.io-index" 306 | checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" 307 | dependencies = [ 308 | "either", 309 | ] 310 | 311 | [[package]] 312 | name = "itoa" 313 | version = "1.0.15" 314 | source = "registry+https://github.com/rust-lang/crates.io-index" 315 | checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" 316 | 317 | [[package]] 318 | name = "js-sys" 319 | version = "0.3.77" 320 | source = "registry+https://github.com/rust-lang/crates.io-index" 321 | checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" 322 | dependencies = [ 323 | "once_cell", 324 | "wasm-bindgen", 325 | ] 326 | 327 | [[package]] 328 | name = "lazy_static" 329 | version = "1.5.0" 330 | source = "registry+https://github.com/rust-lang/crates.io-index" 331 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 332 | 333 | [[package]] 334 | name = "libc" 335 | version = "0.2.172" 336 | source = "registry+https://github.com/rust-lang/crates.io-index" 337 | checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" 338 | 339 | [[package]] 340 | name = "libz-sys" 341 | version = "1.1.22" 342 | source = "registry+https://github.com/rust-lang/crates.io-index" 343 | checksum = "8b70e7a7df205e92a1a4cd9aaae7898dac0aa555503cc0a649494d0d60e7651d" 344 | dependencies = [ 345 | "cc", 346 | "libc", 347 | "pkg-config", 348 | "vcpkg", 349 | ] 350 | 351 | [[package]] 352 | name = "log" 353 | version = "0.4.27" 354 | source = "registry+https://github.com/rust-lang/crates.io-index" 355 | checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" 356 | 357 | [[package]] 358 | name = "memchr" 359 | version = "2.7.4" 360 | source = "registry+https://github.com/rust-lang/crates.io-index" 361 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" 362 | 363 | [[package]] 364 | name = "num-traits" 365 | version = "0.2.19" 366 | source = "registry+https://github.com/rust-lang/crates.io-index" 367 | checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" 368 | dependencies = [ 369 | "autocfg", 370 | ] 371 | 372 | [[package]] 373 | name = "once_cell" 374 | version = "1.21.3" 375 | source = "registry+https://github.com/rust-lang/crates.io-index" 376 | checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" 377 | 378 | [[package]] 379 | name = "oorandom" 380 | version = "11.1.5" 381 | source = "registry+https://github.com/rust-lang/crates.io-index" 382 | checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" 383 | 384 | [[package]] 385 | name = "openssl-probe" 386 | version = "0.1.6" 387 | source = "registry+https://github.com/rust-lang/crates.io-index" 388 | checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" 389 | 390 | [[package]] 391 | name = "openssl-sys" 392 | version = "0.9.108" 393 | source = "registry+https://github.com/rust-lang/crates.io-index" 394 | checksum = "e145e1651e858e820e4860f7b9c5e169bc1d8ce1c86043be79fa7b7634821847" 395 | dependencies = [ 396 | "cc", 397 | "libc", 398 | "pkg-config", 399 | "vcpkg", 400 | ] 401 | 402 | [[package]] 403 | name = "pkg-config" 404 | version = "0.3.32" 405 | source = "registry+https://github.com/rust-lang/crates.io-index" 406 | checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" 407 | 408 | [[package]] 409 | name = "plotters" 410 | version = "0.3.7" 411 | source = "registry+https://github.com/rust-lang/crates.io-index" 412 | checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" 413 | dependencies = [ 414 | "num-traits", 415 | "plotters-backend", 416 | "plotters-svg", 417 | "wasm-bindgen", 418 | "web-sys", 419 | ] 420 | 421 | [[package]] 422 | name = "plotters-backend" 423 | version = "0.3.7" 424 | source = "registry+https://github.com/rust-lang/crates.io-index" 425 | checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" 426 | 427 | [[package]] 428 | name = "plotters-svg" 429 | version = "0.3.7" 430 | source = "registry+https://github.com/rust-lang/crates.io-index" 431 | checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" 432 | dependencies = [ 433 | "plotters-backend", 434 | ] 435 | 436 | [[package]] 437 | name = "proc-macro2" 438 | version = "1.0.95" 439 | source = "registry+https://github.com/rust-lang/crates.io-index" 440 | checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" 441 | dependencies = [ 442 | "unicode-ident", 443 | ] 444 | 445 | [[package]] 446 | name = "quote" 447 | version = "1.0.40" 448 | source = "registry+https://github.com/rust-lang/crates.io-index" 449 | checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" 450 | dependencies = [ 451 | "proc-macro2", 452 | ] 453 | 454 | [[package]] 455 | name = "r-efi" 456 | version = "5.2.0" 457 | source = "registry+https://github.com/rust-lang/crates.io-index" 458 | checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" 459 | 460 | [[package]] 461 | name = "rayon" 462 | version = "1.10.0" 463 | source = "registry+https://github.com/rust-lang/crates.io-index" 464 | checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" 465 | dependencies = [ 466 | "either", 467 | "rayon-core", 468 | ] 469 | 470 | [[package]] 471 | name = "rayon-core" 472 | version = "1.12.1" 473 | source = "registry+https://github.com/rust-lang/crates.io-index" 474 | checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" 475 | dependencies = [ 476 | "crossbeam-deque", 477 | "crossbeam-utils", 478 | ] 479 | 480 | [[package]] 481 | name = "regex" 482 | version = "1.11.1" 483 | source = "registry+https://github.com/rust-lang/crates.io-index" 484 | checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" 485 | dependencies = [ 486 | "aho-corasick", 487 | "memchr", 488 | "regex-automata", 489 | "regex-syntax", 490 | ] 491 | 492 | [[package]] 493 | name = "regex-automata" 494 | version = "0.4.9" 495 | source = "registry+https://github.com/rust-lang/crates.io-index" 496 | checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" 497 | dependencies = [ 498 | "aho-corasick", 499 | "memchr", 500 | "regex-syntax", 501 | ] 502 | 503 | [[package]] 504 | name = "regex-syntax" 505 | version = "0.8.5" 506 | source = "registry+https://github.com/rust-lang/crates.io-index" 507 | checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" 508 | 509 | [[package]] 510 | name = "rustversion" 511 | version = "1.0.21" 512 | source = "registry+https://github.com/rust-lang/crates.io-index" 513 | checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" 514 | 515 | [[package]] 516 | name = "ryu" 517 | version = "1.0.20" 518 | source = "registry+https://github.com/rust-lang/crates.io-index" 519 | checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" 520 | 521 | [[package]] 522 | name = "same-file" 523 | version = "1.0.6" 524 | source = "registry+https://github.com/rust-lang/crates.io-index" 525 | checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" 526 | dependencies = [ 527 | "winapi-util", 528 | ] 529 | 530 | [[package]] 531 | name = "schannel" 532 | version = "0.1.27" 533 | source = "registry+https://github.com/rust-lang/crates.io-index" 534 | checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" 535 | dependencies = [ 536 | "windows-sys 0.59.0", 537 | ] 538 | 539 | [[package]] 540 | name = "serde" 541 | version = "1.0.219" 542 | source = "registry+https://github.com/rust-lang/crates.io-index" 543 | checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" 544 | dependencies = [ 545 | "serde_derive", 546 | ] 547 | 548 | [[package]] 549 | name = "serde_derive" 550 | version = "1.0.219" 551 | source = "registry+https://github.com/rust-lang/crates.io-index" 552 | checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" 553 | dependencies = [ 554 | "proc-macro2", 555 | "quote", 556 | "syn", 557 | ] 558 | 559 | [[package]] 560 | name = "serde_json" 561 | version = "1.0.140" 562 | source = "registry+https://github.com/rust-lang/crates.io-index" 563 | checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" 564 | dependencies = [ 565 | "itoa", 566 | "memchr", 567 | "ryu", 568 | "serde", 569 | ] 570 | 571 | [[package]] 572 | name = "shlex" 573 | version = "1.3.0" 574 | source = "registry+https://github.com/rust-lang/crates.io-index" 575 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" 576 | 577 | [[package]] 578 | name = "socket2" 579 | version = "0.5.9" 580 | source = "registry+https://github.com/rust-lang/crates.io-index" 581 | checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef" 582 | dependencies = [ 583 | "libc", 584 | "windows-sys 0.52.0", 585 | ] 586 | 587 | [[package]] 588 | name = "syn" 589 | version = "2.0.101" 590 | source = "registry+https://github.com/rust-lang/crates.io-index" 591 | checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" 592 | dependencies = [ 593 | "proc-macro2", 594 | "quote", 595 | "unicode-ident", 596 | ] 597 | 598 | [[package]] 599 | name = "tinytemplate" 600 | version = "1.2.1" 601 | source = "registry+https://github.com/rust-lang/crates.io-index" 602 | checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" 603 | dependencies = [ 604 | "serde", 605 | "serde_json", 606 | ] 607 | 608 | [[package]] 609 | name = "unicode-ident" 610 | version = "1.0.18" 611 | source = "registry+https://github.com/rust-lang/crates.io-index" 612 | checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" 613 | 614 | [[package]] 615 | name = "uuid" 616 | version = "1.17.0" 617 | source = "registry+https://github.com/rust-lang/crates.io-index" 618 | checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d" 619 | dependencies = [ 620 | "getrandom", 621 | "js-sys", 622 | "wasm-bindgen", 623 | ] 624 | 625 | [[package]] 626 | name = "vcpkg" 627 | version = "0.2.15" 628 | source = "registry+https://github.com/rust-lang/crates.io-index" 629 | checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" 630 | 631 | [[package]] 632 | name = "walkdir" 633 | version = "2.5.0" 634 | source = "registry+https://github.com/rust-lang/crates.io-index" 635 | checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" 636 | dependencies = [ 637 | "same-file", 638 | "winapi-util", 639 | ] 640 | 641 | [[package]] 642 | name = "wasi" 643 | version = "0.14.2+wasi-0.2.4" 644 | source = "registry+https://github.com/rust-lang/crates.io-index" 645 | checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" 646 | dependencies = [ 647 | "wit-bindgen-rt", 648 | ] 649 | 650 | [[package]] 651 | name = "wasm-bindgen" 652 | version = "0.2.100" 653 | source = "registry+https://github.com/rust-lang/crates.io-index" 654 | checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" 655 | dependencies = [ 656 | "cfg-if", 657 | "once_cell", 658 | "rustversion", 659 | "wasm-bindgen-macro", 660 | ] 661 | 662 | [[package]] 663 | name = "wasm-bindgen-backend" 664 | version = "0.2.100" 665 | source = "registry+https://github.com/rust-lang/crates.io-index" 666 | checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" 667 | dependencies = [ 668 | "bumpalo", 669 | "log", 670 | "proc-macro2", 671 | "quote", 672 | "syn", 673 | "wasm-bindgen-shared", 674 | ] 675 | 676 | [[package]] 677 | name = "wasm-bindgen-macro" 678 | version = "0.2.100" 679 | source = "registry+https://github.com/rust-lang/crates.io-index" 680 | checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" 681 | dependencies = [ 682 | "quote", 683 | "wasm-bindgen-macro-support", 684 | ] 685 | 686 | [[package]] 687 | name = "wasm-bindgen-macro-support" 688 | version = "0.2.100" 689 | source = "registry+https://github.com/rust-lang/crates.io-index" 690 | checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" 691 | dependencies = [ 692 | "proc-macro2", 693 | "quote", 694 | "syn", 695 | "wasm-bindgen-backend", 696 | "wasm-bindgen-shared", 697 | ] 698 | 699 | [[package]] 700 | name = "wasm-bindgen-shared" 701 | version = "0.2.100" 702 | source = "registry+https://github.com/rust-lang/crates.io-index" 703 | checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" 704 | dependencies = [ 705 | "unicode-ident", 706 | ] 707 | 708 | [[package]] 709 | name = "web-sys" 710 | version = "0.3.77" 711 | source = "registry+https://github.com/rust-lang/crates.io-index" 712 | checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" 713 | dependencies = [ 714 | "js-sys", 715 | "wasm-bindgen", 716 | ] 717 | 718 | [[package]] 719 | name = "winapi-util" 720 | version = "0.1.9" 721 | source = "registry+https://github.com/rust-lang/crates.io-index" 722 | checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" 723 | dependencies = [ 724 | "windows-sys 0.59.0", 725 | ] 726 | 727 | [[package]] 728 | name = "windows-sys" 729 | version = "0.52.0" 730 | source = "registry+https://github.com/rust-lang/crates.io-index" 731 | checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" 732 | dependencies = [ 733 | "windows-targets", 734 | ] 735 | 736 | [[package]] 737 | name = "windows-sys" 738 | version = "0.59.0" 739 | source = "registry+https://github.com/rust-lang/crates.io-index" 740 | checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" 741 | dependencies = [ 742 | "windows-targets", 743 | ] 744 | 745 | [[package]] 746 | name = "windows-targets" 747 | version = "0.52.6" 748 | source = "registry+https://github.com/rust-lang/crates.io-index" 749 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 750 | dependencies = [ 751 | "windows_aarch64_gnullvm", 752 | "windows_aarch64_msvc", 753 | "windows_i686_gnu", 754 | "windows_i686_gnullvm", 755 | "windows_i686_msvc", 756 | "windows_x86_64_gnu", 757 | "windows_x86_64_gnullvm", 758 | "windows_x86_64_msvc", 759 | ] 760 | 761 | [[package]] 762 | name = "windows_aarch64_gnullvm" 763 | version = "0.52.6" 764 | source = "registry+https://github.com/rust-lang/crates.io-index" 765 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 766 | 767 | [[package]] 768 | name = "windows_aarch64_msvc" 769 | version = "0.52.6" 770 | source = "registry+https://github.com/rust-lang/crates.io-index" 771 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 772 | 773 | [[package]] 774 | name = "windows_i686_gnu" 775 | version = "0.52.6" 776 | source = "registry+https://github.com/rust-lang/crates.io-index" 777 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 778 | 779 | [[package]] 780 | name = "windows_i686_gnullvm" 781 | version = "0.52.6" 782 | source = "registry+https://github.com/rust-lang/crates.io-index" 783 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 784 | 785 | [[package]] 786 | name = "windows_i686_msvc" 787 | version = "0.52.6" 788 | source = "registry+https://github.com/rust-lang/crates.io-index" 789 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 790 | 791 | [[package]] 792 | name = "windows_x86_64_gnu" 793 | version = "0.52.6" 794 | source = "registry+https://github.com/rust-lang/crates.io-index" 795 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 796 | 797 | [[package]] 798 | name = "windows_x86_64_gnullvm" 799 | version = "0.52.6" 800 | source = "registry+https://github.com/rust-lang/crates.io-index" 801 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 802 | 803 | [[package]] 804 | name = "windows_x86_64_msvc" 805 | version = "0.52.6" 806 | source = "registry+https://github.com/rust-lang/crates.io-index" 807 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 808 | 809 | [[package]] 810 | name = "wit-bindgen-rt" 811 | version = "0.39.0" 812 | source = "registry+https://github.com/rust-lang/crates.io-index" 813 | checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" 814 | dependencies = [ 815 | "bitflags", 816 | ] 817 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fsst-rs" 3 | version = "0.5.2" 4 | description = "Pure-Rust implementation of Fast Static Symbol Tables algorithm for string compression" 5 | authors = ["SpiralDB Developers "] 6 | license = "Apache-2.0" 7 | repository = "https://github.com/spiraldb/fsst" 8 | edition = "2024" 9 | 10 | [lib] 11 | name = "fsst" 12 | 13 | [lints.rust] 14 | warnings = "deny" 15 | missing_docs = "deny" 16 | 17 | [lints.clippy] 18 | all = { level = "deny", priority = -1 } 19 | if_then_some_else_none = { level = "deny" } 20 | mem_forget = { level = "deny" } 21 | or_fun_call = "deny" 22 | panic_in_result_fn = { level = "deny" } 23 | same_name_method = { level = "deny" } 24 | tests_outside_test_module = { level = "deny" } 25 | unwrap_in_result = { level = "deny" } 26 | use_debug = { level = "deny" } 27 | 28 | [dev-dependencies] 29 | criterion = { package = "codspeed-criterion-compat", version = "2.8" } 30 | curl = "0.4" 31 | 32 | [[example]] 33 | name = "round_trip" 34 | bench = false 35 | test = false 36 | 37 | [[bench]] 38 | name = "compress" 39 | harness = false 40 | 41 | [[bench]] 42 | name = "micro" 43 | harness = false 44 | 45 | [[test]] 46 | name = "correctness" 47 | test = true 48 | bench = false 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | ![Crates.io Version](https://img.shields.io/crates/v/fsst_rs) 6 | ![docs.rs](https://img.shields.io/docsrs/fsst-rs) 7 | ![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/spiraldb/fsst/ci.yml?branch=develop) 8 | 9 | 10 | # fsst-rs 11 | 12 | 13 | A pure-Rust, zero-dependency implementation of the [FSST string compression algorithm][whitepaper]. 14 | 15 | FSST is a string compression algorithm meant for use in database systems. It was designed by 16 | [Peter Boncz, Thomas Neumann, and Viktor Leis][whitepaper]. It provides 1-3GB/sec compression 17 | and decompression of strings at compression rates competitive with or better than LZ4. 18 | 19 | This implementation is somewhat inspired by the [MIT-licensed implementation] from the paper authors, written in C++, 20 | but it is mostly written from a careful reading of the paper. 21 | 22 | **NOTE: This current implementation is still in-progress and is not production ready, please use at your own risk.** 23 | 24 | **NOTE: This crate only works on little-endian architectures currently. There are no current plans to support big-endian targets.** 25 | 26 | [whitepaper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf 27 | [MIT-licensed implementation]: https://github.com/cwida/fsst 28 | -------------------------------------------------------------------------------- /benches/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | -------------------------------------------------------------------------------- /benches/compress.rs: -------------------------------------------------------------------------------- 1 | //! Benchmarks for FSST compression, decompression, and symbol table training. 2 | //! 3 | //! We use the dbtext data at https://github.com/cwida/fsst/tree/master/paper/dbtext 4 | #![allow(missing_docs)] 5 | use core::str; 6 | use std::{ 7 | error::Error, 8 | fs::{self, DirBuilder, File}, 9 | io::{Read, Write}, 10 | path::Path, 11 | }; 12 | 13 | use criterion::{Criterion, Throughput, criterion_group, criterion_main}; 14 | 15 | use curl::easy::Easy; 16 | use fsst::Compressor; 17 | 18 | fn download_dataset(url: &str, path: impl AsRef) -> Result<(), Box> { 19 | let target = path.as_ref(); 20 | 21 | let mut dir_builder = DirBuilder::new(); 22 | dir_builder.recursive(true); 23 | 24 | dir_builder.create(target.parent().unwrap())?; 25 | 26 | // Avoid downloading the file twice. 27 | if target.exists() { 28 | return Ok(()); 29 | } 30 | 31 | let mut handle = Easy::new(); 32 | 33 | let mut buffer = Vec::new(); 34 | handle.url(url)?; 35 | { 36 | let mut transfer = handle.transfer(); 37 | transfer.write_function(|data| { 38 | buffer.extend_from_slice(data); 39 | 40 | Ok(data.len()) 41 | })?; 42 | transfer.perform()?; 43 | } 44 | 45 | let mut output = File::create(target)?; 46 | match output.write_all(&buffer) { 47 | Ok(()) => {} 48 | Err(err) => { 49 | // cleanup in case of failure 50 | fs::remove_file(target).unwrap(); 51 | 52 | return Err(Box::new(err)); 53 | } 54 | } 55 | 56 | Ok(()) 57 | } 58 | 59 | #[allow(clippy::use_debug)] 60 | fn bench_dbtext(c: &mut Criterion) { 61 | fn run_dataset_bench(name: &str, url: &str, path: &str, c: &mut Criterion) { 62 | let mut group = c.benchmark_group(name); 63 | download_dataset(url, path).unwrap(); 64 | 65 | let mut buf = Vec::new(); 66 | { 67 | let mut file = File::open(path).unwrap(); 68 | file.read_to_end(&mut buf).unwrap(); 69 | } 70 | 71 | group.bench_function("train-and-compress", |b| { 72 | b.iter_with_large_drop(|| { 73 | let compressor = Compressor::train(&vec![&buf]); 74 | compressor.compress_bulk(std::hint::black_box(&vec![&buf])) 75 | }); 76 | }); 77 | 78 | let compressor = Compressor::train(&vec![&buf]); 79 | let mut buffer = Vec::with_capacity(200 * 1024 * 1024); 80 | group.throughput(Throughput::Bytes(buf.len() as u64)); 81 | group.bench_function("compress-only", |b| { 82 | b.iter(|| unsafe { compressor.compress_into(&buf, &mut buffer) }); 83 | }); 84 | 85 | unsafe { 86 | compressor.compress_into(&buf, &mut buffer); 87 | }; 88 | let decompressor = compressor.decompressor(); 89 | group.bench_function("decompress", |b| { 90 | b.iter_with_large_drop(|| decompressor.decompress(&buffer)); 91 | }); 92 | 93 | group.finish(); 94 | 95 | // Report the compression factor for this dataset. 96 | let uncompressed_size = buf.len(); 97 | let compressor = Compressor::train(&vec![&buf]); 98 | 99 | let compressed = compressor.compress_bulk(&vec![&buf]); 100 | let compressed_size = compressed.iter().map(|l| l.len()).sum::(); 101 | let cf = (uncompressed_size as f64) / (compressed_size as f64); 102 | println!( 103 | "compressed {name} {uncompressed_size} => {compressed_size}B (compression factor {cf:.2}:1)" 104 | ) 105 | } 106 | 107 | run_dataset_bench( 108 | "dbtext/wikipedia", 109 | "https://raw.githubusercontent.com/cwida/fsst/4e188a/paper/dbtext/wikipedia", 110 | "benches/data/wikipedia", 111 | c, 112 | ); 113 | 114 | run_dataset_bench( 115 | "dbtext/l_comment", 116 | "https://raw.githubusercontent.com/cwida/fsst/4e188a/paper/dbtext/l_comment", 117 | "benches/data/l_comment", 118 | c, 119 | ); 120 | 121 | run_dataset_bench( 122 | "dbtext/urls", 123 | "https://raw.githubusercontent.com/cwida/fsst/4e188a/paper/dbtext/urls", 124 | "benches/data/urls", 125 | c, 126 | ); 127 | } 128 | 129 | criterion_group!(compress_bench, bench_dbtext); 130 | criterion_main!(compress_bench); 131 | -------------------------------------------------------------------------------- /benches/micro.rs: -------------------------------------------------------------------------------- 1 | #![allow(missing_docs)] 2 | 3 | use criterion::{Criterion, Throughput, criterion_group, criterion_main}; 4 | 5 | use fsst::{CompressorBuilder, Symbol}; 6 | 7 | fn one_megabyte(seed: &[u8]) -> Vec { 8 | seed.iter().copied().cycle().take(1024 * 1024).collect() 9 | } 10 | 11 | fn bench_compress(c: &mut Criterion) { 12 | let mut group = c.benchmark_group("compress-overhead"); 13 | // Reusable memory to hold outputs 14 | let mut output_buf: Vec = Vec::with_capacity(8 * 1024 * 1024); 15 | 16 | // We create a symbol table that requires probing the hash table to perform 17 | // decompression. 18 | group.bench_function("compress-hashtab", |b| { 19 | let mut compressor = CompressorBuilder::new(); 20 | compressor.insert(Symbol::from_slice(b"abcdefgh"), 8); 21 | let compressor = compressor.build(); 22 | 23 | let word = u64::from_le_bytes([b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h']); 24 | b.iter(|| unsafe { compressor.compress_word(word, output_buf.as_mut_ptr()) }); 25 | }); 26 | 27 | // We create a symbol table that is able to short-circuit the decompression 28 | group.bench_function("compress-twobytes", |b| { 29 | let mut compressor = CompressorBuilder::new(); 30 | compressor.insert(Symbol::from_slice(&[b'a', b'b', 0, 0, 0, 0, 0, 0]), 2); 31 | let compressor = compressor.build(); 32 | 33 | let word = u64::from_le_bytes([b'a', b'b', 0, 0, 0, 0, 0, 0]); 34 | b.iter(|| unsafe { compressor.compress_word(word, output_buf.as_mut_ptr()) }); 35 | }); 36 | group.finish(); 37 | 38 | let mut group = c.benchmark_group("cf=1"); 39 | let test_string = one_megabyte(b"aaaaaaaa"); 40 | group.throughput(Throughput::Bytes(test_string.len() as u64)); 41 | group.bench_function("compress", |b| { 42 | let mut compressor = CompressorBuilder::new(); 43 | assert!(compressor.insert(Symbol::from_u8(b'a'), 1)); 44 | let compressor = compressor.build(); 45 | 46 | b.iter(|| unsafe { 47 | compressor.compress_into(&test_string, &mut output_buf); 48 | }) 49 | }); 50 | group.finish(); 51 | 52 | let mut group = c.benchmark_group("cf=2"); 53 | let test_string = one_megabyte(b"ab"); 54 | 55 | group.throughput(Throughput::Bytes(test_string.len() as u64)); 56 | group.bench_function("compress", |b| { 57 | let mut compressor = CompressorBuilder::new(); 58 | // This outputs two codes for every 4 bytes of text. 59 | assert!(compressor.insert(Symbol::from_slice(&[b'a', 0, 0, 0, 0, 0, 0, 0]), 1)); 60 | assert!(compressor.insert(Symbol::from_slice(&[b'b', b'a', b'b', 0, 0, 0, 0, 0]), 3)); 61 | let compressor = compressor.build(); 62 | 63 | b.iter(|| unsafe { 64 | compressor.compress_into(&test_string, &mut output_buf); 65 | }) 66 | }); 67 | group.finish(); 68 | 69 | let mut group = c.benchmark_group("cf=4"); 70 | let test_string = one_megabyte(b"abcd"); 71 | group.throughput(Throughput::Bytes(test_string.len() as u64)); 72 | group.bench_function("compress", |b| { 73 | let mut compressor = CompressorBuilder::new(); 74 | assert!(compressor.insert(Symbol::from_slice(&[b'a', b'b', b'c', b'd', 0, 0, 0, 0]), 4)); 75 | let compressor = compressor.build(); 76 | 77 | b.iter(|| unsafe { 78 | compressor.compress_into(&test_string, &mut output_buf); 79 | }) 80 | }); 81 | group.finish(); 82 | 83 | let mut group = c.benchmark_group("cf=8"); 84 | let test_string = one_megabyte(b"abcdefgh"); 85 | group.throughput(Throughput::Bytes(test_string.len() as u64)); 86 | group.bench_function("compress", |b| { 87 | let mut compressor = CompressorBuilder::new(); 88 | assert!(compressor.insert(Symbol::from_slice(b"abcdefgh"), 8)); 89 | let compressor = compressor.build(); 90 | 91 | b.iter(|| unsafe { 92 | compressor.compress_into(&test_string, &mut output_buf); 93 | }) 94 | }); 95 | 96 | group.bench_function("decompress", |b| { 97 | let mut compressor = CompressorBuilder::new(); 98 | assert!(compressor.insert(Symbol::from_slice(b"abcdefgh"), 8)); 99 | let compressor = compressor.build(); 100 | let compressed = compressor.compress(&test_string); 101 | 102 | let decompressor = compressor.decompressor(); 103 | 104 | b.iter(|| decompressor.decompress(&compressed)) 105 | }); 106 | group.finish(); 107 | 108 | let _ = std::hint::black_box(output_buf); 109 | } 110 | 111 | criterion_group!(bench_micro, bench_compress); 112 | criterion_main!(bench_micro); 113 | -------------------------------------------------------------------------------- /examples/file_compressor.rs: -------------------------------------------------------------------------------- 1 | #![allow(missing_docs, clippy::use_debug)] 2 | 3 | //! This is a command line program that expects an input file as an argument, 4 | //! and trains a symbol table that it then uses to compress the file in-memory. 5 | //! 6 | //! Example: 7 | //! 8 | //! ``` 9 | //! cargo run --release --example file_compressor -- lineitem.tbl 10 | //! ``` 11 | use std::{ 12 | fs::File, 13 | io::Read, 14 | // io::{Read, Write}, 15 | path::Path, 16 | }; 17 | 18 | use fsst::Compressor; 19 | 20 | fn main() { 21 | let args: Vec<_> = std::env::args().skip(1).collect(); 22 | 23 | let input_path = Path::new(&args[0]); 24 | 25 | let mut string = String::new(); 26 | { 27 | let mut f = File::open(input_path).unwrap(); 28 | f.read_to_string(&mut string).unwrap(); 29 | } 30 | let uncompressed_size = string.len(); 31 | let lines: Vec<&[u8]> = string.lines().map(|line| line.as_bytes()).collect(); 32 | 33 | // let mut output = File::create(output_path).unwrap(); 34 | let start = std::time::Instant::now(); 35 | let compressor = Compressor::train(&lines); 36 | let duration = std::time::Instant::now().duration_since(start); 37 | println!("train took {}µs", duration.as_micros()); 38 | let mut compressed_size = 0; 39 | 40 | let mut buffer = Vec::with_capacity(8 * 1024 * 1024); 41 | 42 | let start = std::time::Instant::now(); 43 | for text in lines { 44 | unsafe { compressor.compress_into(text, &mut buffer) }; 45 | compressed_size += buffer.len(); 46 | } 47 | let duration = std::time::Instant::now().duration_since(start); 48 | println!("compression took {}µs", duration.as_micros()); 49 | println!( 50 | "compressed {} -> {} ({}%)", 51 | uncompressed_size, 52 | compressed_size, 53 | 100.0 * (compressed_size as f64) / (uncompressed_size as f64) 54 | ); 55 | } 56 | -------------------------------------------------------------------------------- /examples/round_trip.rs: -------------------------------------------------------------------------------- 1 | //! Simple example where we show round-tripping a string through the static symbol table. 2 | 3 | use core::str; 4 | 5 | use fsst::Compressor; 6 | 7 | fn main() { 8 | // Train on a sample. 9 | let sample = "the quick brown fox jumped over the lazy dog"; 10 | let trained = Compressor::train(&vec![sample.as_bytes()]); 11 | let compressed = trained.compress(sample.as_bytes()); 12 | println!("compressed: {} => {}", sample.len(), compressed.len()); 13 | // decompress now 14 | let decode = trained.decompressor().decompress(&compressed); 15 | let output = str::from_utf8(&decode).unwrap(); 16 | println!( 17 | "decoded to the original: len={} text='{}'", 18 | decode.len(), 19 | output 20 | ); 21 | } 22 | -------------------------------------------------------------------------------- /fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | corpus 3 | artifacts 4 | coverage 5 | -------------------------------------------------------------------------------- /fuzz/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "arbitrary" 7 | version = "1.4.1" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" 10 | 11 | [[package]] 12 | name = "bitflags" 13 | version = "2.9.1" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" 16 | 17 | [[package]] 18 | name = "cc" 19 | version = "1.2.24" 20 | source = "registry+https://github.com/rust-lang/crates.io-index" 21 | checksum = "16595d3be041c03b09d08d0858631facccee9221e579704070e6e9e4915d3bc7" 22 | dependencies = [ 23 | "jobserver", 24 | "libc", 25 | "shlex", 26 | ] 27 | 28 | [[package]] 29 | name = "cfg-if" 30 | version = "1.0.0" 31 | source = "registry+https://github.com/rust-lang/crates.io-index" 32 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 33 | 34 | [[package]] 35 | name = "fsst-rs" 36 | version = "0.5.2" 37 | 38 | [[package]] 39 | name = "fsst-rs-fuzz" 40 | version = "0.0.0" 41 | dependencies = [ 42 | "fsst-rs", 43 | "libfuzzer-sys", 44 | ] 45 | 46 | [[package]] 47 | name = "getrandom" 48 | version = "0.3.3" 49 | source = "registry+https://github.com/rust-lang/crates.io-index" 50 | checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" 51 | dependencies = [ 52 | "cfg-if", 53 | "libc", 54 | "r-efi", 55 | "wasi", 56 | ] 57 | 58 | [[package]] 59 | name = "jobserver" 60 | version = "0.1.33" 61 | source = "registry+https://github.com/rust-lang/crates.io-index" 62 | checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" 63 | dependencies = [ 64 | "getrandom", 65 | "libc", 66 | ] 67 | 68 | [[package]] 69 | name = "libc" 70 | version = "0.2.172" 71 | source = "registry+https://github.com/rust-lang/crates.io-index" 72 | checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" 73 | 74 | [[package]] 75 | name = "libfuzzer-sys" 76 | version = "0.4.9" 77 | source = "registry+https://github.com/rust-lang/crates.io-index" 78 | checksum = "cf78f52d400cf2d84a3a973a78a592b4adc535739e0a5597a0da6f0c357adc75" 79 | dependencies = [ 80 | "arbitrary", 81 | "cc", 82 | ] 83 | 84 | [[package]] 85 | name = "r-efi" 86 | version = "5.2.0" 87 | source = "registry+https://github.com/rust-lang/crates.io-index" 88 | checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" 89 | 90 | [[package]] 91 | name = "shlex" 92 | version = "1.3.0" 93 | source = "registry+https://github.com/rust-lang/crates.io-index" 94 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" 95 | 96 | [[package]] 97 | name = "wasi" 98 | version = "0.14.2+wasi-0.2.4" 99 | source = "registry+https://github.com/rust-lang/crates.io-index" 100 | checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" 101 | dependencies = [ 102 | "wit-bindgen-rt", 103 | ] 104 | 105 | [[package]] 106 | name = "wit-bindgen-rt" 107 | version = "0.39.0" 108 | source = "registry+https://github.com/rust-lang/crates.io-index" 109 | checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" 110 | dependencies = [ 111 | "bitflags", 112 | ] 113 | -------------------------------------------------------------------------------- /fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fsst-rs-fuzz" 3 | version = "0.0.0" 4 | publish = false 5 | edition = "2024" 6 | 7 | [package.metadata] 8 | cargo-fuzz = true 9 | 10 | [dependencies] 11 | libfuzzer-sys = "0.4" 12 | 13 | [dependencies.fsst-rs] 14 | path = ".." 15 | 16 | [[bin]] 17 | name = "fuzz_train" 18 | path = "fuzz_targets/fuzz_train.rs" 19 | test = false 20 | doc = false 21 | bench = false 22 | 23 | [[bin]] 24 | name = "fuzz_compress" 25 | path = "fuzz_targets/fuzz_compress.rs" 26 | test = false 27 | doc = false 28 | bench = false 29 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/fuzz_compress.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use libfuzzer_sys::fuzz_target; 4 | 5 | fuzz_target!(|data: &[u8]| { 6 | let compressor = fsst::Compressor::train(&vec![data]); 7 | let compressed = compressor.compress(data); 8 | let decompressed = compressor.decompressor().decompress(&compressed); 9 | assert_eq!(&decompressed, data); 10 | 11 | // Rebuild a compressor using the symbol table, and assert that it compresses and roundtrips 12 | // identically. 13 | let recompressor = 14 | fsst::Compressor::rebuild_from(compressor.symbol_table(), compressor.symbol_lengths()); 15 | let recompressed = recompressor.compress(data); 16 | assert_eq!( 17 | &compressed, 18 | &recompressed, 19 | "failed comparison with data {:?} symbols: {:?}", 20 | data, 21 | compressor.symbol_table(), 22 | ); 23 | }); 24 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/fuzz_train.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use libfuzzer_sys::fuzz_target; 4 | 5 | fuzz_target!(|data: &[u8]| { 6 | let _ = fsst::Compressor::train(&vec![data]); 7 | }); 8 | -------------------------------------------------------------------------------- /logo.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spiraldb/fsst/eb53aa0bda54a4e411d6ca90afe9ad566432dca4/logo.webp -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "local>spiraldb/renovate-config" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | [toolchain] 2 | channel = "nightly-2025-02-24" 3 | components = ["rust-src", "rustfmt", "clippy"] 4 | profile = "minimal" 5 | -------------------------------------------------------------------------------- /src/builder.rs: -------------------------------------------------------------------------------- 1 | //! Functions and types used for building a [`Compressor`] from a corpus of text. 2 | //! 3 | //! This module implements the logic from Algorithm 3 of the [FSST Paper]. 4 | //! 5 | //! [FSST Paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf 6 | 7 | use crate::{ 8 | Code, Compressor, FSST_CODE_BASE, FSST_CODE_MASK, Symbol, advance_8byte_word, compare_masked, 9 | lossy_pht::LossyPHT, 10 | }; 11 | use std::cmp::Ordering; 12 | use std::collections::BinaryHeap; 13 | 14 | /// Bitmap that only works for values up to 512 15 | #[derive(Clone, Copy, Debug, Default)] 16 | struct CodesBitmap { 17 | codes: [u64; 8], 18 | } 19 | 20 | assert_sizeof!(CodesBitmap => 64); 21 | 22 | impl CodesBitmap { 23 | /// Set the indicated bit. Must be between 0 and [`FSST_CODE_MASK`][crate::FSST_CODE_MASK]. 24 | pub(crate) fn set(&mut self, index: usize) { 25 | debug_assert!( 26 | index <= FSST_CODE_MASK as usize, 27 | "code cannot exceed {FSST_CODE_MASK}" 28 | ); 29 | 30 | let map = index >> 6; 31 | self.codes[map] |= 1 << (index % 64); 32 | } 33 | 34 | /// Check if `index` is present in the bitmap 35 | pub(crate) fn is_set(&self, index: usize) -> bool { 36 | debug_assert!( 37 | index <= FSST_CODE_MASK as usize, 38 | "code cannot exceed {FSST_CODE_MASK}" 39 | ); 40 | 41 | let map = index >> 6; 42 | self.codes[map] & 1 << (index % 64) != 0 43 | } 44 | 45 | /// Get all codes set in this bitmap 46 | pub(crate) fn codes(&self) -> CodesIterator { 47 | CodesIterator { 48 | inner: self, 49 | index: 0, 50 | block: self.codes[0], 51 | reference: 0, 52 | } 53 | } 54 | 55 | /// Clear the bitmap of all entries. 56 | pub(crate) fn clear(&mut self) { 57 | self.codes[0] = 0; 58 | self.codes[1] = 0; 59 | self.codes[2] = 0; 60 | self.codes[3] = 0; 61 | self.codes[4] = 0; 62 | self.codes[5] = 0; 63 | self.codes[6] = 0; 64 | self.codes[7] = 0; 65 | } 66 | } 67 | 68 | struct CodesIterator<'a> { 69 | inner: &'a CodesBitmap, 70 | index: usize, 71 | block: u64, 72 | reference: usize, 73 | } 74 | 75 | impl Iterator for CodesIterator<'_> { 76 | type Item = u16; 77 | 78 | fn next(&mut self) -> Option { 79 | // If current is zero, advance to next non-zero block 80 | while self.block == 0 { 81 | self.index += 1; 82 | if self.index >= 8 { 83 | return None; 84 | } 85 | self.block = self.inner.codes[self.index]; 86 | self.reference = self.index * 64; 87 | } 88 | 89 | // Find the next set bit in the current block. 90 | let position = self.block.trailing_zeros() as usize; 91 | let code = self.reference + position; 92 | 93 | if code >= 511 { 94 | return None; 95 | } 96 | 97 | // The next iteration will calculate with reference to the returned code + 1 98 | self.reference = code + 1; 99 | self.block = if position == 63 { 100 | 0 101 | } else { 102 | self.block >> (1 + position) 103 | }; 104 | 105 | Some(code as u16) 106 | } 107 | } 108 | 109 | #[derive(Debug, Clone)] 110 | struct Counter { 111 | /// Frequency count for each code. 112 | counts1: Vec, 113 | 114 | /// Frequency count for each code-pair. 115 | counts2: Vec, 116 | 117 | /// Bitmap index for codes that appear in counts1 118 | code1_index: CodesBitmap, 119 | 120 | /// Bitmap index of pairs that have been set. 121 | /// 122 | /// `pair_index[code1].codes()` yields an iterator that can 123 | /// be used to find all possible codes that follow `codes1`. 124 | pair_index: Vec, 125 | } 126 | 127 | const COUNTS1_SIZE: usize = (FSST_CODE_MASK + 1) as usize; 128 | 129 | // NOTE: in Rust, creating a 1D vector of length N^2 is ~4x faster than creating a 2-D vector, 130 | // because `vec!` has a specialization for zero. 131 | // 132 | // We also include +1 extra row at the end so that we can do writes into the counters without a branch 133 | // for the first iteration. 134 | const COUNTS2_SIZE: usize = COUNTS1_SIZE * COUNTS1_SIZE; 135 | 136 | impl Counter { 137 | fn new() -> Self { 138 | let mut counts1 = Vec::with_capacity(COUNTS1_SIZE); 139 | let mut counts2 = Vec::with_capacity(COUNTS2_SIZE); 140 | // SAFETY: all accesses to the vector go through the bitmap to ensure no uninitialized 141 | // data is ever read from these vectors. 142 | unsafe { 143 | counts1.set_len(COUNTS1_SIZE); 144 | counts2.set_len(COUNTS2_SIZE); 145 | } 146 | 147 | Self { 148 | counts1, 149 | counts2, 150 | code1_index: CodesBitmap::default(), 151 | pair_index: vec![CodesBitmap::default(); COUNTS1_SIZE], 152 | } 153 | } 154 | 155 | #[inline] 156 | fn record_count1(&mut self, code1: u16) { 157 | // If not set, we want to start at one. 158 | let base = if self.code1_index.is_set(code1 as usize) { 159 | self.counts1[code1 as usize] 160 | } else { 161 | 0 162 | }; 163 | 164 | self.counts1[code1 as usize] = base + 1; 165 | self.code1_index.set(code1 as usize); 166 | } 167 | 168 | #[inline] 169 | fn record_count2(&mut self, code1: u16, code2: u16) { 170 | debug_assert!(code1 == FSST_CODE_MASK || self.code1_index.is_set(code1 as usize)); 171 | debug_assert!(self.code1_index.is_set(code2 as usize)); 172 | 173 | let idx = (code1 as usize) * COUNTS1_SIZE + (code2 as usize); 174 | if self.pair_index[code1 as usize].is_set(code2 as usize) { 175 | self.counts2[idx] += 1; 176 | } else { 177 | self.counts2[idx] = 1; 178 | } 179 | self.pair_index[code1 as usize].set(code2 as usize); 180 | } 181 | 182 | #[inline] 183 | fn count1(&self, code1: u16) -> usize { 184 | debug_assert!(self.code1_index.is_set(code1 as usize)); 185 | 186 | self.counts1[code1 as usize] 187 | } 188 | 189 | #[inline] 190 | fn count2(&self, code1: u16, code2: u16) -> usize { 191 | debug_assert!(self.code1_index.is_set(code1 as usize)); 192 | debug_assert!(self.code1_index.is_set(code2 as usize)); 193 | debug_assert!(self.pair_index[code1 as usize].is_set(code2 as usize)); 194 | 195 | let idx = (code1 as usize) * 512 + (code2 as usize); 196 | self.counts2[idx] 197 | } 198 | 199 | /// Returns an ordered iterator over the codes that were observed 200 | /// in a call to [`Self::count1`]. 201 | fn first_codes(&self) -> CodesIterator { 202 | self.code1_index.codes() 203 | } 204 | 205 | /// Returns an iterator over the codes that have been observed 206 | /// to follow `code1`. 207 | /// 208 | /// This is the set of all values `code2` where there was 209 | /// previously a call to `self.record_count2(code1, code2)`. 210 | fn second_codes(&self, code1: u16) -> CodesIterator { 211 | self.pair_index[code1 as usize].codes() 212 | } 213 | 214 | /// Clear the counters. 215 | /// Note that this just touches the bitmaps and sets them all to invalid. 216 | fn clear(&mut self) { 217 | self.code1_index.clear(); 218 | for index in &mut self.pair_index { 219 | index.clear(); 220 | } 221 | } 222 | } 223 | 224 | /// Entrypoint for building a new `Compressor`. 225 | pub struct CompressorBuilder { 226 | /// Table mapping codes to symbols. 227 | /// 228 | /// The entries 0-255 are setup in some other way here 229 | symbols: Vec, 230 | 231 | /// The number of entries in the symbol table that have been populated, not counting 232 | /// the escape values. 233 | n_symbols: u8, 234 | 235 | /// Counts for number of symbols of each length. 236 | /// 237 | /// `len_histogram[len-1]` = count of the symbols of length `len`. 238 | len_histogram: [u8; 8], 239 | 240 | /// Inverted index mapping 1-byte symbols to codes. 241 | /// 242 | /// This is only used for building, not used by the final `Compressor`. 243 | codes_one_byte: Vec, 244 | 245 | /// Inverted index mapping 2-byte symbols to codes 246 | codes_two_byte: Vec, 247 | 248 | /// Lossy perfect hash table for looking up codes to symbols that are 3 bytes or more 249 | lossy_pht: LossyPHT, 250 | } 251 | 252 | impl CompressorBuilder { 253 | /// Create a new builder. 254 | pub fn new() -> Self { 255 | // NOTE: `vec!` has a specialization for building a new vector of `0u64`. Because Symbol and u64 256 | // have the same bit pattern, we can allocate as u64 and transmute. If we do `vec![Symbol::EMPTY; N]`, 257 | // that will create a new Vec and call `Symbol::EMPTY.clone()` `N` times which is considerably slower. 258 | let symbols = vec![0u64; 511]; 259 | 260 | // SAFETY: transmute safety assured by the compiler. 261 | let symbols: Vec = unsafe { std::mem::transmute(symbols) }; 262 | 263 | let mut table = Self { 264 | symbols, 265 | n_symbols: 0, 266 | len_histogram: [0; 8], 267 | codes_two_byte: Vec::with_capacity(65_536), 268 | codes_one_byte: Vec::with_capacity(512), 269 | lossy_pht: LossyPHT::new(), 270 | }; 271 | 272 | // Populate the escape byte entries. 273 | for byte in 0..=255 { 274 | let symbol = Symbol::from_u8(byte); 275 | table.symbols[byte as usize] = symbol; 276 | } 277 | 278 | // Fill codes_one_byte with pseudocodes for each byte. 279 | for byte in 0..=255 { 280 | // Push pseudocode for single-byte escape. 281 | table.codes_one_byte.push(Code::new_escape(byte)); 282 | } 283 | 284 | // Fill codes_two_byte with pseudocode of first byte 285 | for idx in 0..=65_535 { 286 | table.codes_two_byte.push(Code::new_escape(idx as u8)); 287 | } 288 | 289 | table 290 | } 291 | } 292 | 293 | impl Default for CompressorBuilder { 294 | fn default() -> Self { 295 | Self::new() 296 | } 297 | } 298 | 299 | impl CompressorBuilder { 300 | /// Attempt to insert a new symbol at the end of the table. 301 | /// 302 | /// # Panics 303 | /// 304 | /// Panics if the table is already full. 305 | /// 306 | /// # Returns 307 | /// 308 | /// Returns true if the symbol was inserted successfully, or false if it conflicted 309 | /// with an existing symbol. 310 | pub fn insert(&mut self, symbol: Symbol, len: usize) -> bool { 311 | assert!(self.n_symbols < 255, "cannot insert into full symbol table"); 312 | assert_eq!(len, symbol.len(), "provided len must equal symbol.len()"); 313 | 314 | if len == 2 { 315 | // shortCodes 316 | self.codes_two_byte[symbol.first2() as usize] = 317 | Code::new_symbol_building(self.n_symbols, 2); 318 | } else if len == 1 { 319 | // byteCodes 320 | self.codes_one_byte[symbol.first_byte() as usize] = 321 | Code::new_symbol_building(self.n_symbols, 1); 322 | } else { 323 | // Symbols of 3 or more bytes go into the hash table 324 | if !self.lossy_pht.insert(symbol, len, self.n_symbols) { 325 | return false; 326 | } 327 | } 328 | 329 | // Increment length histogram. 330 | self.len_histogram[len - 1] += 1; 331 | 332 | // Insert successfully stored symbol at end of the symbol table 333 | // Note the rescaling from range [0-254] -> [256, 510]. 334 | self.symbols[256 + (self.n_symbols as usize)] = symbol; 335 | self.n_symbols += 1; 336 | true 337 | } 338 | 339 | /// Clear all set items from the compressor. 340 | /// 341 | /// This is considerably faster than building a new Compressor from scratch for each 342 | /// iteration of the `train` loop. 343 | fn clear(&mut self) { 344 | // Eliminate every observed code from the table. 345 | for code in 0..(256 + self.n_symbols as usize) { 346 | let symbol = self.symbols[code]; 347 | if symbol.len() == 1 { 348 | // Reset the entry from the codes_one_byte array. 349 | self.codes_one_byte[symbol.first_byte() as usize] = 350 | Code::new_escape(symbol.first_byte()); 351 | } else if symbol.len() == 2 { 352 | // Reset the entry from the codes_two_byte array. 353 | self.codes_two_byte[symbol.first2() as usize] = 354 | Code::new_escape(symbol.first_byte()); 355 | } else { 356 | // Clear the hashtable entry 357 | self.lossy_pht.remove(symbol); 358 | } 359 | } 360 | 361 | // Reset len histogram 362 | for i in 0..=7 { 363 | self.len_histogram[i] = 0; 364 | } 365 | 366 | self.n_symbols = 0; 367 | } 368 | 369 | /// Finalizing the table is done once building is complete to prepare for efficient 370 | /// compression. 371 | /// 372 | /// When we finalize the table, the following modifications are made in-place: 373 | /// 374 | /// 1. The codes are renumbered so that all symbols are ordered by length (order 23456781). 375 | /// During this process, the two byte symbols are separated into a byte_lim and a suffix_lim, 376 | /// so we know that we don't need to check the suffix limitations instead. 377 | /// 2. The 1-byte symbols index is merged into the 2-byte symbols index to allow for use of only 378 | /// a single index in front of the hash table. 379 | /// 380 | /// # Returns 381 | /// 382 | /// Returns the `suffix_lim`, which is the index of the two-byte code before where we know 383 | /// there are no longer suffixies in the symbol table. 384 | /// 385 | /// Also returns the lengths vector, which is of length `n_symbols` and contains the 386 | /// length for each of the values. 387 | fn finalize(&mut self) -> (u8, Vec) { 388 | // Create a cumulative sum of each of the elements of the input line numbers. 389 | // Do a map that includes the previously seen value as well. 390 | // Regroup symbols based on their lengths. 391 | // Space at the end of the symbol table reserved for the one-byte codes. 392 | let byte_lim = self.n_symbols - self.len_histogram[0]; 393 | 394 | // Start code for each length. 395 | // Length 1: at the end of symbol table. 396 | // Length 2: starts at 0. Split into before/after suffixLim. 397 | let mut codes_by_length = [0u8; 8]; 398 | codes_by_length[0] = byte_lim; 399 | codes_by_length[1] = 0; 400 | 401 | // codes for lengths 3..=8 start where the previous ones end. 402 | for i in 1..7 { 403 | codes_by_length[i + 1] = codes_by_length[i] + self.len_histogram[i]; 404 | } 405 | 406 | // no_suffix_code is the lowest code for a symbol that does not have a longer 3+ byte 407 | // suffix in the table. 408 | // This value starts at 0 and extends up. 409 | let mut no_suffix_code = 0; 410 | 411 | // The codes that do not have a suffix begin just before the range of the 3-byte codes. 412 | let mut has_suffix_code = codes_by_length[2]; 413 | 414 | // Assign each symbol a new code ordered by lengths, in the order 415 | // 2(no suffix) | 2 (suffix) | 3 | 4 | 5 | 6 | 7 | 8 | 1 416 | let mut new_codes = [0u8; FSST_CODE_BASE as usize]; 417 | 418 | let mut symbol_lens = [0u8; FSST_CODE_BASE as usize]; 419 | 420 | for i in 0..(self.n_symbols as usize) { 421 | let symbol = self.symbols[256 + i]; 422 | let len = symbol.len(); 423 | if len == 2 { 424 | let has_suffix = self 425 | .symbols 426 | .iter() 427 | .skip(FSST_CODE_BASE as usize) 428 | .enumerate() 429 | .any(|(k, other)| i != k && symbol.first2() == other.first2()); 430 | 431 | if has_suffix { 432 | // Symbols that have a longer suffix are inserted at the end of the 2-byte range 433 | has_suffix_code -= 1; 434 | new_codes[i] = has_suffix_code; 435 | } else { 436 | // Symbols that do not have a longer suffix are inserted at the start of 437 | // the 2-byte range. 438 | new_codes[i] = no_suffix_code; 439 | no_suffix_code += 1; 440 | } 441 | } else { 442 | // Assign new code based on the next code available for the given length symbol 443 | new_codes[i] = codes_by_length[len - 1]; 444 | codes_by_length[len - 1] += 1; 445 | } 446 | 447 | // Write the symbol into the front half of the symbol table. 448 | // We are reusing the space that was previously occupied by escapes. 449 | self.symbols[new_codes[i] as usize] = symbol; 450 | symbol_lens[new_codes[i] as usize] = len as u8; 451 | } 452 | 453 | // Truncate the symbol table to only include the "true" symbols. 454 | self.symbols.truncate(self.n_symbols as usize); 455 | 456 | // Rewrite the codes_one_byte table to point at the new code values. 457 | // Replace pseudocodes with escapes. 458 | for byte in 0..=255 { 459 | let one_byte = self.codes_one_byte[byte]; 460 | if one_byte.extended_code() >= FSST_CODE_BASE { 461 | let new_code = new_codes[one_byte.code() as usize]; 462 | self.codes_one_byte[byte] = Code::new_symbol(new_code, 1); 463 | } else { 464 | // After finalize: codes_one_byte contains the unused value 465 | self.codes_one_byte[byte] = Code::UNUSED; 466 | } 467 | } 468 | 469 | // Rewrite the codes_two_byte table to point at the new code values. 470 | // Replace pseudocodes with escapes. 471 | for two_bytes in 0..=65_535 { 472 | let two_byte = self.codes_two_byte[two_bytes]; 473 | if two_byte.extended_code() >= FSST_CODE_BASE { 474 | let new_code = new_codes[two_byte.code() as usize]; 475 | self.codes_two_byte[two_bytes] = Code::new_symbol(new_code, 2); 476 | } else { 477 | // The one-byte code for the given code number here... 478 | self.codes_two_byte[two_bytes] = self.codes_one_byte[two_bytes & 0xFF]; 479 | } 480 | } 481 | 482 | // Reset values in the hash table as well. 483 | self.lossy_pht.renumber(&new_codes); 484 | 485 | // Pre-compute the lengths 486 | let mut lengths = Vec::with_capacity(self.n_symbols as usize); 487 | for symbol in &self.symbols { 488 | lengths.push(symbol.len() as u8); 489 | } 490 | 491 | (has_suffix_code, lengths) 492 | } 493 | 494 | /// Build into the final hash table. 495 | pub fn build(mut self) -> Compressor { 496 | // finalize the symbol table by inserting the codes_twobyte values into 497 | // the relevant parts of the `codes_onebyte` set. 498 | 499 | let (has_suffix_code, lengths) = self.finalize(); 500 | 501 | Compressor { 502 | symbols: self.symbols, 503 | lengths, 504 | n_symbols: self.n_symbols, 505 | has_suffix_code, 506 | codes_two_byte: self.codes_two_byte, 507 | lossy_pht: self.lossy_pht, 508 | } 509 | } 510 | } 511 | 512 | /// The number of generations used for training. This is taken from the [FSST paper]. 513 | /// 514 | /// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf 515 | #[cfg(not(miri))] 516 | const GENERATIONS: [usize; 5] = [8usize, 38, 68, 98, 128]; 517 | #[cfg(miri)] 518 | const GENERATIONS: [usize; 3] = [8usize, 38, 128]; 519 | 520 | const FSST_SAMPLETARGET: usize = 1 << 14; 521 | const FSST_SAMPLEMAX: usize = 1 << 15; 522 | const FSST_SAMPLELINE: usize = 512; 523 | 524 | /// Create a sample from a set of strings in the input. 525 | /// 526 | /// Sample is constructing by copying "chunks" from the `str_in`s into the `sample_buf`, the 527 | /// returned slices are pointers into the `sample_buf`. 528 | /// 529 | /// SAFETY: sample_buf must be >= FSST_SAMPLEMAX bytes long. Providing something less may cause unexpected failures. 530 | #[allow(clippy::ptr_arg)] 531 | fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec, str_in: &Vec<&'b [u8]>) -> Vec<&'a [u8]> { 532 | assert!( 533 | sample_buf.capacity() >= FSST_SAMPLEMAX, 534 | "sample_buf.len() < FSST_SAMPLEMAX" 535 | ); 536 | 537 | let mut sample: Vec<&[u8]> = Vec::new(); 538 | 539 | let tot_size: usize = str_in.iter().map(|s| s.len()).sum(); 540 | if tot_size < FSST_SAMPLETARGET { 541 | return str_in.clone(); 542 | } 543 | 544 | let mut sample_rnd = fsst_hash(4637947); 545 | let sample_lim = FSST_SAMPLETARGET; 546 | let mut sample_buf_offset: usize = 0; 547 | 548 | while sample_buf_offset < sample_lim { 549 | sample_rnd = fsst_hash(sample_rnd); 550 | let line_nr = (sample_rnd as usize) % str_in.len(); 551 | 552 | // Find the first non-empty chunk starting at line_nr, wrapping around if 553 | // necessary. 554 | let Some(line) = (line_nr..str_in.len()) 555 | .chain(0..line_nr) 556 | .map(|line_nr| str_in[line_nr]) 557 | .find(|line| !line.is_empty()) 558 | else { 559 | return sample; 560 | }; 561 | 562 | let chunks = 1 + ((line.len() - 1) / FSST_SAMPLELINE); 563 | sample_rnd = fsst_hash(sample_rnd); 564 | let chunk = FSST_SAMPLELINE * ((sample_rnd as usize) % chunks); 565 | 566 | let len = FSST_SAMPLELINE.min(line.len() - chunk); 567 | 568 | sample_buf.extend_from_slice(&line[chunk..chunk + len]); 569 | 570 | // SAFETY: this is the data we just placed into `sample_buf` in the line above. 571 | let slice = 572 | unsafe { std::slice::from_raw_parts(sample_buf.as_ptr().add(sample_buf_offset), len) }; 573 | 574 | sample.push(slice); 575 | 576 | sample_buf_offset += len; 577 | } 578 | 579 | sample 580 | } 581 | 582 | /// Hash function used in various components of the library. 583 | /// 584 | /// This is equivalent to the FSST_HASH macro from the C++ implementation. 585 | #[inline] 586 | pub(crate) fn fsst_hash(value: u64) -> u64 { 587 | value.wrapping_mul(2971215073) ^ value.wrapping_shr(15) 588 | } 589 | 590 | impl Compressor { 591 | /// Build and train a `Compressor` from a sample corpus of text. 592 | /// 593 | /// This function implements the generational algorithm described in the [FSST paper] Section 594 | /// 4.3. Starting with an empty symbol table, it iteratively compresses the corpus, then attempts 595 | /// to merge symbols when doing so would yield better compression than leaving them unmerged. The 596 | /// resulting table will have at most 255 symbols (the 256th symbol is reserved for the escape 597 | /// code). 598 | /// 599 | /// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf 600 | pub fn train(values: &Vec<&[u8]>) -> Self { 601 | let mut builder = CompressorBuilder::new(); 602 | 603 | if values.is_empty() { 604 | return builder.build(); 605 | } 606 | 607 | let mut counters = Counter::new(); 608 | let mut sample_memory = Vec::with_capacity(FSST_SAMPLEMAX); 609 | let sample = make_sample(&mut sample_memory, values); 610 | for sample_frac in GENERATIONS { 611 | for (i, line) in sample.iter().enumerate() { 612 | if sample_frac < 128 && ((fsst_hash(i as u64) & 127) as usize) > sample_frac { 613 | continue; 614 | } 615 | 616 | builder.compress_count(line, &mut counters); 617 | } 618 | 619 | builder.optimize(&counters, sample_frac); 620 | counters.clear(); 621 | } 622 | 623 | builder.build() 624 | } 625 | } 626 | 627 | impl CompressorBuilder { 628 | /// Find the longest symbol using the hash table and the codes_one_byte and codes_two_byte indexes. 629 | fn find_longest_symbol(&self, word: u64) -> Code { 630 | // Probe the hash table first to see if we have a long match 631 | let entry = self.lossy_pht.lookup(word); 632 | let ignored_bits = entry.ignored_bits; 633 | 634 | // If the entry is valid, return the code 635 | if !entry.is_unused() && compare_masked(word, entry.symbol.as_u64(), ignored_bits) { 636 | return entry.code; 637 | } 638 | 639 | // Try and match first two bytes 640 | let twobyte = self.codes_two_byte[word as u16 as usize]; 641 | if twobyte.extended_code() >= FSST_CODE_BASE { 642 | return twobyte; 643 | } 644 | 645 | // Fall back to single-byte match 646 | self.codes_one_byte[word as u8 as usize] 647 | } 648 | 649 | /// Compress the text using the current symbol table. Count the code occurrences 650 | /// and code-pair occurrences, calculating total gain using the current compressor. 651 | /// 652 | /// NOTE: this is largely an unfortunate amount of copy-paste from `compress`, just to make sure 653 | /// we can do all the counting in a single pass. 654 | fn compress_count(&self, sample: &[u8], counter: &mut Counter) -> usize { 655 | let mut gain = 0; 656 | if sample.is_empty() { 657 | return gain; 658 | } 659 | 660 | let mut in_ptr = sample.as_ptr(); 661 | 662 | // SAFETY: `end` will point just after the end of the `plaintext` slice. 663 | let in_end = unsafe { in_ptr.byte_add(sample.len()) }; 664 | let in_end_sub8 = in_end as usize - 8; 665 | 666 | let mut prev_code: u16 = FSST_CODE_MASK; 667 | 668 | while (in_ptr as usize) < (in_end_sub8) { 669 | // SAFETY: ensured in-bounds by loop condition. 670 | let word: u64 = unsafe { std::ptr::read_unaligned(in_ptr as *const u64) }; 671 | let code = self.find_longest_symbol(word); 672 | let code_u16 = code.extended_code(); 673 | 674 | // Gain increases by the symbol length if a symbol matches, or 0 675 | // if an escape is emitted. 676 | gain += (code.len() as usize) - ((code_u16 < 256) as usize); 677 | 678 | // Record the single and pair counts 679 | counter.record_count1(code_u16); 680 | counter.record_count2(prev_code, code_u16); 681 | 682 | // Also record the count for just extending by a single byte, but only if 683 | // the symbol is not itself a single byte. 684 | if code.len() > 1 { 685 | let code_first_byte = self.symbols[code_u16 as usize].first_byte() as u16; 686 | counter.record_count1(code_first_byte); 687 | counter.record_count2(prev_code, code_first_byte); 688 | } 689 | 690 | // SAFETY: pointer bound is checked in loop condition before any access is made. 691 | in_ptr = unsafe { in_ptr.byte_add(code.len() as usize) }; 692 | 693 | prev_code = code_u16; 694 | } 695 | 696 | let remaining_bytes = unsafe { in_end.byte_offset_from(in_ptr) }; 697 | assert!( 698 | remaining_bytes.is_positive(), 699 | "in_ptr exceeded in_end, should not be possible" 700 | ); 701 | let remaining_bytes = remaining_bytes as usize; 702 | 703 | // Load the last `remaining_byte`s of data into a final world. We then replicate the loop above, 704 | // but shift data out of this word rather than advancing an input pointer and potentially reading 705 | // unowned memory 706 | let mut bytes = [0u8; 8]; 707 | unsafe { 708 | // SAFETY: it is safe to read up to remaining_bytes from in_ptr, and remaining_bytes 709 | // will be <= 8 bytes. 710 | std::ptr::copy_nonoverlapping(in_ptr, bytes.as_mut_ptr(), remaining_bytes); 711 | } 712 | let mut last_word = u64::from_le_bytes(bytes); 713 | 714 | let mut remaining_bytes = remaining_bytes; 715 | 716 | while remaining_bytes > 0 { 717 | // SAFETY: ensured in-bounds by loop condition. 718 | let code = self.find_longest_symbol(last_word); 719 | let code_u16 = code.extended_code(); 720 | 721 | // Gain increases by the symbol length if a symbol matches, or 0 722 | // if an escape is emitted. 723 | gain += (code.len() as usize) - ((code_u16 < 256) as usize); 724 | 725 | // Record the single and pair counts 726 | counter.record_count1(code_u16); 727 | counter.record_count2(prev_code, code_u16); 728 | 729 | // Also record the count for just extending by a single byte, but only if 730 | // the symbol is not itself a single byte. 731 | if code.len() > 1 { 732 | let code_first_byte = self.symbols[code_u16 as usize].first_byte() as u16; 733 | counter.record_count1(code_first_byte); 734 | counter.record_count2(prev_code, code_first_byte); 735 | } 736 | 737 | // Advance our last_word "input pointer" by shifting off the covered values. 738 | let advance = code.len() as usize; 739 | remaining_bytes -= advance; 740 | last_word = advance_8byte_word(last_word, advance); 741 | 742 | prev_code = code_u16; 743 | } 744 | 745 | gain 746 | } 747 | 748 | /// Using a set of counters and the existing set of symbols, build a new 749 | /// set of symbols/codes that optimizes the gain over the distribution in `counter`. 750 | fn optimize(&mut self, counters: &Counter, sample_frac: usize) { 751 | let mut pqueue = BinaryHeap::with_capacity(65_536); 752 | 753 | for code1 in counters.first_codes() { 754 | let symbol1 = self.symbols[code1 as usize]; 755 | let symbol1_len = symbol1.len(); 756 | let count = counters.count1(code1); 757 | 758 | // From the c++ impl: 759 | // "improves both compression speed (less candidates), but also quality!!" 760 | if count < (5 * sample_frac / 128) { 761 | continue; 762 | } 763 | 764 | let mut gain = count * symbol1_len; 765 | // NOTE: use heuristic from C++ implementation to boost the gain of single-byte symbols. 766 | // This helps to reduce exception counts. 767 | if code1 < 256 { 768 | gain *= 8; 769 | } 770 | 771 | pqueue.push(Candidate { 772 | symbol: symbol1, 773 | gain, 774 | }); 775 | 776 | // Skip merges on last round, or when symbol cannot be extended. 777 | if sample_frac >= 128 || symbol1_len == 8 { 778 | continue; 779 | } 780 | 781 | for code2 in counters.second_codes(code1) { 782 | let symbol2 = self.symbols[code2 as usize]; 783 | 784 | // If merging would yield a symbol of length greater than 8, skip. 785 | if symbol1_len + symbol2.len() > 8 { 786 | continue; 787 | } 788 | let new_symbol = symbol1.concat(symbol2); 789 | let gain = counters.count2(code1, code2) * new_symbol.len(); 790 | 791 | pqueue.push(Candidate { 792 | symbol: new_symbol, 793 | gain, 794 | }) 795 | } 796 | } 797 | 798 | // clear self in advance of inserting the symbols. 799 | self.clear(); 800 | 801 | // Pop the 255 best symbols. 802 | let mut n_symbols = 0; 803 | while !pqueue.is_empty() && n_symbols < 255 { 804 | let candidate = pqueue.pop().unwrap(); 805 | if self.insert(candidate.symbol, candidate.symbol.len()) { 806 | n_symbols += 1; 807 | } 808 | } 809 | } 810 | } 811 | 812 | /// A candidate for inclusion in a symbol table. 813 | /// 814 | /// This is really only useful for the `optimize` step of training. 815 | #[derive(Copy, Clone, Debug)] 816 | struct Candidate { 817 | gain: usize, 818 | symbol: Symbol, 819 | } 820 | 821 | impl Candidate { 822 | fn comparable_form(&self) -> (usize, usize) { 823 | (self.gain, self.symbol.len()) 824 | } 825 | } 826 | 827 | impl Eq for Candidate {} 828 | 829 | impl PartialEq for Candidate { 830 | fn eq(&self, other: &Self) -> bool { 831 | self.comparable_form().eq(&other.comparable_form()) 832 | } 833 | } 834 | 835 | impl PartialOrd for Candidate { 836 | fn partial_cmp(&self, other: &Self) -> Option { 837 | Some(self.cmp(other)) 838 | } 839 | } 840 | 841 | impl Ord for Candidate { 842 | fn cmp(&self, other: &Self) -> Ordering { 843 | let self_ord = (self.gain, self.symbol.len()); 844 | let other_ord = (other.gain, other.symbol.len()); 845 | 846 | self_ord.cmp(&other_ord) 847 | } 848 | } 849 | 850 | #[cfg(test)] 851 | mod test { 852 | use crate::{Compressor, ESCAPE_CODE, builder::CodesBitmap}; 853 | 854 | #[test] 855 | fn test_builder() { 856 | // Train a Compressor on the toy string 857 | let text = b"hello hello hello hello hello"; 858 | 859 | // count of 5 is the cutoff for including a symbol in the table. 860 | let table = Compressor::train(&vec![text, text, text, text, text]); 861 | 862 | // Use the table to compress a string, see the values 863 | let compressed = table.compress(text); 864 | 865 | // Ensure that the compressed string has no escape bytes 866 | assert!(compressed.iter().all(|b| *b != ESCAPE_CODE)); 867 | 868 | // Ensure that we can compress a string with no values seen at training time, with escape bytes 869 | let compressed = table.compress("xyz123".as_bytes()); 870 | let decompressed = table.decompressor().decompress(&compressed); 871 | assert_eq!(&decompressed, b"xyz123"); 872 | assert_eq!( 873 | compressed, 874 | vec![ 875 | ESCAPE_CODE, 876 | b'x', 877 | ESCAPE_CODE, 878 | b'y', 879 | ESCAPE_CODE, 880 | b'z', 881 | ESCAPE_CODE, 882 | b'1', 883 | ESCAPE_CODE, 884 | b'2', 885 | ESCAPE_CODE, 886 | b'3', 887 | ] 888 | ); 889 | } 890 | 891 | #[test] 892 | fn test_bitmap() { 893 | let mut map = CodesBitmap::default(); 894 | map.set(10); 895 | map.set(100); 896 | map.set(500); 897 | 898 | let codes: Vec = map.codes().collect(); 899 | assert_eq!(codes, vec![10u16, 100, 500]); 900 | 901 | // empty case 902 | let map = CodesBitmap::default(); 903 | assert!(map.codes().collect::>().is_empty()); 904 | 905 | // edge case: first bit in each block is set 906 | let mut map = CodesBitmap::default(); 907 | (0..8).for_each(|i| map.set(64 * i)); 908 | assert_eq!( 909 | map.codes().collect::>(), 910 | (0u16..8).map(|i| 64 * i).collect::>(), 911 | ); 912 | 913 | // Full bitmap case. There are only 512 values, so test them all 914 | let mut map = CodesBitmap::default(); 915 | for i in 0..512 { 916 | map.set(i); 917 | } 918 | assert_eq!( 919 | map.codes().collect::>(), 920 | (0u16..511u16).collect::>() 921 | ); 922 | } 923 | 924 | #[test] 925 | #[should_panic(expected = "code cannot exceed")] 926 | fn test_bitmap_invalid() { 927 | let mut map = CodesBitmap::default(); 928 | map.set(512); 929 | } 930 | } 931 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![doc = include_str!("../README.md")] 2 | #![cfg(target_endian = "little")] 3 | 4 | /// Throw a compiler error if a type isn't guaranteed to have a specific size in bytes. 5 | macro_rules! assert_sizeof { 6 | ($typ:ty => $size_in_bytes:expr) => { 7 | const _: [u8; $size_in_bytes] = [0; std::mem::size_of::<$typ>()]; 8 | }; 9 | } 10 | 11 | use lossy_pht::LossyPHT; 12 | use std::fmt::{Debug, Formatter}; 13 | use std::mem::MaybeUninit; 14 | 15 | mod builder; 16 | mod lossy_pht; 17 | 18 | pub use builder::*; 19 | 20 | /// `Symbol`s are small (up to 8-byte) segments of strings, stored in a [`Compressor`][`crate::Compressor`] and 21 | /// identified by an 8-bit code. 22 | #[derive(Copy, Clone)] 23 | pub struct Symbol(u64); 24 | 25 | assert_sizeof!(Symbol => 8); 26 | 27 | impl Symbol { 28 | /// Zero value for `Symbol`. 29 | pub const ZERO: Self = Self::zero(); 30 | 31 | /// Constructor for a `Symbol` from an 8-element byte slice. 32 | pub fn from_slice(slice: &[u8; 8]) -> Self { 33 | let num: u64 = u64::from_le_bytes(*slice); 34 | 35 | Self(num) 36 | } 37 | 38 | /// Return a zero symbol 39 | const fn zero() -> Self { 40 | Self(0) 41 | } 42 | 43 | /// Create a new single-byte symbol 44 | pub fn from_u8(value: u8) -> Self { 45 | Self(value as u64) 46 | } 47 | } 48 | 49 | impl Symbol { 50 | /// Calculate the length of the symbol in bytes. Always a value between 1 and 8. 51 | /// 52 | /// Each symbol has the capacity to hold up to 8 bytes of data, but the symbols 53 | /// can contain fewer bytes, padded with 0x00. There is a special case of a symbol 54 | /// that holds the byte 0x00. In that case, the symbol contains `0x0000000000000000` 55 | /// but we want to interpret that as a one-byte symbol containing `0x00`. 56 | #[allow(clippy::len_without_is_empty)] 57 | pub fn len(self) -> usize { 58 | let numeric = self.0; 59 | // For little-endian platforms, this counts the number of *trailing* zeros 60 | let null_bytes = (numeric.leading_zeros() >> 3) as usize; 61 | 62 | // Special case handling of a symbol with all-zeros. This is actually 63 | // a 1-byte symbol containing 0x00. 64 | let len = size_of::() - null_bytes; 65 | if len == 0 { 1 } else { len } 66 | } 67 | 68 | #[inline] 69 | fn as_u64(self) -> u64 { 70 | self.0 71 | } 72 | 73 | /// Get the first byte of the symbol as a `u8`. 74 | /// 75 | /// If the symbol is empty, this will return the zero byte. 76 | #[inline] 77 | pub fn first_byte(self) -> u8 { 78 | self.0 as u8 79 | } 80 | 81 | /// Get the first two bytes of the symbol as a `u16`. 82 | /// 83 | /// If the Symbol is one or zero bytes, this will return `0u16`. 84 | #[inline] 85 | pub fn first2(self) -> u16 { 86 | self.0 as u16 87 | } 88 | 89 | /// Get the first three bytes of the symbol as a `u64`. 90 | /// 91 | /// If the Symbol is one or zero bytes, this will return `0u64`. 92 | #[inline] 93 | pub fn first3(self) -> u64 { 94 | self.0 & 0xFF_FF_FF 95 | } 96 | 97 | /// Return a new `Symbol` by logically concatenating ourselves with another `Symbol`. 98 | pub fn concat(self, other: Self) -> Self { 99 | assert!( 100 | self.len() + other.len() <= 8, 101 | "cannot build symbol with length > 8" 102 | ); 103 | 104 | let self_len = self.len(); 105 | 106 | Self((other.0 << (8 * self_len)) | self.0) 107 | } 108 | } 109 | 110 | impl Debug for Symbol { 111 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 112 | write!(f, "[")?; 113 | 114 | let slice = &self.0.to_le_bytes()[0..self.len()]; 115 | for c in slice.iter().map(|c| *c as char) { 116 | if ('!'..='~').contains(&c) { 117 | write!(f, "{c}")?; 118 | } else if c == '\n' { 119 | write!(f, " \\n ")?; 120 | } else if c == '\t' { 121 | write!(f, " \\t ")?; 122 | } else if c == ' ' { 123 | write!(f, " SPACE ")?; 124 | } else { 125 | write!(f, " 0x{:X?} ", c as u8)? 126 | } 127 | } 128 | 129 | write!(f, "]") 130 | } 131 | } 132 | 133 | /// A packed type containing a code value, as well as metadata about the symbol referred to by 134 | /// the code. 135 | /// 136 | /// Logically, codes can range from 0-255 inclusive. This type holds both the 8-bit code as well as 137 | /// other metadata bit-packed into a `u16`. 138 | /// 139 | /// The bottom 8 bits contain EITHER a code for a symbol stored in the table, OR a raw byte. 140 | /// 141 | /// The interpretation depends on the 9th bit: when toggled off, the value stores a raw byte, and when 142 | /// toggled on, it stores a code. Thus if you examine the bottom 9 bits of the `u16`, you have an extended 143 | /// code range, where the values 0-255 are raw bytes, and the values 256-510 represent codes 0-254. 511 is 144 | /// a placeholder for the invalid code here. 145 | /// 146 | /// Bits 12-15 store the length of the symbol (values ranging from 0-8). 147 | #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] 148 | struct Code(u16); 149 | 150 | /// Code used to indicate bytes that are not in the symbol table. 151 | /// 152 | /// When compressing a string that cannot fully be expressed with the symbol table, the compressed 153 | /// output will contain an `ESCAPE` byte followed by a raw byte. At decompression time, the presence 154 | /// of `ESCAPE` indicates that the next byte should be appended directly to the result instead of 155 | /// being looked up in the symbol table. 156 | pub const ESCAPE_CODE: u8 = 255; 157 | 158 | /// Number of bits in the `ExtendedCode` that are used to dictate a code value. 159 | pub const FSST_CODE_BITS: usize = 9; 160 | 161 | /// First bit of the "length" portion of an extended code. 162 | pub const FSST_LEN_BITS: usize = 12; 163 | 164 | /// Maximum code value in the extended code range. 165 | pub const FSST_CODE_MAX: u16 = 1 << FSST_CODE_BITS; 166 | 167 | /// Maximum value for the extended code range. 168 | /// 169 | /// When truncated to u8 this is code 255, which is equivalent to [`ESCAPE_CODE`]. 170 | pub const FSST_CODE_MASK: u16 = FSST_CODE_MAX - 1; 171 | 172 | /// First code in the symbol table that corresponds to a non-escape symbol. 173 | pub const FSST_CODE_BASE: u16 = 256; 174 | 175 | #[allow(clippy::len_without_is_empty)] 176 | impl Code { 177 | /// Code for an unused slot in a symbol table or index. 178 | /// 179 | /// This corresponds to the maximum code with a length of 1. 180 | pub const UNUSED: Self = Code(FSST_CODE_MASK + (1 << 12)); 181 | 182 | /// Create a new code for a symbol of given length. 183 | fn new_symbol(code: u8, len: usize) -> Self { 184 | Self(code as u16 + ((len as u16) << FSST_LEN_BITS)) 185 | } 186 | 187 | /// Code for a new symbol during the building phase. 188 | /// 189 | /// The code is remapped from 0..254 to 256...510. 190 | fn new_symbol_building(code: u8, len: usize) -> Self { 191 | Self(code as u16 + 256 + ((len as u16) << FSST_LEN_BITS)) 192 | } 193 | 194 | /// Create a new code corresponding for an escaped byte. 195 | fn new_escape(byte: u8) -> Self { 196 | Self((byte as u16) + (1 << FSST_LEN_BITS)) 197 | } 198 | 199 | #[inline] 200 | fn code(self) -> u8 { 201 | self.0 as u8 202 | } 203 | 204 | #[inline] 205 | fn extended_code(self) -> u16 { 206 | self.0 & 0b111_111_111 207 | } 208 | 209 | #[inline] 210 | fn len(self) -> u16 { 211 | self.0 >> FSST_LEN_BITS 212 | } 213 | } 214 | 215 | impl Debug for Code { 216 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 217 | f.debug_struct("TrainingCode") 218 | .field("code", &(self.0 as u8)) 219 | .field("is_escape", &(self.0 < 256)) 220 | .field("len", &(self.0 >> 12)) 221 | .finish() 222 | } 223 | } 224 | 225 | /// Decompressor uses a symbol table to take a stream of 8-bit codes into a string. 226 | #[derive(Clone)] 227 | pub struct Decompressor<'a> { 228 | /// Slice mapping codes to symbols. 229 | pub(crate) symbols: &'a [Symbol], 230 | 231 | /// Slice containing the length of each symbol in the `symbols` slice. 232 | pub(crate) lengths: &'a [u8], 233 | } 234 | 235 | impl<'a> Decompressor<'a> { 236 | /// Returns a new decompressor that uses the provided symbol table. 237 | /// 238 | /// # Panics 239 | /// 240 | /// If the provided symbol table has length greater than 256 241 | pub fn new(symbols: &'a [Symbol], lengths: &'a [u8]) -> Self { 242 | assert!( 243 | symbols.len() < FSST_CODE_BASE as usize, 244 | "symbol table cannot have size exceeding 255" 245 | ); 246 | 247 | Self { symbols, lengths } 248 | } 249 | 250 | /// Returns an upper bound on the size of the decompressed data. 251 | pub fn max_decompression_capacity(&self, compressed: &[u8]) -> usize { 252 | size_of::() * (compressed.len() + 1) 253 | } 254 | 255 | /// Decompress a slice of codes into a provided buffer. 256 | /// 257 | /// The provided `decoded` buffer must be at least the size of the decoded data, plus 258 | /// an additional 7 bytes. 259 | /// 260 | /// ## Panics 261 | /// 262 | /// If the caller fails to provide sufficient capacity in the decoded buffer. An upper bound 263 | /// on the required capacity can be obtained by calling [`Self::max_decompression_capacity`]. 264 | /// 265 | /// ## Example 266 | /// 267 | /// ``` 268 | /// use fsst::{Symbol, Compressor, CompressorBuilder}; 269 | /// let compressor = { 270 | /// let mut builder = CompressorBuilder::new(); 271 | /// builder.insert(Symbol::from_slice(&[b'h', b'e', b'l', b'l', b'o', b'o', b'o', b'o']), 8); 272 | /// builder.build() 273 | /// }; 274 | /// 275 | /// let decompressor = compressor.decompressor(); 276 | /// 277 | /// let mut decompressed = Vec::with_capacity(8 + 7); 278 | /// 279 | /// let len = decompressor.decompress_into(&[0], decompressed.spare_capacity_mut()); 280 | /// assert_eq!(len, 8); 281 | /// unsafe { decompressed.set_len(len) }; 282 | /// assert_eq!(&decompressed, "helloooo".as_bytes()); 283 | /// ``` 284 | pub fn decompress_into(&self, compressed: &[u8], decoded: &mut [MaybeUninit]) -> usize { 285 | // Ensure the target buffer is at least half the size of the input buffer. 286 | // This is the theortical smallest a valid target can be, and occurs when 287 | // every input code is an escape. 288 | assert!( 289 | decoded.len() >= compressed.len() / 2, 290 | "decoded is smaller than lower-bound decompressed size" 291 | ); 292 | 293 | unsafe { 294 | let mut in_ptr = compressed.as_ptr(); 295 | let _in_begin = in_ptr; 296 | let in_end = in_ptr.add(compressed.len()); 297 | 298 | let mut out_ptr: *mut u8 = decoded.as_mut_ptr().cast(); 299 | let out_begin = out_ptr.cast_const(); 300 | let out_end = decoded.as_ptr().add(decoded.len()).cast::(); 301 | 302 | macro_rules! store_next_symbol { 303 | ($code:expr) => {{ 304 | out_ptr 305 | .cast::() 306 | .write_unaligned(self.symbols.get_unchecked($code as usize).as_u64()); 307 | out_ptr = out_ptr.add(*self.lengths.get_unchecked($code as usize) as usize); 308 | }}; 309 | } 310 | 311 | // First we try loading 8 bytes at a time. 312 | if decoded.len() >= 8 * size_of::() && compressed.len() >= 8 { 313 | // Extract the loop condition since the compiler fails to do so 314 | let block_out_end = out_end.sub(8 * size_of::()); 315 | let block_in_end = in_end.sub(8); 316 | 317 | while out_ptr.cast_const() <= block_out_end && in_ptr < block_in_end { 318 | // Note that we load a little-endian u64 here. 319 | let next_block = in_ptr.cast::().read_unaligned(); 320 | let escape_mask = (next_block & 0x8080808080808080) 321 | & ((((!next_block) & 0x7F7F7F7F7F7F7F7F) + 0x7F7F7F7F7F7F7F7F) 322 | ^ 0x8080808080808080); 323 | 324 | // If there are no escape codes, we write each symbol one by one. 325 | if escape_mask == 0 { 326 | let code = (next_block & 0xFF) as u8; 327 | store_next_symbol!(code); 328 | let code = ((next_block >> 8) & 0xFF) as u8; 329 | store_next_symbol!(code); 330 | let code = ((next_block >> 16) & 0xFF) as u8; 331 | store_next_symbol!(code); 332 | let code = ((next_block >> 24) & 0xFF) as u8; 333 | store_next_symbol!(code); 334 | let code = ((next_block >> 32) & 0xFF) as u8; 335 | store_next_symbol!(code); 336 | let code = ((next_block >> 40) & 0xFF) as u8; 337 | store_next_symbol!(code); 338 | let code = ((next_block >> 48) & 0xFF) as u8; 339 | store_next_symbol!(code); 340 | let code = ((next_block >> 56) & 0xFF) as u8; 341 | store_next_symbol!(code); 342 | in_ptr = in_ptr.add(8); 343 | } else { 344 | // Otherwise, find the first escape code and write the symbols up to that point. 345 | let first_escape_pos = escape_mask.trailing_zeros() >> 3; // Divide bits to bytes 346 | debug_assert!(first_escape_pos < 8); 347 | match first_escape_pos { 348 | 7 => { 349 | let code = (next_block & 0xFF) as u8; 350 | store_next_symbol!(code); 351 | let code = ((next_block >> 8) & 0xFF) as u8; 352 | store_next_symbol!(code); 353 | let code = ((next_block >> 16) & 0xFF) as u8; 354 | store_next_symbol!(code); 355 | let code = ((next_block >> 24) & 0xFF) as u8; 356 | store_next_symbol!(code); 357 | let code = ((next_block >> 32) & 0xFF) as u8; 358 | store_next_symbol!(code); 359 | let code = ((next_block >> 40) & 0xFF) as u8; 360 | store_next_symbol!(code); 361 | let code = ((next_block >> 48) & 0xFF) as u8; 362 | store_next_symbol!(code); 363 | 364 | in_ptr = in_ptr.add(7); 365 | } 366 | 6 => { 367 | let code = (next_block & 0xFF) as u8; 368 | store_next_symbol!(code); 369 | let code = ((next_block >> 8) & 0xFF) as u8; 370 | store_next_symbol!(code); 371 | let code = ((next_block >> 16) & 0xFF) as u8; 372 | store_next_symbol!(code); 373 | let code = ((next_block >> 24) & 0xFF) as u8; 374 | store_next_symbol!(code); 375 | let code = ((next_block >> 32) & 0xFF) as u8; 376 | store_next_symbol!(code); 377 | let code = ((next_block >> 40) & 0xFF) as u8; 378 | store_next_symbol!(code); 379 | 380 | let escaped = ((next_block >> 56) & 0xFF) as u8; 381 | out_ptr.write(escaped); 382 | out_ptr = out_ptr.add(1); 383 | 384 | in_ptr = in_ptr.add(8); 385 | } 386 | 5 => { 387 | let code = (next_block & 0xFF) as u8; 388 | store_next_symbol!(code); 389 | let code = ((next_block >> 8) & 0xFF) as u8; 390 | store_next_symbol!(code); 391 | let code = ((next_block >> 16) & 0xFF) as u8; 392 | store_next_symbol!(code); 393 | let code = ((next_block >> 24) & 0xFF) as u8; 394 | store_next_symbol!(code); 395 | let code = ((next_block >> 32) & 0xFF) as u8; 396 | store_next_symbol!(code); 397 | 398 | let escaped = ((next_block >> 48) & 0xFF) as u8; 399 | out_ptr.write(escaped); 400 | out_ptr = out_ptr.add(1); 401 | 402 | in_ptr = in_ptr.add(7); 403 | } 404 | 4 => { 405 | let code = (next_block & 0xFF) as u8; 406 | store_next_symbol!(code); 407 | let code = ((next_block >> 8) & 0xFF) as u8; 408 | store_next_symbol!(code); 409 | let code = ((next_block >> 16) & 0xFF) as u8; 410 | store_next_symbol!(code); 411 | let code = ((next_block >> 24) & 0xFF) as u8; 412 | store_next_symbol!(code); 413 | 414 | let escaped = ((next_block >> 40) & 0xFF) as u8; 415 | out_ptr.write(escaped); 416 | out_ptr = out_ptr.add(1); 417 | 418 | in_ptr = in_ptr.add(6); 419 | } 420 | 3 => { 421 | let code = (next_block & 0xFF) as u8; 422 | store_next_symbol!(code); 423 | let code = ((next_block >> 8) & 0xFF) as u8; 424 | store_next_symbol!(code); 425 | let code = ((next_block >> 16) & 0xFF) as u8; 426 | store_next_symbol!(code); 427 | 428 | let escaped = ((next_block >> 32) & 0xFF) as u8; 429 | out_ptr.write(escaped); 430 | out_ptr = out_ptr.add(1); 431 | 432 | in_ptr = in_ptr.add(5); 433 | } 434 | 2 => { 435 | let code = (next_block & 0xFF) as u8; 436 | store_next_symbol!(code); 437 | let code = ((next_block >> 8) & 0xFF) as u8; 438 | store_next_symbol!(code); 439 | 440 | let escaped = ((next_block >> 24) & 0xFF) as u8; 441 | out_ptr.write(escaped); 442 | out_ptr = out_ptr.add(1); 443 | 444 | in_ptr = in_ptr.add(4); 445 | } 446 | 1 => { 447 | let code = (next_block & 0xFF) as u8; 448 | store_next_symbol!(code); 449 | 450 | let escaped = ((next_block >> 16) & 0xFF) as u8; 451 | out_ptr.write(escaped); 452 | out_ptr = out_ptr.add(1); 453 | 454 | in_ptr = in_ptr.add(3); 455 | } 456 | 0 => { 457 | // Otherwise, we actually need to decompress the next byte 458 | // Extract the second byte from the u32 459 | let escaped = ((next_block >> 8) & 0xFF) as u8; 460 | in_ptr = in_ptr.add(2); 461 | out_ptr.write(escaped); 462 | out_ptr = out_ptr.add(1); 463 | } 464 | _ => unreachable!(), 465 | } 466 | } 467 | } 468 | } 469 | 470 | // Otherwise, fall back to 1-byte reads. 471 | while out_end.offset_from(out_ptr) > size_of::() as isize && in_ptr < in_end { 472 | let code = in_ptr.read(); 473 | in_ptr = in_ptr.add(1); 474 | 475 | if code == ESCAPE_CODE { 476 | out_ptr.write(in_ptr.read()); 477 | in_ptr = in_ptr.add(1); 478 | out_ptr = out_ptr.add(1); 479 | } else { 480 | store_next_symbol!(code); 481 | } 482 | } 483 | 484 | assert_eq!( 485 | in_ptr, in_end, 486 | "decompression should exhaust input before output" 487 | ); 488 | 489 | out_ptr.offset_from(out_begin) as usize 490 | } 491 | } 492 | 493 | /// Decompress a byte slice that was previously returned by a compressor using the same symbol 494 | /// table into a new vector of bytes. 495 | pub fn decompress(&self, compressed: &[u8]) -> Vec { 496 | let mut decoded = Vec::with_capacity(self.max_decompression_capacity(compressed) + 7); 497 | 498 | let len = self.decompress_into(compressed, decoded.spare_capacity_mut()); 499 | // SAFETY: len bytes have now been initialized by the decompressor. 500 | unsafe { decoded.set_len(len) }; 501 | decoded 502 | } 503 | } 504 | 505 | /// A compressor that uses a symbol table to greedily compress strings. 506 | /// 507 | /// The `Compressor` is the central component of FSST. You can create a compressor either by 508 | /// default (i.e. an empty compressor), or by [training][`Self::train`] it on an input corpus of text. 509 | /// 510 | /// Example usage: 511 | /// 512 | /// ``` 513 | /// use fsst::{Symbol, Compressor, CompressorBuilder}; 514 | /// let compressor = { 515 | /// let mut builder = CompressorBuilder::new(); 516 | /// builder.insert(Symbol::from_slice(&[b'h', b'e', b'l', b'l', b'o', 0, 0, 0]), 5); 517 | /// builder.build() 518 | /// }; 519 | /// 520 | /// let compressed = compressor.compress("hello".as_bytes()); 521 | /// assert_eq!(compressed, vec![0u8]); 522 | /// ``` 523 | #[derive(Clone)] 524 | pub struct Compressor { 525 | /// Table mapping codes to symbols. 526 | pub(crate) symbols: Vec, 527 | 528 | /// Length of each symbol, values range from 1-8. 529 | pub(crate) lengths: Vec, 530 | 531 | /// The number of entries in the symbol table that have been populated, not counting 532 | /// the escape values. 533 | pub(crate) n_symbols: u8, 534 | 535 | /// Inverted index mapping 2-byte symbols to codes 536 | codes_two_byte: Vec, 537 | 538 | /// Limit of no suffixes. 539 | has_suffix_code: u8, 540 | 541 | /// Lossy perfect hash table for looking up codes to symbols that are 3 bytes or more 542 | lossy_pht: LossyPHT, 543 | } 544 | 545 | /// The core structure of the FSST codec, holding a mapping between `Symbol`s and `Code`s. 546 | /// 547 | /// The symbol table is trained on a corpus of data in the form of a single byte array, building up 548 | /// a mapping of 1-byte "codes" to sequences of up to 8 plaintext bytes, or "symbols". 549 | impl Compressor { 550 | /// Using the symbol table, runs a single cycle of compression on an input word, writing 551 | /// the output into `out_ptr`. 552 | /// 553 | /// # Returns 554 | /// 555 | /// This function returns a tuple of (advance_in, advance_out) with the number of bytes 556 | /// for the caller to advance the input and output pointers. 557 | /// 558 | /// `advance_in` is the number of bytes to advance the input pointer before the next call. 559 | /// 560 | /// `advance_out` is the number of bytes to advance `out_ptr` before the next call. 561 | /// 562 | /// # Safety 563 | /// 564 | /// `out_ptr` must never be NULL or otherwise point to invalid memory. 565 | pub unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) { 566 | // Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and 567 | // if it isn't, it will be overwritten anyway. 568 | // 569 | // SAFETY: caller ensures out_ptr is not null 570 | let first_byte = word as u8; 571 | // SAFETY: out_ptr is not null 572 | unsafe { out_ptr.byte_add(1).write_unaligned(first_byte) }; 573 | 574 | // First, check the two_bytes table 575 | let code_twobyte = self.codes_two_byte[word as u16 as usize]; 576 | 577 | if code_twobyte.code() < self.has_suffix_code { 578 | // 2 byte code without having to worry about longer matches. 579 | // SAFETY: out_ptr is not null. 580 | unsafe { std::ptr::write(out_ptr, code_twobyte.code()) }; 581 | 582 | // Advance input by symbol length (2) and output by a single code byte 583 | (2, 1) 584 | } else { 585 | // Probe the hash table 586 | let entry = self.lossy_pht.lookup(word); 587 | 588 | // Now, downshift the `word` and the `entry` to see if they align. 589 | let ignored_bits = entry.ignored_bits; 590 | if entry.code != Code::UNUSED 591 | && compare_masked(word, entry.symbol.as_u64(), ignored_bits) 592 | { 593 | // Advance the input by the symbol length (variable) and the output by one code byte 594 | // SAFETY: out_ptr is not null. 595 | unsafe { std::ptr::write(out_ptr, entry.code.code()) }; 596 | (entry.code.len() as usize, 1) 597 | } else { 598 | // SAFETY: out_ptr is not null 599 | unsafe { std::ptr::write(out_ptr, code_twobyte.code()) }; 600 | 601 | // Advance the input by the symbol length (variable) and the output by either 1 602 | // byte (if was one-byte code) or two bytes (escape). 603 | ( 604 | code_twobyte.len() as usize, 605 | // Predicated version of: 606 | // 607 | // if entry.code >= 256 { 608 | // 2 609 | // } else { 610 | // 1 611 | // } 612 | 1 + (code_twobyte.extended_code() >> 8) as usize, 613 | ) 614 | } 615 | } 616 | } 617 | 618 | /// Compress many lines in bulk. 619 | pub fn compress_bulk(&self, lines: &Vec<&[u8]>) -> Vec> { 620 | let mut res = Vec::new(); 621 | 622 | for line in lines { 623 | res.push(self.compress(line)); 624 | } 625 | 626 | res 627 | } 628 | 629 | /// Compress a string, writing its result into a target buffer. 630 | /// 631 | /// The target buffer is a byte vector that must have capacity large enough 632 | /// to hold the encoded data. 633 | /// 634 | /// When this call returns, `values` will hold the compressed bytes and have 635 | /// its length set to the length of the compressed text. 636 | /// 637 | /// ``` 638 | /// use fsst::{Compressor, CompressorBuilder, Symbol}; 639 | /// 640 | /// let mut compressor = CompressorBuilder::new(); 641 | /// assert!(compressor.insert(Symbol::from_slice(b"aaaaaaaa"), 8)); 642 | /// 643 | /// let compressor = compressor.build(); 644 | /// 645 | /// let mut compressed_values = Vec::with_capacity(1_024); 646 | /// 647 | /// // SAFETY: we have over-sized compressed_values. 648 | /// unsafe { 649 | /// compressor.compress_into(b"aaaaaaaa", &mut compressed_values); 650 | /// } 651 | /// 652 | /// assert_eq!(compressed_values, vec![0u8]); 653 | /// ``` 654 | /// 655 | /// # Safety 656 | /// 657 | /// It is up to the caller to ensure the provided buffer is large enough to hold 658 | /// all encoded data. 659 | pub unsafe fn compress_into(&self, plaintext: &[u8], values: &mut Vec) { 660 | let mut in_ptr = plaintext.as_ptr(); 661 | let mut out_ptr = values.as_mut_ptr(); 662 | 663 | // SAFETY: `end` will point just after the end of the `plaintext` slice. 664 | let in_end = unsafe { in_ptr.byte_add(plaintext.len()) }; 665 | let in_end_sub8 = in_end as usize - 8; 666 | // SAFETY: `end` will point just after the end of the `values` allocation. 667 | let out_end = unsafe { out_ptr.byte_add(values.capacity()) }; 668 | 669 | while (in_ptr as usize) <= in_end_sub8 && out_ptr < out_end { 670 | // SAFETY: pointer ranges are checked in the loop condition 671 | unsafe { 672 | // Load a full 8-byte word of data from in_ptr. 673 | // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though. 674 | let word: u64 = std::ptr::read_unaligned(in_ptr as *const u64); 675 | let (advance_in, advance_out) = self.compress_word(word, out_ptr); 676 | in_ptr = in_ptr.byte_add(advance_in); 677 | out_ptr = out_ptr.byte_add(advance_out); 678 | }; 679 | } 680 | 681 | let remaining_bytes = unsafe { in_end.byte_offset_from(in_ptr) }; 682 | assert!( 683 | out_ptr < out_end || remaining_bytes == 0, 684 | "output buffer sized too small" 685 | ); 686 | 687 | let remaining_bytes = remaining_bytes as usize; 688 | 689 | // Load the last `remaining_byte`s of data into a final world. We then replicate the loop above, 690 | // but shift data out of this word rather than advancing an input pointer and potentially reading 691 | // unowned memory. 692 | let mut bytes = [0u8; 8]; 693 | // SAFETY: remaining_bytes <= 8 694 | unsafe { std::ptr::copy_nonoverlapping(in_ptr, bytes.as_mut_ptr(), remaining_bytes) }; 695 | let mut last_word = u64::from_le_bytes(bytes); 696 | 697 | while in_ptr < in_end && out_ptr < out_end { 698 | // Load a full 8-byte word of data from in_ptr. 699 | // SAFETY: caller asserts in_ptr is not null 700 | let (advance_in, advance_out) = unsafe { self.compress_word(last_word, out_ptr) }; 701 | // SAFETY: pointer ranges are checked in the loop condition 702 | unsafe { 703 | in_ptr = in_ptr.add(advance_in); 704 | out_ptr = out_ptr.add(advance_out); 705 | } 706 | 707 | last_word = advance_8byte_word(last_word, advance_in); 708 | } 709 | 710 | // in_ptr should have exceeded in_end 711 | assert!( 712 | in_ptr >= in_end, 713 | "exhausted output buffer before exhausting input, there is a bug in SymbolTable::compress()" 714 | ); 715 | 716 | assert!(out_ptr <= out_end, "output buffer sized too small"); 717 | 718 | // SAFETY: out_ptr is derived from the `values` allocation. 719 | let bytes_written = unsafe { out_ptr.offset_from(values.as_ptr()) }; 720 | assert!( 721 | bytes_written >= 0, 722 | "out_ptr ended before it started, not possible" 723 | ); 724 | 725 | // SAFETY: we have initialized `bytes_written` values in the output buffer. 726 | unsafe { values.set_len(bytes_written as usize) }; 727 | } 728 | 729 | /// Use the symbol table to compress the plaintext into a sequence of codes and escapes. 730 | pub fn compress(&self, plaintext: &[u8]) -> Vec { 731 | if plaintext.is_empty() { 732 | return Vec::new(); 733 | } 734 | 735 | let mut buffer = Vec::with_capacity(plaintext.len() * 2); 736 | 737 | // SAFETY: the largest compressed size would be all escapes == 2*plaintext_len 738 | unsafe { self.compress_into(plaintext, &mut buffer) }; 739 | 740 | buffer 741 | } 742 | 743 | /// Access the decompressor that can be used to decompress strings emitted from this 744 | /// `Compressor` instance. 745 | pub fn decompressor(&self) -> Decompressor { 746 | Decompressor::new(self.symbol_table(), self.symbol_lengths()) 747 | } 748 | 749 | /// Returns a readonly slice of the current symbol table. 750 | /// 751 | /// The returned slice will have length of `n_symbols`. 752 | pub fn symbol_table(&self) -> &[Symbol] { 753 | &self.symbols[0..self.n_symbols as usize] 754 | } 755 | 756 | /// Returns a readonly slice where index `i` contains the 757 | /// length of the symbol represented by code `i`. 758 | /// 759 | /// Values range from 1-8. 760 | pub fn symbol_lengths(&self) -> &[u8] { 761 | &self.lengths[0..self.n_symbols as usize] 762 | } 763 | 764 | /// Rebuild a compressor from an existing symbol table. 765 | /// 766 | /// This will not attempt to optimize or re-order the codes. 767 | pub fn rebuild_from(symbols: impl AsRef<[Symbol]>, symbol_lens: impl AsRef<[u8]>) -> Self { 768 | let symbols = symbols.as_ref(); 769 | let symbol_lens = symbol_lens.as_ref(); 770 | 771 | assert_eq!( 772 | symbols.len(), 773 | symbol_lens.len(), 774 | "symbols and lengths differ" 775 | ); 776 | assert!( 777 | symbols.len() <= 255, 778 | "symbol table len must be <= 255, was {}", 779 | symbols.len() 780 | ); 781 | validate_symbol_order(symbol_lens); 782 | 783 | // Insert the symbols in their given order into the FSST lookup structures. 784 | let symbols = symbols.to_vec(); 785 | let lengths = symbol_lens.to_vec(); 786 | let mut lossy_pht = LossyPHT::new(); 787 | 788 | let mut codes_one_byte = vec![Code::UNUSED; 256]; 789 | 790 | // Insert all of the one byte symbols first. 791 | for (code, (&symbol, &len)) in symbols.iter().zip(lengths.iter()).enumerate() { 792 | if len == 1 { 793 | codes_one_byte[symbol.first_byte() as usize] = Code::new_symbol(code as u8, 1); 794 | } 795 | } 796 | 797 | // Initialize the codes_two_byte table to be all escapes 798 | let mut codes_two_byte = vec![Code::UNUSED; 65_536]; 799 | 800 | // Insert the two byte symbols, possibly overwriting slots for one-byte symbols and escapes. 801 | for (code, (&symbol, &len)) in symbols.iter().zip(lengths.iter()).enumerate() { 802 | match len { 803 | 2 => { 804 | codes_two_byte[symbol.first2() as usize] = Code::new_symbol(code as u8, 2); 805 | } 806 | 3.. => { 807 | assert!( 808 | lossy_pht.insert(symbol, len as usize, code as u8), 809 | "rebuild symbol insertion into PHT must succeed" 810 | ); 811 | } 812 | _ => { /* Covered by the 1-byte loop above. */ } 813 | } 814 | } 815 | 816 | // Build the finished codes_two_byte table, subbing in unused positions with the 817 | // codes_one_byte value similar to what we do in CompressBuilder::finalize. 818 | for (symbol, code) in codes_two_byte.iter_mut().enumerate() { 819 | if *code == Code::UNUSED { 820 | *code = codes_one_byte[symbol & 0xFF]; 821 | } 822 | } 823 | 824 | // Find the position of the first 2-byte code that has a suffix later in the table 825 | let mut has_suffix_code = 0u8; 826 | for (code, (&symbol, &len)) in symbols.iter().zip(lengths.iter()).enumerate() { 827 | if len != 2 { 828 | break; 829 | } 830 | let rest = &symbols[code..]; 831 | if rest 832 | .iter() 833 | .any(|&other| other.len() > 2 && symbol.first2() == other.first2()) 834 | { 835 | has_suffix_code = code as u8; 836 | break; 837 | } 838 | } 839 | 840 | Compressor { 841 | n_symbols: symbols.len() as u8, 842 | symbols, 843 | lengths, 844 | codes_two_byte, 845 | lossy_pht, 846 | has_suffix_code, 847 | } 848 | } 849 | } 850 | 851 | #[inline] 852 | pub(crate) fn advance_8byte_word(word: u64, bytes: usize) -> u64 { 853 | // shift the word off the low-end, because little endian means the first 854 | // char is stored in the LSB. 855 | // 856 | // Note that even though this looks like it branches, Rust compiles this to a 857 | // conditional move instruction. See `` 858 | if bytes == 8 { 0 } else { word >> (8 * bytes) } 859 | } 860 | 861 | fn validate_symbol_order(symbol_lens: &[u8]) { 862 | // Ensure that the symbol table is ordered by length, 23456781 863 | let mut expected = 2; 864 | for (idx, &len) in symbol_lens.iter().enumerate() { 865 | if expected == 1 { 866 | assert_eq!( 867 | len, 1, 868 | "symbol code={idx} should be one byte, was {len} bytes" 869 | ); 870 | } else { 871 | if len == 1 { 872 | expected = 1; 873 | } 874 | 875 | // we're in the non-zero portion. 876 | assert!( 877 | len >= expected, 878 | "symbol code={idx} breaks violates FSST symbol table ordering" 879 | ); 880 | expected = len; 881 | } 882 | } 883 | } 884 | 885 | #[inline] 886 | pub(crate) fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool { 887 | let mask = u64::MAX >> ignored_bits; 888 | (left & mask) == right 889 | } 890 | 891 | #[cfg(test)] 892 | mod test { 893 | use super::*; 894 | use std::{iter, mem}; 895 | #[test] 896 | fn test_stuff() { 897 | let compressor = { 898 | let mut builder = CompressorBuilder::new(); 899 | builder.insert(Symbol::from_slice(b"helloooo"), 8); 900 | builder.build() 901 | }; 902 | 903 | let decompressor = compressor.decompressor(); 904 | 905 | let mut decompressed = Vec::with_capacity(8 + 7); 906 | 907 | let len = decompressor.decompress_into(&[0], decompressed.spare_capacity_mut()); 908 | assert_eq!(len, 8); 909 | unsafe { decompressed.set_len(len) }; 910 | assert_eq!(&decompressed, "helloooo".as_bytes()); 911 | } 912 | 913 | #[test] 914 | fn test_symbols_good() { 915 | let symbols_u64: &[u64] = &[ 916 | 24931, 25698, 25442, 25699, 25186, 25444, 24932, 25188, 25185, 25441, 25697, 25700, 917 | 24929, 24930, 25443, 25187, 6513249, 6512995, 6578786, 6513761, 6513507, 6382434, 918 | 6579042, 6512994, 6447460, 6447969, 6382178, 6579041, 6512993, 6448226, 6513250, 919 | 6579297, 6513506, 6447459, 6513764, 6447458, 6578529, 6382180, 6513762, 6447714, 920 | 6579299, 6513508, 6382436, 6513763, 6578532, 6381924, 6448228, 6579300, 6381921, 921 | 6382690, 6382179, 6447713, 6447972, 6513505, 6447457, 6382692, 6513252, 6578785, 922 | 6578787, 6578531, 6448225, 6382177, 6382433, 6578530, 6448227, 6381922, 6578788, 923 | 6579044, 6382691, 6512996, 6579043, 6579298, 6447970, 6447716, 6447971, 6381923, 924 | 6447715, 97, 98, 100, 99, 97, 98, 99, 100, 925 | ]; 926 | let symbols: &[Symbol] = unsafe { mem::transmute(symbols_u64) }; 927 | let lens: Vec = iter::repeat_n(2u8, 16) 928 | .chain(iter::repeat_n(3u8, 61)) 929 | .chain(iter::repeat_n(1u8, 8)) 930 | .collect(); 931 | 932 | let compressor = Compressor::rebuild_from(symbols, lens); 933 | let built_symbols: &[u64] = unsafe { mem::transmute(compressor.symbol_table()) }; 934 | assert_eq!(built_symbols, symbols_u64); 935 | } 936 | 937 | #[should_panic(expected = "assertion `left == right` failed")] 938 | #[test] 939 | fn test_symbols_bad() { 940 | let symbols: &[u64] = &[ 941 | 24931, 25698, 25442, 25699, 25186, 25444, 24932, 25188, 25185, 25441, 25697, 25700, 942 | 24929, 24930, 25443, 25187, 6513249, 6512995, 6578786, 6513761, 6513507, 6382434, 943 | 6579042, 6512994, 6447460, 6447969, 6382178, 6579041, 6512993, 6448226, 6513250, 944 | 6579297, 6513506, 6447459, 6513764, 6447458, 6578529, 6382180, 6513762, 6447714, 945 | 6579299, 6513508, 6382436, 6513763, 6578532, 6381924, 6448228, 6579300, 6381921, 946 | 6382690, 6382179, 6447713, 6447972, 6513505, 6447457, 6382692, 6513252, 6578785, 947 | 6578787, 6578531, 6448225, 6382177, 6382433, 6578530, 6448227, 6381922, 6578788, 948 | 6579044, 6382691, 6512996, 6579043, 6579298, 6447970, 6447716, 6447971, 6381923, 949 | 6447715, 97, 98, 100, 99, 97, 98, 99, 100, 950 | ]; 951 | let lens: Vec = iter::repeat_n(2u8, 16) 952 | .chain(iter::repeat_n(3u8, 61)) 953 | .chain(iter::repeat_n(1u8, 8)) 954 | .collect(); 955 | 956 | let mut builder = CompressorBuilder::new(); 957 | for (symbol, len) in symbols.iter().zip(lens.iter()) { 958 | let symbol = Symbol::from_slice(&symbol.to_le_bytes()); 959 | builder.insert(symbol, *len as usize); 960 | } 961 | let compressor = builder.build(); 962 | let built_symbols: &[u64] = unsafe { mem::transmute(compressor.symbol_table()) }; 963 | assert_eq!(built_symbols, symbols); 964 | } 965 | } 966 | -------------------------------------------------------------------------------- /src/lossy_pht.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Debug; 2 | 3 | use crate::Code; 4 | use crate::Symbol; 5 | use crate::builder::fsst_hash; 6 | 7 | /// Size of the perfect hash table. 8 | /// 9 | /// NOTE: this differs from the paper, which recommends a 64KB total 10 | /// table size. The paper does not account for the fact that most 11 | /// vendors split the L1 cache into 32KB of instruction and 32KB of data. 12 | pub const HASH_TABLE_SIZE: usize = 1 << 11; 13 | 14 | /// A single entry in the [Lossy Perfect Hash Table][`LossyPHT`]. 15 | /// 16 | /// `TableEntry` is based on the `Symbol` class outlined in Algorithm 4 of the FSST paper. See 17 | /// the module documentation for a link to the paper. 18 | #[derive(Clone, Debug)] 19 | #[repr(C)] 20 | pub(crate) struct TableEntry { 21 | /// Symbol, piece of a string, 8 bytes or fewer. 22 | pub(crate) symbol: Symbol, 23 | 24 | /// Code and associated metadata for the symbol 25 | pub(crate) code: Code, 26 | 27 | /// Number of ignored bits in `symbol`. 28 | /// 29 | /// This is equivalent to `64 - 8 * code.len()` but is pre-computed to save a few instructions in 30 | /// the compression loop. 31 | pub(crate) ignored_bits: u16, 32 | } 33 | 34 | assert_sizeof!(TableEntry => 16); 35 | 36 | impl TableEntry { 37 | pub(crate) fn is_unused(&self) -> bool { 38 | self.code == Code::UNUSED 39 | } 40 | } 41 | 42 | /// Lossy Perfect Hash Table implementation for compression. 43 | /// 44 | /// This implements the "Lossy Perfect Hash Table" described in Section 5 of the paper. 45 | /// 46 | /// It is so-called because the `insert` operation for a symbol may fail, if another symbol is 47 | /// already occupying the slot. 48 | /// 49 | /// If insertions are made from highest-gain to lowest and from longest-symbol to shortest, then 50 | /// we can say that any failed insert is not a big loss, because its slot is being held by a higher-gain 51 | /// symbol. Note that because other code in this crate calls `insert` in the pop-order of a max heap, 52 | /// this holds. 53 | #[derive(Clone, Debug)] 54 | pub(crate) struct LossyPHT { 55 | /// Hash table slots. Used for strings that are 3 bytes or more. 56 | slots: Vec, 57 | } 58 | 59 | impl LossyPHT { 60 | /// Construct a new empty lossy perfect hash table 61 | pub(crate) fn new() -> Self { 62 | let slots = vec![ 63 | TableEntry { 64 | symbol: Symbol::ZERO, 65 | code: Code::UNUSED, 66 | ignored_bits: 64, 67 | }; 68 | HASH_TABLE_SIZE 69 | ]; 70 | 71 | Self { slots } 72 | } 73 | 74 | /// Try and insert the (symbol, code) pair into the table. 75 | /// 76 | /// If there is a collision, we keep the current thing and reject the write. 77 | /// 78 | /// # Returns 79 | /// 80 | /// True if the symbol was inserted into the table, false if it was rejected due to collision. 81 | pub(crate) fn insert(&mut self, symbol: Symbol, len: usize, code: u8) -> bool { 82 | let prefix_3bytes = symbol.as_u64() & 0xFF_FF_FF; 83 | let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); 84 | let entry = &mut self.slots[slot]; 85 | if !entry.is_unused() { 86 | false 87 | } else { 88 | entry.symbol = symbol; 89 | entry.code = Code::new_symbol_building(code, len); 90 | entry.ignored_bits = (64 - 8 * symbol.len()) as u16; 91 | true 92 | } 93 | } 94 | 95 | /// Given a new code mapping, rewrite the codes into the new code range. 96 | pub(crate) fn renumber(&mut self, new_codes: &[u8]) { 97 | for slot in self.slots.iter_mut() { 98 | if slot.code != Code::UNUSED { 99 | let old_code = slot.code.code(); 100 | let new_code = new_codes[old_code as usize]; 101 | let len = slot.code.len(); 102 | slot.code = Code::new_symbol(new_code, len as usize); 103 | } 104 | } 105 | } 106 | 107 | /// Remove the symbol from the hashtable, if it exists. 108 | pub(crate) fn remove(&mut self, symbol: Symbol) { 109 | let prefix_3bytes = symbol.as_u64() & 0xFF_FF_FF; 110 | let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); 111 | self.slots[slot].code = Code::UNUSED; 112 | } 113 | 114 | #[inline] 115 | pub(crate) fn lookup(&self, word: u64) -> &TableEntry { 116 | let prefix_3bytes = word & 0xFF_FF_FF; 117 | let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); 118 | 119 | // SAFETY: the slot is guaranteed to between [0, HASH_TABLE_SIZE). 120 | unsafe { self.slots.get_unchecked(slot) } 121 | } 122 | } 123 | 124 | impl Default for LossyPHT { 125 | fn default() -> Self { 126 | Self::new() 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /tests/correctness.rs: -------------------------------------------------------------------------------- 1 | //! Correctness tests for FSST. 2 | 3 | #![cfg(test)] 4 | 5 | use fsst::{Compressor, CompressorBuilder, Symbol}; 6 | 7 | static PREAMBLE: &str = r#" 8 | When in the Course of human events, it becomes necessary for one people to dissolve 9 | the political bands which have connected them with another, and to assume among the 10 | powers of the earth, the separate and equal station to which the Laws of Nature and 11 | of Nature's God entitle them, a decent respect to the opinions of mankind requires 12 | that they should declare the causes which impel them to the separation."#; 13 | 14 | static DECLARATION: &str = include_str!("./fixtures/declaration.txt"); 15 | 16 | static ART_OF_WAR: &str = include_str!("./fixtures/art_of_war.txt"); 17 | 18 | #[test] 19 | fn test_basic() { 20 | // Roundtrip the declaration 21 | let trained = Compressor::train(&vec![PREAMBLE.as_bytes()]); 22 | let compressed = trained.compress(PREAMBLE.as_bytes()); 23 | let decompressed = trained.decompressor().decompress(&compressed); 24 | assert_eq!(decompressed, PREAMBLE.as_bytes()); 25 | } 26 | 27 | #[test] 28 | fn test_train_on_empty() { 29 | let trained = Compressor::train(&vec![]); 30 | // We can still compress with it, but the symbols are going to be empty. 31 | let compressed = trained.compress("the quick brown fox jumped over the lazy dog".as_bytes()); 32 | assert_eq!( 33 | trained.decompressor().decompress(&compressed), 34 | "the quick brown fox jumped over the lazy dog".as_bytes() 35 | ); 36 | } 37 | 38 | #[test] 39 | fn test_one_byte() { 40 | let mut empty = CompressorBuilder::new(); 41 | empty.insert(Symbol::from_u8(0x01), 1); 42 | 43 | let empty = empty.build(); 44 | 45 | let compressed = empty.compress(&[0x01]); 46 | assert_eq!(compressed, vec![0u8]); 47 | 48 | assert_eq!(empty.decompressor().decompress(&compressed), vec![0x01]); 49 | } 50 | 51 | #[test] 52 | fn test_zeros() { 53 | let training_data: Vec = vec![0, 1, 2, 3, 4, 0]; 54 | let trained = Compressor::train(&vec![&training_data]); 55 | let compressed = trained.compress(&[4, 0]); 56 | assert_eq!(trained.decompressor().decompress(&compressed), &[4, 0]); 57 | } 58 | 59 | #[cfg_attr(miri, ignore)] 60 | #[test] 61 | fn test_large() { 62 | let corpus: Vec = DECLARATION.bytes().cycle().take(10_240).collect(); 63 | 64 | let trained = Compressor::train(&vec![&corpus]); 65 | let massive: Vec = DECLARATION 66 | .bytes() 67 | .cycle() 68 | .take(16 * 1_024 * 1_024) 69 | .collect(); 70 | 71 | let compressed = trained.compress(&massive); 72 | assert_eq!(trained.decompressor().decompress(&compressed), massive); 73 | } 74 | 75 | #[test] 76 | fn test_chinese() { 77 | let trained = Compressor::train(&vec![ART_OF_WAR.as_bytes()]); 78 | assert_eq!( 79 | ART_OF_WAR.as_bytes(), 80 | trained 81 | .decompressor() 82 | .decompress(&trained.compress(ART_OF_WAR.as_bytes())) 83 | ); 84 | } 85 | 86 | #[test] 87 | fn test_large_with_rebuild() { 88 | let corpus: Vec = DECLARATION.bytes().cycle().take(10_240).collect(); 89 | 90 | let trained = Compressor::train(&vec![&corpus]); 91 | let compressed = trained.compress(DECLARATION.as_bytes()); 92 | 93 | // let compressed = trained.compress(&massive); 94 | let rebuilt = Compressor::rebuild_from(trained.symbol_table(), trained.symbol_lengths()); 95 | let recompressed = rebuilt.compress(DECLARATION.as_bytes()); 96 | 97 | assert_eq!(compressed, recompressed); 98 | 99 | // Ensure round-trip after rebuilding the compressor 100 | let decompressed = rebuilt.decompressor().decompress(&recompressed); 101 | assert_eq!( 102 | unsafe { std::str::from_utf8_unchecked(&decompressed) }, 103 | DECLARATION, 104 | ); 105 | } 106 | -------------------------------------------------------------------------------- /tests/fixtures/art_of_war.txt: -------------------------------------------------------------------------------- 1 | 孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。 2 | 故經之以五事,校之以計,而索其情,一曰道,二曰天,三曰地,四曰將,五曰法。 3 | 道者,令民與上同意也,可與之死,可與之生,而不畏危。天者,陰陽,寒暑,時制也。地者,遠近,險易,廣狹,死生也。將者,智,信,仁,勇,嚴也。法者,曲制,官道,主用也。凡此五者,將莫不聞,知之者勝,不知者不勝。 4 | 故校之以計,而索其情。曰:主孰有道,將孰有能,天地孰得,法令孰行,兵眾孰強,士卒孰練,賞罰孰明,吾以此知勝負矣。將聽吾計,用之必勝,留之;將不聽吾計,用之必敗,去之。 5 | 計利以聽,乃為之勢,以佐其外;勢者,因利而制權也。 6 | 兵者,詭道也。故能而示之不能,用而示之不用,近而示之遠,遠而示之近。利而誘之,亂而取之,實而備之,強而避之,怒而撓之,卑而驕之,佚而勞之,親而離之。攻其無備,出其不意,此兵家之勝,不可先傳也。 7 | 夫未戰而廟算勝者,得算多也;未戰而廟算不勝者,得算少也;多算勝,少算不勝,而況於無算乎?吾以此觀之,勝負見矣。 8 | 9 | 孫子曰:凡用兵之法,馳車千駟,革車千乘,帶甲十萬;千里饋糧,則內外之費賓客之用,膠漆之材,車甲之奉,日費千金,然後十萬之師舉矣。 10 | 其用戰也,勝久則鈍兵挫銳,攻城則力屈,久暴師則國用不足。夫鈍兵,挫銳,屈力,殫貨,則諸侯乘其弊而起,雖有智者,不能善其後矣!故兵聞拙速,未睹巧之久也;夫兵久而國利者,未之有也。 11 | 故不盡知用兵之害者,則不能盡知用兵之利也。善用兵者,役不再籍,糧不三載,取用于國,因糧于敵,故軍食可足也。國之貧于師者遠輸,遠輸則百姓貧,近于師者貴賣,貴賣則百姓財竭,財竭則急于丘役,力屈財殫,中原內虛于家,百姓之費,十去其七,公家之費,破車罷馬,甲冑矢弩,戟楯蔽櫓,丘牛大車,十去其六。 12 | 故智將務食於敵,食敵一鍾,當吾二十鍾,𦮼秆一石,當我二十石。故殺敵者怒也,取敵之利者貨也。故車戰,得車十乘以上,賞其先得者,而更其旌旗,車雜而乘之,卒善而養之,是謂勝敵而益強。 13 | 故兵貴勝,不貴久;故知兵之將,民之司命,國家安危之主也。 14 | 孫子曰:凡用兵之法,全國為上,破國次之;全旅為上,破旅次之;全卒為上,破卒次之;全伍為上,破伍次之。是故百戰百勝,非善之善者也;不戰而屈人之兵,善之善者也。 15 | 故上兵伐謀,其次伐交,其次伐兵,其下攻城。攻城之法,為不得已;修櫓轒轀,具器械,三月而後成;距闉,又三月而後已;將不勝其忿,而蟻附之,殺士卒三分之一,而城不拔者,此攻之災也。 16 | 故善用兵者,屈人之兵,而非戰也;拔人之城,而非攻也;毀人之國,而非久也。必以全爭于天下,故兵不頓,利可全,此謀攻之法也。故用兵之法,十則圍之,五則攻之,倍則分之,敵則能戰之,少則能守之,不若則能避之。故小敵之堅,大敵之擒也。 17 | 夫將者,國之輔也,輔周則國必強,輔隙則國必弱。故軍之所以患于君者三:不知三軍之不可以進,而謂之進;不知三軍之不可以退,而謂之退;是謂縻軍。不知三軍之事,而同三軍之政,則軍士惑矣。不知三軍之權,而同三軍之任,則軍士疑矣。三軍既惑且疑,則諸侯之難至矣,是謂亂軍引勝。 18 | 故知勝者有五:知可以戰與不可以戰者勝,識眾寡之用者勝,上下同欲者勝,以虞待不虞者勝,將能而君不御者勝;此五者,知勝之道也。 19 | 故曰:知彼知己,百戰不殆;不知彼而知己,一勝一負;不知彼,不知己,每戰必敗。 20 | 21 | 孫子曰:昔之善戰者,先為不可勝,以待敵之可勝,不可勝在己,可勝在敵。故善戰者,能為不可勝,不能使敵必可勝。故曰:勝可知,而不可為。 22 | 不可勝者,守也;可勝者,攻也。守則不足,攻則有餘。善守者,藏于九地之下;善攻者,動于九天之上,故能自保而全勝也。 23 | 見勝,不過眾人之所知,非善之善者也。戰勝,而天下曰善,非善之善者也。故舉秋毫,不為多力;見日月,不為明目;聞雷霆,不為聰耳。古之善戰者,勝于易勝者;故善戰者之勝也,無智名,無勇功。故其戰勝不忒,不忒者,其措必勝,勝已敗者也。故善戰者,立于不敗之地,而不失敵之敗也。是故勝兵先勝,而後求戰;敗兵先戰,而後求勝。 24 | 善用兵者,修道而保法,故能為勝敗之政。兵法:「一曰度,二曰量,三曰數,四曰稱,五曰勝;地生度,度生量,量生數,數生稱,稱生勝。」故勝兵若以鎰稱銖,敗兵若以銖稱鎰。勝者之戰民也,若決積水于千仞之谿,形也。 25 | 26 | 孫子曰:凡治眾如治寡,分數是也。鬥眾如鬥寡,形名是也。三軍之眾,可使必受敵而無敗者,奇正是也。兵之所加,如以碬投卵者,虛實是也。 27 | 凡戰者,以正合,以奇勝。故善出奇者,無窮如天地,不竭如江河,終而復始,日月是也;死而復生,四時是也。聲不過五,五聲之變,不可勝聽也。色不過五,五色之變,不可勝觀也。味不過五,五味之變,不可勝嘗也。戰勢不過奇正,奇正之變,不可勝窮也。奇正相生,如循環之無端,孰能窮之哉! 28 | 激水之疾,至于漂石者,勢也。鷙鳥之擊,至于毀折者,節也。是故善戰者,其勢險,其節短,勢如張弩,節如機發。 29 | 紛紛紜紜,鬥亂,而不可亂也。渾渾沌沌,形圓,而不可敗也。亂生于治,怯生于勇,弱生于強。治亂,數也。勇怯,勢也。強弱,形也。故善動敵者,形之,敵必從之;予之,敵必取之;以利動之,以實待之。 30 | 故善戰者,求之于勢,不責于人,故能擇人任勢;任勢者,其戰人也,如轉木石,木石之性,安則靜,危則動,方則止,圓則行。故善戰人之勢,如轉圓石于千仞之山者,勢也。 31 | 32 | 孫子曰:凡先處戰地而待敵者佚,後處戰地而趨戰者勞。故善戰者,致人而不致于人。能使敵人自至者,利之也;能使敵不得至者,害之也。故敵佚能勞之,飽能飢之,安能動之。 33 | 出其所不趨,趨其所不意;行千里而不勞者,行于無人之地也;攻而必取者,攻其所不守也;守而必固者,守其所不攻也。故善攻者,敵不知其所守;善守者,敵不知其所攻。微乎微乎!至于無形;神乎神乎!至于無聲,故能為敵之司命。進而不可禦者,衝其虛也;退而不可追者,速而不可及也。故我欲戰,敵雖高壘深溝,不得不與我戰者,攻其所必救也;我不欲戰,雖劃地而守之,敵不得與我戰者,乖其所之也。 34 | 故形人而我無形,則我專而敵分,我專為一,敵分為十,是以十攻其一也。則我眾而敵寡,能以眾擊寡,則我之所與戰者,約矣。 35 | 吾所與戰之地不可知,不可知,則敵所備者多,敵所備者多,則我所與戰者寡矣。故備前則後寡,備後則前寡,備左則右寡,備右則左寡,無所不備,則無所不寡。寡者,備人者也;眾者,使人備己者也。 36 | 故知戰之地,知戰之日,則可千里而會戰。不知戰地,不知戰日,則左不能救右,右不能救左,前不能救後,後不能救前,而況遠者數十里,近者數里乎?以吾度之,越人之兵雖多,亦奚益于勝哉?故曰:勝可為也,敵雖眾,可使無鬥。 37 | 故策之而知得失之計,作之而知動靜之理,形之而知死生之地,角之而知有餘不足之處。故形兵之極,至于無形;無形,則深間不能窺,智者不能謀。因形而措勝于眾,眾不能知,人皆知我所以勝之形,而莫知吾所以制勝之形;故其戰勝不復,而應形於無窮。 38 | 夫兵形象水,水之形,避高而趨下:兵之形,避實而擊虛;水因地而制流,兵因敵而制勝。故兵無常勢,水無常形;能因敵變化而取勝,謂之神。故五行無常勝,四時無常位,日有短長,月有死生。 39 | 40 | 孫子曰:凡用兵之法,將受命於君,合軍聚眾,交和而舍,莫難於軍爭。軍爭之難者,以迂為直,以患為利。故迂其途,而誘之以利,後人發,先人至,此知迂直之計者也。故軍爭為利,軍爭為危。 41 | 舉軍而爭利,則不及;委軍而爭利,則輜重捐。是故卷甲而趨,日夜不處,倍道兼行,百里而爭利,則擒三將軍,勁者先,疲者後,其法十一而至;五十里而爭利,則蹶上將軍,其法半至;卅里而爭利,則三分之二至。是故軍無輜重則亡,無糧食則亡,無委積則亡。故不知諸侯之謀者,不能豫交;不知山林、險阻、沮澤之形者,不能行軍,不能鄉導者,不能得地利。 42 | 故兵以詐立,以利動,以分合為變者也,故其疾如風,其徐如林,侵掠如火,不動如山,難知如陰,動如雷霆。掠鄉分眾,廓地分利,懸權而動,先知迂直之計者勝,此軍爭之法也。 43 | 軍政曰:「言不相聞,故為金鼓;視不相見,故為旌旗。」夫金鼓旌旗者,所以一人之耳目也;人既專一,則勇者不得獨進,怯者不得獨退,此用眾之法也。故夜戰多火鼓,晝戰多旌旗,所以變人之耳目也。 44 | 故三軍可奪氣,將軍可奪心。是故朝氣銳,晝氣惰,暮氣歸;故善用兵者,避其銳氣,擊其惰歸,此治氣者也。以治待亂,以靜待譁,此治心者也。以近待遠,以佚待勞,以飽待飢,此治力者也。 45 | 無邀正正之旗,勿擊堂堂之陣,此治變者也;故用兵之法,高陵勿向,背邱勿逆,佯北勿從,銳卒勿攻,餌兵勿食,歸師勿遏,圍師必闕,窮寇勿迫,此用兵之法也。 46 | -------------------------------------------------------------------------------- /tests/fixtures/declaration.txt: -------------------------------------------------------------------------------- 1 | The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, and to assume among the powers of the earth, the separate and equal station to which the Laws of Nature and of Nature's God entitle them, a decent respect to the opinions of mankind requires that they should declare the causes which impel them to the separation. 2 | 3 | We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.--That to secure these rights, Governments are instituted among Men, deriving their just powers from the consent of the governed, --That whenever any Form of Government becomes destructive of these ends, it is the Right of the People to alter or to abolish it, and to institute new Government, laying its foundation on such principles and organizing its powers in such form, as to them shall seem most likely to effect their Safety and Happiness. Prudence, indeed, will dictate that Governments long established should not be changed for light and transient causes; and accordingly all experience hath shewn, that mankind are more disposed to suffer, while evils are sufferable, than to right themselves by abolishing the forms to which they are accustomed. But when a long train of abuses and usurpations, pursuing invariably the same Object evinces a design to reduce them under absolute Despotism, it is their right, it is their duty, to throw off such Government, and to provide new Guards for their future security.--Such has been the patient sufferance of these Colonies; and such is now the necessity which constrains them to alter their former Systems of Government. The history of the present King of Great Britain is a history of repeated injuries and usurpations, all having in direct object the establishment of an absolute Tyranny over these States. To prove this, let Facts be submitted to a candid world. 4 | 5 | He has refused his Assent to Laws, the most wholesome and necessary for the public good. 6 | 7 | He has forbidden his Governors to pass Laws of immediate and pressing importance, unless suspended in their operation till his Assent should be obtained; and when so suspended, he has utterly neglected to attend to them. 8 | 9 | He has refused to pass other Laws for the accommodation of large districts of people, unless those people would relinquish the right of Representation in the Legislature, a right inestimable to them and formidable to tyrants only. 10 | 11 | He has called together legislative bodies at places unusual, uncomfortable, and distant from the depository of their public Records, for the sole purpose of fatiguing them into compliance with his measures. 12 | 13 | He has dissolved Representative Houses repeatedly, for opposing with manly firmness his invasions on the rights of the people. 14 | 15 | He has refused for a long time, after such dissolutions, to cause others to be elected; whereby the Legislative powers, incapable of Annihilation, have returned to the People at large for their exercise; the State remaining in the mean time exposed to all the dangers of invasion from without, and convulsions within. 16 | 17 | He has endeavoured to prevent the population of these States; for that purpose obstructing the Laws for Naturalization of Foreigners; refusing to pass others to encourage their migrations hither, and raising the conditions of new Appropriations of Lands. 18 | 19 | He has obstructed the Administration of Justice, by refusing his Assent to Laws for establishing Judiciary powers. 20 | 21 | He has made Judges dependent on his Will alone, for the tenure of their offices, and the amount and payment of their salaries. 22 | 23 | He has erected a multitude of New Offices, and sent hither swarms of Officers to harrass our people, and eat out their substance. 24 | 25 | He has kept among us, in times of peace, Standing Armies without the Consent of our legislatures. 26 | 27 | He has affected to render the Military independent of and superior to the Civil power. 28 | 29 | He has combined with others to subject us to a jurisdiction foreign to our constitution, and unacknowledged by our laws; giving his Assent to their Acts of pretended Legislation: 30 | 31 | For Quartering large bodies of armed troops among us: 32 | 33 | For protecting them, by a mock Trial, from punishment for any Murders which they should commit on the Inhabitants of these States: 34 | 35 | For cutting off our Trade with all parts of the world: 36 | 37 | For imposing Taxes on us without our Consent: 38 | 39 | For depriving us in many cases, of the benefits of Trial by Jury: 40 | 41 | For transporting us beyond Seas to be tried for pretended offences 42 | 43 | For abolishing the free System of English Laws in a neighbouring Province, establishing therein an Arbitrary government, and enlarging its Boundaries so as to render it at once an example and fit instrument for introducing the same absolute rule into these Colonies: 44 | 45 | For taking away our Charters, abolishing our most valuable Laws, and altering fundamentally the Forms of our Governments: 46 | 47 | For suspending our own Legislatures, and declaring themselves invested with power to legislate for us in all cases whatsoever. 48 | 49 | He has abdicated Government here, by declaring us out of his Protection and waging War against us. 50 | 51 | He has plundered our seas, ravaged our Coasts, burnt our towns, and destroyed the lives of our people. 52 | 53 | He is at this time transporting large Armies of foreign Mercenaries to compleat the works of death, desolation and tyranny, already begun with circumstances of Cruelty & perfidy scarcely paralleled in the most barbarous ages, and totally unworthy the Head of a civilized nation. 54 | 55 | He has constrained our fellow Citizens taken Captive on the high Seas to bear Arms against their Country, to become the executioners of their friends and Brethren, or to fall themselves by their Hands. 56 | 57 | He has excited domestic insurrections amongst us, and has endeavoured to bring on the inhabitants of our frontiers, the merciless Indian Savages, whose known rule of warfare, is an undistinguished destruction of all ages, sexes and conditions. 58 | 59 | In every stage of these Oppressions We have Petitioned for Redress in the most humble terms: Our repeated Petitions have been answered only by repeated injury. A Prince whose character is thus marked by every act which may define a Tyrant, is unfit to be the ruler of a free people. 60 | 61 | Nor have We been wanting in attentions to our Brittish brethren. We have warned them from time to time of attempts by their legislature to extend an unwarrantable jurisdiction over us. We have reminded them of the circumstances of our emigration and settlement here. We have appealed to their native justice and magnanimity, and we have conjured them by the ties of our common kindred to disavow these usurpations, which, would inevitably interrupt our connections and correspondence. They too have been deaf to the voice of justice and of consanguinity. We must, therefore, acquiesce in the necessity, which denounces our Separation, and hold them, as we hold the rest of mankind, Enemies in War, in Peace Friends. 62 | 63 | We, therefore, the Representatives of the united States of America, in General Congress, Assembled, appealing to the Supreme Judge of the world for the rectitude of our intentions, do, in the Name, and by Authority of the good People of these Colonies, solemnly publish and declare, That these United Colonies are, and of Right ought to be Free and Independent States; that they are Absolved from all Allegiance to the British Crown, and that all political connection between them and the State of Great Britain, is and ought to be totally dissolved; and that as Free and Independent States, they have full Power to levy War, conclude Peace, contract Alliances, establish Commerce, and to do all other Acts and Things which Independent States may of right do. And for the support of this Declaration, with a firm reliance on the protection of divine Providence, we mutually pledge to each other our Lives, our Fortunes and our sacred Honor. 64 | --------------------------------------------------------------------------------