├── .cargo └── config.toml ├── .github └── workflows │ ├── cicd.yml │ └── pypi_release.yml ├── .gitignore ├── .vscode └── settings.json ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── clippy.toml ├── doc ├── 2022-07-31-19-27-57.png └── sqlitebrowser.png ├── python ├── README.md ├── lib │ ├── __init__.py │ └── datasette.py ├── pyproject.toml └── setup.py └── src ├── add_functions.rs ├── basic.rs ├── bin ├── benchmark.rs └── create_test_db.rs ├── create_extension.rs ├── dict_management.rs ├── dict_training.rs ├── lib.rs ├── transparent.rs └── util.rs /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.aarch64-linux-android] 2 | linker = "aarch64-linux-android23-clang" 3 | -------------------------------------------------------------------------------- /.github/workflows/cicd.yml: -------------------------------------------------------------------------------- 1 | name: CICD 2 | 3 | # adapted from https://github.com/bootandy/dust/blob/master/.github/workflows/CICD.yml 4 | 5 | env: 6 | PROJECT_NAME: sqlite_zstd 7 | on: [push, pull_request] 8 | 9 | jobs: 10 | style: 11 | name: Tests 12 | runs-on: ${{ matrix.job.os }} 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | job: 17 | - { os: ubuntu-latest } 18 | steps: 19 | - uses: actions/checkout@v1 20 | - name: Initialize workflow variables 21 | id: vars 22 | shell: bash 23 | run: | 24 | # target-specific options 25 | # * CARGO_FEATURES_OPTION 26 | CARGO_FEATURES_OPTION='' ; 27 | if [ -n "${{ matrix.job.features }}" ]; then CARGO_FEATURES_OPTION='--features "${{ matrix.job.features }}"' ; fi 28 | echo set-output name=CARGO_FEATURES_OPTION::${CARGO_FEATURES_OPTION} 29 | echo ::set-output name=CARGO_FEATURES_OPTION::${CARGO_FEATURES_OPTION} 30 | - name: Install `rust` toolchain 31 | uses: actions-rs/toolchain@v1 32 | with: 33 | toolchain: stable 34 | override: true 35 | profile: minimal # minimal component installation (ie, no documentation) 36 | components: rustfmt, clippy 37 | - name: "`fmt` testing" 38 | uses: actions-rs/cargo@v1 39 | with: 40 | command: fmt 41 | args: --all -- --check 42 | - name: "`clippy` testing" 43 | if: success() || failure() # run regardless of prior step ("`fmt` testing") success/failure 44 | uses: actions-rs/cargo@v1 45 | with: 46 | command: clippy 47 | args: ${{ matrix.job.cargo-options }} ${{ steps.vars.outputs.CARGO_FEATURES_OPTION }} --all-targets -- -D warnings 48 | - name: "tests" 49 | if: success() || failure() # run regardless of prior step 50 | uses: actions-rs/cargo@v1 51 | with: 52 | command: test 53 | 54 | build: 55 | name: Build 56 | runs-on: ${{ matrix.job.os }} 57 | strategy: 58 | fail-fast: false 59 | matrix: 60 | job: 61 | # { os, target, cargo-options, features, use-cross, toolchain } 62 | - { 63 | os: ubuntu-latest, 64 | target: x86_64-unknown-linux-gnu, 65 | features: build_extension, 66 | } 67 | - { 68 | os: ubuntu-latest, 69 | target: arm-unknown-linux-gnueabihf, 70 | use-cross: use-cross, 71 | features: build_extension, 72 | } 73 | - { 74 | os: macos-latest, 75 | target: x86_64-apple-darwin, 76 | features: build_extension, 77 | } 78 | - { 79 | os: windows-latest, 80 | target: x86_64-pc-windows-gnu, 81 | features: build_extension, 82 | } ## !maint: [rivy; 2020-01-21] may break due to rust bug; follow possible solution from GH:rust-lang/rust#47048 (refs: GH:rust-lang/rust#47048 , GH:rust-lang/rust#53454 , GH:bike-barn/hermit#172 ) 83 | - { 84 | os: windows-latest, 85 | target: x86_64-pc-windows-msvc, 86 | features: build_extension, 87 | } 88 | steps: 89 | - uses: actions/checkout@v1 90 | - name: Install any prerequisites 91 | shell: bash 92 | run: | 93 | case ${{ matrix.job.target }} in 94 | arm-unknown-linux-gnueabihf) sudo apt-get -y update ; sudo apt-get -y install gcc-arm-linux-gnueabihf ;; 95 | esac 96 | - name: Initialize workflow variables 97 | id: vars 98 | shell: bash 99 | run: | 100 | # toolchain 101 | TOOLCHAIN="stable" ## default to "stable" toolchain 102 | # * specify alternate TOOLCHAIN for *-pc-windows-gnu targets; gnu targets on Windows are broken for the standard *-pc-windows-msvc toolchain (refs: , , ) 103 | case ${{ matrix.job.target }} in *-pc-windows-gnu) TOOLCHAIN="stable-${{ matrix.job.target }}" ;; esac; 104 | # * use requested TOOLCHAIN if specified 105 | if [ -n "${{ matrix.job.toolchain }}" ]; then TOOLCHAIN="${{ matrix.job.toolchain }}" ; fi 106 | echo set-output name=TOOLCHAIN::${TOOLCHAIN} 107 | echo ::set-output name=TOOLCHAIN::${TOOLCHAIN} 108 | # staging directory 109 | STAGING='_staging' 110 | echo set-output name=STAGING::${STAGING} 111 | echo ::set-output name=STAGING::${STAGING} 112 | # determine EXE suffix 113 | case ${{ matrix.job.target }} in 114 | *-linux-*) EXE_suffix=".so"; EXE_prefix="lib" ;; 115 | *-apple-*) EXE_suffix=".dylib"; EXE_prefix="lib" ;; 116 | *-windows-*) EXE_suffix=".dll"; EXE_prefix="" ;; esac; 117 | echo set-output name=LIB_FNAME::${EXE_prefix}${{ env.PROJECT_NAME }}${EXE_suffix} 118 | echo ::set-output name=LIB_FNAME::${EXE_prefix}${{ env.PROJECT_NAME }}${EXE_suffix} 119 | # parse commit reference info 120 | REF_NAME=${GITHUB_REF#refs/*/} 121 | unset REF_BRANCH ; case ${GITHUB_REF} in refs/heads/*) REF_BRANCH=${GITHUB_REF#refs/heads/} ;; esac; 122 | unset REF_TAG ; case ${GITHUB_REF} in refs/tags/*) REF_TAG=${GITHUB_REF#refs/tags/} ;; esac; 123 | REF_SHAS=${GITHUB_SHA:0:8} 124 | echo set-output name=REF_NAME::${REF_NAME} 125 | echo set-output name=REF_BRANCH::${REF_BRANCH} 126 | echo set-output name=REF_TAG::${REF_TAG} 127 | echo set-output name=REF_SHAS::${REF_SHAS} 128 | echo ::set-output name=REF_NAME::${REF_NAME} 129 | echo ::set-output name=REF_BRANCH::${REF_BRANCH} 130 | echo ::set-output name=REF_TAG::${REF_TAG} 131 | echo ::set-output name=REF_SHAS::${REF_SHAS} 132 | # parse target 133 | unset TARGET_ARCH ; case ${{ matrix.job.target }} in arm-unknown-linux-gnueabihf) TARGET_ARCH=arm ;; i686-*) TARGET_ARCH=i686 ;; x86_64-*) TARGET_ARCH=x86_64 ;; esac; 134 | echo set-output name=TARGET_ARCH::${TARGET_ARCH} 135 | echo ::set-output name=TARGET_ARCH::${TARGET_ARCH} 136 | unset TARGET_OS ; case ${{ matrix.job.target }} in *-linux-*) TARGET_OS=linux ;; *-apple-*) TARGET_OS=macos ;; *-windows-*) TARGET_OS=windows ;; esac; 137 | echo set-output name=TARGET_OS::${TARGET_OS} 138 | echo ::set-output name=TARGET_OS::${TARGET_OS} 139 | # package name 140 | PKG_suffix=".tar.gz" ; case ${{ matrix.job.target }} in *-pc-windows-*) PKG_suffix=".zip" ;; esac; 141 | PKG_BASENAME=${PROJECT_NAME}-${REF_TAG:-$REF_SHAS}-${{ matrix.job.target }} 142 | PKG_NAME=${PKG_BASENAME}${PKG_suffix} 143 | echo set-output name=PKG_suffix::${PKG_suffix} 144 | echo set-output name=PKG_BASENAME::${PKG_BASENAME} 145 | echo set-output name=PKG_NAME::${PKG_NAME} 146 | echo ::set-output name=PKG_suffix::${PKG_suffix} 147 | echo ::set-output name=PKG_BASENAME::${PKG_BASENAME} 148 | echo ::set-output name=PKG_NAME::${PKG_NAME} 149 | # deployable tag? (ie, leading "vM" or "M"; M == version number) 150 | unset DEPLOY ; if [[ $REF_TAG =~ ^[vV]?[0-9].* ]]; then DEPLOY='true' ; fi 151 | echo set-output name=DEPLOY::${DEPLOY:-/false} 152 | echo ::set-output name=DEPLOY::${DEPLOY} 153 | # target-specific options 154 | # * CARGO_FEATURES_OPTION 155 | CARGO_FEATURES_OPTION='' ; 156 | if [ -n "${{ matrix.job.features }}" ]; then CARGO_FEATURES_OPTION='--features "${{ matrix.job.features }}"' ; fi 157 | echo set-output name=CARGO_FEATURES_OPTION::${CARGO_FEATURES_OPTION} 158 | echo ::set-output name=CARGO_FEATURES_OPTION::${CARGO_FEATURES_OPTION} 159 | # * CARGO_USE_CROSS (truthy) 160 | CARGO_USE_CROSS='true' ; case '${{ matrix.job.use-cross }}' in ''|0|f|false|n|no) unset CARGO_USE_CROSS ;; esac; 161 | echo set-output name=CARGO_USE_CROSS::${CARGO_USE_CROSS:-/false} 162 | echo ::set-output name=CARGO_USE_CROSS::${CARGO_USE_CROSS} 163 | # * strip executable? 164 | STRIP="strip" ; STRIP_PARAMS="" ; case ${{ matrix.job.target }} in arm-unknown-linux-gnueabihf) STRIP="arm-linux-gnueabihf-strip" ;; *-pc-windows-msvc) STRIP="" ;; *-apple-darwin) STRIP_PARAMS="-x" ;; esac; 165 | echo set-output name=STRIP::${STRIP} 166 | echo ::set-output name=STRIP::${STRIP} 167 | echo set-output name=STRIP_PARAMS::${STRIP_PARAMS} 168 | echo ::set-output name=STRIP_PARAMS::${STRIP_PARAMS} 169 | - name: Create all needed build/work directories 170 | shell: bash 171 | run: | 172 | mkdir -p '${{ steps.vars.outputs.STAGING }}' 173 | mkdir -p '${{ steps.vars.outputs.STAGING }}/${{ steps.vars.outputs.PKG_BASENAME }}' 174 | - name: rust toolchain ~ install 175 | uses: actions-rs/toolchain@v1 176 | with: 177 | toolchain: ${{ steps.vars.outputs.TOOLCHAIN }} 178 | target: ${{ matrix.job.target }} 179 | override: true 180 | profile: minimal # minimal component installation (ie, no documentation) 181 | - name: Info 182 | shell: bash 183 | run: | 184 | gcc --version || true 185 | rustup -V 186 | rustup toolchain list 187 | rustup default 188 | cargo -V 189 | rustc -V 190 | - name: Build 191 | uses: actions-rs/cargo@v1 192 | with: 193 | use-cross: ${{ steps.vars.outputs.CARGO_USE_CROSS }} 194 | command: build 195 | args: --release --target=${{ matrix.job.target }} ${{ matrix.job.cargo-options }} ${{ steps.vars.outputs.CARGO_FEATURES_OPTION }} 196 | - name: Archive executable artifacts 197 | uses: actions/upload-artifact@master 198 | with: 199 | name: ${{ env.PROJECT_NAME }}-${{ matrix.job.target }} 200 | path: target/${{ matrix.job.target }}/release/${{ steps.vars.outputs.LIB_FNAME }} 201 | - name: Package 202 | shell: bash 203 | run: | 204 | # binary 205 | cp 'target/${{ matrix.job.target }}/release/${{ steps.vars.outputs.LIB_FNAME }}' '${{ steps.vars.outputs.STAGING }}/${{ steps.vars.outputs.PKG_BASENAME }}/' 206 | # `strip` binary (if needed) 207 | if [ -n "${{ steps.vars.outputs.STRIP }}" ]; then "${{ steps.vars.outputs.STRIP }}" ${{ steps.vars.outputs.STRIP_PARAMS }} '${{ steps.vars.outputs.STAGING }}/${{ steps.vars.outputs.PKG_BASENAME }}/${{ steps.vars.outputs.LIB_FNAME }}' ; fi 208 | # README and LICENSE 209 | cp README.md '${{ steps.vars.outputs.STAGING }}/${{ steps.vars.outputs.PKG_BASENAME }}/' 210 | cp LICENSE '${{ steps.vars.outputs.STAGING }}/${{ steps.vars.outputs.PKG_BASENAME }}/' 211 | # base compressed package 212 | pushd '${{ steps.vars.outputs.STAGING }}/' >/dev/null 213 | case ${{ matrix.job.target }} in 214 | *-pc-windows-*) 7z -y a '${{ steps.vars.outputs.PKG_NAME }}' '${{ steps.vars.outputs.PKG_BASENAME }}'/* | tail -2 ;; 215 | *) tar czf '${{ steps.vars.outputs.PKG_NAME }}' '${{ steps.vars.outputs.PKG_BASENAME }}'/* ;; 216 | esac; 217 | popd >/dev/null 218 | - name: Publish 219 | uses: softprops/action-gh-release@v1 220 | if: steps.vars.outputs.DEPLOY 221 | with: 222 | files: | 223 | ${{ steps.vars.outputs.STAGING }}/${{ steps.vars.outputs.PKG_NAME }} 224 | env: 225 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 226 | -------------------------------------------------------------------------------- /.github/workflows/pypi_release.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | build_wheels: 8 | name: Build wheels on ${{ matrix.os }} for ${{ matrix.cibw_python }} on ${{ matrix.cibw_arch }} 9 | runs-on: ${{ matrix.os }} 10 | strategy: 11 | matrix: 12 | os: [ubuntu-latest] 13 | cibw_arch: ["x86_64", "aarch64"] 14 | env: 15 | CIBW_BEFORE_ALL_LINUX: "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain stable -y" 16 | CIBW_BUILD_VERBOSITY: "1" 17 | CIBW_ENVIRONMENT: 'PATH="$PATH:$HOME/.cargo/bin"' 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | 22 | - name: Set up QEMU 23 | if: matrix.os == 'ubuntu-latest' && matrix.cibw_arch == 'aarch64' 24 | uses: docker/setup-qemu-action@68827325e0b33c7199eb31dd4e31fbe9023e06e3 # v3.0.0 25 | with: 26 | platforms: arm64 27 | 28 | - name: Build wheels 29 | uses: pypa/cibuildwheel@v2.16.2 30 | env: 31 | CIBW_BUILD_VERBOSITY: 1 32 | CIBW_BUILD: ${{ matrix.cibw_python }} 33 | CIBW_ARCHS: ${{ matrix.cibw_arch }} 34 | CIBW_TEST_SKIP: "*universal2:arm64" 35 | with: 36 | package-dir: ./python 37 | output-dir: ./python/wheelhouse 38 | 39 | - uses: actions/upload-artifact@v3 40 | with: 41 | name: dist 42 | path: ./python/wheelhouse/*.whl 43 | 44 | build_sdist: 45 | name: Build source distribution 46 | runs-on: ubuntu-latest 47 | steps: 48 | - uses: actions/checkout@v2 49 | 50 | - name: Install rust 51 | uses: actions-rs/toolchain@v1 52 | with: 53 | toolchain: stable 54 | profile: minimal 55 | 56 | - uses: actions/setup-python@v2 57 | name: Install Python 58 | with: 59 | python-version: "3.12" 60 | 61 | - name: Build sdist 62 | run: | 63 | python -m pip install setuptools-rust setuptools wheel 64 | cd python/ 65 | python setup.py sdist 66 | 67 | - uses: actions/upload-artifact@v2 68 | with: 69 | name: dist 70 | path: python/dist/*.tar.* 71 | 72 | release: 73 | needs: [build_wheels, build_sdist] 74 | runs-on: ubuntu-latest 75 | steps: 76 | - uses: actions/download-artifact@v3 77 | with: 78 | name: dist 79 | path: python/dist/ 80 | 81 | - uses: pypa/gh-action-pypi-publish@v1.8.10 82 | with: 83 | repository-url: https://pypi.org/project/sqlite-zstd-build 84 | user: __token__ 85 | password: ${{ secrets.PYPI_API_TOKEN }} 86 | packages-dir: python/dist 87 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /python/*.egg-info/ 2 | /python/build/ 3 | /python/dist/ 4 | /target 5 | *.sqlite3* 6 | private* 7 | /bench* 8 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.formatOnSave": true, 3 | "rust-analyzer.checkOnSave.command": "clippy", 4 | "rust-analyzer.cargo.features": ["benchmark"] 5 | } 6 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "adler" 7 | version = "1.0.2" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" 10 | 11 | [[package]] 12 | name = "aho-corasick" 13 | version = "0.7.18" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" 16 | dependencies = [ 17 | "memchr", 18 | ] 19 | 20 | [[package]] 21 | name = "ansi_term" 22 | version = "0.12.1" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" 25 | dependencies = [ 26 | "winapi", 27 | ] 28 | 29 | [[package]] 30 | name = "anyhow" 31 | version = "1.0.58" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "bb07d2053ccdbe10e2af2995a2f116c1330396493dc1269f6a91d0ae82e19704" 34 | 35 | [[package]] 36 | name = "atty" 37 | version = "0.2.14" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" 40 | dependencies = [ 41 | "hermit-abi", 42 | "libc", 43 | "winapi", 44 | ] 45 | 46 | [[package]] 47 | name = "autocfg" 48 | version = "1.1.0" 49 | source = "registry+https://github.com/rust-lang/crates.io-index" 50 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" 51 | 52 | [[package]] 53 | name = "bitflags" 54 | version = "1.3.2" 55 | source = "registry+https://github.com/rust-lang/crates.io-index" 56 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 57 | 58 | [[package]] 59 | name = "bitflags" 60 | version = "2.9.0" 61 | source = "registry+https://github.com/rust-lang/crates.io-index" 62 | checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" 63 | 64 | [[package]] 65 | name = "bstr" 66 | version = "0.2.17" 67 | source = "registry+https://github.com/rust-lang/crates.io-index" 68 | checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" 69 | dependencies = [ 70 | "lazy_static", 71 | "memchr", 72 | "regex-automata", 73 | "serde", 74 | ] 75 | 76 | [[package]] 77 | name = "byteorder" 78 | version = "1.4.3" 79 | source = "registry+https://github.com/rust-lang/crates.io-index" 80 | checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" 81 | 82 | [[package]] 83 | name = "cc" 84 | version = "1.2.22" 85 | source = "registry+https://github.com/rust-lang/crates.io-index" 86 | checksum = "32db95edf998450acc7881c932f94cd9b05c87b4b2599e8bab064753da4acfd1" 87 | dependencies = [ 88 | "jobserver", 89 | "libc", 90 | "shlex", 91 | ] 92 | 93 | [[package]] 94 | name = "cfg-if" 95 | version = "1.0.0" 96 | source = "registry+https://github.com/rust-lang/crates.io-index" 97 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 98 | 99 | [[package]] 100 | name = "chrono" 101 | version = "0.4.19" 102 | source = "registry+https://github.com/rust-lang/crates.io-index" 103 | checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" 104 | dependencies = [ 105 | "libc", 106 | "num-integer", 107 | "num-traits", 108 | "time", 109 | "winapi", 110 | ] 111 | 112 | [[package]] 113 | name = "clap" 114 | version = "2.34.0" 115 | source = "registry+https://github.com/rust-lang/crates.io-index" 116 | checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" 117 | dependencies = [ 118 | "ansi_term", 119 | "atty", 120 | "bitflags 1.3.2", 121 | "strsim 0.8.0", 122 | "textwrap 0.11.0", 123 | "unicode-width", 124 | "vec_map", 125 | ] 126 | 127 | [[package]] 128 | name = "clap" 129 | version = "3.2.12" 130 | source = "registry+https://github.com/rust-lang/crates.io-index" 131 | checksum = "ab8b79fe3946ceb4a0b1c080b4018992b8d27e9ff363644c1c9b6387c854614d" 132 | dependencies = [ 133 | "atty", 134 | "bitflags 1.3.2", 135 | "clap_derive", 136 | "clap_lex", 137 | "indexmap", 138 | "once_cell", 139 | "strsim 0.10.0", 140 | "termcolor", 141 | "textwrap 0.15.0", 142 | ] 143 | 144 | [[package]] 145 | name = "clap_derive" 146 | version = "3.2.7" 147 | source = "registry+https://github.com/rust-lang/crates.io-index" 148 | checksum = "759bf187376e1afa7b85b959e6a664a3e7a95203415dba952ad19139e798f902" 149 | dependencies = [ 150 | "heck 0.4.0", 151 | "proc-macro-error", 152 | "proc-macro2", 153 | "quote", 154 | "syn 1.0.98", 155 | ] 156 | 157 | [[package]] 158 | name = "clap_lex" 159 | version = "0.2.4" 160 | source = "registry+https://github.com/rust-lang/crates.io-index" 161 | checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" 162 | dependencies = [ 163 | "os_str_bytes", 164 | ] 165 | 166 | [[package]] 167 | name = "crc32fast" 168 | version = "1.3.2" 169 | source = "registry+https://github.com/rust-lang/crates.io-index" 170 | checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" 171 | dependencies = [ 172 | "cfg-if", 173 | ] 174 | 175 | [[package]] 176 | name = "csv" 177 | version = "1.1.6" 178 | source = "registry+https://github.com/rust-lang/crates.io-index" 179 | checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" 180 | dependencies = [ 181 | "bstr", 182 | "csv-core", 183 | "itoa 0.4.8", 184 | "ryu", 185 | "serde", 186 | ] 187 | 188 | [[package]] 189 | name = "csv-core" 190 | version = "0.1.10" 191 | source = "registry+https://github.com/rust-lang/crates.io-index" 192 | checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" 193 | dependencies = [ 194 | "memchr", 195 | ] 196 | 197 | [[package]] 198 | name = "ctor" 199 | version = "0.1.22" 200 | source = "registry+https://github.com/rust-lang/crates.io-index" 201 | checksum = "f877be4f7c9f246b183111634f75baa039715e3f46ce860677d3b19a69fb229c" 202 | dependencies = [ 203 | "quote", 204 | "syn 1.0.98", 205 | ] 206 | 207 | [[package]] 208 | name = "diff" 209 | version = "0.1.13" 210 | source = "registry+https://github.com/rust-lang/crates.io-index" 211 | checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" 212 | 213 | [[package]] 214 | name = "env_logger" 215 | version = "0.9.0" 216 | source = "registry+https://github.com/rust-lang/crates.io-index" 217 | checksum = "0b2cf0344971ee6c64c31be0d530793fba457d322dfec2810c453d0ef228f9c3" 218 | dependencies = [ 219 | "atty", 220 | "humantime", 221 | "log", 222 | "regex", 223 | "termcolor", 224 | ] 225 | 226 | [[package]] 227 | name = "errno" 228 | version = "0.2.8" 229 | source = "registry+https://github.com/rust-lang/crates.io-index" 230 | checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" 231 | dependencies = [ 232 | "errno-dragonfly", 233 | "libc", 234 | "winapi", 235 | ] 236 | 237 | [[package]] 238 | name = "errno-dragonfly" 239 | version = "0.1.2" 240 | source = "registry+https://github.com/rust-lang/crates.io-index" 241 | checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" 242 | dependencies = [ 243 | "cc", 244 | "libc", 245 | ] 246 | 247 | [[package]] 248 | name = "fallible-iterator" 249 | version = "0.3.0" 250 | source = "registry+https://github.com/rust-lang/crates.io-index" 251 | checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" 252 | 253 | [[package]] 254 | name = "fallible-streaming-iterator" 255 | version = "0.1.9" 256 | source = "registry+https://github.com/rust-lang/crates.io-index" 257 | checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" 258 | 259 | [[package]] 260 | name = "flate2" 261 | version = "1.0.24" 262 | source = "registry+https://github.com/rust-lang/crates.io-index" 263 | checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6" 264 | dependencies = [ 265 | "crc32fast", 266 | "miniz_oxide", 267 | ] 268 | 269 | [[package]] 270 | name = "foldhash" 271 | version = "0.1.5" 272 | source = "registry+https://github.com/rust-lang/crates.io-index" 273 | checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" 274 | 275 | [[package]] 276 | name = "getrandom" 277 | version = "0.2.7" 278 | source = "registry+https://github.com/rust-lang/crates.io-index" 279 | checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6" 280 | dependencies = [ 281 | "cfg-if", 282 | "libc", 283 | "wasi 0.11.0+wasi-snapshot-preview1", 284 | ] 285 | 286 | [[package]] 287 | name = "hashbrown" 288 | version = "0.12.2" 289 | source = "registry+https://github.com/rust-lang/crates.io-index" 290 | checksum = "607c8a29735385251a339424dd462993c0fed8fa09d378f259377df08c126022" 291 | 292 | [[package]] 293 | name = "hashbrown" 294 | version = "0.15.3" 295 | source = "registry+https://github.com/rust-lang/crates.io-index" 296 | checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" 297 | dependencies = [ 298 | "foldhash", 299 | ] 300 | 301 | [[package]] 302 | name = "hashlink" 303 | version = "0.10.0" 304 | source = "registry+https://github.com/rust-lang/crates.io-index" 305 | checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1" 306 | dependencies = [ 307 | "hashbrown 0.15.3", 308 | ] 309 | 310 | [[package]] 311 | name = "heck" 312 | version = "0.3.3" 313 | source = "registry+https://github.com/rust-lang/crates.io-index" 314 | checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" 315 | dependencies = [ 316 | "unicode-segmentation", 317 | ] 318 | 319 | [[package]] 320 | name = "heck" 321 | version = "0.4.0" 322 | source = "registry+https://github.com/rust-lang/crates.io-index" 323 | checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" 324 | 325 | [[package]] 326 | name = "hermit-abi" 327 | version = "0.1.19" 328 | source = "registry+https://github.com/rust-lang/crates.io-index" 329 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" 330 | dependencies = [ 331 | "libc", 332 | ] 333 | 334 | [[package]] 335 | name = "hex" 336 | version = "0.4.3" 337 | source = "registry+https://github.com/rust-lang/crates.io-index" 338 | checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" 339 | 340 | [[package]] 341 | name = "humantime" 342 | version = "2.1.0" 343 | source = "registry+https://github.com/rust-lang/crates.io-index" 344 | checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" 345 | 346 | [[package]] 347 | name = "indexmap" 348 | version = "1.9.1" 349 | source = "registry+https://github.com/rust-lang/crates.io-index" 350 | checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" 351 | dependencies = [ 352 | "autocfg", 353 | "hashbrown 0.12.2", 354 | ] 355 | 356 | [[package]] 357 | name = "io-lifetimes" 358 | version = "0.7.2" 359 | source = "registry+https://github.com/rust-lang/crates.io-index" 360 | checksum = "24c3f4eff5495aee4c0399d7b6a0dc2b6e81be84242ffbfcf253ebacccc1d0cb" 361 | 362 | [[package]] 363 | name = "itoa" 364 | version = "0.4.8" 365 | source = "registry+https://github.com/rust-lang/crates.io-index" 366 | checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" 367 | 368 | [[package]] 369 | name = "itoa" 370 | version = "1.0.2" 371 | source = "registry+https://github.com/rust-lang/crates.io-index" 372 | checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d" 373 | 374 | [[package]] 375 | name = "jobserver" 376 | version = "0.1.32" 377 | source = "registry+https://github.com/rust-lang/crates.io-index" 378 | checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" 379 | dependencies = [ 380 | "libc", 381 | ] 382 | 383 | [[package]] 384 | name = "lazy_static" 385 | version = "1.4.0" 386 | source = "registry+https://github.com/rust-lang/crates.io-index" 387 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 388 | 389 | [[package]] 390 | name = "libc" 391 | version = "0.2.126" 392 | source = "registry+https://github.com/rust-lang/crates.io-index" 393 | checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836" 394 | 395 | [[package]] 396 | name = "libsqlite3-sys" 397 | version = "0.33.0" 398 | source = "registry+https://github.com/rust-lang/crates.io-index" 399 | checksum = "947e6816f7825b2b45027c2c32e7085da9934defa535de4a6a46b10a4d5257fa" 400 | dependencies = [ 401 | "cc", 402 | "pkg-config", 403 | "prettyplease", 404 | "quote", 405 | "syn 2.0.101", 406 | "vcpkg", 407 | ] 408 | 409 | [[package]] 410 | name = "linux-raw-sys" 411 | version = "0.0.46" 412 | source = "registry+https://github.com/rust-lang/crates.io-index" 413 | checksum = "d4d2456c373231a208ad294c33dc5bff30051eafd954cd4caae83a712b12854d" 414 | 415 | [[package]] 416 | name = "log" 417 | version = "0.4.17" 418 | source = "registry+https://github.com/rust-lang/crates.io-index" 419 | checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" 420 | dependencies = [ 421 | "cfg-if", 422 | ] 423 | 424 | [[package]] 425 | name = "lru_time_cache" 426 | version = "0.11.11" 427 | source = "registry+https://github.com/rust-lang/crates.io-index" 428 | checksum = "9106e1d747ffd48e6be5bb2d97fa706ed25b144fbee4d5c02eae110cd8d6badd" 429 | 430 | [[package]] 431 | name = "memchr" 432 | version = "2.5.0" 433 | source = "registry+https://github.com/rust-lang/crates.io-index" 434 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" 435 | 436 | [[package]] 437 | name = "miniz_oxide" 438 | version = "0.5.3" 439 | source = "registry+https://github.com/rust-lang/crates.io-index" 440 | checksum = "6f5c75688da582b8ffc1f1799e9db273f32133c49e048f614d22ec3256773ccc" 441 | dependencies = [ 442 | "adler", 443 | ] 444 | 445 | [[package]] 446 | name = "names" 447 | version = "0.14.0" 448 | source = "registry+https://github.com/rust-lang/crates.io-index" 449 | checksum = "7bddcd3bf5144b6392de80e04c347cd7fab2508f6df16a85fc496ecd5cec39bc" 450 | dependencies = [ 451 | "clap 3.2.12", 452 | "rand", 453 | ] 454 | 455 | [[package]] 456 | name = "num-integer" 457 | version = "0.1.45" 458 | source = "registry+https://github.com/rust-lang/crates.io-index" 459 | checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" 460 | dependencies = [ 461 | "autocfg", 462 | "num-traits", 463 | ] 464 | 465 | [[package]] 466 | name = "num-traits" 467 | version = "0.2.15" 468 | source = "registry+https://github.com/rust-lang/crates.io-index" 469 | checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" 470 | dependencies = [ 471 | "autocfg", 472 | ] 473 | 474 | [[package]] 475 | name = "once_cell" 476 | version = "1.13.0" 477 | source = "registry+https://github.com/rust-lang/crates.io-index" 478 | checksum = "18a6dbe30758c9f83eb00cbea4ac95966305f5a7772f3f42ebfc7fc7eddbd8e1" 479 | 480 | [[package]] 481 | name = "os_str_bytes" 482 | version = "6.2.0" 483 | source = "registry+https://github.com/rust-lang/crates.io-index" 484 | checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4" 485 | 486 | [[package]] 487 | name = "output_vt100" 488 | version = "0.1.3" 489 | source = "registry+https://github.com/rust-lang/crates.io-index" 490 | checksum = "628223faebab4e3e40667ee0b2336d34a5b960ff60ea743ddfdbcf7770bcfb66" 491 | dependencies = [ 492 | "winapi", 493 | ] 494 | 495 | [[package]] 496 | name = "owning_ref" 497 | version = "0.4.1" 498 | source = "registry+https://github.com/rust-lang/crates.io-index" 499 | checksum = "6ff55baddef9e4ad00f88b6c743a2a8062d4c6ade126c2a528644b8e444d52ce" 500 | dependencies = [ 501 | "stable_deref_trait", 502 | ] 503 | 504 | [[package]] 505 | name = "pkg-config" 506 | version = "0.3.25" 507 | source = "registry+https://github.com/rust-lang/crates.io-index" 508 | checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae" 509 | 510 | [[package]] 511 | name = "ppv-lite86" 512 | version = "0.2.16" 513 | source = "registry+https://github.com/rust-lang/crates.io-index" 514 | checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" 515 | 516 | [[package]] 517 | name = "pretty_assertions" 518 | version = "1.2.1" 519 | source = "registry+https://github.com/rust-lang/crates.io-index" 520 | checksum = "c89f989ac94207d048d92db058e4f6ec7342b0971fc58d1271ca148b799b3563" 521 | dependencies = [ 522 | "ansi_term", 523 | "ctor", 524 | "diff", 525 | "output_vt100", 526 | ] 527 | 528 | [[package]] 529 | name = "prettyplease" 530 | version = "0.2.32" 531 | source = "registry+https://github.com/rust-lang/crates.io-index" 532 | checksum = "664ec5419c51e34154eec046ebcba56312d5a2fc3b09a06da188e1ad21afadf6" 533 | dependencies = [ 534 | "proc-macro2", 535 | "syn 2.0.101", 536 | ] 537 | 538 | [[package]] 539 | name = "proc-macro-error" 540 | version = "1.0.4" 541 | source = "registry+https://github.com/rust-lang/crates.io-index" 542 | checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" 543 | dependencies = [ 544 | "proc-macro-error-attr", 545 | "proc-macro2", 546 | "quote", 547 | "syn 1.0.98", 548 | "version_check", 549 | ] 550 | 551 | [[package]] 552 | name = "proc-macro-error-attr" 553 | version = "1.0.4" 554 | source = "registry+https://github.com/rust-lang/crates.io-index" 555 | checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" 556 | dependencies = [ 557 | "proc-macro2", 558 | "quote", 559 | "version_check", 560 | ] 561 | 562 | [[package]] 563 | name = "proc-macro2" 564 | version = "1.0.95" 565 | source = "registry+https://github.com/rust-lang/crates.io-index" 566 | checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" 567 | dependencies = [ 568 | "unicode-ident", 569 | ] 570 | 571 | [[package]] 572 | name = "procfs" 573 | version = "0.13.2" 574 | source = "registry+https://github.com/rust-lang/crates.io-index" 575 | checksum = "979e5cb47caafb8e14653bb083358e19917ca8c9c4c2648932eccd935f5c4d80" 576 | dependencies = [ 577 | "bitflags 1.3.2", 578 | "byteorder", 579 | "chrono", 580 | "flate2", 581 | "hex", 582 | "lazy_static", 583 | "rustix", 584 | ] 585 | 586 | [[package]] 587 | name = "quote" 588 | version = "1.0.40" 589 | source = "registry+https://github.com/rust-lang/crates.io-index" 590 | checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" 591 | dependencies = [ 592 | "proc-macro2", 593 | ] 594 | 595 | [[package]] 596 | name = "rand" 597 | version = "0.8.5" 598 | source = "registry+https://github.com/rust-lang/crates.io-index" 599 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 600 | dependencies = [ 601 | "libc", 602 | "rand_chacha", 603 | "rand_core", 604 | ] 605 | 606 | [[package]] 607 | name = "rand_chacha" 608 | version = "0.3.1" 609 | source = "registry+https://github.com/rust-lang/crates.io-index" 610 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 611 | dependencies = [ 612 | "ppv-lite86", 613 | "rand_core", 614 | ] 615 | 616 | [[package]] 617 | name = "rand_core" 618 | version = "0.6.3" 619 | source = "registry+https://github.com/rust-lang/crates.io-index" 620 | checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" 621 | dependencies = [ 622 | "getrandom", 623 | ] 624 | 625 | [[package]] 626 | name = "regex" 627 | version = "1.6.0" 628 | source = "registry+https://github.com/rust-lang/crates.io-index" 629 | checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" 630 | dependencies = [ 631 | "aho-corasick", 632 | "memchr", 633 | "regex-syntax", 634 | ] 635 | 636 | [[package]] 637 | name = "regex-automata" 638 | version = "0.1.10" 639 | source = "registry+https://github.com/rust-lang/crates.io-index" 640 | checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" 641 | 642 | [[package]] 643 | name = "regex-syntax" 644 | version = "0.6.27" 645 | source = "registry+https://github.com/rust-lang/crates.io-index" 646 | checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" 647 | 648 | [[package]] 649 | name = "rusqlite" 650 | version = "0.35.0" 651 | source = "registry+https://github.com/rust-lang/crates.io-index" 652 | checksum = "a22715a5d6deef63c637207afbe68d0c72c3f8d0022d7cf9714c442d6157606b" 653 | dependencies = [ 654 | "bitflags 2.9.0", 655 | "fallible-iterator", 656 | "fallible-streaming-iterator", 657 | "hashlink", 658 | "libsqlite3-sys", 659 | "smallvec", 660 | ] 661 | 662 | [[package]] 663 | name = "rustix" 664 | version = "0.35.7" 665 | source = "registry+https://github.com/rust-lang/crates.io-index" 666 | checksum = "d51cc38aa10f6bbb377ed28197aa052aa4e2b762c22be9d3153d01822587e787" 667 | dependencies = [ 668 | "bitflags 1.3.2", 669 | "errno", 670 | "io-lifetimes", 671 | "libc", 672 | "linux-raw-sys", 673 | "windows-sys", 674 | ] 675 | 676 | [[package]] 677 | name = "ryu" 678 | version = "1.0.10" 679 | source = "registry+https://github.com/rust-lang/crates.io-index" 680 | checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695" 681 | 682 | [[package]] 683 | name = "serde" 684 | version = "1.0.139" 685 | source = "registry+https://github.com/rust-lang/crates.io-index" 686 | checksum = "0171ebb889e45aa68b44aee0859b3eede84c6f5f5c228e6f140c0b2a0a46cad6" 687 | dependencies = [ 688 | "serde_derive", 689 | ] 690 | 691 | [[package]] 692 | name = "serde_derive" 693 | version = "1.0.139" 694 | source = "registry+https://github.com/rust-lang/crates.io-index" 695 | checksum = "dc1d3230c1de7932af58ad8ffbe1d784bd55efd5a9d84ac24f69c72d83543dfb" 696 | dependencies = [ 697 | "proc-macro2", 698 | "quote", 699 | "syn 1.0.98", 700 | ] 701 | 702 | [[package]] 703 | name = "serde_json" 704 | version = "1.0.82" 705 | source = "registry+https://github.com/rust-lang/crates.io-index" 706 | checksum = "82c2c1fdcd807d1098552c5b9a36e425e42e9fbd7c6a37a8425f390f781f7fa7" 707 | dependencies = [ 708 | "itoa 1.0.2", 709 | "ryu", 710 | "serde", 711 | ] 712 | 713 | [[package]] 714 | name = "shlex" 715 | version = "1.3.0" 716 | source = "registry+https://github.com/rust-lang/crates.io-index" 717 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" 718 | 719 | [[package]] 720 | name = "smallvec" 721 | version = "1.9.0" 722 | source = "registry+https://github.com/rust-lang/crates.io-index" 723 | checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" 724 | 725 | [[package]] 726 | name = "sqlite-zstd" 727 | version = "0.3.5" 728 | dependencies = [ 729 | "anyhow", 730 | "chrono", 731 | "csv", 732 | "env_logger", 733 | "lazy_static", 734 | "log", 735 | "lru_time_cache", 736 | "names", 737 | "owning_ref", 738 | "pretty_assertions", 739 | "procfs", 740 | "rand", 741 | "rusqlite", 742 | "serde", 743 | "serde_json", 744 | "structopt", 745 | "zstd", 746 | ] 747 | 748 | [[package]] 749 | name = "stable_deref_trait" 750 | version = "1.2.0" 751 | source = "registry+https://github.com/rust-lang/crates.io-index" 752 | checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" 753 | 754 | [[package]] 755 | name = "strsim" 756 | version = "0.8.0" 757 | source = "registry+https://github.com/rust-lang/crates.io-index" 758 | checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" 759 | 760 | [[package]] 761 | name = "strsim" 762 | version = "0.10.0" 763 | source = "registry+https://github.com/rust-lang/crates.io-index" 764 | checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" 765 | 766 | [[package]] 767 | name = "structopt" 768 | version = "0.3.26" 769 | source = "registry+https://github.com/rust-lang/crates.io-index" 770 | checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" 771 | dependencies = [ 772 | "clap 2.34.0", 773 | "lazy_static", 774 | "structopt-derive", 775 | ] 776 | 777 | [[package]] 778 | name = "structopt-derive" 779 | version = "0.4.18" 780 | source = "registry+https://github.com/rust-lang/crates.io-index" 781 | checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" 782 | dependencies = [ 783 | "heck 0.3.3", 784 | "proc-macro-error", 785 | "proc-macro2", 786 | "quote", 787 | "syn 1.0.98", 788 | ] 789 | 790 | [[package]] 791 | name = "syn" 792 | version = "1.0.98" 793 | source = "registry+https://github.com/rust-lang/crates.io-index" 794 | checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd" 795 | dependencies = [ 796 | "proc-macro2", 797 | "quote", 798 | "unicode-ident", 799 | ] 800 | 801 | [[package]] 802 | name = "syn" 803 | version = "2.0.101" 804 | source = "registry+https://github.com/rust-lang/crates.io-index" 805 | checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" 806 | dependencies = [ 807 | "proc-macro2", 808 | "quote", 809 | "unicode-ident", 810 | ] 811 | 812 | [[package]] 813 | name = "termcolor" 814 | version = "1.1.3" 815 | source = "registry+https://github.com/rust-lang/crates.io-index" 816 | checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" 817 | dependencies = [ 818 | "winapi-util", 819 | ] 820 | 821 | [[package]] 822 | name = "textwrap" 823 | version = "0.11.0" 824 | source = "registry+https://github.com/rust-lang/crates.io-index" 825 | checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" 826 | dependencies = [ 827 | "unicode-width", 828 | ] 829 | 830 | [[package]] 831 | name = "textwrap" 832 | version = "0.15.0" 833 | source = "registry+https://github.com/rust-lang/crates.io-index" 834 | checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb" 835 | 836 | [[package]] 837 | name = "time" 838 | version = "0.1.44" 839 | source = "registry+https://github.com/rust-lang/crates.io-index" 840 | checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" 841 | dependencies = [ 842 | "libc", 843 | "wasi 0.10.0+wasi-snapshot-preview1", 844 | "winapi", 845 | ] 846 | 847 | [[package]] 848 | name = "unicode-ident" 849 | version = "1.0.2" 850 | source = "registry+https://github.com/rust-lang/crates.io-index" 851 | checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7" 852 | 853 | [[package]] 854 | name = "unicode-segmentation" 855 | version = "1.9.0" 856 | source = "registry+https://github.com/rust-lang/crates.io-index" 857 | checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99" 858 | 859 | [[package]] 860 | name = "unicode-width" 861 | version = "0.1.9" 862 | source = "registry+https://github.com/rust-lang/crates.io-index" 863 | checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" 864 | 865 | [[package]] 866 | name = "vcpkg" 867 | version = "0.2.15" 868 | source = "registry+https://github.com/rust-lang/crates.io-index" 869 | checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" 870 | 871 | [[package]] 872 | name = "vec_map" 873 | version = "0.8.2" 874 | source = "registry+https://github.com/rust-lang/crates.io-index" 875 | checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" 876 | 877 | [[package]] 878 | name = "version_check" 879 | version = "0.9.4" 880 | source = "registry+https://github.com/rust-lang/crates.io-index" 881 | checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" 882 | 883 | [[package]] 884 | name = "wasi" 885 | version = "0.10.0+wasi-snapshot-preview1" 886 | source = "registry+https://github.com/rust-lang/crates.io-index" 887 | checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" 888 | 889 | [[package]] 890 | name = "wasi" 891 | version = "0.11.0+wasi-snapshot-preview1" 892 | source = "registry+https://github.com/rust-lang/crates.io-index" 893 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" 894 | 895 | [[package]] 896 | name = "winapi" 897 | version = "0.3.9" 898 | source = "registry+https://github.com/rust-lang/crates.io-index" 899 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 900 | dependencies = [ 901 | "winapi-i686-pc-windows-gnu", 902 | "winapi-x86_64-pc-windows-gnu", 903 | ] 904 | 905 | [[package]] 906 | name = "winapi-i686-pc-windows-gnu" 907 | version = "0.4.0" 908 | source = "registry+https://github.com/rust-lang/crates.io-index" 909 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 910 | 911 | [[package]] 912 | name = "winapi-util" 913 | version = "0.1.5" 914 | source = "registry+https://github.com/rust-lang/crates.io-index" 915 | checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" 916 | dependencies = [ 917 | "winapi", 918 | ] 919 | 920 | [[package]] 921 | name = "winapi-x86_64-pc-windows-gnu" 922 | version = "0.4.0" 923 | source = "registry+https://github.com/rust-lang/crates.io-index" 924 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 925 | 926 | [[package]] 927 | name = "windows-sys" 928 | version = "0.36.1" 929 | source = "registry+https://github.com/rust-lang/crates.io-index" 930 | checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2" 931 | dependencies = [ 932 | "windows_aarch64_msvc", 933 | "windows_i686_gnu", 934 | "windows_i686_msvc", 935 | "windows_x86_64_gnu", 936 | "windows_x86_64_msvc", 937 | ] 938 | 939 | [[package]] 940 | name = "windows_aarch64_msvc" 941 | version = "0.36.1" 942 | source = "registry+https://github.com/rust-lang/crates.io-index" 943 | checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47" 944 | 945 | [[package]] 946 | name = "windows_i686_gnu" 947 | version = "0.36.1" 948 | source = "registry+https://github.com/rust-lang/crates.io-index" 949 | checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6" 950 | 951 | [[package]] 952 | name = "windows_i686_msvc" 953 | version = "0.36.1" 954 | source = "registry+https://github.com/rust-lang/crates.io-index" 955 | checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024" 956 | 957 | [[package]] 958 | name = "windows_x86_64_gnu" 959 | version = "0.36.1" 960 | source = "registry+https://github.com/rust-lang/crates.io-index" 961 | checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1" 962 | 963 | [[package]] 964 | name = "windows_x86_64_msvc" 965 | version = "0.36.1" 966 | source = "registry+https://github.com/rust-lang/crates.io-index" 967 | checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680" 968 | 969 | [[package]] 970 | name = "zstd" 971 | version = "0.11.2+zstd.1.5.2" 972 | source = "registry+https://github.com/rust-lang/crates.io-index" 973 | checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" 974 | dependencies = [ 975 | "zstd-safe", 976 | ] 977 | 978 | [[package]] 979 | name = "zstd-safe" 980 | version = "5.0.2+zstd.1.5.2" 981 | source = "registry+https://github.com/rust-lang/crates.io-index" 982 | checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" 983 | dependencies = [ 984 | "libc", 985 | "zstd-sys", 986 | ] 987 | 988 | [[package]] 989 | name = "zstd-sys" 990 | version = "2.0.1+zstd.1.5.2" 991 | source = "registry+https://github.com/rust-lang/crates.io-index" 992 | checksum = "9fd07cbbc53846d9145dbffdf6dd09a7a0aa52be46741825f5c97bdd4f73f12b" 993 | dependencies = [ 994 | "cc", 995 | "libc", 996 | ] 997 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["phiresky "] 3 | description = "Extension for sqlite that provides transparent dictionary-based row-level compression for sqlite" 4 | edition = "2024" 5 | license = "LGPL-2.0-or-later" 6 | name = "sqlite-zstd" 7 | repository = "https://github.com/phiresky/sqlite-zstd" 8 | version = "0.3.5" 9 | readme = "README.md" 10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 11 | 12 | [features] 13 | default = [] 14 | 15 | build_extension = ["rusqlite/loadable_extension"] 16 | 17 | # debug_zstd = ["zstd/debug"] 18 | 19 | benchmark = ["structopt", "procfs", "rusqlite/backup", "rusqlite/load_extension", "csv"] 20 | 21 | all = ["benchmark"] 22 | 23 | [[bin]] 24 | name = "benchmark" 25 | required-features = ["benchmark"] 26 | 27 | [[bin]] 28 | name = "create_test_db" 29 | required-features = ["benchmark"] 30 | 31 | [lib] 32 | crate-type = ["cdylib"] 33 | 34 | [dependencies] 35 | zstd = {version = "0.11.2", features = ["experimental"]} 36 | #zstd = {version = "0.5.3", path="../zstd-rs"} 37 | #zstd = {version = "=0.5.4"} 38 | anyhow = "1.0.44" 39 | serde = {version = "1.0.130", features = ["derive"]} 40 | serde_json = "1.0.68" 41 | 42 | csv = {version = "1.1.6", optional = true} 43 | env_logger = "0.9.0" 44 | lazy_static = "1.4.0" 45 | log = "0.4.14" 46 | lru_time_cache = "0.11.11" 47 | owning_ref = "0.4.1" 48 | procfs = {version = "0.13.2", optional = true} 49 | rand = "0.8.4" 50 | structopt = {version = "0.3.23", optional = true} 51 | 52 | [dependencies.rusqlite] 53 | features = ["functions", "blob", "bundled", "array"] 54 | package = "rusqlite" 55 | version = "0.35.0" 56 | 57 | [dev-dependencies] 58 | chrono = "0.4.19" 59 | names = "0.14.0" 60 | pretty_assertions = "1.2.1" 61 | 62 | [profile.release] 63 | lto = "fat" 64 | 65 | # cargo-deb configuration 66 | # https://github.com/kornelski/cargo-deb 67 | [package.metadata.deb] 68 | # Debianized package name, conveniently matches the name of the shared library file 69 | name = "libsqlite-zstd" 70 | # $auto fills in the automatically calculated dependencies (namely libc) 71 | # libsqlite3-0 is added because this library isn't very useful without SQLite 72 | depends = "$auto, libsqlite3-0" 73 | # This feature is required to build the shared library extension 74 | features = ["build_extension"] 75 | assets = [ 76 | # Install the shared library extension to /usr/lib, where SQLite can find it 77 | ["target/release/libsqlite_zstd.so", "usr/lib/", "744"], 78 | # It's good practice to install the README file into /usr/share/doc for every package 79 | ["README.md", "usr/share/doc/libsqlite-zstd/README", "644"], 80 | ] 81 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sqlite-zstd 2 | 3 | Extension for sqlite that provides transparent dictionary-based row-level compression for sqlite. This basically allows you to compress entries in a sqlite database almost as well as if you were compressing the whole DB file, but while retaining random access. 4 | 5 | See also the announcement blog post for some motivation, benchmarks and ramblings: https://phiresky.github.io/blog/2022/sqlite-zstd 6 | 7 | [![size comparison chart](doc/2022-07-31-19-27-57.png)](https://phiresky.github.io/blog/2022/sqlite-zstd) 8 | 9 | Depending on the data, this can reduce the size of the database by 80% while keeping performance mostly the same (or even improving it, since the data to be read from disk is smaller). 10 | 11 | Note that a compression VFS such as https://github.com/mlin/sqlite_zstd_vfs might be suited better depending on the use case. That has very different tradeoffs and capabilities, but the end result is similar. 12 | 13 | ## Transparent Compression 14 | 15 | - `zstd_enable_transparent(config)` 16 | 17 | Enable transparent row-level compression of the given column on the given table. 18 | 19 | You can call this function several times on the same table with different columns to compress. 20 | 21 | ```sql 22 | SELECT 23 | zstd_enable_transparent('{"table": "objects", "column": "data1", "compression_level": 19, "dict_chooser": "''a''"}'), 24 | zstd_enable_transparent('{"table": "objects", "column": "data2", "compression_level": 19, "dict_chooser": "''a''"}') 25 | 26 | ``` 27 | 28 | The data will be moved to `_table_name_zstd`, while `table_name` will be a view that can be queried as normally, including SELECT, INSERT, UPDATE, and DELETE queries. This function will not compress any data by itself, you need to call `zstd_incremental_maintenance` afterwards. 29 | 30 | `config` is a json object describing the configuration. See [TransparentCompressConfig](src/transparent.rs#L34) for detail. 31 | 32 | The following differences apply when compression is active: 33 | 34 | - The compressed column may only contain `blob` or `text` data, depending on the affinity of the declared data type (e.g. `VARCHAR(10)` is fine, but `int` is not). 35 | - The primary key must not be null for any row, otherwise updating may not work as expected 36 | - sqlite3_changes() will return 0 for modifying queries ([see here](https://sqlite.org/c3ref/changes.html)). 37 | - The SQLite streaming blob reading API will be somewhat useless since the blob is fully copied into memory anyways. 38 | - Attaching a database containing compressed tables using `ATTACH 'foo.db'` is not supported. 39 | - DDL statements (like ALTER TABLE and CREATE INDEX) are only partially supported 40 | 41 | - `zstd_incremental_maintenance(duration_seconds: float | null, db_load: float) -> bool` 42 | 43 | Perform an incremental maintenance operation taking around the given amount of time. 44 | This will train dictionaries and compress data based on the grouping given in the TransparentCompressConfig. 45 | 46 | **In order for the size of your database file to actually shrink, you also have to call "VACUUM"**. Otherwise SQLite just marks pages as free (and reuses them for new data). 47 | 48 | `duration_seconds`: If given amount of time is 0, do a single step and exit as soon as possible. If given amount of time is null, run until all pending maintenance is complete. 49 | 50 | `db_load`: specifies the ratio of time the db will be locked with write queries. For example: if set to 0.5, after each write operation taking 2 seconds the maintenance function will sleep for 2 seconds so other processes have time to run write operations against the database. If set to 1, the maintenance will not sleep. Note that this is only useful if you run the incremental maintenance function in a separate thread or process than your other logic. Note that both the duration and the db load are best-effort: there is no exact guarantee about the amount of time the database will stay locked at a time. 51 | 52 | _Returns_ 1 if there is more work to be done, 0 if everything is compressed as it should. 53 | 54 | Note that each call of this function has a start up time cost equivalent to `select * from table where dictid is null`, so longer durations are more efficient. 55 | 56 | This function can safely be interrupted at any time, each chunk of compression work is done as an atomic operation. 57 | 58 | Examples: 59 | 60 | - `zstd_incremental_maintenance(null, 1)`: Compresses everying, as fast as possible. Useful if the db is not currently in use. 61 | - `zstd_incremental_maintenance(60, 0.5)`: Spend 60 seconds compressing pending stuff, while allowing other queries to run 50% of the time. 62 | 63 | Example output: 64 | 65 | ``` 66 | sqlite> select zstd_incremental_maintenance(null, 1); 67 | [2020-12-23T21:11:31Z WARN sqlite_zstd::transparent] Warning: It is recommended to set `pragma busy_timeout=2000;` or higher 68 | [2020-12-23T21:11:40Z INFO sqlite_zstd::transparent] events.data: Total 5.20GB to potentially compress. 69 | 3[2020-12-23T21:13:22Z INFO sqlite_zstd::transparent] Compressed 6730 rows with dictid=109. Total size of entries before: 163.77MB, afterwards: 2.12MB, (average: before=24.33kB, after=315B) 70 | [2020-12-23T21:13:43Z INFO sqlite_zstd::transparent] Compressed 4505 rows with dictid=110. Total size of entries before: 69.28MB, afterwards: 1.60MB, (average: before=15.38kB, after=355B) 71 | [2020-12-23T21:14:06Z INFO sqlite_zstd::transparent] Compressed 5228 rows with dictid=111. Total size of entries before: 91.97MB, afterwards: 1.41MB, (average: before=17.59kB, after=268B) 72 | ``` 73 | 74 | - `zstd_train_dict_and_save(agg, dict_size: int, sample_count: int, dict_chooser_key: text) -> int` 75 | 76 | This function is used internally by zstd_incremental_maintenance. Same as `zstd_train_dict`, but the dictionary is saved to the `_zstd_dicts` table and the id is returned. The dict_chooser_key is used to identify the dictionary during compression, but during decompression only the integer primary key is used (that way changing the dict chooser expression is safe). 77 | 78 | ## Basic Functionality 79 | 80 | - `zstd_compress(data: text|blob, level: int = 3, dictionary: blob | int | null = null, compact: bool = false) -> blob` 81 | 82 | Compresses the given data, with the compression level (1 - 22, default 3) 83 | 84 | - If dictionary is a blob it will be directly used 85 | - If dictionary is an int i, it is functionally equivalent to `zstd_compress(data, level, (select dict from _zstd_dict where id = i))` 86 | - If dictionary is not present, null, or -1, the data is compressed without a dictionary. 87 | 88 | if compact is true, the output will be without magic header, without checksums, and without dictids. This will save 4 bytes when not using dictionaries and 8 bytes when using dictionaries. this also means the data will not be decodeable as a normal zstd archive with the standard tools. 89 | The same compact argument must also be passed to the decompress function. 90 | 91 | - `zstd_decompress(data: blob, is_text: bool, dictionary: blob | int | null = null, compact: bool = false) -> text|blob` 92 | 93 | Decompresses the given data. if the dictionary is wrong, the result is undefined 94 | 95 | - If dictionary is a blob it will be directly used 96 | - If dictionary is an int i, it is functionally equivalent to `zstd_decompress(data, (select dict from _zstd_dict where id = i))`. 97 | - If dictionary is not present, null, or -1, it is assumed the data was compressed without a dictionary. 98 | 99 | Note that passing dictionary as an int is recommended, since then the dictionary only has to be prepared once. 100 | 101 | is_text specifies whether to output the data as text or as a blob. Note that when outputting as text the encoding depends on the sqlite database encoding. sqlite-zstd is only tested with UTF-8. 102 | 103 | compact must be specified when the compress function was also called with compact. 104 | 105 | - `zstd_train_dict(agg, dict_size: int, sample_count: int) -> blob` 106 | 107 | Aggregate function (like sum() or count()) to train a zstd dictionary on randomly selected sample_count samples of the given aggregate data 108 | 109 | Example use: `select zstd_train_dict(tbl.data, 100000, 1000) from tbl` will return a dictionary of size 100kB trained on 1000 random samples in `tbl` 110 | 111 | The recommended number of samples is 100x the target dictionary size. As an example, you can train a dict of 100kB with the "optimal" sample count as follows: 112 | 113 | ```sql 114 | select zstd_train_dict(data, 100000, (select (100000 * 100 / avg(length(data))) as sample_count from tbl)) 115 | as dict from tbl 116 | ``` 117 | 118 | Note that dict_size and sample_count are assumed to be constants. 119 | 120 | # Compiling 121 | 122 | This project can be built in two modes: (a) as a Rust library and (b) as a pure SQLite extension (with `--features build_extension`). 123 | 124 | You can get the SQLite extension binaries from the GitHub releases. Alternatively, you can build the extension by hand: 125 | 126 | ``` 127 | cargo build --release --features build_extension 128 | # should give you target/release/libsqlite_zstd.so 129 | ``` 130 | 131 | ## Cross Compiling 132 | 133 | For cross-compiling to `aarch64-linux-android`, you need to 134 | 1. Download the target we need to cross-compile 135 | ```bash 136 | rustup target add aarch64_linux_android 137 | ``` 138 | 139 | 2. Prepare the [Android NDK](https://developer.android.com/ndk) (The binutils is deprecated and removed from NDK 23+, so you need to download an older version of NDK) 140 | 141 | 3. Setup NDK binary path 142 | ```bash 143 | export PATH="$PATH:/toolchains/llvm/prebuilt/linux-x86_64/bin" 144 | ``` 145 | 146 | 4. Specify linker in [cargo configuration file](https://doc.rust-lang.org/cargo/reference/config.html) 147 | ```toml 148 | [target.aarch64-linux-android] 149 | linker = "aarch64-linux-android23-clang" 150 | ``` 151 | 152 | 5. Specify `target` accordingly when building 153 | ```bash 154 | cargo build -r --features build_extension --target aarch64-linux-android 155 | ``` 156 | 157 | ## As a Python "extension" 158 | 159 | If you want to use this project as an SQLite extension inside a Python project, 160 | you can install it as a Python package (you still need to have a rust compiler 161 | to actually build the binary): 162 | 163 | ```bash 164 | pip install 'git+https://github.com/phiresky/sqlite-zstd.git#egg=sqlite_zstd&subdirectory=python' 165 | ``` 166 | 167 | This installs the extension as a Python package, with some support code to make 168 | it easy to use from Python code or [Datasette](https://datasette.io/). 169 | 170 | # Usage 171 | 172 | You can either load this library as SQLite extension or as a Rust library. Note that sqlite extensions are not persistent, so you need to load it each time you connect to the database. 173 | 174 | **Is this library production ready?** 175 | 176 | I wouldn't trust it with my data (yet). Make sure you have backups of everything. I'm also not making any guarantees for backwards compatibility of future updates, though migrating by copying over the uncompressed data should of course work fine. 177 | 178 | **Sqlite CLI** 179 | 180 | Either load it in the REPL: 181 | 182 | ```sh 183 | $ sqlite3 file.db 184 | SQLite version 3.34.0 2020-12-01 16:14:00 185 | sqlite> .load .../libsqlite_zstd.so 186 | [2020-12-23T21:30:02Z INFO sqlite_zstd::create_extension] [sqlite-zstd] initialized 187 | sqlite> 188 | ``` 189 | 190 | Or alternatively: 191 | 192 | `sqlite3 -cmd '.load libsqlite_zstd.so' 'select * from foo'` 193 | 194 | **C Api** 195 | 196 | ```c 197 | int success = sqlite3_load_extension(db, "libsqlite_zstd.so", NULL, NULL); 198 | ``` 199 | 200 | See [here](https://www.sqlite.org/loadext.html) for more information. 201 | 202 | **Rust** 203 | 204 | The recommended method is to add `sqlite_zstd` as a dependency to your project, then load it using 205 | 206 | ```rust 207 | let conn: rusqlite::Connection; 208 | sqlite_zstd::load(&conn)?; 209 | ``` 210 | 211 | Alternatively, you can load the extension like any other extension: 212 | 213 | ```rust 214 | let conn: rusqlite::Connection; 215 | conn.load_extension("libsqlite_zstd.so", None)?; 216 | ``` 217 | 218 | See [here](https://docs.rs/rusqlite/0.24.2/rusqlite/struct.Connection.html#method.load_extension) for more information. 219 | 220 | **Python** 221 | 222 | If you have installed this as a Python module as described above, you can load 223 | the extension into an existion SQLite connection like this: 224 | 225 | ```python 226 | import sqlite3 227 | import sqlite_zstd 228 | 229 | conn = sqlite3.connect(':memory:') 230 | sqlite_zstd.load(conn) 231 | ``` 232 | 233 | When using Datasette, this extension is loaded automatically into every 234 | connection. 235 | 236 | # Verbosity / Debugging 237 | 238 | You can change the log level by setting the environment variable `SQLITE_ZSTD_LOG=error` for less logging and `SQLITE_ZSTD_LOG=debug` for more logging. 239 | 240 | # Future Work / Ideas / Todo 241 | 242 | - investigate startup cost without dictionary 243 | - correctly handle indices over compressed columns (try generated columns instead of views, maybe vtables, ask the sqlite devs) 244 | - do compression in different thread(s) for performance (e.g. using .multithread(1) in zstd?) 245 | - type affinity interferes with int pass through - `insert into compressed (col) values (1)` will result in typeof(col) = text instead of integer if the type of the column was declared as text - which in turns causes decompression to fail with "got string, but zstd compressed data is always blob" 246 | - either change the type of the compressed column to blob or similar or disallow integer passthrough 247 | -------------------------------------------------------------------------------- /clippy.toml: -------------------------------------------------------------------------------- 1 | allow-print-in-tests = true 2 | -------------------------------------------------------------------------------- /doc/2022-07-31-19-27-57.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/sqlite-zstd/ebc5d418dd2057099c9d0d36df155e8637a04748/doc/2022-07-31-19-27-57.png -------------------------------------------------------------------------------- /doc/sqlitebrowser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/sqlite-zstd/ebc5d418dd2057099c9d0d36df155e8637a04748/doc/sqlitebrowser.png -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # sqlite-zstd 2 | Extension for sqlite that provides transparent dictionary-based row-level compression for sqlite. This basically allows you to compress entries in a sqlite database almost as well as if you were compressing the whole DB file, but while retaining random access. 3 | 4 | See also the announcement blog post for some motivation, benchmarks and ramblings: https://phiresky.github.io/blog/2022/sqlite-zstd 5 | 6 | Depending on the data, this can reduce the size of the database by 80% while keeping performance mostly the same (or even improving it, since the data to be read from disk is smaller). 7 | 8 | Note that a compression VFS such as https://github.com/mlin/sqlite_zstd_vfs might be suited better depending on the use case. That has very different tradeoffs and capabilities, but the end result is similar. 9 | 10 | ## Install 11 | ```bash 12 | pip install sqlite-zstd 13 | ``` 14 | 15 | ## Usage 16 | ```python 17 | import sqlite3 18 | import sqlite_zstd 19 | 20 | conn = sqlite3.connect(':memory:') 21 | sqlite_zstd.load(conn) 22 | ``` 23 | -------------------------------------------------------------------------------- /python/lib/__init__.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | from importlib.resources import files, as_file 3 | 4 | def load(conn: sqlite3.Connection) -> None: 5 | lib = next(x for x in files(__name__).iterdir() if x.name.startswith('lib')) 6 | with as_file(lib) as ext: 7 | conn.load_extension(str(ext)) 8 | -------------------------------------------------------------------------------- /python/lib/datasette.py: -------------------------------------------------------------------------------- 1 | from datasette import hookimpl 2 | from . import load 3 | 4 | 5 | @hookimpl 6 | def prepare_connection(conn): 7 | conn.enable_load_extension(True) 8 | load(conn) 9 | conn.enable_load_extension(False) 10 | -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.2", "setuptools_scm>=6.2", "setuptools_rust"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "sqlite-zstd" 7 | requires-python = ">=3.9" 8 | dynamic = ["version"] 9 | 10 | [project.readme] 11 | file = "README.md" 12 | content-type = "text/markdown" 13 | 14 | [project.entry-points.datasette] 15 | sqlite_zstd = "sqlite_zstd.datasette" 16 | 17 | [tool.setuptools.package-dir] 18 | sqlite_zstd = "lib" 19 | 20 | [tool.setuptools_scm] 21 | root = ".." 22 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools_rust import Binding, RustExtension 3 | 4 | setup( 5 | rust_extensions=[RustExtension('sqlite_zstd.libsqlite_zstd', 6 | path='../Cargo.toml', 7 | binding=Binding.NoBinding, 8 | features=['build_extension'], 9 | py_limited_api=True, 10 | )], 11 | ) 12 | -------------------------------------------------------------------------------- /src/add_functions.rs: -------------------------------------------------------------------------------- 1 | use crate::dict_training::ZstdTrainDictAggregate; 2 | use crate::util::*; 3 | use crate::{basic::zstd_decompress_fn, transparent::*}; 4 | 5 | use crate::basic::zstd_compress_fn; 6 | use rusqlite::functions::{Context, FunctionFlags}; 7 | 8 | pub fn add_functions(db: &rusqlite::Connection) -> anyhow::Result<()> { 9 | let nondeterministic = FunctionFlags::SQLITE_UTF8 | FunctionFlags::SQLITE_DIRECTONLY; 10 | let deterministic = FunctionFlags::SQLITE_UTF8 | FunctionFlags::SQLITE_DETERMINISTIC; 11 | 12 | let zstd_compress = |ctx: &Context| zstd_compress_fn(ctx, false).map_err(ah); 13 | let zstd_compress_col = |ctx: &Context| zstd_compress_fn(ctx, true).map_err(ah); 14 | 15 | let zstd_decompress = |ctx: &Context| zstd_decompress_fn(ctx, false).map_err(ah); 16 | let zstd_decompress_col = |ctx: &Context| zstd_decompress_fn(ctx, true).map_err(ah); 17 | // 18 | db.create_scalar_function("zstd_compress", 1, deterministic, zstd_compress)?; 19 | db.create_scalar_function("zstd_compress", 2, deterministic, zstd_compress)?; 20 | db.create_scalar_function("zstd_compress", 3, deterministic, zstd_compress)?; 21 | db.create_scalar_function("zstd_compress", 4, deterministic, zstd_compress)?; 22 | db.create_scalar_function("zstd_compress_col", 4, deterministic, zstd_compress_col)?; 23 | db.create_scalar_function("zstd_decompress", 2, deterministic, zstd_decompress)?; 24 | db.create_scalar_function("zstd_decompress", 3, deterministic, zstd_decompress)?; 25 | db.create_scalar_function("zstd_decompress", 4, deterministic, zstd_decompress)?; 26 | db.create_scalar_function("zstd_decompress_col", 4, deterministic, zstd_decompress_col)?; 27 | 28 | db.create_aggregate_function( 29 | "zstd_train_dict", 30 | 3, 31 | nondeterministic, 32 | ZstdTrainDictAggregate { 33 | return_save_id: false, 34 | }, 35 | )?; 36 | db.create_aggregate_function( 37 | "zstd_train_dict_and_save", 38 | 4, 39 | nondeterministic, 40 | ZstdTrainDictAggregate { 41 | return_save_id: true, 42 | }, 43 | )?; 44 | 45 | db.create_scalar_function("zstd_enable_transparent", 1, nondeterministic, |ctx| { 46 | zstd_enable_transparent(ctx).map_err(ah) 47 | })?; 48 | 49 | db.create_scalar_function("zstd_incremental_maintenance", 2, nondeterministic, |ctx| { 50 | zstd_incremental_maintenance(ctx).map_err(ah) 51 | })?; 52 | 53 | Ok(()) 54 | } 55 | 56 | #[cfg(test)] 57 | pub mod tests { 58 | use super::*; 59 | use anyhow::Context; 60 | use chrono::TimeZone; 61 | pub use pretty_assertions::assert_eq; 62 | 63 | use rusqlite::{Connection, params}; 64 | use serde::{Deserialize, Serialize}; 65 | use std::collections::BTreeMap; 66 | 67 | // the point of this is that it's something you might store in a DB that has lots of redundant data 68 | #[derive(Serialize, Deserialize, Debug)] 69 | #[serde(tag = "type")] 70 | enum EventData { 71 | OpenApplication { 72 | id: i32, 73 | app_name: String, 74 | app_type: String, 75 | properties: BTreeMap, 76 | }, 77 | CloseApplication { 78 | id: i32, 79 | }, 80 | Shutdown, 81 | } 82 | 83 | pub fn create_example_db(seed: Option, eles: i32) -> anyhow::Result { 84 | let seed = seed.unwrap_or_else(|| thread_rng().r#gen()); 85 | lazy_static::lazy_static! { 86 | // people use maybe 100 different apps 87 | static ref APP_NAMES: Vec = names::Generator::with_naming(names::Name::Plain) 88 | .take(100) 89 | .collect(); 90 | // of maybe 10 different categories 91 | static ref APP_TYPES: Vec = names::Generator::with_naming(names::Name::Plain) 92 | .take(10) 93 | .collect(); 94 | }; 95 | let mut db = if std::env::var("TEST_TO_FILE").is_ok() { 96 | let db_fname = format!( 97 | "/tmp/foo.{}.sqlite3", 98 | rand::distributions::Uniform::from(0..10000).sample(&mut rand::thread_rng()) 99 | ); 100 | log::debug!("writing temp db to {}", db_fname); 101 | Connection::open(db_fname)? 102 | } else { 103 | Connection::open_in_memory().context("opening memory db")? 104 | }; 105 | add_functions(&db).context("adding functions")?; 106 | db.execute_batch( 107 | " 108 | create table events ( 109 | id integer primary key not null, 110 | timestamp text not null, 111 | data text not null, 112 | another_col text 113 | ); 114 | ", 115 | )?; 116 | 117 | use rand::distributions::WeightedIndex; 118 | use rand::prelude::*; 119 | 120 | let window_properties = &[ 121 | (30, "_GTK_APPLICATION_ID"), 122 | (30, "_GTK_APPLICATION_OBJECT_PATH"), 123 | (30, "_GTK_UNIQUE_BUS_NAME"), 124 | (30, "_GTK_WINDOW_OBJECT_PATH"), 125 | (40, "_NET_WM_USER_TIME_WINDOW"), 126 | (41, "WM_CLIENT_LEADER"), 127 | (50, "_NET_WM_BYPASS_COMPOSITOR"), 128 | (60, "WM_WINDOW_ROLE"), 129 | (61, "_MOTIF_WM_HINTS"), 130 | (90, "_GTK_THEME_VARIANT"), 131 | (91, "_NET_WM_SYNC_REQUEST_COUNTER"), 132 | (91, "_NET_WM_USER_TIME"), 133 | (139, "_NET_STARTUP_ID"), 134 | (170, "_NET_WM_ICON_NAME"), 135 | (180, "WM_HINTS"), 136 | (220, "_NET_WM_WINDOW_TYPE"), 137 | (220, "XdndAware"), 138 | (229, "WM_LOCALE_NAME"), 139 | (230, "_NET_WM_NAME"), 140 | (230, "_NET_WM_PID"), 141 | (230, "WM_CLIENT_MACHINE"), 142 | (240, "_NET_WM_DESKTOP"), 143 | (240, "_NET_WM_STATE"), 144 | (240, "WM_CLASS"), 145 | (240, "WM_NORMAL_HINTS"), 146 | (240, "WM_PROTOCOLS"), 147 | (240, "WM_STATE"), 148 | ]; 149 | 150 | let mut rng = rand::rngs::StdRng::seed_from_u64(seed); 151 | let event_type_dist = WeightedIndex::new([10, 10, 1])?; 152 | let window_properties_dist = WeightedIndex::new(window_properties.iter().map(|e| e.0))?; 153 | let app_id_dist = rand::distributions::Uniform::from(0..100); 154 | let data = (0..eles).map(|_| match event_type_dist.sample(&mut rng) { 155 | 0 => { 156 | let mut properties = BTreeMap::new(); 157 | for _i in 1..rand::distributions::Uniform::from(100..1000).sample(&mut rng) { 158 | let p = window_properties[window_properties_dist.sample(&mut rng)].1; 159 | properties.insert(p.to_string(), "1".to_string()); 160 | } 161 | EventData::OpenApplication { 162 | id: app_id_dist.sample(&mut rng), 163 | app_name: APP_NAMES.choose(&mut rng).unwrap().clone(), 164 | app_type: APP_TYPES.choose(&mut rng).unwrap().clone(), 165 | properties, 166 | } 167 | } 168 | 1 => EventData::CloseApplication { 169 | id: app_id_dist.sample(&mut rng), 170 | }, 171 | 2 => EventData::Shutdown, 172 | _ => panic!("impossible"), 173 | }); 174 | { 175 | let tx = db.transaction()?; 176 | { 177 | let mut insert = tx.prepare( 178 | "insert into events (timestamp, data, another_col) values (?, ?, ?)", 179 | )?; 180 | let date = chrono::Utc.ymd(2021, 1, 1).and_hms(0, 0, 0); 181 | for (i, d) in data.enumerate() { 182 | insert.execute(params![ 183 | (date + chrono::Duration::seconds(30) * (i as i32)).to_rfc3339(), 184 | serde_json::to_string_pretty(&d)?, 185 | "rustacean" 186 | ])?; 187 | } 188 | } 189 | tx.commit()?; 190 | } 191 | Ok(db) 192 | } 193 | 194 | #[test] 195 | fn sanity() -> anyhow::Result<()> { 196 | let _db = create_example_db(None, 10).context("create eg db")?; 197 | Ok(()) 198 | } 199 | 200 | fn test_strings() -> anyhow::Result> { 201 | let data = [ 202 | "hello this is a test", 203 | "foobar", 204 | "looooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooong", 205 | "nope", 206 | ]; 207 | Ok(data.iter().map(|e| e.to_string()).collect()) 208 | } 209 | 210 | #[test] 211 | fn compress_is_deterministic() -> anyhow::Result<()> { 212 | let db = create_example_db(None, 0)?; 213 | 214 | for eg in test_strings()? { 215 | let compressed1: Vec = 216 | db.query_row("select zstd_compress(?)", params![eg], |r| r.get(0))?; 217 | let compressed2: Vec = 218 | db.query_row("select zstd_compress(?)", params![eg], |r| r.get(0))?; 219 | assert_eq!(compressed1, compressed2) 220 | } 221 | 222 | Ok(()) 223 | } 224 | 225 | #[test] 226 | fn compress_decompress_roundtrip() -> anyhow::Result<()> { 227 | let db = create_example_db(None, 0)?; 228 | 229 | for eg in test_strings()? { 230 | let compressed: Vec = db 231 | .query_row("select zstd_compress(?)", params![eg], |r| r.get(0)) 232 | .context("compressing")?; 233 | let decompressed: String = db 234 | .query_row( 235 | "select zstd_decompress(?, true)", 236 | params![compressed], 237 | |r| r.get(0), 238 | ) 239 | .context("decompressing")?; 240 | assert_eq!(eg, decompressed) 241 | } 242 | 243 | Ok(()) 244 | } 245 | 246 | #[test] 247 | fn decompress_type() -> anyhow::Result<()> { 248 | let db = create_example_db(None, 0)?; 249 | 250 | for eg in test_strings()? { 251 | let compressed: Vec = 252 | db.query_row("select zstd_compress(?)", params![eg], |r| r.get(0))?; 253 | let decompressed_text: String = db.query_row( 254 | "select zstd_decompress(?, true)", 255 | params![compressed], 256 | |r| r.get(0), 257 | )?; 258 | 259 | let decompressed_blob: Vec = db.query_row( 260 | "select zstd_decompress(?, false)", 261 | params![compressed], 262 | |r| r.get(0), 263 | )?; 264 | assert_eq!(decompressed_text.as_bytes(), decompressed_blob) 265 | } 266 | 267 | Ok(()) 268 | } 269 | #[test] 270 | fn compress_with_dict_smaller() -> anyhow::Result<()> { 271 | let db = create_example_db(None, 100)?; 272 | 273 | let compressed1: Vec = db.query_row( 274 | "select zstd_compress((select data from events where id = 1), 5)", 275 | params![], 276 | |r| r.get(0), 277 | )?; 278 | 279 | let dict: Vec = db 280 | .query_row( 281 | "select zstd_train_dict(data, 1000, 100) from events", 282 | params![], 283 | |r| r.get(0), 284 | ) 285 | .context("train dict")?; 286 | 287 | let compressed2: Vec = db 288 | .query_row( 289 | "select zstd_compress((select data from events where id = 1), 5, ?)", 290 | params![dict], 291 | |r| r.get(0), 292 | ) 293 | .context("compress with dict")?; 294 | 295 | assert!(compressed1.len() > compressed2.len()); 296 | 297 | let decompressed1: String = db 298 | .query_row("select zstd_decompress(?, 1)", params![compressed1], |r| { 299 | r.get(0) 300 | }) 301 | .context("decompress 1")?; 302 | 303 | let decompressed2: String = db 304 | .query_row( 305 | "select zstd_decompress(?, 1, ?)", 306 | params![compressed2, dict], 307 | |r| r.get(0), 308 | ) 309 | .context("decompress 2")?; 310 | 311 | assert_eq!(decompressed1, decompressed2); 312 | 313 | Ok(()) 314 | } 315 | 316 | #[test] 317 | fn dict_saving_works() -> anyhow::Result<()> { 318 | let db = create_example_db(None, 100)?; 319 | 320 | let dict: i32 = db 321 | .query_row( 322 | "select zstd_train_dict_and_save(data, 1000, 100, null) from events", 323 | params![], 324 | |r| r.get(0), 325 | ) 326 | .context("train dict")?; 327 | 328 | let uncompressed: String = db 329 | .query_row("select data from events where id = 1", params![], |r| { 330 | r.get(0) 331 | }) 332 | .context("get data")?; 333 | 334 | let compressed2: Vec = db 335 | .query_row( 336 | "select zstd_compress((select data from events where id = 1), 5, ?)", 337 | params![dict], 338 | |r| r.get(0), 339 | ) 340 | .context("compress with dict")?; 341 | 342 | let decompressed2: String = db 343 | .query_row( 344 | "select zstd_decompress(?, 1, ?)", 345 | params![compressed2, dict], 346 | |r| r.get(0), 347 | ) 348 | .context("decompress 2")?; 349 | 350 | assert_eq!(uncompressed, decompressed2); 351 | 352 | Ok(()) 353 | } 354 | 355 | #[test] 356 | fn levels() -> anyhow::Result<()> { 357 | let db = create_example_db(None, 5)?; 358 | /*db.prepare("select * from events")? 359 | .query_map(params![], |r| Ok(debug_row(r)))? 360 | .count();*/ 361 | 362 | let mut st = db.prepare("select data from events")?; 363 | let eles: Vec = st 364 | .query_map(params![], |r| r.get(0)) 365 | .context("get sample")? 366 | .collect::>()?; 367 | 368 | for ele in eles { 369 | // let mut last_size = usize::MAX; 370 | for level in 1..24 { 371 | let compressed1: Vec = db 372 | .query_row("select zstd_compress(?, ?)", params![ele, level], |r| { 373 | r.get(0) 374 | }) 375 | .context("compress")?; 376 | let decompressed1: String = db 377 | .query_row( 378 | "select zstd_decompress(?, ?)", 379 | params![compressed1, 1], 380 | |r| r.get(0), 381 | ) 382 | .context("decompress")?; 383 | 384 | assert_eq!(ele, decompressed1); 385 | println!("l={}, size={}", level, compressed1.len()); 386 | // assert!(compressed1.len() <= last_size); 387 | // last_size = compressed1.len(); 388 | } 389 | } 390 | 391 | Ok(()) 392 | } 393 | } 394 | -------------------------------------------------------------------------------- /src/basic.rs: -------------------------------------------------------------------------------- 1 | use crate::dict_management::*; 2 | use anyhow::Context as AContext; 3 | 4 | use rusqlite::functions::Context; 5 | 6 | use rusqlite::types::ToSqlOutput; 7 | use rusqlite::types::{Value, ValueRef}; 8 | use std::{io::Write, sync::Arc}; 9 | use zstd::bulk::Compressor; 10 | use zstd::dict::DecoderDictionary; 11 | 12 | /// null_dict_is_passthrough is only true when called through the `zstd_compress_col` function (for transparent compression) 13 | /// with null_dict_is_passthrough, the behaviour is slightly changed: When dict is null, the data is passed through without compression. 14 | pub(crate) fn zstd_compress_fn<'a>( 15 | ctx: &Context, 16 | null_dict_is_passthrough: bool, 17 | ) -> anyhow::Result> { 18 | let arg_data = 0; 19 | let arg_level = 1; 20 | let arg_dict = 2; 21 | let arg_is_compact = 3; 22 | 23 | let input_value = match ctx.get_raw(arg_data) { 24 | ValueRef::Blob(b) => b, 25 | ValueRef::Text(b) => b, 26 | ValueRef::Null => return Ok(ToSqlOutput::Owned(Value::Null)), // pass through null 27 | e => { 28 | anyhow::bail!( 29 | "zstd_compress expects blob or text as input, got {}", 30 | e.data_type() 31 | ) 32 | } 33 | }; 34 | 35 | if null_dict_is_passthrough && ctx.len() >= arg_dict { 36 | // if the dict id is null, pass through data 37 | if let ValueRef::Null = ctx.get_raw(arg_dict) { 38 | // TODO: figure out if sqlite3_result_blob can be passed a pointer into sqlite3_context to avoid copying?? 39 | // return Ok(ToSqlOutput::Borrowed(ctx.get_raw(arg_data))); 40 | return Ok(ToSqlOutput::Owned(Value::Blob(input_value.to_vec()))); 41 | } 42 | } 43 | 44 | let level: i32 = if ctx.len() <= arg_level { 45 | // no level given, use default (currently 3) 46 | 0 47 | } else { 48 | ctx.get(arg_level).context("level argument")? 49 | }; 50 | let compact: bool = if ctx.len() <= arg_is_compact { 51 | false 52 | } else { 53 | ctx.get(arg_is_compact).context("is_compact argument")? 54 | }; 55 | 56 | if ctx.len() <= arg_dict { 57 | zstd_compress_fn_tail(compact, input_value, Compressor::new(level)) 58 | } else { 59 | match ctx.get_raw(arg_dict) { 60 | ValueRef::Integer(-1) | ValueRef::Null => { 61 | zstd_compress_fn_tail(compact, input_value, Compressor::new(level)) 62 | } 63 | ValueRef::Blob(d) => { 64 | zstd_compress_fn_tail(compact, input_value, Compressor::with_dictionary(level, d)) 65 | } 66 | //Some(Arc::new(wrap_encoder_dict(d.to_vec(), level))), 67 | ValueRef::Integer(_) => { 68 | let dick = encoder_dict_from_ctx(ctx, arg_dict, level) 69 | .context("loading dictionary from int")?; 70 | 71 | let enc = Compressor::with_prepared_dictionary(&dick); 72 | zstd_compress_fn_tail(compact, input_value, enc) 73 | } 74 | other => anyhow::bail!( 75 | "dict argument must be int or blob, got {}", 76 | other.data_type() 77 | ), 78 | } 79 | } 80 | } 81 | 82 | // separate fn purely for borrowship simplicity 83 | fn zstd_compress_fn_tail<'a>( 84 | compact: bool, 85 | input_value: &[u8], 86 | encoder: Result, 87 | ) -> anyhow::Result> { 88 | let mut encoder = encoder.context("creating zstd encoder")?; 89 | { 90 | // pledge source size (benchmarking shows this doesn't help any tho) 91 | let cctx = encoder.context_mut(); 92 | cctx.set_pledged_src_size(input_value.len() as u64) 93 | .map_err(|c| anyhow::anyhow!("setting pledged source size (code {c})"))?; 94 | // cctx.set_parameter(zstd::zstd_safe::CParameter::BlockDelimiters(false)) 95 | // .map_err(|_| anyhow::anyhow!("no"))?; 96 | } 97 | if compact { 98 | encoder 99 | .include_checksum(false) 100 | .context("disable checksums")?; 101 | encoder.include_contentsize(false).context("cs")?; 102 | encoder.include_dictid(false).context("did")?; 103 | encoder.include_magicbytes(false).context("did")?; 104 | } 105 | let res = encoder 106 | .compress(input_value) 107 | .context("writing data to zstd encoder")?; 108 | 109 | Ok(ToSqlOutput::Owned(Value::Blob(res))) 110 | } 111 | 112 | pub(crate) fn zstd_decompress_fn<'a>( 113 | ctx: &Context, 114 | null_dict_is_passthrough: bool, 115 | ) -> anyhow::Result> { 116 | let arg_data = 0; 117 | let arg_output_text = 1; 118 | let arg_dict = 2; 119 | let arg_is_compact = 3; 120 | 121 | if null_dict_is_passthrough && ctx.len() >= arg_dict { 122 | // if the dict id is null, pass through data 123 | 124 | if let ValueRef::Null = ctx.get_raw(arg_dict) { 125 | // TODO: figure out if sqlite3_result_blob can be passed a pointer into sqlite3_context to avoid copying?? 126 | // return Ok(ToSqlOutput::Borrowed(ctx.get_raw(arg_data))); 127 | return Ok(ToSqlOutput::Owned(ctx.get_raw(arg_data).into())); 128 | } 129 | } 130 | 131 | let output_text: bool = ctx 132 | .get(arg_output_text) 133 | .context("output_text arg invalid")?; 134 | 135 | let input_value = match ctx.get_raw(arg_data) { 136 | ValueRef::Blob(b) => b, 137 | ValueRef::Null => return Ok(ToSqlOutput::Owned(Value::Null)), // pass through null 138 | e => { 139 | anyhow::bail!( 140 | "zstd_decompress expects blob as input, got {}", 141 | e.data_type() 142 | ) 143 | } 144 | }; 145 | 146 | let dict = if ctx.len() <= arg_dict { 147 | None 148 | } else { 149 | match ctx.get_raw(arg_dict) { 150 | ValueRef::Integer(-1) | ValueRef::Null => None, 151 | ValueRef::Blob(d) => Some(Arc::new(DecoderDictionary::copy(d))), 152 | ValueRef::Integer(_) => { 153 | Some(decoder_dict_from_ctx(ctx, arg_dict).context("load dict")?) 154 | } 155 | other => anyhow::bail!( 156 | "dict argument must be int or blob, got {}", 157 | other.data_type() 158 | ), 159 | } 160 | }; 161 | 162 | let compact = if ctx.len() <= arg_is_compact { 163 | false 164 | } else { 165 | ctx.get(arg_is_compact).context("argument 'compact'")? 166 | }; 167 | let dict_ref = dict.as_ref().map(|e| -> &DecoderDictionary { e }); 168 | 169 | zstd_decompress_inner(input_value, dict_ref, output_text, compact) 170 | } 171 | 172 | fn zstd_decompress_inner<'a>( 173 | input_value: &[u8], 174 | dict: Option<&DecoderDictionary>, 175 | output_text: bool, 176 | compact: bool, 177 | ) -> anyhow::Result> { 178 | let vec = { 179 | // todo: use zstd::bulk api maybe (but we don't know the output size) 180 | let out = Vec::new(); 181 | let mut decoder = match &dict { 182 | Some(dict) => zstd::stream::write::Decoder::with_prepared_dictionary(out, dict), 183 | None => zstd::stream::write::Decoder::new(out), 184 | } 185 | .context("dict load doesn't work")?; 186 | if compact { 187 | decoder.include_magicbytes(false)?; 188 | } 189 | decoder.write_all(input_value).context("decoding")?; 190 | decoder.flush().context("decoder flushing")?; 191 | decoder.into_inner() 192 | }; 193 | 194 | // dict; // to make sure the dict is still in scope because of https://github.com/gyscos/zstd-rs/issues/55 195 | if output_text { 196 | Ok(ToSqlOutput::Owned(Value::Text( 197 | // converted right back to &u8 in https://docs.rs/rusqlite/0.21.0/src/rusqlite/types/value_ref.rs.html#107 198 | // so we don't want the overhead of checking utf8. also db encoding might not be utf8 so ?? 199 | unsafe { String::from_utf8_unchecked(vec) }, 200 | ))) 201 | } else { 202 | Ok(ToSqlOutput::Owned(Value::Blob(vec))) 203 | } 204 | } 205 | -------------------------------------------------------------------------------- /src/bin/benchmark.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "benchmark")] 2 | 3 | use anyhow::Context; 4 | use anyhow::Result; 5 | use rand::seq::SliceRandom; 6 | use rusqlite::{Connection, OpenFlags, params}; 7 | use std::path::{Path, PathBuf}; 8 | use std::{io::Write, time::Instant}; 9 | use structopt::StructOpt; 10 | #[derive(Debug, StructOpt)] 11 | struct Config { 12 | #[structopt(short, long)] 13 | input_db: Vec, 14 | #[structopt(short, long)] 15 | location: Vec, 16 | #[structopt(short, long)] 17 | zstd_lib: String, 18 | #[structopt(short, long)] 19 | hot_cache: bool, 20 | #[structopt(short, long)] 21 | iterations: i32, 22 | } 23 | 24 | fn pragmas(db: &Connection) -> Result<()> { 25 | //let want_page_size = 32768; 26 | //db.execute(&format!("pragma page_size = {};", want_page_size)) 27 | // .context("setup pragma 1")?; 28 | db.execute_batch( 29 | " 30 | pragma journal_mode = WAL; 31 | pragma foreign_keys = on; 32 | pragma temp_store = memory; 33 | pragma wal_autocheckpoint = 20; 34 | pragma synchronous = normal; 35 | pragma mmap_size = 30000000000; 36 | ", 37 | )?; 38 | let jm: String = db.pragma_query_value(None, "journal_mode", |r| r.get(0))?; 39 | if &jm != "wal" { 40 | anyhow::bail!("journal mode is not wal"); 41 | } 42 | Ok(()) 43 | } 44 | trait Bench { 45 | fn name(&self) -> &str; 46 | fn execute(&self, conn: &Connection) -> Result; 47 | } 48 | 49 | type DbId = i64; 50 | struct SelectBench { 51 | name: &'static str, 52 | ids: Vec, 53 | } 54 | 55 | impl SelectBench { 56 | fn prepare_sequential(conn: &Connection) -> Result> { 57 | Ok(Box::new(SelectBench { 58 | name: "Select 1000 sequential (compressed) values", 59 | ids: conn.prepare("select id from title_basics where id >= (select id from title_basics order by random() limit 1) order by id asc limit 1000")?.query_map(params![], |r| r.get(0))?.collect::>()? 60 | })) 61 | } 62 | fn prepare_random(conn: &Connection) -> Result> { 63 | Ok(Box::new(SelectBench { 64 | name: "Select 1000 random (compressed) values", 65 | ids: conn 66 | .prepare("select id from title_basics order by random() limit 1000")? 67 | .query_map(params![], |r| r.get(0))? 68 | .collect::>()?, 69 | })) 70 | } 71 | } 72 | 73 | impl Bench for SelectBench { 74 | fn name(&self) -> &str { 75 | self.name 76 | } 77 | fn execute(&self, conn: &Connection) -> Result { 78 | let mut stmt = conn.prepare("select data from title_basics where id = ?")?; 79 | let mut _total_len = 0; 80 | for id in &self.ids { 81 | let data: String = stmt.query_row(params![id], |r| r.get(0))?; 82 | _total_len += data.len(); 83 | } 84 | 85 | // eprintln!("total bytes got: {}", _total_len); 86 | Ok(self.ids.len() as i64) 87 | } 88 | } 89 | 90 | struct UpdateBench { 91 | name: &'static str, 92 | values: Vec<(DbId, String)>, 93 | } 94 | impl UpdateBench { 95 | fn prepare_random(conn: &Connection) -> Result> { 96 | let ids: Vec = conn 97 | .prepare("select id from title_basics order by random() limit 1000")? 98 | .query_map(params![], |r| r.get(0))? 99 | .collect::>()?; 100 | 101 | let values: Vec = conn 102 | .prepare("select data from title_basics order by random() limit 1000")? 103 | .query_map(params![], |r| r.get(0))? 104 | .collect::>()?; 105 | Ok(Box::new(UpdateBench { 106 | name: "Update 1000 random (compressed) values", 107 | values: ids.into_iter().zip(values).collect(), 108 | })) 109 | } 110 | fn prepare_sequential(conn: &Connection) -> Result> { 111 | let ids: Vec = conn 112 | .prepare("select id from title_basics where id >= (select id from title_basics order by random() limit 1) order by id asc limit 1000")? 113 | .query_map(params![], |r| r.get(0))? 114 | .collect::>()?; 115 | 116 | let values: Vec = conn 117 | .prepare("select data from title_basics order by random() limit ?")? 118 | .query_map(params![ids.len()], |r| r.get(0))? 119 | .collect::>()?; 120 | Ok(Box::new(UpdateBench { 121 | name: "Update 1000 sequential (compressed) values", 122 | values: ids.into_iter().zip(values).collect(), 123 | })) 124 | } 125 | } 126 | impl Bench for UpdateBench { 127 | fn name(&self) -> &str { 128 | self.name 129 | } 130 | fn execute(&self, conn: &Connection) -> Result { 131 | conn.execute("begin", params![])?; 132 | let mut stmt = conn.prepare("update title_basics set data = ? where id = ?")?; 133 | for (id, value) in &self.values { 134 | stmt.execute(params![value, id])?; 135 | } 136 | conn.execute("commit", params![])?; 137 | Ok(self.values.len() as i64) 138 | } 139 | } 140 | struct InsertBench { 141 | name: &'static str, 142 | values: Vec, 143 | } 144 | impl InsertBench { 145 | fn prepare_random(conn: &Connection) -> Result> { 146 | let values: Vec = conn 147 | .prepare("select data from title_basics order by random() limit 1000")? 148 | .query_map(params![], |r| r.get(0))? 149 | .collect::>()?; 150 | Ok(Box::new(InsertBench { 151 | name: "Insert 1000 new values", 152 | values, 153 | })) 154 | } 155 | } 156 | impl Bench for InsertBench { 157 | fn name(&self) -> &str { 158 | self.name 159 | } 160 | fn execute(&self, conn: &Connection) -> Result { 161 | conn.execute("begin", params![])?; 162 | let mut stmt = conn.prepare("insert into title_basics (data) values (?)")?; 163 | for value in &self.values { 164 | stmt.execute(params![value])?; 165 | } 166 | conn.execute("commit", params![])?; 167 | Ok(self.values.len() as i64) 168 | } 169 | } 170 | 171 | fn drop_caches() -> Result<()> { 172 | eprintln!("dropping caches"); 173 | assert!(std::process::Command::new("sync").status()?.success()); 174 | std::fs::OpenOptions::new() 175 | .read(false) 176 | .write(true) 177 | .open("/proc/sys/vm/drop_caches") 178 | .context("Could not open drop caches")? 179 | .write_all(b"3") 180 | .context("Could not drop caches")?; 181 | Ok(()) 182 | } 183 | 184 | struct BenchTarget { 185 | total_count: i64, 186 | total_duration_s: f64, 187 | path: PathBuf, 188 | } 189 | fn main() -> Result<()> { 190 | if cfg!(debug_assertions) { 191 | panic!("benching must be done in prod mode, otherwise the results are useless"); 192 | } 193 | let config = Config::from_args(); 194 | //let input_db = Connection::open_with_flags(config.input_db)?; 195 | 196 | let its_per_bench = config.iterations; 197 | 198 | println!("location,db filename,test name,iterations/s,number of samples"); 199 | 200 | let benches: Vec> = { 201 | let db1 = 202 | Connection::open_with_flags(&config.input_db[0], OpenFlags::SQLITE_OPEN_READ_ONLY)?; 203 | type Preparer = Box Result>>; 204 | let preparers: Vec = vec![ 205 | Box::new(SelectBench::prepare_random), 206 | Box::new(SelectBench::prepare_sequential), 207 | Box::new(UpdateBench::prepare_random), 208 | Box::new(UpdateBench::prepare_sequential), 209 | Box::new(InsertBench::prepare_random), 210 | ]; 211 | preparers 212 | .iter() 213 | .map(|preparer| { 214 | eprintln!("running preparer {its_per_bench} times"); 215 | (0..its_per_bench) 216 | .map(|_i| preparer(&db1)) 217 | .collect::>() 218 | .context("preparing benches") 219 | }) 220 | .collect::>()? 221 | }; 222 | 223 | for locjoi in config.location { 224 | let (location_name, location) = { 225 | let vec: Vec<_> = locjoi.splitn(2, ':').collect(); 226 | (vec[0], vec[1]) 227 | }; 228 | eprintln!("{} at {}", location_name, location); 229 | 230 | let db_paths = config 231 | .input_db 232 | .iter() 233 | .map(|input_db| { 234 | let pb = PathBuf::from(input_db); 235 | let file_name = pb.file_name().unwrap(); 236 | 237 | let db_path = Path::new(&location).join(file_name); 238 | if !db_path.exists() { 239 | eprintln!("copying {} -> {}", input_db, db_path.to_string_lossy()); 240 | std::fs::copy(input_db, &db_path)?; 241 | } else { 242 | eprintln!( 243 | "{} already exists, assuming it's the same", 244 | file_name.to_string_lossy() 245 | ); 246 | } 247 | Ok(db_path) 248 | }) 249 | .collect::>>()?; 250 | for bench_its in &benches { 251 | // eprintln!("{locjoi} benchmark {}", bench_its[0].name()); 252 | let mut targets: Vec<_> = db_paths 253 | .iter() 254 | .map(|path| BenchTarget { 255 | total_count: 0, 256 | total_duration_s: 0.0, 257 | path: path.clone(), 258 | }) 259 | .collect(); 260 | for (i, bench) in bench_its.iter().enumerate() { 261 | eprintln!( 262 | "{locjoi} benchmark {} iteration {i} / {its_per_bench}", 263 | bench.name() 264 | ); 265 | if !config.hot_cache { 266 | drop_caches()?; 267 | } 268 | // shuffle to make sure there is no crosstalk 269 | targets.shuffle(&mut rand::thread_rng()); 270 | 271 | for target in targets.iter_mut() { 272 | let db = Connection::open(&target.path)?; 273 | pragmas(&db).context("Could not set pragmas")?; 274 | db.load_extension(&config.zstd_lib, None)?; 275 | let before = Instant::now(); 276 | target.total_count += bench.execute(&db).context("executing bench")?; 277 | target.total_duration_s += before.elapsed().as_secs_f64(); 278 | } 279 | } 280 | targets.sort_by_key(|e| e.path.clone()); 281 | for target in &targets { 282 | println!( 283 | "{},{},{},{:.0},{}", 284 | location_name, 285 | target.path.file_name().unwrap().to_string_lossy(), 286 | bench_its[0].name(), 287 | target.total_count as f64 / target.total_duration_s, 288 | target.total_count 289 | ); 290 | } 291 | } 292 | } 293 | 294 | Ok(()) 295 | } 296 | -------------------------------------------------------------------------------- /src/bin/create_test_db.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | 3 | use anyhow::Context; 4 | use anyhow::Result; 5 | use rusqlite::Connection; 6 | use rusqlite::params; 7 | use serde::Deserialize; 8 | use serde::Serialize; 9 | use serde_json::json; 10 | use structopt::StructOpt; 11 | 12 | #[derive(Serialize, Deserialize)] 13 | #[allow(non_snake_case)] 14 | struct Title { 15 | tconst: String, 16 | titleType: String, 17 | primaryTitle: String, 18 | originalTitle: String, 19 | isAdult: i32, 20 | startYear: String, 21 | endYear: String, 22 | runtimeMinutes: String, 23 | genres: String, 24 | } 25 | 26 | fn pragmas(db: &Connection) -> Result<()> { 27 | //let want_page_size = 32768; 28 | //db.execute(&format!("pragma page_size = {};", want_page_size)) 29 | // .context("setup pragma 1")?; 30 | db.execute_batch( 31 | " 32 | pragma journal_mode = WAL; 33 | pragma foreign_keys = on; 34 | pragma temp_store = memory; 35 | pragma wal_autocheckpoint = 20; 36 | pragma synchronous = normal; 37 | pragma mmap_size = 30000000000; 38 | ", 39 | )?; 40 | let jm: String = db.pragma_query_value(None, "journal_mode", |r| r.get(0))?; 41 | if &jm != "wal" { 42 | anyhow::bail!("journal mode is not wal"); 43 | } 44 | Ok(()) 45 | } 46 | 47 | #[derive(Debug, StructOpt)] 48 | struct Config { 49 | #[structopt(short, long)] 50 | zstd_lib: String, 51 | } 52 | 53 | fn main() -> Result<()> { 54 | let config = Config::from_args(); 55 | env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init(); 56 | std::env::set_var("SQLITE_ZSTD_LOG", "debug"); 57 | // before running, download https://datasets.imdbws.com/title.basics.tsv.gz 58 | // loads title_basics.tsv.gz, creates a json database and a database in normal form 59 | 60 | log::info!("loading csv"); 61 | let data: Vec = csv::ReaderBuilder::new() 62 | .delimiter(b'\t') 63 | .quoting(false) 64 | .from_reader(File::open("benchmark/title.basics.tsv")?) 65 | .deserialize() 66 | .collect::<std::result::Result<Vec<Title>, csv::Error>>() 67 | .context("foo")?; 68 | { 69 | log::info!("creating columnar db"); 70 | let mut columnar = Connection::open("benchmark/imdb-columnar.sqlite3")?; 71 | pragmas(&columnar)?; 72 | 73 | columnar.execute("create table title_basics( 74 | id integer primary key, tconst text, titleType text, primaryTitle text, originalTitle text, isAdult int, startYear text, endYear text, runtimeMinutes text, genres text)", params![])?; 75 | let db = columnar.transaction()?; 76 | let mut stmt = db.prepare("insert into title_basics values (?,?,?,?,?,?,?,?,?,?)")?; 77 | for ele in &data { 78 | stmt.execute(params![ 79 | &Option::<String>::None, 80 | &ele.tconst, 81 | &ele.titleType, 82 | &ele.primaryTitle, 83 | &ele.originalTitle, 84 | &ele.isAdult, 85 | &ele.startYear, 86 | &ele.endYear, 87 | &ele.runtimeMinutes, 88 | &ele.genres 89 | ])?; 90 | } 91 | drop(stmt); 92 | db.commit()?; 93 | } 94 | { 95 | log::info!("creating json db"); 96 | let mut jsondb = Connection::open("benchmark/imdb-json.sqlite3").unwrap(); 97 | pragmas(&jsondb)?; 98 | jsondb.execute( 99 | "create table title_basics( 100 | id integer primary key, data text)", 101 | params![], 102 | )?; 103 | let tx = jsondb.transaction()?; 104 | let mut stmt = tx.prepare("insert into title_basics values (?, ?)")?; 105 | for ele in &data { 106 | stmt.execute(params![ 107 | &Option::<String>::None, 108 | &serde_json::to_string(ele)? 109 | ])?; 110 | } 111 | drop(stmt); 112 | tx.commit()?; 113 | log::info!("vacuum-copying dbs"); 114 | jsondb.execute( 115 | "vacuum into 'benchmark/imdb-json-nocompress.sqlite3'", 116 | params![], 117 | )?; 118 | jsondb.execute( 119 | "vacuum into 'benchmark/imdb-json-zstd-transparent.sqlite3'", 120 | params![], 121 | )?; 122 | jsondb.execute( 123 | "vacuum into 'benchmark/imdb-json-zstd-nodict.sqlite3'", 124 | params![], 125 | )?; 126 | } 127 | { 128 | log::info!("doing transparent compression"); 129 | let db = Connection::open("benchmark/imdb-json-zstd-transparent.sqlite3").unwrap(); 130 | pragmas(&db)?; 131 | db.load_extension(&config.zstd_lib, None)?; 132 | let config = json!({ 133 | "table": "title_basics", 134 | "column": "data", 135 | "compression_level": 19, 136 | "dict_chooser": "'i' || (id/3000000)" 137 | }); 138 | db.query_row( 139 | "select zstd_enable_transparent(?)", 140 | params![&serde_json::to_string(&config)?], 141 | |_| Ok(()), 142 | )?; 143 | db.query_row( 144 | "select zstd_incremental_maintenance(null, 1)", 145 | params![], 146 | |_| Ok(()), 147 | )?; 148 | db.execute("vacuum", params![])?; 149 | } 150 | { 151 | log::info!("doing nodict compression"); 152 | let db = Connection::open("benchmark/imdb-json-zstd-nodict.sqlite3").unwrap(); 153 | pragmas(&db)?; 154 | db.load_extension(&config.zstd_lib, None)?; 155 | let config = json!({ 156 | "table": "title_basics", 157 | "column": "data", 158 | "compression_level": 19, 159 | "dict_chooser": "'[nodict]'" 160 | }); 161 | db.query_row( 162 | "select zstd_enable_transparent(?)", 163 | params![&serde_json::to_string(&config)?], 164 | |_| Ok(()), 165 | )?; 166 | db.query_row( 167 | "select zstd_incremental_maintenance(null, 1)", 168 | params![], 169 | |_| Ok(()), 170 | )?; 171 | db.execute("vacuum", params![])?; 172 | } 173 | Ok(()) 174 | } 175 | -------------------------------------------------------------------------------- /src/create_extension.rs: -------------------------------------------------------------------------------- 1 | // https://www.sqlite.org/loadext.html 2 | // https://github.com/jgallagher/rusqlite/issues/524#issuecomment-507787350 3 | 4 | use rusqlite::Connection; 5 | use rusqlite::ffi; 6 | use std::os::raw::c_int; 7 | 8 | #[expect(clippy::not_unsafe_ptr_arg_deref)] 9 | #[unsafe(no_mangle)] 10 | pub extern "C" fn sqlite3_sqlitezstd_init( 11 | db: *mut ffi::sqlite3, 12 | pz_err_msg: *mut *mut std::os::raw::c_char, 13 | p_api: *mut ffi::sqlite3_api_routines, 14 | ) -> c_int { 15 | /* Insert here calls to 16 | ** sqlite3_create_function_v2(), 17 | ** sqlite3_create_collation_v2(), 18 | ** sqlite3_create_module_v2(), and/or 19 | ** sqlite3_vfs_register() 20 | ** to register the new features that your extension adds. 21 | */ 22 | unsafe { Connection::extension_init2(db, pz_err_msg, p_api, init) } 23 | } 24 | 25 | fn init(db: Connection) -> rusqlite::Result<bool> { 26 | match crate::load(&db) { 27 | Ok(()) => { 28 | log::info!("[sqlite-zstd] initialized"); 29 | Ok(false) 30 | } 31 | Err(e) => { 32 | log::error!("[sqlite-zstd] init error: {:?}", e); 33 | Err(rusqlite::Error::ModuleError(format!("{:?}", e))) 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/dict_management.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Context as AContext; 2 | use lru_time_cache::LruCache; 3 | use rusqlite::Connection; 4 | use rusqlite::{functions::Context, params}; 5 | use std::sync::LazyLock; 6 | use std::sync::{Arc, Mutex}; 7 | use std::time::Duration; 8 | 9 | use zstd::dict::{DecoderDictionary, EncoderDictionary}; 10 | 11 | type EncoderCache = LruCache<(usize, i32, i32), Arc<EncoderDictionary<'static>>>; 12 | // we cache the instantiated encoder dictionaries keyed by (DbConnection, dict_id, compression_level) 13 | // DbConnection would ideally be db.path() because it's the same for multiple connections to the same db, but that would be less robust (e.g. in-memory databases) 14 | // we use a Mutex and not a RwLock because even the .get() methods on LruCache need to write (to update expiry and least recently used time) 15 | static ENCODER_DICTS: LazyLock<Mutex<EncoderCache>> = 16 | LazyLock::new(|| Mutex::new(LruCache::with_expiry_duration(Duration::from_secs(10)))); 17 | 18 | type DecoderCache = LruCache<(usize, i32), Arc<DecoderDictionary<'static>>>; 19 | static DECODER_DICTS: LazyLock<Mutex<DecoderCache>> = 20 | LazyLock::new(|| Mutex::new(LruCache::with_expiry_duration(Duration::from_secs(10)))); 21 | 22 | /// when we open a new connection, it may reuse the same pointer location as an old connection, so we need to invalidate parts of the dict cache 23 | pub(crate) fn invalidate_caches(_db: &Connection) { 24 | // (theoretically we only need to clear caches with key db_handle_pointer but it likely doesn't matter much, 25 | // how often are you going to open a new connection?) 26 | // let db_handle_pointer = unsafe { db.handle() } as usize; 27 | log::debug!("Invalidating dict caches"); 28 | { 29 | let mut cache = ENCODER_DICTS.lock().unwrap(); 30 | cache.clear(); 31 | } 32 | { 33 | let mut cache = DECODER_DICTS.lock().unwrap(); 34 | cache.clear(); 35 | } 36 | } 37 | // TODO: the rust interface currently requires a level when preparing a dictionary, but the zstd interface (ZSTD_CCtx_loadDictionary) does not. 38 | // TODO: Using LruCache here isn't very smart 39 | pub fn encoder_dict_from_ctx( 40 | ctx: &Context, 41 | arg_index: usize, 42 | level: i32, 43 | ) -> anyhow::Result<Arc<EncoderDictionary<'static>>> { 44 | let id: i32 = ctx.get(arg_index)?; 45 | let db = unsafe { ctx.get_connection()? }; // SAFETY: This might be unsafe depending on how the connection is used. See https://github.com/rusqlite/rusqlite/issues/643#issuecomment-640181213 46 | let db_handle_pointer = unsafe { db.handle() } as usize; // SAFETY: We're only getting the pointer as an int, not using the raw connection 47 | 48 | let mut dicts_write = ENCODER_DICTS.lock().unwrap(); 49 | let entry = dicts_write.entry((db_handle_pointer, id, level)); 50 | let res = match entry { 51 | lru_time_cache::Entry::Vacant(e) => e.insert({ 52 | log::debug!( 53 | "loading encoder dictionary {} level {} (should only happen once per 10s)", 54 | id, 55 | level 56 | ); 57 | 58 | let dict_raw: Vec<u8> = db 59 | .query_row( 60 | "select dict from _zstd_dicts where id = ?", 61 | params![id], 62 | |r| r.get(0), 63 | ) 64 | .with_context(|| format!("getting dict with id={id} from _zstd_dicts"))?; 65 | let dict = EncoderDictionary::copy(&dict_raw, level); 66 | Arc::new(dict) 67 | }), 68 | lru_time_cache::Entry::Occupied(o) => o.into_mut(), 69 | } 70 | .clone(); 71 | Ok(res) 72 | } 73 | 74 | pub fn decoder_dict_from_ctx( 75 | ctx: &Context, 76 | arg_index: usize, 77 | ) -> anyhow::Result<Arc<DecoderDictionary<'static>>> { 78 | // we cache the instantiated decoder dictionaries keyed by (DbConnection, dict_id) 79 | // DbConnection would ideally be db.path() because it's the same for multiple connections to the same db, but that would be less robust (e.g. in-memory databases) 80 | let id: i32 = ctx.get(arg_index)?; 81 | let db = unsafe { ctx.get_connection()? }; // SAFETY: This might be unsafe depending on how the connection is used. See https://github.com/rusqlite/rusqlite/issues/643#issuecomment-640181213 82 | let db_handle_pointer = unsafe { db.handle() } as usize; // SAFETY: We're only getting the pointer as an int, not using the raw connection 83 | log::trace!("Using DB Handle pointer {db_handle_pointer} as cache key"); 84 | let cache_key = (db_handle_pointer, id); 85 | // since the get() function on lru cache also writes (updates last used time and expiry), 86 | // we can not use DICTS.read() (RwLock) for perf 87 | let mut dicts_write = DECODER_DICTS.lock().unwrap(); 88 | let entry = dicts_write.entry(cache_key); 89 | let res = match entry { 90 | lru_time_cache::Entry::Vacant(e) => e.insert({ 91 | log::debug!( 92 | "loading decoder dictionary {} (should only happen once per 10s)", 93 | id 94 | ); 95 | let db = unsafe { ctx.get_connection()? }; 96 | let dict_raw: Vec<u8> = db 97 | .query_row( 98 | "select dict from _zstd_dicts where id = ?", 99 | params![id], 100 | |r| r.get(0), 101 | ) 102 | .with_context(|| format!("getting dict with id={id} from _zstd_dicts"))?; 103 | let dict = DecoderDictionary::copy(&dict_raw); 104 | Arc::new(dict) 105 | }), 106 | lru_time_cache::Entry::Occupied(o) => o.into_mut(), 107 | } 108 | .clone(); 109 | Ok(res) 110 | } 111 | 112 | /* 113 | 114 | 115 | use rusqlite::{functions::Context, params, types::ValueRef}; 116 | 117 | /// load a dict from sqlite function parameters 118 | /// 119 | /// sqlite sadly does not do auxdata caching for subqueries like `zstd_compress(data, 3, (select dict from _zstd_dicts where id = 4))` 120 | /// so instead we support the syntax `zstd_compress(data, 3, 4)` as an alias to the above 121 | /// if the dict parameter is a number, the dict will be queried from the _zstd_dicts table and cached in sqlite auxdata 122 | /// so it is only constructed once per query 123 | /// 124 | /// this function is not 100% correct because the level is passed separately from the dictionary but the dictionary is cached in the aux data of the dictionary parameter 125 | /// e.g. `select zstd_compress(tbl.data, tbl.row_compression_level, 123) from tbl` will probably compress all the data with the same compression ratio instead of a random one 126 | /// as a workaround `select zstd_compress(tbl.data, tbl.row_compression_level, (select 123)) from tbl` probably works 127 | /// to fix this the level parameter would need to be checked against the constructed dictionary and the dict discarded on mismatch 128 | pub fn encoder_dict_from_ctx<'a>( 129 | ctx: &'a Context, 130 | arg_index: usize, 131 | level: i32, 132 | ) -> rusqlite::Result<Arc<OwnedEncoderDict<'a>>> { 133 | Ok(match ctx.get_aux::<OwnedEncoderDict>(arg_index as i32)? { 134 | Some(d) => d, 135 | None => { 136 | log::debug!("loading dictionary (should only happen once per query)"); 137 | let dict_raw = match ctx.get_raw(arg_index) { 138 | ValueRef::Blob(b) => b.to_vec(), 139 | ValueRef::Integer(i) => { 140 | let db = unsafe { ctx.get_connection()? }; 141 | let res: Vec<u8> = db.query_row( 142 | "select dict from _zstd_dicts where id = ?", 143 | params![i], 144 | |r| r.get(0), 145 | )?; 146 | res 147 | } 148 | e => { 149 | return Err(rusqlite::Error::InvalidFunctionParameterType( 150 | arg_index, 151 | e.data_type(), 152 | )) 153 | } 154 | }; 155 | let dict = wrap_encoder_dict(dict_raw, level); 156 | ctx.set_aux(arg_index as i32, dict)?; 157 | ctx.get_aux::<OwnedEncoderDict>(arg_index as i32)?.unwrap() 158 | } 159 | }) 160 | } 161 | 162 | 163 | /// same as above 164 | pub fn decoder_dict_from_ctx<'a>( 165 | ctx: &'a Context, 166 | arg_index: usize, 167 | ) -> rusqlite::Result<Arc<OwnedDecoderDict<'a>>> { 168 | Ok(match ctx.get_aux::<OwnedDecoderDict>(arg_index as i32)? { 169 | Some(d) => d, 170 | None => { 171 | log::debug!("loading dictionary (should only happen once per query)"); 172 | let dict_raw = /*ctx.get::<Vec<u8>>(arg_index)?;*/ 173 | match ctx.get_raw(arg_index) { 174 | ValueRef::Blob(b) => b.to_vec(), 175 | ValueRef::Integer(i) => { 176 | let db = unsafe { ctx.get_connection()? }; 177 | let res: Vec<u8> = db.query_row( 178 | "select dict from _zstd_dicts where id = ?", 179 | params![i], 180 | |r| r.get(0), 181 | )?; 182 | res 183 | } 184 | e => return Err(rusqlite::Error::InvalidFunctionParameterType( 185 | arg_index, 186 | e.data_type(), 187 | )), 188 | }; 189 | let dict = wrap_decoder_dict(dict_raw); 190 | ctx.set_aux(arg_index as i32, dict)?; 191 | ctx.get_aux::<OwnedDecoderDict>(arg_index as i32)?.unwrap() 192 | } 193 | }) 194 | } 195 | */ 196 | -------------------------------------------------------------------------------- /src/dict_training.rs: -------------------------------------------------------------------------------- 1 | use crate::transparent::pretty_bytes; 2 | use crate::util::*; 3 | use anyhow::Context as AContext; 4 | use rand::Rng; 5 | use rusqlite::functions::Context; 6 | 7 | use rusqlite::params; 8 | use rusqlite::types::{Value, ValueRef}; 9 | 10 | pub struct ZstdTrainDictAggregate { 11 | /// if None, return trained dict, otherwise insert into _zstd_dicts table with chooser_key given as fourth arg and return id 12 | /// if false expects 3 args, if true expects 4 args 13 | pub return_save_id: bool, 14 | } 15 | pub struct ZstdTrainDictState { 16 | reservoir: Vec<Vec<u8>>, 17 | wanted_item_count: usize, 18 | total_count: usize, 19 | wanted_dict_size: usize, 20 | chooser_key: Option<Option<String>>, 21 | } 22 | 23 | impl rusqlite::functions::Aggregate<ZstdTrainDictState, Value> for ZstdTrainDictAggregate { 24 | fn init(&self, ctx: &mut Context) -> rusqlite::Result<ZstdTrainDictState> { 25 | let arg_dict_size_bytes = 1; 26 | let arg_sample_count = 2; 27 | let arg_chooser_key = 3; 28 | let wanted_item_count = ctx.get::<f64>(arg_sample_count)? as usize; 29 | log::debug!("sampling {} values", wanted_item_count); 30 | Ok(ZstdTrainDictState { 31 | reservoir: vec![], 32 | wanted_item_count, 33 | wanted_dict_size: ctx.get::<i64>(arg_dict_size_bytes)? as usize, 34 | total_count: 0, 35 | chooser_key: if self.return_save_id { 36 | Some(ctx.get(arg_chooser_key)?) 37 | } else { 38 | None 39 | }, 40 | }) 41 | } 42 | fn step(&self, ctx: &mut Context, state: &mut ZstdTrainDictState) -> rusqlite::Result<()> { 43 | let arg_sample = 0; 44 | 45 | let cur = match ctx.get_raw(arg_sample) { 46 | ValueRef::Blob(b) => b, 47 | ValueRef::Text(b) => b, 48 | ValueRef::Real(_f) => return Ok(()), 49 | ValueRef::Integer(_i) => return Ok(()), 50 | ValueRef::Null => return Ok(()), 51 | }; 52 | let i = state.total_count; 53 | let k = state.wanted_item_count; 54 | // https://en.wikipedia.org/wiki/Reservoir_sampling#Simple_algorithm 55 | 56 | if i < k { 57 | state.reservoir.push(Vec::from(cur)); 58 | state.total_count += 1; 59 | return Ok(()); 60 | } 61 | state.total_count += 1; 62 | let j = rand::thread_rng().gen_range(0..i); 63 | if j < k { 64 | state.reservoir[j] = Vec::from(cur); 65 | } 66 | Ok(()) 67 | } 68 | 69 | fn finalize( 70 | &self, 71 | ctx: &mut Context, 72 | state: Option<ZstdTrainDictState>, 73 | ) -> rusqlite::Result<Value> { 74 | let state = 75 | state.ok_or_else(|| ah(anyhow::anyhow!("tried to train zstd dict on zero rows")))?; 76 | log::debug!( 77 | "training dict of max size {}kB with {} samples of total size {}kB (of {} samples seen)", 78 | state.wanted_dict_size / 1000, 79 | state.reservoir.len(), 80 | state.reservoir.iter().map(|x| x.len()).sum::<usize>() / 1000, 81 | state.total_count 82 | ); 83 | let dict = zstd::dict::from_samples(&state.reservoir, state.wanted_dict_size) 84 | .context("Training dictionary failed") 85 | .map_err(ah)?; 86 | log::debug!( 87 | "resulting dict has size {}", 88 | pretty_bytes(dict.len() as i64) 89 | ); 90 | if let Some(key) = state.chooser_key { 91 | let db = unsafe { ctx.get_connection()? }; 92 | ensure_dicts_table_exists(&db)?; 93 | db.execute( 94 | "insert into _zstd_dicts (chooser_key,dict) values (?, ?);", 95 | params![key, dict], 96 | )?; 97 | let id = db.last_insert_rowid(); 98 | log::debug!( 99 | "inserted dict into _zstd_dicts with key {}, id {}", 100 | key.as_deref().unwrap_or("null"), 101 | id 102 | ); 103 | Ok(Value::Integer(id)) 104 | } else { 105 | Ok(Value::Blob(dict)) 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::print_stdout)] 2 | 3 | use rusqlite::Connection; 4 | use util::init_logging; 5 | 6 | #[cfg(feature = "build_extension")] 7 | mod create_extension; 8 | 9 | mod add_functions; 10 | mod basic; 11 | mod dict_management; 12 | mod dict_training; 13 | mod transparent; 14 | mod util; 15 | 16 | pub use log::LevelFilter as LogLevel; 17 | 18 | /// Loads the sqlite extension with the default log level (INFO) 19 | pub fn load(connection: &Connection) -> anyhow::Result<()> { 20 | load_with_loglevel(connection, LogLevel::Info) 21 | } 22 | 23 | /// Loads the sqlite extension with the given log level 24 | pub fn load_with_loglevel( 25 | connection: &Connection, 26 | default_log_level: LogLevel, 27 | ) -> anyhow::Result<()> { 28 | init_logging(default_log_level); 29 | crate::dict_management::invalidate_caches(connection); 30 | crate::add_functions::add_functions(connection) 31 | } 32 | -------------------------------------------------------------------------------- /src/transparent.rs: -------------------------------------------------------------------------------- 1 | use crate::{util::*, *}; 2 | use anyhow::Context as AContext; 3 | use rusqlite::OptionalExtension; 4 | use rusqlite::functions::Context; 5 | use rusqlite::types::ToSqlOutput; 6 | use rusqlite::types::Value; 7 | use rusqlite::{named_params, params}; 8 | use std::collections::HashMap; 9 | use std::time::{Duration, Instant}; 10 | 11 | // the output will be without magic header, without checksums, and without dictids. This will save 4 bytes when not using dictionaries and 8 bytes when using dictionaries. 12 | // this also means the data will not be decodeable as a normal zstd archive with the standard tools 13 | static COMPACT: bool = true; 14 | #[derive(Debug)] 15 | struct ColumnInfo { 16 | name: String, 17 | coltype: String, 18 | is_primary_key: bool, 19 | to_compress: bool, 20 | is_dict_id: bool, 21 | } 22 | 23 | fn def_min_dict_size() -> i64 { 24 | 5000 25 | } 26 | fn def_dict_size_ratio() -> f32 { 27 | 0.01 28 | } 29 | fn def_train_dict_samples_ratio() -> f32 { 30 | 100.0 31 | } 32 | fn def_incremental_compression_step_bytes() -> i64 { 33 | // https://github.com/facebook/zstd/blob/dev/doc/images/CSpeed2.png 34 | // about 5MB/s at level 19 35 | 5_000_000 / 3 36 | } 37 | 38 | /// This is the configuration of the transparent compression for one column of one table. 39 | /// It is safe to change every property of this configuration at any time except for table and column, but data that is already compressed will not be recompressed with the new settings. 40 | /// You can update the config e.g. using SQL: `update _zstd_configs set config = json_patch(config, '{"target_db_load": 1}');` 41 | /// 42 | /// Note that the configuration is assumed to be trusted. For example, dict_chooser can probably used for SQL injection. 43 | #[derive(serde::Serialize, serde::Deserialize)] 44 | pub struct TransparentCompressConfig { 45 | /// the name of the table to which the transparent compression will be applied. It will be renamed to _tblname_zstd and replaced with a editable view. 46 | pub table: String, 47 | /// the name of the column 48 | pub column: String, 49 | /// The compression level. Valid levels are 1-19. 50 | /// Compression will be significantly slower when the level is increased, but decompression speed should stay about the same regardless of compression level. 51 | /// That means this is a tradeoff between zstd_incremental_maintenance vs SELECT performance. 52 | pub compression_level: i8, 53 | /// An SQL expression that chooses which dict to use or returns null if data should stay uncompressed for now 54 | /// Examples: 55 | /// 56 | /// * `'a'` 57 | /// This will cause a single dictionary to be trained for everything. 58 | /// 59 | /// * `strftime(created, '%Y-%m')` 60 | /// This will cause every month of data to be compressed with its own dictionary. 61 | /// 62 | /// * `nullif(strftime(created, '%Y-%m'), strftime('now', '%Y-%m'))` 63 | /// 64 | /// The same as above, but if the entry is from the current month it will stay uncompressed. 65 | /// This is handy because it means that the dictionary for the month will only be created when the month is over 66 | /// and can thus be optimized the most for the given data 67 | /// * `case when date(timestamp, ''weekday 0'') < date(''now'', ''weekday 0'') then data_type || ''.'' || date(timestamp, ''weekday 0'') else null end` 68 | /// 69 | /// This one uses keys like data_type.2020-11-01` where the date is the first day of the week, except for the current week which stays uncompressed. 70 | /// This means that every different data_type will be compressed separately and separately for each week. 71 | /// 72 | /// You can return the special string `[nodict]` to compress the given data without a dictionary. 73 | /// Note that the compression key is global for all tables. So if you want your dict to only apply to this table return 74 | /// something like `"tablename." || strftime(...)`. 75 | pub dict_chooser: String, 76 | #[serde(default = "def_min_dict_size")] 77 | /// if dictionary size would be smaller than this then no dict will be trained and if no dict exists the data will stay uncompressed 78 | pub min_dict_size_bytes_for_training: i64, 79 | #[serde(default = "def_dict_size_ratio")] 80 | /// The target size of the dictionary based on seen data. For example, 81 | /// For example if we see 10MB of data for a specific group, the dict will target a size of ratio * 10MB (default 0.01) 82 | pub dict_size_ratio: f32, 83 | /// for training, we find samples of this factor (default 100) 84 | /// the default of 100 and 0.01 means that by default the dict will be trained on all of the data 85 | #[serde(default = "def_train_dict_samples_ratio")] 86 | pub train_dict_samples_ratio: f32, 87 | /// how many bytes (approximately) to compress at once. By default tuned so at compression level 19 it locks the database for about 0.3s per step. 88 | #[serde(default = "def_incremental_compression_step_bytes")] 89 | pub incremental_compression_step_bytes: i64, 90 | } 91 | 92 | pub fn pretty_bytes(bytes: i64) -> String { 93 | if bytes >= 1_000_000_000 { 94 | format!("{:.2}GB", bytes as f64 / 1e9) 95 | } else if bytes >= 1_000_000 { 96 | format!("{:.2}MB", bytes as f64 / 1e6) 97 | } else if bytes >= 1_000 { 98 | format!("{:.2}kB", bytes as f64 / 1e3) 99 | } else { 100 | format!("{bytes}B") 101 | } 102 | } 103 | 104 | #[derive(Debug)] 105 | enum SqliteAffinity { 106 | Integer, 107 | Text, 108 | Blob, 109 | Real, 110 | Numeric, 111 | } 112 | /// determine affinity, algorithm described at https://www.sqlite.org/draft/datatype3.html#determination_of_column_affinity 113 | fn get_column_affinity(declared_type: &str) -> SqliteAffinity { 114 | use SqliteAffinity::*; 115 | let typ = declared_type.to_ascii_lowercase(); 116 | if typ.contains("int") { 117 | Integer 118 | } else if typ.contains("char") || typ.contains("clob") || typ.contains("text") { 119 | Text 120 | } else if typ.contains("blob") || typ.is_empty() { 121 | Blob 122 | } else if typ.contains("real") || typ.contains("floa") || typ.contains("doub") { 123 | Real 124 | } else { 125 | Numeric 126 | } 127 | } 128 | 129 | fn show_warnings(db: &Connection) -> anyhow::Result<()> { 130 | // warnings 131 | let journal_mode: String = db 132 | .query_row("pragma journal_mode;", params![], |r| r.get(0)) 133 | .context("querying journal mode")?; 134 | if journal_mode != "wal" { 135 | log::warn!("Warning: It is recommended to set `pragma journal_mode=WAL;`"); 136 | } 137 | let vacuum_mode: i32 = db 138 | .query_row("pragma auto_vacuum;", params![], |r| r.get(0)) 139 | .context("querying vacuum mode")?; 140 | if vacuum_mode != 1 { 141 | log::warn!("Warning: It is recommended to set `pragma auto_vacuum=full;`"); 142 | } 143 | let busy_timeout: i32 = db 144 | .query_row("pragma busy_timeout;", params![], |r| r.get(0)) 145 | .context("querying busy timeout")?; 146 | if busy_timeout == 0 { 147 | log::warn!("Warning: It is recommended to set `pragma busy_timeout=2000;` or higher"); 148 | } 149 | Ok(()) 150 | } 151 | /// 152 | /// enables transparent row-level compression for a table with the following steps: 153 | /// 154 | /// 1. renames tablename to _tablename_zstd if table is not already enabled 155 | /// 2. creates a view called tablename that mirrors _tablename_zstd except it decompresses the compressed column on the fly 156 | /// 3. creates INSERT, UPDATE and DELETE triggers on the view so they affect the backing table instead 157 | /// 158 | /// Warning: this function assumes trusted input, it is not sql injection safe! 159 | pub fn zstd_enable_transparent<'a>(ctx: &Context) -> anyhow::Result<ToSqlOutput<'a>> { 160 | let arg_config = 0; 161 | 162 | let config_str: String = ctx.get(arg_config)?; 163 | let config: TransparentCompressConfig = serde_json::from_str(&config_str) 164 | .with_context(|| format!("parsing json config '{config_str}'"))?; 165 | let db = &mut unsafe { ctx.get_connection()? }; 166 | let db = db 167 | .unchecked_transaction() 168 | .context("Could not start transaction")?; 169 | let table_name = &config.table; 170 | let new_table_name = format!("_{table_name}_zstd"); 171 | 172 | let configs = get_configs(&db)?; 173 | let already_compressed_columns = configs 174 | .iter() 175 | .filter(|c| &c.table == table_name) 176 | .map(|c| &c.column[..]) 177 | .collect::<Vec<&str>>(); 178 | 179 | log::debug!( 180 | "already compressed columns={:?}", 181 | already_compressed_columns 182 | ); 183 | 184 | if already_compressed_columns.contains(&&config.column[..]) { 185 | anyhow::bail!( 186 | "Column {} is already enabled for compression.", 187 | &config.column 188 | ); 189 | } 190 | 191 | let table_already_enabled = !already_compressed_columns.is_empty(); 192 | 193 | let dict_id_columns: Vec<String> = if table_already_enabled { 194 | let query = format!( 195 | r#"select "from" 196 | from pragma_foreign_key_list('{}') 197 | where "table" = '_zstd_dicts'"#, 198 | &new_table_name 199 | ); 200 | log::debug!("dict_id_columns query {:?}", query); 201 | db.prepare(&query)? 202 | .query_map(params![], |row| row.get("from")) 203 | .context("Could not get dicts ids info")? 204 | .collect::<Result<Vec<String>, _>>()? 205 | } else { 206 | vec![] 207 | }; 208 | 209 | log::debug!("dict_id columns={:?}", dict_id_columns); 210 | 211 | if !check_table_exists( 212 | &db, 213 | if table_already_enabled { 214 | &new_table_name 215 | } else { 216 | table_name 217 | }, 218 | ) { 219 | anyhow::bail!("Table {} doesn't exist", table_name); 220 | } 221 | 222 | let columns_info: Vec<ColumnInfo> = db 223 | .prepare(&format_sqlite!( 224 | r#"pragma table_info({})"#, 225 | if table_already_enabled { 226 | &new_table_name 227 | } else { 228 | table_name 229 | } 230 | ))? 231 | .query_map(params![], |row| { 232 | let col_name: String = row.get("name")?; 233 | let to_compress = (col_name == config.column) 234 | || (already_compressed_columns.contains(&&col_name[..])); 235 | let is_dict_id = dict_id_columns.contains(&col_name); 236 | Ok(ColumnInfo { 237 | name: col_name, 238 | is_primary_key: row.get("pk")?, 239 | coltype: row.get("type")?, 240 | to_compress, 241 | is_dict_id, 242 | }) 243 | }) 244 | .context("Could not query table_info")? 245 | .collect::<Result<_, rusqlite::Error>>()?; 246 | 247 | show_warnings(&db)?; 248 | 249 | // primary key columns. these will be used to index the table in the modifying triggers 250 | let primary_key_columns: Vec<&ColumnInfo> = 251 | columns_info.iter().filter(|e| e.is_primary_key).collect(); 252 | 253 | if columns_info.is_empty() { 254 | anyhow::bail!("Table {} does not exist", table_name); 255 | } 256 | if primary_key_columns.is_empty() { 257 | anyhow::bail!( 258 | "Table {} does not have a primary key, sqlite-zstd only works on tables with primary keys, since rowids can change on VACUUM.", 259 | table_name 260 | ); 261 | } 262 | 263 | let column_name = &config.column; 264 | 265 | let to_compress_column = columns_info 266 | .iter() 267 | .find(|e| &e.name == column_name) 268 | .with_context(|| format!("Column {column_name} does not exist in {table_name}"))?; 269 | if to_compress_column.is_primary_key { 270 | anyhow::bail!( 271 | "Can't compress column {} since it is part of primary key (this could probably be supported, but currently isn't)", 272 | column_name 273 | ); 274 | } 275 | 276 | check_columns_to_compress_are_not_indexed(&db, &columns_info, table_name)?; 277 | 278 | let dict_id_column_name = get_dict_id(&to_compress_column.name); 279 | log::debug!("cols={:?}", columns_info); 280 | 281 | { 282 | let query = format!( 283 | "select ({}) as dict_chooser from {} limit 1", 284 | config.dict_chooser, 285 | escape_sqlite_identifier(table_name) 286 | ); 287 | // small sanity check of chooser statement 288 | db.query_row(&query, params![], |row| row.get::<_, String>(0)) 289 | .optional() 290 | .with_context(|| format!("Tried to execute:\n{query}")) 291 | .context(r#"Dict chooser expression does not seem to be valid. Make sure you return a string and get your escaping right: If you want an sqlite string inside a json string inside a sqlite string you need to do '{"foo": "''bar''"}'"#)?; 292 | } 293 | { 294 | // can't use prepared statement at these positions 295 | if !table_already_enabled { 296 | let rename_query = 297 | format_sqlite!("alter table {} rename to {}", table_name, &new_table_name); 298 | log::debug!("[run] {}", &rename_query); 299 | db.execute(&rename_query, params![]) 300 | .context("Could not rename table")?; 301 | } 302 | 303 | util::ensure_dicts_table_exists(&db)?; 304 | 305 | db.execute( 306 | " 307 | create table if not exists _zstd_configs ( 308 | id integer primary key autoincrement, 309 | config json not null 310 | );", 311 | params![], 312 | ) 313 | .context("Could not create _zstd_configs")?; 314 | 315 | db.execute( 316 | "insert into _zstd_configs (config) values (?)", 317 | params![config_str], 318 | ) 319 | .context("Could not insert config")?; 320 | 321 | db.execute( 322 | &format_sqlite!( 323 | "alter table {} add column {} integer default null references _zstd_dicts(id)", 324 | &new_table_name, 325 | &dict_id_column_name 326 | ), 327 | params![], 328 | ) 329 | .context("Could not add dictid column")?; 330 | 331 | // this index is needed since the maintenance function queries by the dictionary id to find rows that are not compressed 332 | db.execute( 333 | &format_sqlite!( 334 | "create index {} on {} ({})", 335 | &format!("{}_idx", &dict_id_column_name), 336 | &new_table_name, 337 | &dict_id_column_name 338 | ), 339 | params![], 340 | ) 341 | .context("Could not create index on dictid")?; 342 | } 343 | 344 | create_or_replace_view( 345 | &db, 346 | &columns_info, 347 | table_name, 348 | &new_table_name, 349 | table_already_enabled, 350 | )?; 351 | 352 | create_insert_trigger(&db, &columns_info, table_name, &new_table_name, &config)?; 353 | 354 | // a WHERE statement that selects a row based on the primary key 355 | let primary_key_condition = primary_key_columns 356 | .iter() 357 | .map(|c| format_sqlite!("old.{0} = {0}", &c.name)) 358 | .collect::<Vec<String>>() 359 | .join(" and "); 360 | 361 | // add delete trigger 362 | create_delete_trigger(&db, table_name, &new_table_name, &primary_key_condition)?; 363 | 364 | // update trigger 365 | create_update_triggers( 366 | &db, 367 | &columns_info, 368 | table_name, 369 | &new_table_name, 370 | &primary_key_condition, 371 | &config, 372 | )?; 373 | 374 | db.commit().context("Could not commit transaction")?; 375 | Ok(ToSqlOutput::Owned(Value::Text("Done!".to_string()))) 376 | } 377 | 378 | fn get_dict_id(column_name: &str) -> String { 379 | format!("_{column_name}_dict") 380 | } 381 | 382 | fn check_table_exists(db: &rusqlite::Connection, table_name: &str) -> bool { 383 | let table_count: u32 = db 384 | .query_row( 385 | "select count(`type`) from sqlite_master where name = ? and type = 'table'", 386 | params![table_name], 387 | |r| r.get(0), 388 | ) 389 | .unwrap_or(0); 390 | table_count != 0 391 | } 392 | 393 | fn check_columns_to_compress_are_not_indexed( 394 | db: &rusqlite::Connection, 395 | columns_info: &[ColumnInfo], 396 | table_name: &str, 397 | ) -> anyhow::Result<()> { 398 | let indexed_columns: HashMap<String, String> = db 399 | .prepare( 400 | " 401 | select distinct ii.name as column_name, il.name as index_name 402 | from sqlite_master as m, 403 | pragma_index_list(m.name) as il, 404 | pragma_index_info(il.name) as ii 405 | where m.type='table' AND m.name=?", 406 | )? 407 | .query_map(params![table_name], |row| { 408 | Ok((row.get("column_name")?, row.get("index_name")?)) 409 | }) 410 | .context("could not get indices info")? 411 | .collect::<Result<_, rusqlite::Error>>()?; 412 | 413 | let indexed_columns_to_compress = columns_info 414 | .iter() 415 | .filter(|c| match indexed_columns.get(&c.name) { 416 | Some(_) => c.to_compress, 417 | None => false, 418 | }) 419 | .collect::<Vec<&ColumnInfo>>(); 420 | 421 | if !indexed_columns_to_compress.is_empty() { 422 | let columns_indices = indexed_columns_to_compress 423 | .iter() 424 | .map(|c| format!("{} ({})", c.name, indexed_columns.get(&c.name).unwrap())) 425 | .collect::<Vec<String>>() 426 | .join(", "); 427 | anyhow::bail!( 428 | "Can't compress column(s): {} - used as part of index (this could probably be supported, but currently isn't)", 429 | columns_indices 430 | ); 431 | }; 432 | Ok(()) 433 | } 434 | 435 | fn create_or_replace_view( 436 | db: &rusqlite::Connection, 437 | columns_info: &[ColumnInfo], 438 | table_name: &str, 439 | internal_table_name: &str, 440 | table_already_enabled: bool, 441 | ) -> anyhow::Result<()> { 442 | if table_already_enabled { 443 | // this drops the existing triggers as well 444 | let dropview_query = format!(r#"drop view {}"#, escape_sqlite_identifier(table_name)); 445 | log::debug!("[run] {}", &dropview_query); 446 | db.execute(&dropview_query, params![]) 447 | .context("Could not drop view")?; 448 | } 449 | 450 | // create view 451 | let select_columns_escaped = columns_info 452 | .iter() 453 | .filter(|c| !c.is_dict_id ) 454 | .map(|c| { 455 | if c.to_compress { 456 | let affinity_is_text = match get_column_affinity(&c.coltype) { 457 | SqliteAffinity::Blob => false, 458 | SqliteAffinity::Text => true, 459 | other => anyhow::bail!("the to-compress column has type {} which has affinity {:?}, but affinity must be text or blob. See https://www.sqlite.org/draft/datatype3.html#determination_of_column_affinity", c.coltype, other) 460 | }; 461 | Ok(format!( 462 | // prepared statement parameters not allowed in view 463 | "zstd_decompress_col({}, {}, {}, {}) as {0}", 464 | &escape_sqlite_identifier(&c.name), 465 | if affinity_is_text { 1 } else { 0 }, 466 | &escape_sqlite_identifier(&get_dict_id(&c.name)), 467 | COMPACT 468 | )) 469 | } else { 470 | Ok(format_sqlite!("{}", &c.name)) 471 | } 472 | }) 473 | .collect::<Result<Vec<String>, _>>() 474 | .context("could not construct select in view")? 475 | .join(", "); 476 | let createview_query = format!( 477 | r#" 478 | create view {} as 479 | select {} 480 | from {} 481 | "#, 482 | escape_sqlite_identifier(table_name), 483 | select_columns_escaped, 484 | escape_sqlite_identifier(internal_table_name) 485 | ); 486 | log::debug!("[run] {}", &createview_query); 487 | db.execute(&createview_query, params![]) 488 | .context("Could not create view")?; 489 | Ok(()) 490 | } 491 | 492 | fn create_insert_trigger( 493 | db: &rusqlite::Connection, 494 | columns_info: &[ColumnInfo], 495 | table_name: &str, 496 | internal_table_name: &str, 497 | _config: &TransparentCompressConfig, 498 | ) -> anyhow::Result<()> { 499 | let trigger_name = format!("{table_name}_insert_trigger"); 500 | 501 | // expressions that map backing table columns to view columns 502 | let mut insert_selection = vec![]; 503 | // names of the columns to be inserted 504 | let mut columns_selection = vec![]; 505 | 506 | for c in columns_info { 507 | if c.is_dict_id { 508 | continue; 509 | } 510 | columns_selection.push(String::from(&c.name)); 511 | if c.to_compress { 512 | let dict_id = get_dict_id(&c.name); 513 | insert_selection.push(format!( 514 | // prepared statement parameters not allowed in view 515 | "new.{col} as {col}, null as {dictcol}", 516 | col = escape_sqlite_identifier(&c.name), 517 | dictcol = escape_sqlite_identifier(&dict_id) 518 | )); 519 | columns_selection.push(String::from(&dict_id)); 520 | } else { 521 | insert_selection.push(format_sqlite!("new.{}", &c.name)); 522 | } 523 | } 524 | 525 | let createtrigger_query = format!( 526 | " 527 | create trigger {} 528 | instead of insert on {} 529 | for each row 530 | begin 531 | insert into {}({}) select {}; 532 | end; 533 | ", 534 | escape_sqlite_identifier(&trigger_name), 535 | escape_sqlite_identifier(table_name), 536 | escape_sqlite_identifier(internal_table_name), 537 | columns_selection.join(", "), 538 | insert_selection.join(",\n"), 539 | ); 540 | log::debug!("[run] {}", &createtrigger_query); 541 | db.execute(&createtrigger_query, params![]) 542 | .context("Could not create insert trigger")?; 543 | Ok(()) 544 | } 545 | 546 | fn create_delete_trigger( 547 | db: &rusqlite::Connection, 548 | table_name: &str, 549 | internal_table_name: &str, 550 | primary_key_condition: &str, 551 | ) -> anyhow::Result<()> { 552 | let trigger_name = format!("{table_name}_delete_trigger"); 553 | 554 | let deletetrigger_query = format!( 555 | " 556 | create trigger {trg_name} 557 | instead of delete on {view} 558 | for each row 559 | begin 560 | delete from {backing_table} where {primary_key_condition}; 561 | end; 562 | ", 563 | trg_name = escape_sqlite_identifier(&trigger_name), 564 | view = escape_sqlite_identifier(table_name), 565 | backing_table = escape_sqlite_identifier(internal_table_name), 566 | primary_key_condition = primary_key_condition 567 | ); 568 | log::debug!("[run] {}", &deletetrigger_query); 569 | db.execute(&deletetrigger_query, params![]) 570 | .context("could not create delete trigger")?; 571 | Ok(()) 572 | } 573 | 574 | fn create_update_triggers( 575 | db: &rusqlite::Connection, 576 | columns_info: &[ColumnInfo], 577 | table_name: &str, 578 | internal_table_name: &str, 579 | primary_key_condition: &str, 580 | _config: &TransparentCompressConfig, 581 | ) -> anyhow::Result<()> { 582 | for col in columns_info { 583 | if col.is_dict_id { 584 | continue; 585 | } 586 | 587 | let trigger_name = format!("{}_update_{}_trigger", table_name, col.name); 588 | 589 | let update = if col.to_compress { 590 | format!( 591 | "{col} = new.{col}, {dictcol} = null", 592 | col = escape_sqlite_identifier(&col.name), 593 | dictcol = escape_sqlite_identifier(&get_dict_id(&col.name)), 594 | ) 595 | } else { 596 | format_sqlite!("{} = new.{}", &col.name, &col.name) 597 | }; 598 | // update triggers 599 | let updatetrigger_query = format!( 600 | " 601 | create trigger {trg_name} 602 | instead of update of {upd_col} on {view_name} 603 | for each row 604 | begin 605 | update {backing_table} set {update} where {primary_key_condition}; 606 | end; 607 | ", 608 | trg_name = escape_sqlite_identifier(&trigger_name), 609 | view_name = escape_sqlite_identifier(table_name), 610 | backing_table = escape_sqlite_identifier(internal_table_name), 611 | upd_col = escape_sqlite_identifier(&col.name), 612 | update = update, 613 | primary_key_condition = primary_key_condition 614 | ); 615 | log::debug!("[run] {}", &updatetrigger_query); 616 | db.execute(&updatetrigger_query, params![]) 617 | .with_context(|| format!("Could not create update of {} trigger", col.name))?; 618 | } 619 | Ok(()) 620 | } 621 | 622 | fn get_configs(db: &rusqlite::Connection) -> Result<Vec<TransparentCompressConfig>, anyhow::Error> { 623 | // if the table `_zstd_configs` does not exist yet, transparent compression hasn't been used yet, so return an empty array 624 | if !check_table_exists(db, "_zstd_configs") { 625 | return Ok(vec![]); 626 | } 627 | 628 | let configs = db 629 | .prepare("select config from _zstd_configs")? 630 | .query_map(params![], |row| { 631 | serde_json::from_str(row.get_ref_unwrap("config").as_str()?) 632 | .context("parsing config") 633 | .map_err(ah) 634 | }) 635 | .context("Couldn't fetch configs")? 636 | .collect::<Result<Vec<TransparentCompressConfig>, rusqlite::Error>>()?; 637 | Ok(configs) 638 | } 639 | 640 | #[derive(Debug)] 641 | struct TodoInfo { 642 | dict_choice: Option<String>, 643 | count: i64, 644 | total_bytes: i64, 645 | } 646 | 647 | struct IncrementalMaintenanceArgs { 648 | end_limit: Instant, 649 | target_db_load: f32, 650 | time_limit: f64, 651 | } 652 | pub fn zstd_incremental_maintenance<'a>(ctx: &Context) -> Result<ToSqlOutput<'a>, anyhow::Error> { 653 | let args = { 654 | let arg_time_limit_seconds = 0; 655 | let arg_target_db_load = 1; 656 | let time_limit: Option<f64> = ctx 657 | .get(arg_time_limit_seconds) 658 | .context("could not get time limit argument")?; 659 | let time_limit = time_limit.unwrap_or(100000000.0); 660 | let target_db_load: f32 = ctx 661 | .get(arg_target_db_load) 662 | .context("could not get target db load argument")?; 663 | if !(0.0..=1e100).contains(&time_limit) { 664 | anyhow::bail!("time too large"); 665 | } 666 | let end_limit = Instant::now() + Duration::from_secs_f64(time_limit); 667 | IncrementalMaintenanceArgs { 668 | end_limit, 669 | target_db_load, 670 | time_limit, 671 | } 672 | }; 673 | let db = unsafe { ctx.get_connection()? }; 674 | show_warnings(&db)?; 675 | let configs = get_configs(&db)?; 676 | 677 | for config in configs { 678 | match maintenance_for_config(&db, config, &args)? { 679 | MaintRet::TimeLimitReached => { 680 | log::info!( 681 | "time limit of {:.1}s reached, stopping with more maintenance work pending", 682 | args.time_limit 683 | ); 684 | return Ok(1.into()); 685 | } 686 | MaintRet::Completed => {} 687 | } 688 | } 689 | log::info!("All maintenance work completed!"); 690 | Ok(0.into()) 691 | } 692 | 693 | enum MaintRet { 694 | TimeLimitReached, 695 | Completed, 696 | } 697 | 698 | struct EscapedNames { 699 | compressed_tablename: String, 700 | data_colname: String, 701 | dict_colname: String, 702 | } 703 | impl From<&TransparentCompressConfig> for EscapedNames { 704 | fn from(config: &TransparentCompressConfig) -> EscapedNames { 705 | EscapedNames { 706 | compressed_tablename: escape_sqlite_identifier(&format!("_{}_zstd", config.table)), 707 | data_colname: escape_sqlite_identifier(&config.column), 708 | dict_colname: escape_sqlite_identifier(&format!("_{}_dict", config.column)), 709 | } 710 | } 711 | } 712 | 713 | fn maintenance_for_config( 714 | db: &Connection, 715 | config: TransparentCompressConfig, 716 | args: &IncrementalMaintenanceArgs, 717 | ) -> anyhow::Result<MaintRet> { 718 | let esc_names = EscapedNames::from(&config); 719 | 720 | let todos = db 721 | .prepare(&format!( 722 | "select 723 | ({chooser}) as dict_choice, 724 | count(*) as count, 725 | sum(length({datacol})) as total_bytes 726 | from {tbl} where {dictcol} is null group by dict_choice", 727 | tbl = esc_names.compressed_tablename, 728 | dictcol = esc_names.dict_colname, 729 | datacol = esc_names.data_colname, 730 | chooser = config.dict_chooser 731 | ))? 732 | .query_map(params![], |row| { 733 | Ok(TodoInfo { 734 | dict_choice: row.get("dict_choice")?, 735 | count: row.get("count")?, 736 | total_bytes: row.get("total_bytes")?, 737 | }) 738 | })? 739 | .collect::<Result<Vec<_>, _>>()?; 740 | 741 | let total_bytes_to_compress: i64 = todos 742 | .iter() 743 | .filter(|e| e.dict_choice.is_some()) 744 | .map(|e| e.total_bytes) 745 | .sum(); 746 | let mut rows_compressed_so_far: i64 = 0; 747 | let mut bytes_compressed_so_far: i64 = 0; 748 | let total_rows_to_compress: i64 = todos 749 | .iter() 750 | .filter(|e| e.dict_choice.is_some()) 751 | .map(|e| e.count) 752 | .sum(); 753 | log::info!( 754 | "{}.{}: Total {} rows ({}) to potentially compress (split in {} groups).", 755 | config.table, 756 | config.column, 757 | total_rows_to_compress, 758 | pretty_bytes(total_bytes_to_compress), 759 | todos.len() 760 | ); 761 | for todo in todos.into_iter() { 762 | let rows_handled = maintenance_for_todo(db, &config, &todo, &esc_names, args)?; 763 | rows_compressed_so_far += rows_handled; 764 | // estimate bytes compressed 765 | bytes_compressed_so_far += 766 | ((rows_handled as f64 / todo.count as f64) * todo.total_bytes as f64) as i64; 767 | if rows_handled > 0 { 768 | log::info!( 769 | "Handled {} / {} rows ({} / {})", 770 | rows_compressed_so_far, 771 | total_rows_to_compress, 772 | pretty_bytes(bytes_compressed_so_far), 773 | pretty_bytes(total_bytes_to_compress) 774 | ); 775 | } 776 | if Instant::now() > args.end_limit { 777 | return Ok(MaintRet::TimeLimitReached); 778 | } 779 | } 780 | Ok(MaintRet::Completed) 781 | } 782 | 783 | fn maintenance_for_todo( 784 | db: &Connection, 785 | config: &TransparentCompressConfig, 786 | todo: &TodoInfo, 787 | esc_names: &EscapedNames, 788 | args: &IncrementalMaintenanceArgs, 789 | ) -> anyhow::Result<i64> { 790 | let avg_sample_bytes = todo.total_bytes / todo.count; 791 | let dict_choice = todo.dict_choice.as_deref().unwrap_or("[null]"); 792 | log::debug!( 793 | "looking at group={}, has {} rows with {} average size ({} total)", 794 | dict_choice, 795 | todo.count, 796 | pretty_bytes(avg_sample_bytes), 797 | pretty_bytes(todo.total_bytes) 798 | ); 799 | 800 | let (dict_id, dict_is_new) = 801 | match get_or_train_dict(db, config, todo, esc_names).context("getting dict")? { 802 | TrainDictReturn::Skip => return Ok(0), 803 | TrainDictReturn::Done { 804 | dict_id, 805 | dict_is_new, 806 | } => (dict_id, dict_is_new), 807 | }; 808 | 809 | let mut total_updated: i64 = 0; 810 | let mut chunk_size = config.incremental_compression_step_bytes / avg_sample_bytes; 811 | if chunk_size < 1 { 812 | chunk_size = 1; 813 | } 814 | log::debug!( 815 | "Compressing {} samples with key {} and level {}, chunksize {}", 816 | todo.count, 817 | dict_choice, 818 | config.compression_level, 819 | chunk_size 820 | ); 821 | loop { 822 | let update_start = Instant::now(); 823 | let q = &format!( 824 | "update {tbl} set {datacol} = zstd_compress_col({datacol}, :lvl, :dict, :compact), {dictcol} = :dict where rowid in (select rowid from {tbl} where {dictcol} is null and :dictchoice = ({chooser}) limit :chunksize)", 825 | tbl = esc_names.compressed_tablename, 826 | datacol = esc_names.data_colname, 827 | dictcol = esc_names.dict_colname, 828 | chooser = config.dict_chooser 829 | ); 830 | log::trace!("executing {}", q); 831 | let updated = db 832 | .execute( 833 | q, 834 | named_params! { 835 | ":lvl": config.compression_level, 836 | ":dict": dict_id, 837 | ":dictchoice": &dict_choice, 838 | ":chunksize": chunk_size, 839 | ":compact": COMPACT 840 | }, 841 | ) 842 | .with_context(|| format!("while compressing chunk for key {dict_choice}"))?; 843 | 844 | total_updated += updated as i64; 845 | log::debug!("Compressed {} / {}", total_updated, todo.count); 846 | if Instant::now() > args.end_limit { 847 | break; 848 | } 849 | let elapsed = update_start.elapsed(); 850 | if elapsed.div_f32(args.target_db_load) > elapsed { 851 | let sleep_duration = elapsed.div_f32(args.target_db_load) - elapsed; 852 | if sleep_duration > Duration::from_millis(1) { 853 | log::debug!( 854 | "Sleeping {}s to keep write load at {}", 855 | sleep_duration.as_secs_f32(), 856 | args.target_db_load 857 | ); 858 | std::thread::sleep(sleep_duration); 859 | } 860 | } 861 | 862 | if updated == 0 { 863 | break; 864 | } 865 | } 866 | 867 | let (total_size_after, total_count_after): (i64, i64) = db.query_row( 868 | &format!( 869 | "select sum(length({datacol})), count(*) from {tbl} where {dictcol} = ?", 870 | tbl = esc_names.compressed_tablename, 871 | datacol = esc_names.data_colname, 872 | dictcol = esc_names.dict_colname 873 | ), 874 | params![dict_id], 875 | |row| Ok((row.get(0)?, row.get(1)?)), 876 | )?; 877 | if dict_is_new { 878 | log::info!( 879 | "Compressed {} rows with dict_choice={} (dict_id={}). Total size of entries before: {}, afterwards: {}, (average: before={}, after={})", 880 | total_updated, 881 | dict_choice, 882 | dict_id, 883 | pretty_bytes(todo.total_bytes), 884 | pretty_bytes(total_size_after), 885 | pretty_bytes(avg_sample_bytes), 886 | pretty_bytes(total_size_after / total_count_after), 887 | ); 888 | } 889 | Ok(total_updated) 890 | } 891 | 892 | enum TrainDictReturn { 893 | Skip, 894 | Done { dict_id: i32, dict_is_new: bool }, 895 | } 896 | fn get_or_train_dict( 897 | db: &Connection, 898 | config: &TransparentCompressConfig, 899 | todo: &TodoInfo, 900 | esc_names: &EscapedNames, 901 | ) -> anyhow::Result<TrainDictReturn> { 902 | let dict_choice = match &todo.dict_choice { 903 | None => { 904 | log::debug!("Skipping group, no dict chosen"); 905 | return Ok(TrainDictReturn::Skip); 906 | } 907 | Some(e) => e, 908 | }; 909 | if dict_choice == "[nodict]" { 910 | return Ok(TrainDictReturn::Done { 911 | dict_id: -1, 912 | dict_is_new: false, 913 | }); 914 | } 915 | 916 | let avg_sample_bytes = todo.total_bytes / todo.count; 917 | let dict_id: Option<i32> = db 918 | .query_row( 919 | "select id from _zstd_dicts where chooser_key = ?", 920 | params![dict_choice], 921 | |row| row.get("id"), 922 | ) 923 | .optional()?; 924 | Ok(match dict_id { 925 | Some(dict_id) => { 926 | log::debug!( 927 | "Found existing dictionary id={} for key={}", 928 | dict_id, 929 | dict_choice 930 | ); 931 | TrainDictReturn::Done { 932 | dict_id, 933 | dict_is_new: false, 934 | } 935 | } 936 | None => { 937 | let dict_target_size = (todo.total_bytes as f32 * config.dict_size_ratio) as i64; 938 | 939 | if dict_target_size < config.min_dict_size_bytes_for_training { 940 | log::debug!( 941 | "Dictionary for group '{}' would be smaller than minimum ({} * {:.3} = {} < {}), ignoring", 942 | dict_choice, 943 | pretty_bytes(todo.total_bytes), 944 | config.dict_size_ratio, 945 | pretty_bytes(dict_target_size), 946 | pretty_bytes(config.min_dict_size_bytes_for_training) 947 | ); 948 | return Ok(TrainDictReturn::Skip); 949 | } 950 | let target_samples = (dict_target_size as f32 * config.train_dict_samples_ratio 951 | / avg_sample_bytes as f32) as i64; // use roughly 100x the size of the dictionary as data 952 | 953 | log::debug!( 954 | "Training dict for key {} of max size {}", 955 | dict_choice, 956 | pretty_bytes(dict_target_size) 957 | ); 958 | let dict_id = db.query_row(&format!( 959 | "select zstd_train_dict_and_save({datacol}, ?, ?, ?) as dictid from {tbl} where {dictcol} is null and ? = ({chooser})", 960 | datacol=esc_names.data_colname, 961 | tbl=esc_names.compressed_tablename, 962 | dictcol=esc_names.dict_colname, 963 | chooser=config.dict_chooser 964 | ), params![dict_target_size, target_samples, dict_choice, dict_choice], |row| row.get("dictid"))?; 965 | TrainDictReturn::Done { 966 | dict_id, 967 | dict_is_new: true, 968 | } 969 | } 970 | }) 971 | } 972 | #[cfg(test)] 973 | mod tests { 974 | use super::add_functions::tests::create_example_db; 975 | use super::*; 976 | use pretty_assertions::assert_eq; 977 | use rand::prelude::SliceRandom; 978 | use rusqlite::params; 979 | use rusqlite::{Connection, Row}; 980 | 981 | fn row_to_thong(r: &Row) -> anyhow::Result<Vec<Value>> { 982 | Ok((0..r.as_ref().column_count()) 983 | .map(|i| r.get_ref(i).map(|e| e.into())) 984 | .collect::<Result<_, _>>()?) 985 | } 986 | 987 | fn get_whole_table(db: &Connection, tbl_name: &str) -> anyhow::Result<Vec<Vec<Value>>> { 988 | let mut stmt = db.prepare(&format!("select * from {tbl_name} ORDER BY id"))?; 989 | let q1: Vec<Vec<Value>> = stmt 990 | .query_map(params![], |e| row_to_thong(e).map_err(ah))? 991 | .collect::<Result<_, rusqlite::Error>>()?; 992 | Ok(q1) 993 | } 994 | 995 | fn check_table_rows_same(db1: &Connection, db2: &Connection) -> anyhow::Result<()> { 996 | let tbl1 = get_whole_table(db1, "events").context("Could not get whole table db 1")?; 997 | let tbl2 = get_whole_table(db2, "events").context("Could not get whole table db 2")?; 998 | assert_eq!(tbl1, tbl2); 999 | 1000 | Ok(()) 1001 | } 1002 | 1003 | #[test] 1004 | fn sanity() -> anyhow::Result<()> { 1005 | let db1 = create_example_db(Some(123), 100)?; 1006 | let db2 = create_example_db(Some(123), 100)?; 1007 | 1008 | check_table_rows_same(&db1, &db2)?; 1009 | 1010 | Ok(()) 1011 | } 1012 | 1013 | #[test] 1014 | fn no_configs() -> anyhow::Result<()> { 1015 | let db = create_example_db(Some(123), 100)?; 1016 | 1017 | assert_eq!(get_configs(&db)?.len(), 0); 1018 | Ok(()) 1019 | } 1020 | 1021 | fn get_two_dbs(seed: Option<u64>) -> anyhow::Result<(Connection, Connection)> { 1022 | if std::env::var("RUST_LOG").is_err() { 1023 | // TODO: Audit that the environment access only happens in single-threaded code. 1024 | unsafe { std::env::set_var("RUST_LOG", "info") }; 1025 | } 1026 | env_logger::try_init().ok(); 1027 | 1028 | let db1 = create_example_db(seed, 2000)?; 1029 | let db2 = create_example_db(seed, 2000)?; 1030 | 1031 | db2.query_row( 1032 | r#"select zstd_enable_transparent(?)"#, 1033 | params![r#"{"table": "events", "column": "data", "compression_level": 3, "dict_chooser": "'1'"}"#], 1034 | |_| Ok(()) 1035 | ).context("enable transparent")?; 1036 | 1037 | Ok((db1, db2)) 1038 | } 1039 | #[test] 1040 | fn enable_transparent() -> anyhow::Result<()> { 1041 | let (db1, db2) = get_two_dbs(Some(123))?; 1042 | check_table_rows_same(&db1, &db2)?; 1043 | 1044 | Ok(()) 1045 | } 1046 | 1047 | fn get_rand_id(db: &Connection) -> anyhow::Result<i64> { 1048 | db.query_row( 1049 | "select id from events order by random() limit 1", 1050 | params![], 1051 | |r| r.get(0), 1052 | ) 1053 | .context("Could not get random id") 1054 | } 1055 | 1056 | fn insert(db: &Connection, _id: i64, _id2: i64) -> anyhow::Result<()> { 1057 | let query = r#"insert into events (timestamp, data) values ('2020-12-20T00:00:00Z', '{"foo": "bar"}')"#; 1058 | 1059 | db.execute(query, params![])?; 1060 | 1061 | Ok(()) 1062 | } 1063 | 1064 | fn insert_both_columns(db: &Connection, _id: i64, _id2: i64) -> anyhow::Result<()> { 1065 | let query = r#"insert into events (timestamp, data, another_col) values ('2020-12-20T00:00:00Z', '{"foo": "bar"}', 'rustacean')"#; 1066 | 1067 | db.execute(query, params![])?; 1068 | 1069 | Ok(()) 1070 | } 1071 | 1072 | fn update_comp_col(db: &Connection, id: i64, _id2: i64) -> anyhow::Result<()> { 1073 | let _updc = db.execute("update events set data='fooooooooooooooooooooooooooooooooooooooooooooobar' where id = ?", params![id]).context("updating compressed column")?; 1074 | 1075 | //assert_eq!(updc, 1); 1076 | Ok(()) 1077 | } 1078 | 1079 | fn update_other_col(db: &Connection, id: i64, _id2: i64) -> anyhow::Result<()> { 1080 | let _updc = db 1081 | .execute( 1082 | "update events set timestamp = '2020-02-01' where id = ?", 1083 | params![id], 1084 | ) 1085 | .context("updating other column")?; 1086 | //assert_eq!(updc, 1); 1087 | Ok(()) 1088 | } 1089 | 1090 | fn update_other_two_col(db: &Connection, id: i64, id2: i64) -> anyhow::Result<()> { 1091 | //thread::rand 1092 | delete_one(db, id2, id)?; 1093 | let _updc = db 1094 | .execute( 1095 | "update events set timestamp = '2020-02-01', id=? where id = ?", 1096 | params![id2, id], 1097 | ) 1098 | .context("updating other two column")?; 1099 | //assert_eq!(updc, 1); 1100 | Ok(()) 1101 | } 1102 | 1103 | fn update_comp_col_and_other_two_col(db: &Connection, id: i64, id2: i64) -> anyhow::Result<()> { 1104 | //thread::rand 1105 | delete_one(db, id2, id)?; 1106 | let _updc = db.execute("update events set timestamp = '2020-02-01', id=?, data='fooooooooooooooooooooooooooooooooooooooooooooobar' where id = ?", params![id2,id]).context("updating three column")?; 1107 | //assert_eq!(updc, 1); 1108 | Ok(()) 1109 | } 1110 | 1111 | fn update_two_rows(db: &Connection, id: i64, id2: i64) -> anyhow::Result<()> { 1112 | //thread::rand 1113 | let _updc = db.execute("update events set timestamp = '2020-02-01', data='fooooooooooooooooooooooooooooooooooooooooooooobar' where id in (:a, :b)", 1114 | named_params! {":a": id, ":b": id2}).context("updating two rows")?; 1115 | //assert_eq!(updc, 2); 1116 | Ok(()) 1117 | } 1118 | 1119 | fn update_two_rows_by_compressed(db: &Connection, id: i64, id2: i64) -> anyhow::Result<()> { 1120 | let _updc = db 1121 | .execute( 1122 | "update events set data = 'testingxy' where id in (?, ?)", 1123 | params![id, id2], 1124 | ) 1125 | .context("updating two rows replace compressed")?; 1126 | //assert_eq!(updc, 2); 1127 | //thread::rand 1128 | let _updc = db 1129 | .execute( 1130 | "update events set timestamp='1234' where data = 'testingxy'", 1131 | params![], 1132 | ) 1133 | .context("updating where compressed=...")?; 1134 | //assert_eq!(updc, 2); 1135 | Ok(()) 1136 | } 1137 | 1138 | fn delete_one(db: &Connection, id: i64, _id2: i64) -> anyhow::Result<()> { 1139 | let _updc = db 1140 | .execute("delete from events where id = ?", params![id]) 1141 | .context("deleting from events by id")?; 1142 | //assert_eq!(updc, 1); 1143 | Ok(()) 1144 | } 1145 | 1146 | fn delete_where_other(db: &Connection, id: i64, _id2: i64) -> anyhow::Result<()> { 1147 | let ts: String = db.query_row( 1148 | "select timestamp from events where id = ?", 1149 | params![id], 1150 | |r| r.get(0), 1151 | )?; 1152 | let _updc = db 1153 | .execute("delete from events where timestamp = ?", params![ts]) 1154 | .context("deleting by timestamp")?; 1155 | //assert_eq!(updc, 1); 1156 | Ok(()) 1157 | } 1158 | 1159 | #[test] 1160 | fn test_many() -> anyhow::Result<()> { 1161 | type Executor = dyn Fn(&Connection, i64, i64) -> anyhow::Result<()>; 1162 | let posses: Vec<&Executor> = vec![ 1163 | &insert, 1164 | &update_comp_col, 1165 | &update_other_col, 1166 | &update_other_two_col, 1167 | &update_comp_col_and_other_two_col, 1168 | &update_two_rows, 1169 | &update_two_rows_by_compressed, 1170 | &delete_one, 1171 | &delete_where_other, 1172 | ]; 1173 | 1174 | let mut posses2 = vec![]; 1175 | for _ in 0..100 { 1176 | posses2.push(*posses.choose(&mut rand::thread_rng()).unwrap()); 1177 | } 1178 | for compress_first in [false, true] { 1179 | for operations in &[&posses2] { 1180 | if compress_first { 1181 | let (db1, db2) = 1182 | get_two_dbs(Some(123)).context("Could not create databases")?; 1183 | if compress_first { 1184 | let done: i64 = db2.query_row( 1185 | "select zstd_incremental_maintenance(9999999, 1)", 1186 | params![], 1187 | |r| r.get(0), 1188 | )?; 1189 | 1190 | assert_eq!(done, 0); 1191 | 1192 | let uncompressed_count: i64 = db2 1193 | .query_row( 1194 | "select count(*) from _events_zstd where _data_dict is null", 1195 | params![], 1196 | |r| r.get(0), 1197 | ) 1198 | .context("Could not query uncompressed count")?; 1199 | assert_eq!(uncompressed_count, 0); 1200 | } 1201 | 1202 | for operation in *operations { 1203 | let id = get_rand_id(&db1)?; 1204 | let id2 = get_rand_id(&db2)?; 1205 | operation(&db1, id, id2) 1206 | .context("Could not run operation on uncompressed db")?; 1207 | operation(&db2, id, id2) 1208 | .context("Could not run operation on compressed db")?; 1209 | } 1210 | 1211 | check_table_rows_same(&db1, &db2)?; 1212 | } 1213 | } 1214 | } 1215 | 1216 | Ok(()) 1217 | } 1218 | 1219 | #[test] 1220 | fn columns_of_the_same_table_are_enabled() -> anyhow::Result<()> { 1221 | let (db1, db2) = get_two_dbs(Some(456)).context("Could not create databases")?; 1222 | db2.query_row( 1223 | r#"select zstd_enable_transparent(?)"#, 1224 | params![r#"{"table": "events", "column": "another_col", "compression_level": 3, "dict_chooser": "'1'"}"#], 1225 | |_| Ok(()) 1226 | ).context("enable transparent")?; 1227 | 1228 | let done: i64 = db2.query_row( 1229 | "select zstd_incremental_maintenance(9999999, 1)", 1230 | params![], 1231 | |r| r.get(0), 1232 | )?; 1233 | 1234 | assert_eq!(done, 0); 1235 | 1236 | let uncompressed_count: i64 = db2 1237 | .query_row( 1238 | "select count(*) from _events_zstd where _data_dict is null", 1239 | params![], 1240 | |r| r.get(0), 1241 | ) 1242 | .context("Could not query uncompressed count")?; 1243 | assert_eq!(uncompressed_count, 0); 1244 | 1245 | let id = get_rand_id(&db1)?; 1246 | let id2 = get_rand_id(&db2)?; 1247 | insert_both_columns(&db1, id, id2).context("Could not run operation on uncompressed db")?; 1248 | insert_both_columns(&db2, id, id2).context("Could not run operation on compressed db")?; 1249 | 1250 | check_table_rows_same(&db1, &db2)?; 1251 | 1252 | Ok(()) 1253 | } 1254 | 1255 | #[test] 1256 | #[should_panic(expected = "another_col (another_col_idx) - used as part of index")] 1257 | fn indexed_column_cannot_be_enabled() { 1258 | let db = create_example_db(None, 1100).unwrap(); 1259 | 1260 | // When column of original table is indexed 1261 | db.execute( 1262 | "create index another_col_idx on events (another_col)", 1263 | params![], 1264 | ) 1265 | .unwrap(); 1266 | 1267 | db.query_row( 1268 | r#"select zstd_enable_transparent(?)"#, 1269 | params![r#"{"table": "events", "column": "another_col", "compression_level": 3, "dict_chooser": "'1'"}"#], 1270 | |_| Ok(()) 1271 | ).unwrap(); 1272 | } 1273 | 1274 | #[test] 1275 | #[should_panic(expected = "another_col is already enabled for compression")] 1276 | fn same_column_is_not_allowed_to_be_enabled_multiple_times() { 1277 | let db = create_example_db(None, 1100).unwrap(); 1278 | 1279 | db.query_row( 1280 | r#"select zstd_enable_transparent(?)"#, 1281 | params![r#"{"table": "events", "column": "another_col", "compression_level": 3, "dict_chooser": "'1'"}"#], 1282 | |_| Ok(()) 1283 | ).unwrap(); 1284 | 1285 | db.query_row( 1286 | r#"select zstd_enable_transparent(?)"#, 1287 | params![r#"{"table": "events", "column": "another_col", "compression_level": 3, "dict_chooser": "'1'"}"#], 1288 | |_| Ok(()) 1289 | ).unwrap(); 1290 | } 1291 | } 1292 | -------------------------------------------------------------------------------- /src/util.rs: -------------------------------------------------------------------------------- 1 | use log::LevelFilter; 2 | 3 | pub fn ensure_dicts_table_exists(db: &rusqlite::Connection) -> rusqlite::Result<()> { 4 | db.execute_batch( 5 | " 6 | create table if not exists _zstd_dicts ( 7 | id integer primary key autoincrement, 8 | chooser_key text unique, 9 | dict blob not null 10 | ); 11 | insert or ignore into _zstd_dicts values (-1, '[nodict]', ''); -- only added so foreign key is fulfilled 12 | ", 13 | )?; 14 | Ok(()) 15 | } 16 | 17 | /// format an expression while escaping given values as sqlite identifiers 18 | /// needed since prepared query parameters can't be used in identifier position 19 | #[doc(hidden)] 20 | #[macro_export] 21 | macro_rules! format_sqlite { 22 | ($x:expr_2021, $($y:expr_2021),*) => { 23 | format!($x, $(escape_sqlite_identifier($y),)*) 24 | }; 25 | } 26 | 27 | pub fn ah(e: anyhow::Error) -> rusqlite::Error { 28 | rusqlite::Error::UserFunctionError(format!("{e:?}").into()) 29 | } 30 | 31 | /*pub fn debug_row(r: &rusqlite::Row) { 32 | let cols = r.column_names(); 33 | for (i, name) in cols.iter().enumerate() { 34 | print!("{}={} ", name, format_blob(r.get_ref_unwrap(i))) 35 | } 36 | println!(); 37 | } 38 | 39 | fn format_blob(b: ValueRef) -> String { 40 | use ValueRef::*; 41 | match b { 42 | Null => "NULL".to_owned(), 43 | Integer(i) => format!("{}", i), 44 | Real(i) => format!("{}", i), 45 | Text(i) => format!("'{}'", String::from_utf8_lossy(i).replace("'", "''")), 46 | Blob(b) => format!("[blob {}B]", b.len()), 47 | } 48 | }*/ 49 | 50 | /// 51 | /// adapted from https://github.com/jgallagher/rusqlite/blob/022266239233857faa7f0b415c1a3d5095d96a53/src/vtab/mod.rs#L629 52 | /// sql injection safe? investigate 53 | /// hello -> `hello` 54 | /// he`lo -> `he``lo` 55 | /// 56 | /// we intentionally use the `e` syntax instead of "e" because of 57 | /// "a misspelled double-quoted identifier will be interpreted as a string literal, rather than generating an error" 58 | /// see https://www.sqlite.org/quirks.html#double_quoted_string_literals_are_accepted 59 | /// 60 | pub fn escape_sqlite_identifier(identifier: &str) -> String { 61 | format!("`{}`", identifier.replace('`', "``")) 62 | } 63 | 64 | pub fn init_logging(default_level: LevelFilter) { 65 | if std::env::var("SQLITE_ZSTD_LOG").is_err() { 66 | // TODO: Audit that the environment access only happens in single-threaded code. 67 | unsafe { std::env::set_var("SQLITE_ZSTD_LOG", format!("{default_level}")) }; 68 | } 69 | env_logger::try_init_from_env(env_logger::Env::new().filter("SQLITE_ZSTD_LOG")).ok(); 70 | } 71 | --------------------------------------------------------------------------------