├── .cargo
    └── config.toml
├── .github
    └── workflows
    │   ├── cicd.yml
    │   └── pypi_release.yml
├── .gitignore
├── .vscode
    └── settings.json
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── clippy.toml
├── doc
    ├── 2022-07-31-19-27-57.png
    └── sqlitebrowser.png
├── python
    ├── README.md
    ├── lib
    │   ├── __init__.py
    │   └── datasette.py
    ├── pyproject.toml
    └── setup.py
└── src
    ├── add_functions.rs
    ├── basic.rs
    ├── bin
        ├── benchmark.rs
        └── create_test_db.rs
    ├── create_extension.rs
    ├── dict_management.rs
    ├── dict_training.rs
    ├── lib.rs
    ├── transparent.rs
    └── util.rs


/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [target.aarch64-linux-android]
2 | linker = "aarch64-linux-android23-clang"
3 | 


--------------------------------------------------------------------------------
/.github/workflows/cicd.yml:
--------------------------------------------------------------------------------
  1 | name: CICD
  2 | 
  3 | # adapted from https://github.com/bootandy/dust/blob/master/.github/workflows/CICD.yml
  4 | 
  5 | env:
  6 |   PROJECT_NAME: sqlite_zstd
  7 | on: [push, pull_request]
  8 | 
  9 | jobs:
 10 |   style:
 11 |     name: Tests
 12 |     runs-on: ${{ matrix.job.os }}
 13 |     strategy:
 14 |       fail-fast: false
 15 |       matrix:
 16 |         job:
 17 |           - { os: ubuntu-latest }
 18 |     steps:
 19 |       - uses: actions/checkout@v1
 20 |       - name: Initialize workflow variables
 21 |         id: vars
 22 |         shell: bash
 23 |         run: |
 24 |           # target-specific options
 25 |           # * CARGO_FEATURES_OPTION
 26 |           CARGO_FEATURES_OPTION='' ;
 27 |           if [ -n "${{ matrix.job.features }}" ]; then CARGO_FEATURES_OPTION='--features "${{ matrix.job.features }}"' ; fi
 28 |           echo set-output name=CARGO_FEATURES_OPTION::${CARGO_FEATURES_OPTION}
 29 |           echo ::set-output name=CARGO_FEATURES_OPTION::${CARGO_FEATURES_OPTION}
 30 |       - name: Install `rust` toolchain
 31 |         uses: actions-rs/toolchain@v1
 32 |         with:
 33 |           toolchain: stable
 34 |           override: true
 35 |           profile: minimal # minimal component installation (ie, no documentation)
 36 |           components: rustfmt, clippy
 37 |       - name: "`fmt` testing"
 38 |         uses: actions-rs/cargo@v1
 39 |         with:
 40 |           command: fmt
 41 |           args: --all -- --check
 42 |       - name: "`clippy` testing"
 43 |         if: success() || failure() # run regardless of prior step ("`fmt` testing") success/failure
 44 |         uses: actions-rs/cargo@v1
 45 |         with:
 46 |           command: clippy
 47 |           args: ${{ matrix.job.cargo-options }} ${{ steps.vars.outputs.CARGO_FEATURES_OPTION }} --all-targets -- -D warnings
 48 |       - name: "tests"
 49 |         if: success() || failure() # run regardless of prior step
 50 |         uses: actions-rs/cargo@v1
 51 |         with:
 52 |           command: test
 53 | 
 54 |   build:
 55 |     name: Build
 56 |     runs-on: ${{ matrix.job.os }}
 57 |     strategy:
 58 |       fail-fast: false
 59 |       matrix:
 60 |         job:
 61 |           # { os, target, cargo-options, features, use-cross, toolchain }
 62 |           - {
 63 |               os: ubuntu-latest,
 64 |               target: x86_64-unknown-linux-gnu,
 65 |               features: build_extension,
 66 |             }
 67 |           - {
 68 |               os: ubuntu-latest,
 69 |               target: arm-unknown-linux-gnueabihf,
 70 |               use-cross: use-cross,
 71 |               features: build_extension,
 72 |             }
 73 |           - {
 74 |               os: macos-latest,
 75 |               target: x86_64-apple-darwin,
 76 |               features: build_extension,
 77 |             }
 78 |           - {
 79 |               os: windows-latest,
 80 |               target: x86_64-pc-windows-gnu,
 81 |               features: build_extension,
 82 |             } ## !maint: [rivy; 2020-01-21] may break due to rust bug; follow possible solution from GH:rust-lang/rust#47048 (refs: GH:rust-lang/rust#47048 , GH:rust-lang/rust#53454 , GH:bike-barn/hermit#172 )
 83 |           - {
 84 |               os: windows-latest,
 85 |               target: x86_64-pc-windows-msvc,
 86 |               features: build_extension,
 87 |             }
 88 |     steps:
 89 |       - uses: actions/checkout@v1
 90 |       - name: Install any prerequisites
 91 |         shell: bash
 92 |         run: |
 93 |           case ${{ matrix.job.target }} in
 94 |             arm-unknown-linux-gnueabihf) sudo apt-get -y update ; sudo apt-get -y install gcc-arm-linux-gnueabihf ;;
 95 |           esac
 96 |       - name: Initialize workflow variables
 97 |         id: vars
 98 |         shell: bash
 99 |         run: |
100 |           # toolchain
101 |           TOOLCHAIN="stable" ## default to "stable" toolchain
102 |           # * specify alternate TOOLCHAIN for *-pc-windows-gnu targets; gnu targets on Windows are broken for the standard *-pc-windows-msvc toolchain (refs: <https://github.com/rust-lang/rust/issues/47048>, <https://github.com/rust-lang/rust/issues/53454>, <https://github.com/rust-lang/cargo/issues/6754>)
103 |           case ${{ matrix.job.target }} in *-pc-windows-gnu) TOOLCHAIN="stable-${{ matrix.job.target }}" ;; esac;
104 |           # * use requested TOOLCHAIN if specified
105 |           if [ -n "${{ matrix.job.toolchain }}" ]; then TOOLCHAIN="${{ matrix.job.toolchain }}" ; fi
106 |           echo set-output name=TOOLCHAIN::${TOOLCHAIN}
107 |           echo ::set-output name=TOOLCHAIN::${TOOLCHAIN}
108 |           # staging directory
109 |           STAGING='_staging'
110 |           echo set-output name=STAGING::${STAGING}
111 |           echo ::set-output name=STAGING::${STAGING}
112 |           # determine EXE suffix
113 |           case ${{ matrix.job.target }} in
114 |             *-linux-*) EXE_suffix=".so"; EXE_prefix="lib" ;;
115 |             *-apple-*) EXE_suffix=".dylib"; EXE_prefix="lib" ;;
116 |             *-windows-*) EXE_suffix=".dll"; EXE_prefix="" ;; esac;
117 |           echo set-output name=LIB_FNAME::${EXE_prefix}${{ env.PROJECT_NAME }}${EXE_suffix}
118 |           echo ::set-output name=LIB_FNAME::${EXE_prefix}${{ env.PROJECT_NAME }}${EXE_suffix}
119 |           # parse commit reference info
120 |           REF_NAME=${GITHUB_REF#refs/*/}
121 |           unset REF_BRANCH ; case ${GITHUB_REF} in refs/heads/*) REF_BRANCH=${GITHUB_REF#refs/heads/} ;; esac;
122 |           unset REF_TAG ; case ${GITHUB_REF} in refs/tags/*) REF_TAG=${GITHUB_REF#refs/tags/} ;; esac;
123 |           REF_SHAS=${GITHUB_SHA:0:8}
124 |           echo set-output name=REF_NAME::${REF_NAME}
125 |           echo set-output name=REF_BRANCH::${REF_BRANCH}
126 |           echo set-output name=REF_TAG::${REF_TAG}
127 |           echo set-output name=REF_SHAS::${REF_SHAS}
128 |           echo ::set-output name=REF_NAME::${REF_NAME}
129 |           echo ::set-output name=REF_BRANCH::${REF_BRANCH}
130 |           echo ::set-output name=REF_TAG::${REF_TAG}
131 |           echo ::set-output name=REF_SHAS::${REF_SHAS}
132 |           # parse target
133 |           unset TARGET_ARCH ; case ${{ matrix.job.target }} in arm-unknown-linux-gnueabihf) TARGET_ARCH=arm ;; i686-*) TARGET_ARCH=i686 ;; x86_64-*) TARGET_ARCH=x86_64 ;; esac;
134 |           echo set-output name=TARGET_ARCH::${TARGET_ARCH}
135 |           echo ::set-output name=TARGET_ARCH::${TARGET_ARCH}
136 |           unset TARGET_OS ; case ${{ matrix.job.target }} in *-linux-*) TARGET_OS=linux ;; *-apple-*) TARGET_OS=macos ;; *-windows-*) TARGET_OS=windows ;; esac;
137 |           echo set-output name=TARGET_OS::${TARGET_OS}
138 |           echo ::set-output name=TARGET_OS::${TARGET_OS}
139 |           # package name
140 |           PKG_suffix=".tar.gz" ; case ${{ matrix.job.target }} in *-pc-windows-*) PKG_suffix=".zip" ;; esac;
141 |           PKG_BASENAME=${PROJECT_NAME}-${REF_TAG:-$REF_SHAS}-${{ matrix.job.target }}
142 |           PKG_NAME=${PKG_BASENAME}${PKG_suffix}
143 |           echo set-output name=PKG_suffix::${PKG_suffix}
144 |           echo set-output name=PKG_BASENAME::${PKG_BASENAME}
145 |           echo set-output name=PKG_NAME::${PKG_NAME}
146 |           echo ::set-output name=PKG_suffix::${PKG_suffix}
147 |           echo ::set-output name=PKG_BASENAME::${PKG_BASENAME}
148 |           echo ::set-output name=PKG_NAME::${PKG_NAME}
149 |           # deployable tag? (ie, leading "vM" or "M"; M == version number)
150 |           unset DEPLOY ; if [[ $REF_TAG =~ ^[vV]?[0-9].* ]]; then DEPLOY='true' ; fi
151 |           echo set-output name=DEPLOY::${DEPLOY:-<empty>/false}
152 |           echo ::set-output name=DEPLOY::${DEPLOY}
153 |           # target-specific options
154 |           # * CARGO_FEATURES_OPTION
155 |           CARGO_FEATURES_OPTION='' ;
156 |           if [ -n "${{ matrix.job.features }}" ]; then CARGO_FEATURES_OPTION='--features "${{ matrix.job.features }}"' ; fi
157 |           echo set-output name=CARGO_FEATURES_OPTION::${CARGO_FEATURES_OPTION}
158 |           echo ::set-output name=CARGO_FEATURES_OPTION::${CARGO_FEATURES_OPTION}
159 |           # * CARGO_USE_CROSS (truthy)
160 |           CARGO_USE_CROSS='true' ; case '${{ matrix.job.use-cross }}' in ''|0|f|false|n|no) unset CARGO_USE_CROSS ;; esac;
161 |           echo set-output name=CARGO_USE_CROSS::${CARGO_USE_CROSS:-<empty>/false}
162 |           echo ::set-output name=CARGO_USE_CROSS::${CARGO_USE_CROSS}
163 |           # * strip executable?
164 |           STRIP="strip" ; STRIP_PARAMS="" ; case ${{ matrix.job.target }} in arm-unknown-linux-gnueabihf) STRIP="arm-linux-gnueabihf-strip" ;; *-pc-windows-msvc) STRIP="" ;; *-apple-darwin) STRIP_PARAMS="-x" ;; esac;
165 |           echo set-output name=STRIP::${STRIP}
166 |           echo ::set-output name=STRIP::${STRIP}
167 |           echo set-output name=STRIP_PARAMS::${STRIP_PARAMS}
168 |           echo ::set-output name=STRIP_PARAMS::${STRIP_PARAMS}
169 |       - name: Create all needed build/work directories
170 |         shell: bash
171 |         run: |
172 |           mkdir -p '${{ steps.vars.outputs.STAGING }}'
173 |           mkdir -p '${{ steps.vars.outputs.STAGING }}/${{ steps.vars.outputs.PKG_BASENAME }}'
174 |       - name: rust toolchain ~ install
175 |         uses: actions-rs/toolchain@v1
176 |         with:
177 |           toolchain: ${{ steps.vars.outputs.TOOLCHAIN }}
178 |           target: ${{ matrix.job.target }}
179 |           override: true
180 |           profile: minimal # minimal component installation (ie, no documentation)
181 |       - name: Info
182 |         shell: bash
183 |         run: |
184 |           gcc --version || true
185 |           rustup -V
186 |           rustup toolchain list
187 |           rustup default
188 |           cargo -V
189 |           rustc -V
190 |       - name: Build
191 |         uses: actions-rs/cargo@v1
192 |         with:
193 |           use-cross: ${{ steps.vars.outputs.CARGO_USE_CROSS }}
194 |           command: build
195 |           args: --release --target=${{ matrix.job.target }} ${{ matrix.job.cargo-options }} ${{ steps.vars.outputs.CARGO_FEATURES_OPTION }}
196 |       - name: Archive executable artifacts
197 |         uses: actions/upload-artifact@master
198 |         with:
199 |           name: ${{ env.PROJECT_NAME }}-${{ matrix.job.target }}
200 |           path: target/${{ matrix.job.target }}/release/${{ steps.vars.outputs.LIB_FNAME }}
201 |       - name: Package
202 |         shell: bash
203 |         run: |
204 |           # binary
205 |           cp 'target/${{ matrix.job.target }}/release/${{ steps.vars.outputs.LIB_FNAME }}' '${{ steps.vars.outputs.STAGING }}/${{ steps.vars.outputs.PKG_BASENAME }}/'
206 |           # `strip` binary (if needed)
207 |           if [ -n "${{ steps.vars.outputs.STRIP }}" ]; then "${{ steps.vars.outputs.STRIP }}" ${{ steps.vars.outputs.STRIP_PARAMS }} '${{ steps.vars.outputs.STAGING }}/${{ steps.vars.outputs.PKG_BASENAME }}/${{ steps.vars.outputs.LIB_FNAME }}' ; fi
208 |           # README and LICENSE
209 |           cp README.md '${{ steps.vars.outputs.STAGING }}/${{ steps.vars.outputs.PKG_BASENAME }}/'
210 |           cp LICENSE '${{ steps.vars.outputs.STAGING }}/${{ steps.vars.outputs.PKG_BASENAME }}/'
211 |           # base compressed package
212 |           pushd '${{ steps.vars.outputs.STAGING }}/' >/dev/null
213 |           case ${{ matrix.job.target }} in
214 |             *-pc-windows-*) 7z -y a '${{ steps.vars.outputs.PKG_NAME }}' '${{ steps.vars.outputs.PKG_BASENAME }}'/* | tail -2 ;;
215 |             *) tar czf '${{ steps.vars.outputs.PKG_NAME }}' '${{ steps.vars.outputs.PKG_BASENAME }}'/* ;;
216 |           esac;
217 |           popd >/dev/null
218 |       - name: Publish
219 |         uses: softprops/action-gh-release@v1
220 |         if: steps.vars.outputs.DEPLOY
221 |         with:
222 |           files: |
223 |             ${{ steps.vars.outputs.STAGING }}/${{ steps.vars.outputs.PKG_NAME }}
224 |         env:
225 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
226 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi_release.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | 
 3 | on:
 4 |   workflow_dispatch: 
 5 | 
 6 | jobs:
 7 |   build_wheels:
 8 |     name: Build wheels on ${{ matrix.os }} for ${{ matrix.cibw_python }} on ${{ matrix.cibw_arch }}
 9 |     runs-on: ${{ matrix.os }}
10 |     strategy:
11 |       matrix:
12 |         os: [ubuntu-latest]
13 |         cibw_arch: ["x86_64", "aarch64"]
14 |     env:
15 |       CIBW_BEFORE_ALL_LINUX: "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain stable -y"
16 |       CIBW_BUILD_VERBOSITY: "1"
17 |       CIBW_ENVIRONMENT: 'PATH="$PATH:$HOME/.cargo/bin"'
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v4
21 | 
22 |       - name: Set up QEMU
23 |         if: matrix.os == 'ubuntu-latest' && matrix.cibw_arch == 'aarch64'
24 |         uses: docker/setup-qemu-action@68827325e0b33c7199eb31dd4e31fbe9023e06e3  # v3.0.0
25 |         with:
26 |           platforms: arm64
27 | 
28 |       - name: Build wheels
29 |         uses: pypa/cibuildwheel@v2.16.2
30 |         env:
31 |           CIBW_BUILD_VERBOSITY: 1
32 |           CIBW_BUILD: ${{ matrix.cibw_python }}
33 |           CIBW_ARCHS: ${{ matrix.cibw_arch }}
34 |           CIBW_TEST_SKIP: "*universal2:arm64"
35 |         with:
36 |             package-dir: ./python
37 |             output-dir: ./python/wheelhouse
38 | 
39 |       - uses: actions/upload-artifact@v3
40 |         with:
41 |           name: dist
42 |           path: ./python/wheelhouse/*.whl
43 | 
44 |   build_sdist:
45 |     name: Build source distribution
46 |     runs-on: ubuntu-latest
47 |     steps:
48 |       - uses: actions/checkout@v2
49 | 
50 |       - name: Install rust
51 |         uses: actions-rs/toolchain@v1
52 |         with:
53 |           toolchain: stable
54 |           profile: minimal
55 | 
56 |       - uses: actions/setup-python@v2
57 |         name: Install Python
58 |         with:
59 |           python-version: "3.12"
60 | 
61 |       - name: Build sdist
62 |         run: |
63 |           python -m pip install setuptools-rust setuptools wheel
64 |           cd python/
65 |           python setup.py sdist
66 | 
67 |       - uses: actions/upload-artifact@v2
68 |         with:
69 |           name: dist
70 |           path: python/dist/*.tar.*
71 |   
72 |   release:
73 |     needs: [build_wheels, build_sdist]
74 |     runs-on: ubuntu-latest
75 |     steps:
76 |       - uses: actions/download-artifact@v3
77 |         with:
78 |           name: dist
79 |           path: python/dist/
80 | 
81 |       - uses: pypa/gh-action-pypi-publish@v1.8.10
82 |         with:
83 |           repository-url: https://pypi.org/project/sqlite-zstd-build
84 |           user: __token__
85 |           password: ${{ secrets.PYPI_API_TOKEN }}
86 |           packages-dir: python/dist
87 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /python/*.egg-info/
2 | /python/build/
3 | /python/dist/
4 | /target
5 | *.sqlite3*
6 | private*
7 | /bench*
8 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "editor.formatOnSave": true,
3 |     "rust-analyzer.checkOnSave.command": "clippy",
4 |     "rust-analyzer.cargo.features": ["benchmark"]
5 | }
6 | 


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 4
  4 | 
  5 | [[package]]
  6 | name = "adler"
  7 | version = "1.0.2"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 10 | 
 11 | [[package]]
 12 | name = "aho-corasick"
 13 | version = "0.7.18"
 14 | source = "registry+https://github.com/rust-lang/crates.io-index"
 15 | checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
 16 | dependencies = [
 17 |  "memchr",
 18 | ]
 19 | 
 20 | [[package]]
 21 | name = "ansi_term"
 22 | version = "0.12.1"
 23 | source = "registry+https://github.com/rust-lang/crates.io-index"
 24 | checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
 25 | dependencies = [
 26 |  "winapi",
 27 | ]
 28 | 
 29 | [[package]]
 30 | name = "anyhow"
 31 | version = "1.0.58"
 32 | source = "registry+https://github.com/rust-lang/crates.io-index"
 33 | checksum = "bb07d2053ccdbe10e2af2995a2f116c1330396493dc1269f6a91d0ae82e19704"
 34 | 
 35 | [[package]]
 36 | name = "atty"
 37 | version = "0.2.14"
 38 | source = "registry+https://github.com/rust-lang/crates.io-index"
 39 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
 40 | dependencies = [
 41 |  "hermit-abi",
 42 |  "libc",
 43 |  "winapi",
 44 | ]
 45 | 
 46 | [[package]]
 47 | name = "autocfg"
 48 | version = "1.1.0"
 49 | source = "registry+https://github.com/rust-lang/crates.io-index"
 50 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 51 | 
 52 | [[package]]
 53 | name = "bitflags"
 54 | version = "1.3.2"
 55 | source = "registry+https://github.com/rust-lang/crates.io-index"
 56 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 57 | 
 58 | [[package]]
 59 | name = "bitflags"
 60 | version = "2.9.0"
 61 | source = "registry+https://github.com/rust-lang/crates.io-index"
 62 | checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd"
 63 | 
 64 | [[package]]
 65 | name = "bstr"
 66 | version = "0.2.17"
 67 | source = "registry+https://github.com/rust-lang/crates.io-index"
 68 | checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
 69 | dependencies = [
 70 |  "lazy_static",
 71 |  "memchr",
 72 |  "regex-automata",
 73 |  "serde",
 74 | ]
 75 | 
 76 | [[package]]
 77 | name = "byteorder"
 78 | version = "1.4.3"
 79 | source = "registry+https://github.com/rust-lang/crates.io-index"
 80 | checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
 81 | 
 82 | [[package]]
 83 | name = "cc"
 84 | version = "1.2.22"
 85 | source = "registry+https://github.com/rust-lang/crates.io-index"
 86 | checksum = "32db95edf998450acc7881c932f94cd9b05c87b4b2599e8bab064753da4acfd1"
 87 | dependencies = [
 88 |  "jobserver",
 89 |  "libc",
 90 |  "shlex",
 91 | ]
 92 | 
 93 | [[package]]
 94 | name = "cfg-if"
 95 | version = "1.0.0"
 96 | source = "registry+https://github.com/rust-lang/crates.io-index"
 97 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 98 | 
 99 | [[package]]
100 | name = "chrono"
101 | version = "0.4.19"
102 | source = "registry+https://github.com/rust-lang/crates.io-index"
103 | checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73"
104 | dependencies = [
105 |  "libc",
106 |  "num-integer",
107 |  "num-traits",
108 |  "time",
109 |  "winapi",
110 | ]
111 | 
112 | [[package]]
113 | name = "clap"
114 | version = "2.34.0"
115 | source = "registry+https://github.com/rust-lang/crates.io-index"
116 | checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
117 | dependencies = [
118 |  "ansi_term",
119 |  "atty",
120 |  "bitflags 1.3.2",
121 |  "strsim 0.8.0",
122 |  "textwrap 0.11.0",
123 |  "unicode-width",
124 |  "vec_map",
125 | ]
126 | 
127 | [[package]]
128 | name = "clap"
129 | version = "3.2.12"
130 | source = "registry+https://github.com/rust-lang/crates.io-index"
131 | checksum = "ab8b79fe3946ceb4a0b1c080b4018992b8d27e9ff363644c1c9b6387c854614d"
132 | dependencies = [
133 |  "atty",
134 |  "bitflags 1.3.2",
135 |  "clap_derive",
136 |  "clap_lex",
137 |  "indexmap",
138 |  "once_cell",
139 |  "strsim 0.10.0",
140 |  "termcolor",
141 |  "textwrap 0.15.0",
142 | ]
143 | 
144 | [[package]]
145 | name = "clap_derive"
146 | version = "3.2.7"
147 | source = "registry+https://github.com/rust-lang/crates.io-index"
148 | checksum = "759bf187376e1afa7b85b959e6a664a3e7a95203415dba952ad19139e798f902"
149 | dependencies = [
150 |  "heck 0.4.0",
151 |  "proc-macro-error",
152 |  "proc-macro2",
153 |  "quote",
154 |  "syn 1.0.98",
155 | ]
156 | 
157 | [[package]]
158 | name = "clap_lex"
159 | version = "0.2.4"
160 | source = "registry+https://github.com/rust-lang/crates.io-index"
161 | checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5"
162 | dependencies = [
163 |  "os_str_bytes",
164 | ]
165 | 
166 | [[package]]
167 | name = "crc32fast"
168 | version = "1.3.2"
169 | source = "registry+https://github.com/rust-lang/crates.io-index"
170 | checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
171 | dependencies = [
172 |  "cfg-if",
173 | ]
174 | 
175 | [[package]]
176 | name = "csv"
177 | version = "1.1.6"
178 | source = "registry+https://github.com/rust-lang/crates.io-index"
179 | checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
180 | dependencies = [
181 |  "bstr",
182 |  "csv-core",
183 |  "itoa 0.4.8",
184 |  "ryu",
185 |  "serde",
186 | ]
187 | 
188 | [[package]]
189 | name = "csv-core"
190 | version = "0.1.10"
191 | source = "registry+https://github.com/rust-lang/crates.io-index"
192 | checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
193 | dependencies = [
194 |  "memchr",
195 | ]
196 | 
197 | [[package]]
198 | name = "ctor"
199 | version = "0.1.22"
200 | source = "registry+https://github.com/rust-lang/crates.io-index"
201 | checksum = "f877be4f7c9f246b183111634f75baa039715e3f46ce860677d3b19a69fb229c"
202 | dependencies = [
203 |  "quote",
204 |  "syn 1.0.98",
205 | ]
206 | 
207 | [[package]]
208 | name = "diff"
209 | version = "0.1.13"
210 | source = "registry+https://github.com/rust-lang/crates.io-index"
211 | checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
212 | 
213 | [[package]]
214 | name = "env_logger"
215 | version = "0.9.0"
216 | source = "registry+https://github.com/rust-lang/crates.io-index"
217 | checksum = "0b2cf0344971ee6c64c31be0d530793fba457d322dfec2810c453d0ef228f9c3"
218 | dependencies = [
219 |  "atty",
220 |  "humantime",
221 |  "log",
222 |  "regex",
223 |  "termcolor",
224 | ]
225 | 
226 | [[package]]
227 | name = "errno"
228 | version = "0.2.8"
229 | source = "registry+https://github.com/rust-lang/crates.io-index"
230 | checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
231 | dependencies = [
232 |  "errno-dragonfly",
233 |  "libc",
234 |  "winapi",
235 | ]
236 | 
237 | [[package]]
238 | name = "errno-dragonfly"
239 | version = "0.1.2"
240 | source = "registry+https://github.com/rust-lang/crates.io-index"
241 | checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
242 | dependencies = [
243 |  "cc",
244 |  "libc",
245 | ]
246 | 
247 | [[package]]
248 | name = "fallible-iterator"
249 | version = "0.3.0"
250 | source = "registry+https://github.com/rust-lang/crates.io-index"
251 | checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
252 | 
253 | [[package]]
254 | name = "fallible-streaming-iterator"
255 | version = "0.1.9"
256 | source = "registry+https://github.com/rust-lang/crates.io-index"
257 | checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
258 | 
259 | [[package]]
260 | name = "flate2"
261 | version = "1.0.24"
262 | source = "registry+https://github.com/rust-lang/crates.io-index"
263 | checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6"
264 | dependencies = [
265 |  "crc32fast",
266 |  "miniz_oxide",
267 | ]
268 | 
269 | [[package]]
270 | name = "foldhash"
271 | version = "0.1.5"
272 | source = "registry+https://github.com/rust-lang/crates.io-index"
273 | checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
274 | 
275 | [[package]]
276 | name = "getrandom"
277 | version = "0.2.7"
278 | source = "registry+https://github.com/rust-lang/crates.io-index"
279 | checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6"
280 | dependencies = [
281 |  "cfg-if",
282 |  "libc",
283 |  "wasi 0.11.0+wasi-snapshot-preview1",
284 | ]
285 | 
286 | [[package]]
287 | name = "hashbrown"
288 | version = "0.12.2"
289 | source = "registry+https://github.com/rust-lang/crates.io-index"
290 | checksum = "607c8a29735385251a339424dd462993c0fed8fa09d378f259377df08c126022"
291 | 
292 | [[package]]
293 | name = "hashbrown"
294 | version = "0.15.3"
295 | source = "registry+https://github.com/rust-lang/crates.io-index"
296 | checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3"
297 | dependencies = [
298 |  "foldhash",
299 | ]
300 | 
301 | [[package]]
302 | name = "hashlink"
303 | version = "0.10.0"
304 | source = "registry+https://github.com/rust-lang/crates.io-index"
305 | checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1"
306 | dependencies = [
307 |  "hashbrown 0.15.3",
308 | ]
309 | 
310 | [[package]]
311 | name = "heck"
312 | version = "0.3.3"
313 | source = "registry+https://github.com/rust-lang/crates.io-index"
314 | checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
315 | dependencies = [
316 |  "unicode-segmentation",
317 | ]
318 | 
319 | [[package]]
320 | name = "heck"
321 | version = "0.4.0"
322 | source = "registry+https://github.com/rust-lang/crates.io-index"
323 | checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
324 | 
325 | [[package]]
326 | name = "hermit-abi"
327 | version = "0.1.19"
328 | source = "registry+https://github.com/rust-lang/crates.io-index"
329 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
330 | dependencies = [
331 |  "libc",
332 | ]
333 | 
334 | [[package]]
335 | name = "hex"
336 | version = "0.4.3"
337 | source = "registry+https://github.com/rust-lang/crates.io-index"
338 | checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
339 | 
340 | [[package]]
341 | name = "humantime"
342 | version = "2.1.0"
343 | source = "registry+https://github.com/rust-lang/crates.io-index"
344 | checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
345 | 
346 | [[package]]
347 | name = "indexmap"
348 | version = "1.9.1"
349 | source = "registry+https://github.com/rust-lang/crates.io-index"
350 | checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
351 | dependencies = [
352 |  "autocfg",
353 |  "hashbrown 0.12.2",
354 | ]
355 | 
356 | [[package]]
357 | name = "io-lifetimes"
358 | version = "0.7.2"
359 | source = "registry+https://github.com/rust-lang/crates.io-index"
360 | checksum = "24c3f4eff5495aee4c0399d7b6a0dc2b6e81be84242ffbfcf253ebacccc1d0cb"
361 | 
362 | [[package]]
363 | name = "itoa"
364 | version = "0.4.8"
365 | source = "registry+https://github.com/rust-lang/crates.io-index"
366 | checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
367 | 
368 | [[package]]
369 | name = "itoa"
370 | version = "1.0.2"
371 | source = "registry+https://github.com/rust-lang/crates.io-index"
372 | checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d"
373 | 
374 | [[package]]
375 | name = "jobserver"
376 | version = "0.1.32"
377 | source = "registry+https://github.com/rust-lang/crates.io-index"
378 | checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0"
379 | dependencies = [
380 |  "libc",
381 | ]
382 | 
383 | [[package]]
384 | name = "lazy_static"
385 | version = "1.4.0"
386 | source = "registry+https://github.com/rust-lang/crates.io-index"
387 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
388 | 
389 | [[package]]
390 | name = "libc"
391 | version = "0.2.126"
392 | source = "registry+https://github.com/rust-lang/crates.io-index"
393 | checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836"
394 | 
395 | [[package]]
396 | name = "libsqlite3-sys"
397 | version = "0.33.0"
398 | source = "registry+https://github.com/rust-lang/crates.io-index"
399 | checksum = "947e6816f7825b2b45027c2c32e7085da9934defa535de4a6a46b10a4d5257fa"
400 | dependencies = [
401 |  "cc",
402 |  "pkg-config",
403 |  "prettyplease",
404 |  "quote",
405 |  "syn 2.0.101",
406 |  "vcpkg",
407 | ]
408 | 
409 | [[package]]
410 | name = "linux-raw-sys"
411 | version = "0.0.46"
412 | source = "registry+https://github.com/rust-lang/crates.io-index"
413 | checksum = "d4d2456c373231a208ad294c33dc5bff30051eafd954cd4caae83a712b12854d"
414 | 
415 | [[package]]
416 | name = "log"
417 | version = "0.4.17"
418 | source = "registry+https://github.com/rust-lang/crates.io-index"
419 | checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
420 | dependencies = [
421 |  "cfg-if",
422 | ]
423 | 
424 | [[package]]
425 | name = "lru_time_cache"
426 | version = "0.11.11"
427 | source = "registry+https://github.com/rust-lang/crates.io-index"
428 | checksum = "9106e1d747ffd48e6be5bb2d97fa706ed25b144fbee4d5c02eae110cd8d6badd"
429 | 
430 | [[package]]
431 | name = "memchr"
432 | version = "2.5.0"
433 | source = "registry+https://github.com/rust-lang/crates.io-index"
434 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
435 | 
436 | [[package]]
437 | name = "miniz_oxide"
438 | version = "0.5.3"
439 | source = "registry+https://github.com/rust-lang/crates.io-index"
440 | checksum = "6f5c75688da582b8ffc1f1799e9db273f32133c49e048f614d22ec3256773ccc"
441 | dependencies = [
442 |  "adler",
443 | ]
444 | 
445 | [[package]]
446 | name = "names"
447 | version = "0.14.0"
448 | source = "registry+https://github.com/rust-lang/crates.io-index"
449 | checksum = "7bddcd3bf5144b6392de80e04c347cd7fab2508f6df16a85fc496ecd5cec39bc"
450 | dependencies = [
451 |  "clap 3.2.12",
452 |  "rand",
453 | ]
454 | 
455 | [[package]]
456 | name = "num-integer"
457 | version = "0.1.45"
458 | source = "registry+https://github.com/rust-lang/crates.io-index"
459 | checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
460 | dependencies = [
461 |  "autocfg",
462 |  "num-traits",
463 | ]
464 | 
465 | [[package]]
466 | name = "num-traits"
467 | version = "0.2.15"
468 | source = "registry+https://github.com/rust-lang/crates.io-index"
469 | checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
470 | dependencies = [
471 |  "autocfg",
472 | ]
473 | 
474 | [[package]]
475 | name = "once_cell"
476 | version = "1.13.0"
477 | source = "registry+https://github.com/rust-lang/crates.io-index"
478 | checksum = "18a6dbe30758c9f83eb00cbea4ac95966305f5a7772f3f42ebfc7fc7eddbd8e1"
479 | 
480 | [[package]]
481 | name = "os_str_bytes"
482 | version = "6.2.0"
483 | source = "registry+https://github.com/rust-lang/crates.io-index"
484 | checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4"
485 | 
486 | [[package]]
487 | name = "output_vt100"
488 | version = "0.1.3"
489 | source = "registry+https://github.com/rust-lang/crates.io-index"
490 | checksum = "628223faebab4e3e40667ee0b2336d34a5b960ff60ea743ddfdbcf7770bcfb66"
491 | dependencies = [
492 |  "winapi",
493 | ]
494 | 
495 | [[package]]
496 | name = "owning_ref"
497 | version = "0.4.1"
498 | source = "registry+https://github.com/rust-lang/crates.io-index"
499 | checksum = "6ff55baddef9e4ad00f88b6c743a2a8062d4c6ade126c2a528644b8e444d52ce"
500 | dependencies = [
501 |  "stable_deref_trait",
502 | ]
503 | 
504 | [[package]]
505 | name = "pkg-config"
506 | version = "0.3.25"
507 | source = "registry+https://github.com/rust-lang/crates.io-index"
508 | checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae"
509 | 
510 | [[package]]
511 | name = "ppv-lite86"
512 | version = "0.2.16"
513 | source = "registry+https://github.com/rust-lang/crates.io-index"
514 | checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
515 | 
516 | [[package]]
517 | name = "pretty_assertions"
518 | version = "1.2.1"
519 | source = "registry+https://github.com/rust-lang/crates.io-index"
520 | checksum = "c89f989ac94207d048d92db058e4f6ec7342b0971fc58d1271ca148b799b3563"
521 | dependencies = [
522 |  "ansi_term",
523 |  "ctor",
524 |  "diff",
525 |  "output_vt100",
526 | ]
527 | 
528 | [[package]]
529 | name = "prettyplease"
530 | version = "0.2.32"
531 | source = "registry+https://github.com/rust-lang/crates.io-index"
532 | checksum = "664ec5419c51e34154eec046ebcba56312d5a2fc3b09a06da188e1ad21afadf6"
533 | dependencies = [
534 |  "proc-macro2",
535 |  "syn 2.0.101",
536 | ]
537 | 
538 | [[package]]
539 | name = "proc-macro-error"
540 | version = "1.0.4"
541 | source = "registry+https://github.com/rust-lang/crates.io-index"
542 | checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
543 | dependencies = [
544 |  "proc-macro-error-attr",
545 |  "proc-macro2",
546 |  "quote",
547 |  "syn 1.0.98",
548 |  "version_check",
549 | ]
550 | 
551 | [[package]]
552 | name = "proc-macro-error-attr"
553 | version = "1.0.4"
554 | source = "registry+https://github.com/rust-lang/crates.io-index"
555 | checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
556 | dependencies = [
557 |  "proc-macro2",
558 |  "quote",
559 |  "version_check",
560 | ]
561 | 
562 | [[package]]
563 | name = "proc-macro2"
564 | version = "1.0.95"
565 | source = "registry+https://github.com/rust-lang/crates.io-index"
566 | checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
567 | dependencies = [
568 |  "unicode-ident",
569 | ]
570 | 
571 | [[package]]
572 | name = "procfs"
573 | version = "0.13.2"
574 | source = "registry+https://github.com/rust-lang/crates.io-index"
575 | checksum = "979e5cb47caafb8e14653bb083358e19917ca8c9c4c2648932eccd935f5c4d80"
576 | dependencies = [
577 |  "bitflags 1.3.2",
578 |  "byteorder",
579 |  "chrono",
580 |  "flate2",
581 |  "hex",
582 |  "lazy_static",
583 |  "rustix",
584 | ]
585 | 
586 | [[package]]
587 | name = "quote"
588 | version = "1.0.40"
589 | source = "registry+https://github.com/rust-lang/crates.io-index"
590 | checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
591 | dependencies = [
592 |  "proc-macro2",
593 | ]
594 | 
595 | [[package]]
596 | name = "rand"
597 | version = "0.8.5"
598 | source = "registry+https://github.com/rust-lang/crates.io-index"
599 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
600 | dependencies = [
601 |  "libc",
602 |  "rand_chacha",
603 |  "rand_core",
604 | ]
605 | 
606 | [[package]]
607 | name = "rand_chacha"
608 | version = "0.3.1"
609 | source = "registry+https://github.com/rust-lang/crates.io-index"
610 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
611 | dependencies = [
612 |  "ppv-lite86",
613 |  "rand_core",
614 | ]
615 | 
616 | [[package]]
617 | name = "rand_core"
618 | version = "0.6.3"
619 | source = "registry+https://github.com/rust-lang/crates.io-index"
620 | checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
621 | dependencies = [
622 |  "getrandom",
623 | ]
624 | 
625 | [[package]]
626 | name = "regex"
627 | version = "1.6.0"
628 | source = "registry+https://github.com/rust-lang/crates.io-index"
629 | checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b"
630 | dependencies = [
631 |  "aho-corasick",
632 |  "memchr",
633 |  "regex-syntax",
634 | ]
635 | 
636 | [[package]]
637 | name = "regex-automata"
638 | version = "0.1.10"
639 | source = "registry+https://github.com/rust-lang/crates.io-index"
640 | checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
641 | 
642 | [[package]]
643 | name = "regex-syntax"
644 | version = "0.6.27"
645 | source = "registry+https://github.com/rust-lang/crates.io-index"
646 | checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244"
647 | 
648 | [[package]]
649 | name = "rusqlite"
650 | version = "0.35.0"
651 | source = "registry+https://github.com/rust-lang/crates.io-index"
652 | checksum = "a22715a5d6deef63c637207afbe68d0c72c3f8d0022d7cf9714c442d6157606b"
653 | dependencies = [
654 |  "bitflags 2.9.0",
655 |  "fallible-iterator",
656 |  "fallible-streaming-iterator",
657 |  "hashlink",
658 |  "libsqlite3-sys",
659 |  "smallvec",
660 | ]
661 | 
662 | [[package]]
663 | name = "rustix"
664 | version = "0.35.7"
665 | source = "registry+https://github.com/rust-lang/crates.io-index"
666 | checksum = "d51cc38aa10f6bbb377ed28197aa052aa4e2b762c22be9d3153d01822587e787"
667 | dependencies = [
668 |  "bitflags 1.3.2",
669 |  "errno",
670 |  "io-lifetimes",
671 |  "libc",
672 |  "linux-raw-sys",
673 |  "windows-sys",
674 | ]
675 | 
676 | [[package]]
677 | name = "ryu"
678 | version = "1.0.10"
679 | source = "registry+https://github.com/rust-lang/crates.io-index"
680 | checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695"
681 | 
682 | [[package]]
683 | name = "serde"
684 | version = "1.0.139"
685 | source = "registry+https://github.com/rust-lang/crates.io-index"
686 | checksum = "0171ebb889e45aa68b44aee0859b3eede84c6f5f5c228e6f140c0b2a0a46cad6"
687 | dependencies = [
688 |  "serde_derive",
689 | ]
690 | 
691 | [[package]]
692 | name = "serde_derive"
693 | version = "1.0.139"
694 | source = "registry+https://github.com/rust-lang/crates.io-index"
695 | checksum = "dc1d3230c1de7932af58ad8ffbe1d784bd55efd5a9d84ac24f69c72d83543dfb"
696 | dependencies = [
697 |  "proc-macro2",
698 |  "quote",
699 |  "syn 1.0.98",
700 | ]
701 | 
702 | [[package]]
703 | name = "serde_json"
704 | version = "1.0.82"
705 | source = "registry+https://github.com/rust-lang/crates.io-index"
706 | checksum = "82c2c1fdcd807d1098552c5b9a36e425e42e9fbd7c6a37a8425f390f781f7fa7"
707 | dependencies = [
708 |  "itoa 1.0.2",
709 |  "ryu",
710 |  "serde",
711 | ]
712 | 
713 | [[package]]
714 | name = "shlex"
715 | version = "1.3.0"
716 | source = "registry+https://github.com/rust-lang/crates.io-index"
717 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
718 | 
719 | [[package]]
720 | name = "smallvec"
721 | version = "1.9.0"
722 | source = "registry+https://github.com/rust-lang/crates.io-index"
723 | checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1"
724 | 
725 | [[package]]
726 | name = "sqlite-zstd"
727 | version = "0.3.5"
728 | dependencies = [
729 |  "anyhow",
730 |  "chrono",
731 |  "csv",
732 |  "env_logger",
733 |  "lazy_static",
734 |  "log",
735 |  "lru_time_cache",
736 |  "names",
737 |  "owning_ref",
738 |  "pretty_assertions",
739 |  "procfs",
740 |  "rand",
741 |  "rusqlite",
742 |  "serde",
743 |  "serde_json",
744 |  "structopt",
745 |  "zstd",
746 | ]
747 | 
748 | [[package]]
749 | name = "stable_deref_trait"
750 | version = "1.2.0"
751 | source = "registry+https://github.com/rust-lang/crates.io-index"
752 | checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
753 | 
754 | [[package]]
755 | name = "strsim"
756 | version = "0.8.0"
757 | source = "registry+https://github.com/rust-lang/crates.io-index"
758 | checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
759 | 
760 | [[package]]
761 | name = "strsim"
762 | version = "0.10.0"
763 | source = "registry+https://github.com/rust-lang/crates.io-index"
764 | checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
765 | 
766 | [[package]]
767 | name = "structopt"
768 | version = "0.3.26"
769 | source = "registry+https://github.com/rust-lang/crates.io-index"
770 | checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10"
771 | dependencies = [
772 |  "clap 2.34.0",
773 |  "lazy_static",
774 |  "structopt-derive",
775 | ]
776 | 
777 | [[package]]
778 | name = "structopt-derive"
779 | version = "0.4.18"
780 | source = "registry+https://github.com/rust-lang/crates.io-index"
781 | checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0"
782 | dependencies = [
783 |  "heck 0.3.3",
784 |  "proc-macro-error",
785 |  "proc-macro2",
786 |  "quote",
787 |  "syn 1.0.98",
788 | ]
789 | 
790 | [[package]]
791 | name = "syn"
792 | version = "1.0.98"
793 | source = "registry+https://github.com/rust-lang/crates.io-index"
794 | checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd"
795 | dependencies = [
796 |  "proc-macro2",
797 |  "quote",
798 |  "unicode-ident",
799 | ]
800 | 
801 | [[package]]
802 | name = "syn"
803 | version = "2.0.101"
804 | source = "registry+https://github.com/rust-lang/crates.io-index"
805 | checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
806 | dependencies = [
807 |  "proc-macro2",
808 |  "quote",
809 |  "unicode-ident",
810 | ]
811 | 
812 | [[package]]
813 | name = "termcolor"
814 | version = "1.1.3"
815 | source = "registry+https://github.com/rust-lang/crates.io-index"
816 | checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
817 | dependencies = [
818 |  "winapi-util",
819 | ]
820 | 
821 | [[package]]
822 | name = "textwrap"
823 | version = "0.11.0"
824 | source = "registry+https://github.com/rust-lang/crates.io-index"
825 | checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
826 | dependencies = [
827 |  "unicode-width",
828 | ]
829 | 
830 | [[package]]
831 | name = "textwrap"
832 | version = "0.15.0"
833 | source = "registry+https://github.com/rust-lang/crates.io-index"
834 | checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb"
835 | 
836 | [[package]]
837 | name = "time"
838 | version = "0.1.44"
839 | source = "registry+https://github.com/rust-lang/crates.io-index"
840 | checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255"
841 | dependencies = [
842 |  "libc",
843 |  "wasi 0.10.0+wasi-snapshot-preview1",
844 |  "winapi",
845 | ]
846 | 
847 | [[package]]
848 | name = "unicode-ident"
849 | version = "1.0.2"
850 | source = "registry+https://github.com/rust-lang/crates.io-index"
851 | checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7"
852 | 
853 | [[package]]
854 | name = "unicode-segmentation"
855 | version = "1.9.0"
856 | source = "registry+https://github.com/rust-lang/crates.io-index"
857 | checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99"
858 | 
859 | [[package]]
860 | name = "unicode-width"
861 | version = "0.1.9"
862 | source = "registry+https://github.com/rust-lang/crates.io-index"
863 | checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973"
864 | 
865 | [[package]]
866 | name = "vcpkg"
867 | version = "0.2.15"
868 | source = "registry+https://github.com/rust-lang/crates.io-index"
869 | checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
870 | 
871 | [[package]]
872 | name = "vec_map"
873 | version = "0.8.2"
874 | source = "registry+https://github.com/rust-lang/crates.io-index"
875 | checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
876 | 
877 | [[package]]
878 | name = "version_check"
879 | version = "0.9.4"
880 | source = "registry+https://github.com/rust-lang/crates.io-index"
881 | checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
882 | 
883 | [[package]]
884 | name = "wasi"
885 | version = "0.10.0+wasi-snapshot-preview1"
886 | source = "registry+https://github.com/rust-lang/crates.io-index"
887 | checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
888 | 
889 | [[package]]
890 | name = "wasi"
891 | version = "0.11.0+wasi-snapshot-preview1"
892 | source = "registry+https://github.com/rust-lang/crates.io-index"
893 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
894 | 
895 | [[package]]
896 | name = "winapi"
897 | version = "0.3.9"
898 | source = "registry+https://github.com/rust-lang/crates.io-index"
899 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
900 | dependencies = [
901 |  "winapi-i686-pc-windows-gnu",
902 |  "winapi-x86_64-pc-windows-gnu",
903 | ]
904 | 
905 | [[package]]
906 | name = "winapi-i686-pc-windows-gnu"
907 | version = "0.4.0"
908 | source = "registry+https://github.com/rust-lang/crates.io-index"
909 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
910 | 
911 | [[package]]
912 | name = "winapi-util"
913 | version = "0.1.5"
914 | source = "registry+https://github.com/rust-lang/crates.io-index"
915 | checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
916 | dependencies = [
917 |  "winapi",
918 | ]
919 | 
920 | [[package]]
921 | name = "winapi-x86_64-pc-windows-gnu"
922 | version = "0.4.0"
923 | source = "registry+https://github.com/rust-lang/crates.io-index"
924 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
925 | 
926 | [[package]]
927 | name = "windows-sys"
928 | version = "0.36.1"
929 | source = "registry+https://github.com/rust-lang/crates.io-index"
930 | checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2"
931 | dependencies = [
932 |  "windows_aarch64_msvc",
933 |  "windows_i686_gnu",
934 |  "windows_i686_msvc",
935 |  "windows_x86_64_gnu",
936 |  "windows_x86_64_msvc",
937 | ]
938 | 
939 | [[package]]
940 | name = "windows_aarch64_msvc"
941 | version = "0.36.1"
942 | source = "registry+https://github.com/rust-lang/crates.io-index"
943 | checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47"
944 | 
945 | [[package]]
946 | name = "windows_i686_gnu"
947 | version = "0.36.1"
948 | source = "registry+https://github.com/rust-lang/crates.io-index"
949 | checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6"
950 | 
951 | [[package]]
952 | name = "windows_i686_msvc"
953 | version = "0.36.1"
954 | source = "registry+https://github.com/rust-lang/crates.io-index"
955 | checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024"
956 | 
957 | [[package]]
958 | name = "windows_x86_64_gnu"
959 | version = "0.36.1"
960 | source = "registry+https://github.com/rust-lang/crates.io-index"
961 | checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1"
962 | 
963 | [[package]]
964 | name = "windows_x86_64_msvc"
965 | version = "0.36.1"
966 | source = "registry+https://github.com/rust-lang/crates.io-index"
967 | checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680"
968 | 
969 | [[package]]
970 | name = "zstd"
971 | version = "0.11.2+zstd.1.5.2"
972 | source = "registry+https://github.com/rust-lang/crates.io-index"
973 | checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4"
974 | dependencies = [
975 |  "zstd-safe",
976 | ]
977 | 
978 | [[package]]
979 | name = "zstd-safe"
980 | version = "5.0.2+zstd.1.5.2"
981 | source = "registry+https://github.com/rust-lang/crates.io-index"
982 | checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db"
983 | dependencies = [
984 |  "libc",
985 |  "zstd-sys",
986 | ]
987 | 
988 | [[package]]
989 | name = "zstd-sys"
990 | version = "2.0.1+zstd.1.5.2"
991 | source = "registry+https://github.com/rust-lang/crates.io-index"
992 | checksum = "9fd07cbbc53846d9145dbffdf6dd09a7a0aa52be46741825f5c97bdd4f73f12b"
993 | dependencies = [
994 |  "cc",
995 |  "libc",
996 | ]
997 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | authors = ["phiresky <phireskyde+git@gmail.com>"]
 3 | description = "Extension for sqlite that provides transparent dictionary-based row-level compression for sqlite"
 4 | edition = "2024"
 5 | license = "LGPL-2.0-or-later"
 6 | name = "sqlite-zstd"
 7 | repository = "https://github.com/phiresky/sqlite-zstd"
 8 | version = "0.3.5"
 9 | readme = "README.md"
10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
11 | 
12 | [features]
13 | default = []
14 | 
15 | build_extension = ["rusqlite/loadable_extension"]
16 | 
17 | # debug_zstd = ["zstd/debug"]
18 | 
19 | benchmark = ["structopt", "procfs", "rusqlite/backup", "rusqlite/load_extension", "csv"]
20 | 
21 | all = ["benchmark"]
22 | 
23 | [[bin]]
24 | name = "benchmark"
25 | required-features = ["benchmark"]
26 | 
27 | [[bin]]
28 | name = "create_test_db"
29 | required-features = ["benchmark"]
30 | 
31 | [lib]
32 | crate-type = ["cdylib"]
33 | 
34 | [dependencies]
35 | zstd = {version = "0.11.2", features = ["experimental"]}
36 | #zstd = {version = "0.5.3", path="../zstd-rs"}
37 | #zstd = {version = "=0.5.4"}
38 | anyhow = "1.0.44"
39 | serde = {version = "1.0.130", features = ["derive"]}
40 | serde_json = "1.0.68"
41 | 
42 | csv = {version = "1.1.6", optional = true}
43 | env_logger = "0.9.0"
44 | lazy_static = "1.4.0"
45 | log = "0.4.14"
46 | lru_time_cache = "0.11.11"
47 | owning_ref = "0.4.1"
48 | procfs = {version = "0.13.2", optional = true}
49 | rand = "0.8.4"
50 | structopt = {version = "0.3.23", optional = true}
51 | 
52 | [dependencies.rusqlite]
53 | features = ["functions", "blob", "bundled", "array"]
54 | package = "rusqlite"
55 | version = "0.35.0"
56 | 
57 | [dev-dependencies]
58 | chrono = "0.4.19"
59 | names = "0.14.0"
60 | pretty_assertions = "1.2.1"
61 | 
62 | [profile.release]
63 | lto = "fat"
64 | 
65 | # cargo-deb configuration
66 | # https://github.com/kornelski/cargo-deb
67 | [package.metadata.deb]
68 | # Debianized package name, conveniently matches the name of the shared library file
69 | name = "libsqlite-zstd"
70 | # $auto fills in the automatically calculated dependencies (namely libc)
71 | # libsqlite3-0 is added because this library isn't very useful without SQLite
72 | depends = "$auto, libsqlite3-0"
73 | # This feature is required to build the shared library extension
74 | features = ["build_extension"]
75 | assets = [
76 |     # Install the shared library extension to /usr/lib, where SQLite can find it
77 |     ["target/release/libsqlite_zstd.so", "usr/lib/", "744"],
78 |     # It's good practice to install the README file into /usr/share/doc for every package
79 |     ["README.md", "usr/share/doc/libsqlite-zstd/README", "644"],
80 | ]
81 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # sqlite-zstd
  2 | 
  3 | Extension for sqlite that provides transparent dictionary-based row-level compression for sqlite. This basically allows you to compress entries in a sqlite database almost as well as if you were compressing the whole DB file, but while retaining random access.
  4 | 
  5 | See also the announcement blog post for some motivation, benchmarks and ramblings: https://phiresky.github.io/blog/2022/sqlite-zstd
  6 | 
  7 | [![size comparison chart](doc/2022-07-31-19-27-57.png)](https://phiresky.github.io/blog/2022/sqlite-zstd)
  8 | 
  9 | Depending on the data, this can reduce the size of the database by 80% while keeping performance mostly the same (or even improving it, since the data to be read from disk is smaller).
 10 | 
 11 | Note that a compression VFS such as https://github.com/mlin/sqlite_zstd_vfs might be suited better depending on the use case. That has very different tradeoffs and capabilities, but the end result is similar.
 12 | 
 13 | ## Transparent Compression
 14 | 
 15 | - `zstd_enable_transparent(config)`
 16 | 
 17 |   Enable transparent row-level compression of the given column on the given table.
 18 | 
 19 |   You can call this function several times on the same table with different columns to compress.
 20 | 
 21 |   ```sql
 22 |       SELECT
 23 |           zstd_enable_transparent('{"table": "objects", "column": "data1", "compression_level": 19, "dict_chooser": "''a''"}'),
 24 |           zstd_enable_transparent('{"table": "objects", "column": "data2", "compression_level": 19, "dict_chooser": "''a''"}')
 25 | 
 26 |   ```
 27 | 
 28 |   The data will be moved to `_table_name_zstd`, while `table_name` will be a view that can be queried as normally, including SELECT, INSERT, UPDATE, and DELETE queries. This function will not compress any data by itself, you need to call `zstd_incremental_maintenance` afterwards.
 29 |   
 30 |   `config` is a json object describing the configuration. See [TransparentCompressConfig](src/transparent.rs#L34) for detail.
 31 | 
 32 |   The following differences apply when compression is active:
 33 | 
 34 |   - The compressed column may only contain `blob` or `text` data, depending on the affinity of the declared data type (e.g. `VARCHAR(10)` is fine, but `int` is not).
 35 |   - The primary key must not be null for any row, otherwise updating may not work as expected
 36 |   - sqlite3_changes() will return 0 for modifying queries ([see here](https://sqlite.org/c3ref/changes.html)).
 37 |   - The SQLite streaming blob reading API will be somewhat useless since the blob is fully copied into memory anyways.
 38 |   - Attaching a database containing compressed tables using `ATTACH 'foo.db'` is not supported.
 39 |   - DDL statements (like ALTER TABLE and CREATE INDEX) are only partially supported
 40 | 
 41 | - `zstd_incremental_maintenance(duration_seconds: float | null, db_load: float) -> bool`
 42 | 
 43 |   Perform an incremental maintenance operation taking around the given amount of time.
 44 |   This will train dictionaries and compress data based on the grouping given in the TransparentCompressConfig.
 45 |   
 46 |   **In order for the size of your database file to actually shrink, you also have to call "VACUUM"**. Otherwise SQLite just marks pages as free (and reuses them for new data).
 47 | 
 48 |   `duration_seconds`: If given amount of time is 0, do a single step and exit as soon as possible. If given amount of time is null, run until all pending maintenance is complete.
 49 | 
 50 |   `db_load`: specifies the ratio of time the db will be locked with write queries. For example: if set to 0.5, after each write operation taking 2 seconds the maintenance function will sleep for 2 seconds so other processes have time to run write operations against the database. If set to 1, the maintenance will not sleep. Note that this is only useful if you run the incremental maintenance function in a separate thread or process than your other logic. Note that both the duration and the db load are best-effort: there is no exact guarantee about the amount of time the database will stay locked at a time.
 51 | 
 52 |   _Returns_ 1 if there is more work to be done, 0 if everything is compressed as it should.
 53 | 
 54 |   Note that each call of this function has a start up time cost equivalent to `select * from table where dictid is null`, so longer durations are more efficient.
 55 | 
 56 |   This function can safely be interrupted at any time, each chunk of compression work is done as an atomic operation.
 57 | 
 58 |   Examples:
 59 | 
 60 |   - `zstd_incremental_maintenance(null, 1)`: Compresses everying, as fast as possible. Useful if the db is not currently in use.
 61 |   - `zstd_incremental_maintenance(60, 0.5)`: Spend 60 seconds compressing pending stuff, while allowing other queries to run 50% of the time.
 62 | 
 63 |   Example output:
 64 | 
 65 |   ```
 66 |   sqlite> select zstd_incremental_maintenance(null, 1);
 67 |     [2020-12-23T21:11:31Z WARN  sqlite_zstd::transparent] Warning: It is recommended to set `pragma busy_timeout=2000;` or higher
 68 |     [2020-12-23T21:11:40Z INFO  sqlite_zstd::transparent] events.data: Total 5.20GB to potentially compress.
 69 |     3[2020-12-23T21:13:22Z INFO  sqlite_zstd::transparent] Compressed 6730 rows with dictid=109. Total size of entries before: 163.77MB, afterwards: 2.12MB, (average: before=24.33kB, after=315B)
 70 |     [2020-12-23T21:13:43Z INFO  sqlite_zstd::transparent] Compressed 4505 rows with dictid=110. Total size of entries before: 69.28MB, afterwards: 1.60MB, (average: before=15.38kB, after=355B)
 71 |     [2020-12-23T21:14:06Z INFO  sqlite_zstd::transparent] Compressed 5228 rows with dictid=111. Total size of entries before: 91.97MB, afterwards: 1.41MB, (average: before=17.59kB, after=268B)
 72 |   ```
 73 | 
 74 | - `zstd_train_dict_and_save(agg, dict_size: int, sample_count: int, dict_chooser_key: text) -> int`
 75 | 
 76 |   This function is used internally by zstd_incremental_maintenance. Same as `zstd_train_dict`, but the dictionary is saved to the `_zstd_dicts` table and the id is returned. The dict_chooser_key is used to identify the dictionary during compression, but during decompression only the integer primary key is used (that way changing the dict chooser expression is safe).
 77 |   
 78 | ## Basic Functionality
 79 | 
 80 | - `zstd_compress(data: text|blob, level: int = 3, dictionary: blob | int | null = null, compact: bool = false) -> blob`
 81 | 
 82 |   Compresses the given data, with the compression level (1 - 22, default 3)
 83 | 
 84 |   - If dictionary is a blob it will be directly used
 85 |   - If dictionary is an int i, it is functionally equivalent to `zstd_compress(data, level, (select dict from _zstd_dict where id = i))`
 86 |   - If dictionary is not present, null, or -1, the data is compressed without a dictionary.
 87 | 
 88 |   if compact is true, the output will be without magic header, without checksums, and without dictids. This will save 4 bytes when not using dictionaries and 8 bytes when using dictionaries. this also means the data will not be decodeable as a normal zstd archive with the standard tools.
 89 |   The same compact argument must also be passed to the decompress function.
 90 | 
 91 | - `zstd_decompress(data: blob, is_text: bool, dictionary: blob | int | null = null, compact: bool = false) -> text|blob`
 92 | 
 93 |   Decompresses the given data. if the dictionary is wrong, the result is undefined
 94 | 
 95 |   - If dictionary is a blob it will be directly used
 96 |   - If dictionary is an int i, it is functionally equivalent to `zstd_decompress(data, (select dict from _zstd_dict where id = i))`.
 97 |   - If dictionary is not present, null, or -1, it is assumed the data was compressed without a dictionary.
 98 | 
 99 |   Note that passing dictionary as an int is recommended, since then the dictionary only has to be prepared once.
100 | 
101 |   is_text specifies whether to output the data as text or as a blob. Note that when outputting as text the encoding depends on the sqlite database encoding. sqlite-zstd is only tested with UTF-8.
102 | 
103 |   compact must be specified when the compress function was also called with compact.
104 | 
105 | - `zstd_train_dict(agg, dict_size: int, sample_count: int) -> blob`
106 | 
107 |   Aggregate function (like sum() or count()) to train a zstd dictionary on randomly selected sample_count samples of the given aggregate data
108 | 
109 |   Example use: `select zstd_train_dict(tbl.data, 100000, 1000) from tbl` will return a dictionary of size 100kB trained on 1000 random samples in `tbl`
110 | 
111 |   The recommended number of samples is 100x the target dictionary size. As an example, you can train a dict of 100kB with the "optimal" sample count as follows:
112 | 
113 |   ```sql
114 |   select zstd_train_dict(data, 100000, (select (100000 * 100 / avg(length(data))) as sample_count from tbl))
115 |                   as dict from tbl
116 |   ```
117 | 
118 |   Note that dict_size and sample_count are assumed to be constants.
119 | 
120 | # Compiling
121 | 
122 | This project can be built in two modes: (a) as a Rust library and (b) as a pure SQLite extension (with `--features build_extension`).
123 | 
124 | You can get the SQLite extension binaries from the GitHub releases. Alternatively, you can build the extension by hand:
125 | 
126 | ```
127 | cargo build --release --features build_extension
128 | # should give you target/release/libsqlite_zstd.so
129 | ```
130 | 
131 | ## Cross Compiling
132 | 
133 | For cross-compiling to `aarch64-linux-android`, you need to
134 | 1. Download the target we need to cross-compile
135 | ```bash
136 | rustup target add aarch64_linux_android
137 | ```
138 | 
139 | 2. Prepare the [Android NDK](https://developer.android.com/ndk) (The binutils is deprecated and removed from NDK 23+, so you need to download an older version of NDK)
140 | 
141 | 3. Setup NDK binary path
142 | ```bash
143 | export PATH="$PATH:<NDK_DIR>/toolchains/llvm/prebuilt/linux-x86_64/bin"
144 | ```
145 | 
146 | 4. Specify linker in [cargo configuration file](https://doc.rust-lang.org/cargo/reference/config.html)
147 | ```toml
148 | [target.aarch64-linux-android]
149 | linker = "aarch64-linux-android23-clang"
150 | ```
151 | 
152 | 5. Specify `target` accordingly when building
153 | ```bash
154 | cargo build -r --features build_extension --target aarch64-linux-android
155 | ```
156 | 
157 | ## As a Python "extension"
158 | 
159 | If you want to use this project as an SQLite extension inside a Python project,
160 | you can install it as a Python package (you still need to have a rust compiler
161 | to actually build the binary):
162 | 
163 | ```bash
164 | pip install 'git+https://github.com/phiresky/sqlite-zstd.git#egg=sqlite_zstd&subdirectory=python'
165 | ```
166 | 
167 | This installs the extension as a Python package, with some support code to make
168 | it easy to use from Python code or [Datasette](https://datasette.io/).
169 | 
170 | # Usage
171 | 
172 | You can either load this library as SQLite extension or as a Rust library. Note that sqlite extensions are not persistent, so you need to load it each time you connect to the database.
173 | 
174 | **Is this library production ready?**
175 | 
176 | I wouldn't trust it with my data (yet). Make sure you have backups of everything. I'm also not making any guarantees for backwards compatibility of future updates, though migrating by copying over the uncompressed data should of course work fine.
177 | 
178 | **Sqlite CLI**
179 | 
180 | Either load it in the REPL:
181 | 
182 | ```sh
183 | $ sqlite3 file.db
184 | SQLite version 3.34.0 2020-12-01 16:14:00
185 | sqlite> .load .../libsqlite_zstd.so
186 | [2020-12-23T21:30:02Z INFO  sqlite_zstd::create_extension] [sqlite-zstd] initialized
187 | sqlite>
188 | ```
189 | 
190 | Or alternatively:
191 | 
192 | `sqlite3 -cmd '.load libsqlite_zstd.so' 'select * from foo'`
193 | 
194 | **C Api**
195 | 
196 | ```c
197 | int success = sqlite3_load_extension(db, "libsqlite_zstd.so", NULL, NULL);
198 | ```
199 | 
200 | See [here](https://www.sqlite.org/loadext.html) for more information.
201 | 
202 | **Rust**
203 | 
204 | The recommended method is to add `sqlite_zstd` as a dependency to your project, then load it using
205 | 
206 | ```rust
207 | let conn: rusqlite::Connection;
208 | sqlite_zstd::load(&conn)?;
209 | ```
210 | 
211 | Alternatively, you can load the extension like any other extension:
212 | 
213 | ```rust
214 | let conn: rusqlite::Connection;
215 | conn.load_extension("libsqlite_zstd.so", None)?;
216 | ```
217 | 
218 | See [here](https://docs.rs/rusqlite/0.24.2/rusqlite/struct.Connection.html#method.load_extension) for more information.
219 | 
220 | **Python**
221 | 
222 | If you have installed this as a Python module as described above, you can load
223 | the extension into an existion SQLite connection like this:
224 | 
225 | ```python
226 | import sqlite3
227 | import sqlite_zstd
228 | 
229 | conn = sqlite3.connect(':memory:')
230 | sqlite_zstd.load(conn)
231 | ```
232 | 
233 | When using Datasette, this extension is loaded automatically into every
234 | connection.
235 | 
236 | # Verbosity / Debugging
237 | 
238 | You can change the log level by setting the environment variable `SQLITE_ZSTD_LOG=error` for less logging and `SQLITE_ZSTD_LOG=debug` for more logging.
239 | 
240 | # Future Work / Ideas / Todo
241 | 
242 | - investigate startup cost without dictionary
243 | - correctly handle indices over compressed columns (try generated columns instead of views, maybe vtables, ask the sqlite devs)
244 | - do compression in different thread(s) for performance (e.g. using .multithread(1) in zstd?)
245 | - type affinity interferes with int pass through - `insert into compressed (col) values (1)` will result in typeof(col) = text instead of integer if the type of the column was declared as text - which in turns causes decompression to fail with "got string, but zstd compressed data is always blob"
246 | - either change the type of the compressed column to blob or similar or disallow integer passthrough
247 | 


--------------------------------------------------------------------------------
/clippy.toml:
--------------------------------------------------------------------------------
1 | allow-print-in-tests = true
2 | 


--------------------------------------------------------------------------------
/doc/2022-07-31-19-27-57.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/sqlite-zstd/ebc5d418dd2057099c9d0d36df155e8637a04748/doc/2022-07-31-19-27-57.png


--------------------------------------------------------------------------------
/doc/sqlitebrowser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/sqlite-zstd/ebc5d418dd2057099c9d0d36df155e8637a04748/doc/sqlitebrowser.png


--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
 1 | # sqlite-zstd
 2 | Extension for sqlite that provides transparent dictionary-based row-level compression for sqlite. This basically allows you to compress entries in a sqlite database almost as well as if you were compressing the whole DB file, but while retaining random access.
 3 | 
 4 | See also the announcement blog post for some motivation, benchmarks and ramblings: https://phiresky.github.io/blog/2022/sqlite-zstd
 5 | 
 6 | Depending on the data, this can reduce the size of the database by 80% while keeping performance mostly the same (or even improving it, since the data to be read from disk is smaller).
 7 | 
 8 | Note that a compression VFS such as https://github.com/mlin/sqlite_zstd_vfs might be suited better depending on the use case. That has very different tradeoffs and capabilities, but the end result is similar.
 9 | 
10 | ## Install
11 | ```bash
12 | pip install sqlite-zstd
13 | ```
14 | 
15 | ## Usage
16 | ```python
17 | import sqlite3
18 | import sqlite_zstd
19 | 
20 | conn = sqlite3.connect(':memory:')
21 | sqlite_zstd.load(conn)
22 | ```
23 | 


--------------------------------------------------------------------------------
/python/lib/__init__.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | from importlib.resources import files, as_file
3 | 
4 | def load(conn: sqlite3.Connection) -> None:
5 |     lib = next(x for x in files(__name__).iterdir() if x.name.startswith('lib'))
6 |     with as_file(lib) as ext:
7 |         conn.load_extension(str(ext))
8 | 


--------------------------------------------------------------------------------
/python/lib/datasette.py:
--------------------------------------------------------------------------------
 1 | from datasette import hookimpl
 2 | from . import load
 3 | 
 4 | 
 5 | @hookimpl
 6 | def prepare_connection(conn):
 7 |     conn.enable_load_extension(True)
 8 |     load(conn)
 9 |     conn.enable_load_extension(False)
10 | 


--------------------------------------------------------------------------------
/python/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.2", "setuptools_scm>=6.2", "setuptools_rust"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "sqlite-zstd"
 7 | requires-python = ">=3.9"
 8 | dynamic = ["version"]
 9 | 
10 | [project.readme]
11 | file = "README.md"
12 | content-type = "text/markdown"
13 | 
14 | [project.entry-points.datasette]
15 | sqlite_zstd = "sqlite_zstd.datasette"
16 | 
17 | [tool.setuptools.package-dir]
18 | sqlite_zstd = "lib"
19 | 
20 | [tool.setuptools_scm]
21 | root = ".."
22 | 


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from setuptools_rust import Binding, RustExtension
 3 | 
 4 | setup(
 5 |     rust_extensions=[RustExtension('sqlite_zstd.libsqlite_zstd',
 6 |         path='../Cargo.toml',
 7 |         binding=Binding.NoBinding,
 8 |         features=['build_extension'],
 9 |         py_limited_api=True,
10 |     )],
11 | )
12 | 


--------------------------------------------------------------------------------
/src/add_functions.rs:
--------------------------------------------------------------------------------
  1 | use crate::dict_training::ZstdTrainDictAggregate;
  2 | use crate::util::*;
  3 | use crate::{basic::zstd_decompress_fn, transparent::*};
  4 | 
  5 | use crate::basic::zstd_compress_fn;
  6 | use rusqlite::functions::{Context, FunctionFlags};
  7 | 
  8 | pub fn add_functions(db: &rusqlite::Connection) -> anyhow::Result<()> {
  9 |     let nondeterministic = FunctionFlags::SQLITE_UTF8 | FunctionFlags::SQLITE_DIRECTONLY;
 10 |     let deterministic = FunctionFlags::SQLITE_UTF8 | FunctionFlags::SQLITE_DETERMINISTIC;
 11 | 
 12 |     let zstd_compress = |ctx: &Context| zstd_compress_fn(ctx, false).map_err(ah);
 13 |     let zstd_compress_col = |ctx: &Context| zstd_compress_fn(ctx, true).map_err(ah);
 14 | 
 15 |     let zstd_decompress = |ctx: &Context| zstd_decompress_fn(ctx, false).map_err(ah);
 16 |     let zstd_decompress_col = |ctx: &Context| zstd_decompress_fn(ctx, true).map_err(ah);
 17 |     //
 18 |     db.create_scalar_function("zstd_compress", 1, deterministic, zstd_compress)?;
 19 |     db.create_scalar_function("zstd_compress", 2, deterministic, zstd_compress)?;
 20 |     db.create_scalar_function("zstd_compress", 3, deterministic, zstd_compress)?;
 21 |     db.create_scalar_function("zstd_compress", 4, deterministic, zstd_compress)?;
 22 |     db.create_scalar_function("zstd_compress_col", 4, deterministic, zstd_compress_col)?;
 23 |     db.create_scalar_function("zstd_decompress", 2, deterministic, zstd_decompress)?;
 24 |     db.create_scalar_function("zstd_decompress", 3, deterministic, zstd_decompress)?;
 25 |     db.create_scalar_function("zstd_decompress", 4, deterministic, zstd_decompress)?;
 26 |     db.create_scalar_function("zstd_decompress_col", 4, deterministic, zstd_decompress_col)?;
 27 | 
 28 |     db.create_aggregate_function(
 29 |         "zstd_train_dict",
 30 |         3,
 31 |         nondeterministic,
 32 |         ZstdTrainDictAggregate {
 33 |             return_save_id: false,
 34 |         },
 35 |     )?;
 36 |     db.create_aggregate_function(
 37 |         "zstd_train_dict_and_save",
 38 |         4,
 39 |         nondeterministic,
 40 |         ZstdTrainDictAggregate {
 41 |             return_save_id: true,
 42 |         },
 43 |     )?;
 44 | 
 45 |     db.create_scalar_function("zstd_enable_transparent", 1, nondeterministic, |ctx| {
 46 |         zstd_enable_transparent(ctx).map_err(ah)
 47 |     })?;
 48 | 
 49 |     db.create_scalar_function("zstd_incremental_maintenance", 2, nondeterministic, |ctx| {
 50 |         zstd_incremental_maintenance(ctx).map_err(ah)
 51 |     })?;
 52 | 
 53 |     Ok(())
 54 | }
 55 | 
 56 | #[cfg(test)]
 57 | pub mod tests {
 58 |     use super::*;
 59 |     use anyhow::Context;
 60 |     use chrono::TimeZone;
 61 |     pub use pretty_assertions::assert_eq;
 62 | 
 63 |     use rusqlite::{Connection, params};
 64 |     use serde::{Deserialize, Serialize};
 65 |     use std::collections::BTreeMap;
 66 | 
 67 |     // the point of this is that it's something you might store in a DB that has lots of redundant data
 68 |     #[derive(Serialize, Deserialize, Debug)]
 69 |     #[serde(tag = "type")]
 70 |     enum EventData {
 71 |         OpenApplication {
 72 |             id: i32,
 73 |             app_name: String,
 74 |             app_type: String,
 75 |             properties: BTreeMap<String, String>,
 76 |         },
 77 |         CloseApplication {
 78 |             id: i32,
 79 |         },
 80 |         Shutdown,
 81 |     }
 82 | 
 83 |     pub fn create_example_db(seed: Option<u64>, eles: i32) -> anyhow::Result<Connection> {
 84 |         let seed = seed.unwrap_or_else(|| thread_rng().r#gen());
 85 |         lazy_static::lazy_static! {
 86 |             // people use maybe 100 different apps
 87 |             static ref APP_NAMES: Vec<String> = names::Generator::with_naming(names::Name::Plain)
 88 |             .take(100)
 89 |             .collect();
 90 |             // of maybe 10 different categories
 91 |             static ref APP_TYPES: Vec<String> = names::Generator::with_naming(names::Name::Plain)
 92 |                 .take(10)
 93 |                 .collect();
 94 |         };
 95 |         let mut db = if std::env::var("TEST_TO_FILE").is_ok() {
 96 |             let db_fname = format!(
 97 |                 "/tmp/foo.{}.sqlite3",
 98 |                 rand::distributions::Uniform::from(0..10000).sample(&mut rand::thread_rng())
 99 |             );
100 |             log::debug!("writing temp db to {}", db_fname);
101 |             Connection::open(db_fname)?
102 |         } else {
103 |             Connection::open_in_memory().context("opening memory db")?
104 |         };
105 |         add_functions(&db).context("adding functions")?;
106 |         db.execute_batch(
107 |             "
108 |             create table events (
109 |                 id integer primary key not null,
110 |                 timestamp text not null,
111 |                 data text not null,
112 |                 another_col text
113 |             );
114 |         ",
115 |         )?;
116 | 
117 |         use rand::distributions::WeightedIndex;
118 |         use rand::prelude::*;
119 | 
120 |         let window_properties = &[
121 |             (30, "_GTK_APPLICATION_ID"),
122 |             (30, "_GTK_APPLICATION_OBJECT_PATH"),
123 |             (30, "_GTK_UNIQUE_BUS_NAME"),
124 |             (30, "_GTK_WINDOW_OBJECT_PATH"),
125 |             (40, "_NET_WM_USER_TIME_WINDOW"),
126 |             (41, "WM_CLIENT_LEADER"),
127 |             (50, "_NET_WM_BYPASS_COMPOSITOR"),
128 |             (60, "WM_WINDOW_ROLE"),
129 |             (61, "_MOTIF_WM_HINTS"),
130 |             (90, "_GTK_THEME_VARIANT"),
131 |             (91, "_NET_WM_SYNC_REQUEST_COUNTER"),
132 |             (91, "_NET_WM_USER_TIME"),
133 |             (139, "_NET_STARTUP_ID"),
134 |             (170, "_NET_WM_ICON_NAME"),
135 |             (180, "WM_HINTS"),
136 |             (220, "_NET_WM_WINDOW_TYPE"),
137 |             (220, "XdndAware"),
138 |             (229, "WM_LOCALE_NAME"),
139 |             (230, "_NET_WM_NAME"),
140 |             (230, "_NET_WM_PID"),
141 |             (230, "WM_CLIENT_MACHINE"),
142 |             (240, "_NET_WM_DESKTOP"),
143 |             (240, "_NET_WM_STATE"),
144 |             (240, "WM_CLASS"),
145 |             (240, "WM_NORMAL_HINTS"),
146 |             (240, "WM_PROTOCOLS"),
147 |             (240, "WM_STATE"),
148 |         ];
149 | 
150 |         let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
151 |         let event_type_dist = WeightedIndex::new([10, 10, 1])?;
152 |         let window_properties_dist = WeightedIndex::new(window_properties.iter().map(|e| e.0))?;
153 |         let app_id_dist = rand::distributions::Uniform::from(0..100);
154 |         let data = (0..eles).map(|_| match event_type_dist.sample(&mut rng) {
155 |             0 => {
156 |                 let mut properties = BTreeMap::new();
157 |                 for _i in 1..rand::distributions::Uniform::from(100..1000).sample(&mut rng) {
158 |                     let p = window_properties[window_properties_dist.sample(&mut rng)].1;
159 |                     properties.insert(p.to_string(), "1".to_string());
160 |                 }
161 |                 EventData::OpenApplication {
162 |                     id: app_id_dist.sample(&mut rng),
163 |                     app_name: APP_NAMES.choose(&mut rng).unwrap().clone(),
164 |                     app_type: APP_TYPES.choose(&mut rng).unwrap().clone(),
165 |                     properties,
166 |                 }
167 |             }
168 |             1 => EventData::CloseApplication {
169 |                 id: app_id_dist.sample(&mut rng),
170 |             },
171 |             2 => EventData::Shutdown,
172 |             _ => panic!("impossible"),
173 |         });
174 |         {
175 |             let tx = db.transaction()?;
176 |             {
177 |                 let mut insert = tx.prepare(
178 |                     "insert into events (timestamp, data, another_col) values (?, ?, ?)",
179 |                 )?;
180 |                 let date = chrono::Utc.ymd(2021, 1, 1).and_hms(0, 0, 0);
181 |                 for (i, d) in data.enumerate() {
182 |                     insert.execute(params![
183 |                         (date + chrono::Duration::seconds(30) * (i as i32)).to_rfc3339(),
184 |                         serde_json::to_string_pretty(&d)?,
185 |                         "rustacean"
186 |                     ])?;
187 |                 }
188 |             }
189 |             tx.commit()?;
190 |         }
191 |         Ok(db)
192 |     }
193 | 
194 |     #[test]
195 |     fn sanity() -> anyhow::Result<()> {
196 |         let _db = create_example_db(None, 10).context("create eg db")?;
197 |         Ok(())
198 |     }
199 | 
200 |     fn test_strings() -> anyhow::Result<Vec<String>> {
201 |         let data = [
202 |             "hello this is a test",
203 |             "foobar",
204 |             "looooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooong",
205 |             "nope",
206 |         ];
207 |         Ok(data.iter().map(|e| e.to_string()).collect())
208 |     }
209 | 
210 |     #[test]
211 |     fn compress_is_deterministic() -> anyhow::Result<()> {
212 |         let db = create_example_db(None, 0)?;
213 | 
214 |         for eg in test_strings()? {
215 |             let compressed1: Vec<u8> =
216 |                 db.query_row("select zstd_compress(?)", params![eg], |r| r.get(0))?;
217 |             let compressed2: Vec<u8> =
218 |                 db.query_row("select zstd_compress(?)", params![eg], |r| r.get(0))?;
219 |             assert_eq!(compressed1, compressed2)
220 |         }
221 | 
222 |         Ok(())
223 |     }
224 | 
225 |     #[test]
226 |     fn compress_decompress_roundtrip() -> anyhow::Result<()> {
227 |         let db = create_example_db(None, 0)?;
228 | 
229 |         for eg in test_strings()? {
230 |             let compressed: Vec<u8> = db
231 |                 .query_row("select zstd_compress(?)", params![eg], |r| r.get(0))
232 |                 .context("compressing")?;
233 |             let decompressed: String = db
234 |                 .query_row(
235 |                     "select zstd_decompress(?, true)",
236 |                     params![compressed],
237 |                     |r| r.get(0),
238 |                 )
239 |                 .context("decompressing")?;
240 |             assert_eq!(eg, decompressed)
241 |         }
242 | 
243 |         Ok(())
244 |     }
245 | 
246 |     #[test]
247 |     fn decompress_type() -> anyhow::Result<()> {
248 |         let db = create_example_db(None, 0)?;
249 | 
250 |         for eg in test_strings()? {
251 |             let compressed: Vec<u8> =
252 |                 db.query_row("select zstd_compress(?)", params![eg], |r| r.get(0))?;
253 |             let decompressed_text: String = db.query_row(
254 |                 "select zstd_decompress(?, true)",
255 |                 params![compressed],
256 |                 |r| r.get(0),
257 |             )?;
258 | 
259 |             let decompressed_blob: Vec<u8> = db.query_row(
260 |                 "select zstd_decompress(?, false)",
261 |                 params![compressed],
262 |                 |r| r.get(0),
263 |             )?;
264 |             assert_eq!(decompressed_text.as_bytes(), decompressed_blob)
265 |         }
266 | 
267 |         Ok(())
268 |     }
269 |     #[test]
270 |     fn compress_with_dict_smaller() -> anyhow::Result<()> {
271 |         let db = create_example_db(None, 100)?;
272 | 
273 |         let compressed1: Vec<u8> = db.query_row(
274 |             "select zstd_compress((select data from events where id = 1), 5)",
275 |             params![],
276 |             |r| r.get(0),
277 |         )?;
278 | 
279 |         let dict: Vec<u8> = db
280 |             .query_row(
281 |                 "select zstd_train_dict(data, 1000, 100) from events",
282 |                 params![],
283 |                 |r| r.get(0),
284 |             )
285 |             .context("train dict")?;
286 | 
287 |         let compressed2: Vec<u8> = db
288 |             .query_row(
289 |                 "select zstd_compress((select data from events where id = 1), 5, ?)",
290 |                 params![dict],
291 |                 |r| r.get(0),
292 |             )
293 |             .context("compress with dict")?;
294 | 
295 |         assert!(compressed1.len() > compressed2.len());
296 | 
297 |         let decompressed1: String = db
298 |             .query_row("select zstd_decompress(?, 1)", params![compressed1], |r| {
299 |                 r.get(0)
300 |             })
301 |             .context("decompress 1")?;
302 | 
303 |         let decompressed2: String = db
304 |             .query_row(
305 |                 "select zstd_decompress(?, 1, ?)",
306 |                 params![compressed2, dict],
307 |                 |r| r.get(0),
308 |             )
309 |             .context("decompress 2")?;
310 | 
311 |         assert_eq!(decompressed1, decompressed2);
312 | 
313 |         Ok(())
314 |     }
315 | 
316 |     #[test]
317 |     fn dict_saving_works() -> anyhow::Result<()> {
318 |         let db = create_example_db(None, 100)?;
319 | 
320 |         let dict: i32 = db
321 |             .query_row(
322 |                 "select zstd_train_dict_and_save(data, 1000, 100, null) from events",
323 |                 params![],
324 |                 |r| r.get(0),
325 |             )
326 |             .context("train dict")?;
327 | 
328 |         let uncompressed: String = db
329 |             .query_row("select data from events where id = 1", params![], |r| {
330 |                 r.get(0)
331 |             })
332 |             .context("get data")?;
333 | 
334 |         let compressed2: Vec<u8> = db
335 |             .query_row(
336 |                 "select zstd_compress((select data from events where id = 1), 5, ?)",
337 |                 params![dict],
338 |                 |r| r.get(0),
339 |             )
340 |             .context("compress with dict")?;
341 | 
342 |         let decompressed2: String = db
343 |             .query_row(
344 |                 "select zstd_decompress(?, 1, ?)",
345 |                 params![compressed2, dict],
346 |                 |r| r.get(0),
347 |             )
348 |             .context("decompress 2")?;
349 | 
350 |         assert_eq!(uncompressed, decompressed2);
351 | 
352 |         Ok(())
353 |     }
354 | 
355 |     #[test]
356 |     fn levels() -> anyhow::Result<()> {
357 |         let db = create_example_db(None, 5)?;
358 |         /*db.prepare("select * from events")?
359 |         .query_map(params![], |r| Ok(debug_row(r)))?
360 |         .count();*/
361 | 
362 |         let mut st = db.prepare("select data from events")?;
363 |         let eles: Vec<String> = st
364 |             .query_map(params![], |r| r.get(0))
365 |             .context("get sample")?
366 |             .collect::<Result<_, _>>()?;
367 | 
368 |         for ele in eles {
369 |             // let mut last_size = usize::MAX;
370 |             for level in 1..24 {
371 |                 let compressed1: Vec<u8> = db
372 |                     .query_row("select zstd_compress(?, ?)", params![ele, level], |r| {
373 |                         r.get(0)
374 |                     })
375 |                     .context("compress")?;
376 |                 let decompressed1: String = db
377 |                     .query_row(
378 |                         "select zstd_decompress(?, ?)",
379 |                         params![compressed1, 1],
380 |                         |r| r.get(0),
381 |                     )
382 |                     .context("decompress")?;
383 | 
384 |                 assert_eq!(ele, decompressed1);
385 |                 println!("l={}, size={}", level, compressed1.len());
386 |                 // assert!(compressed1.len() <= last_size);
387 |                 // last_size = compressed1.len();
388 |             }
389 |         }
390 | 
391 |         Ok(())
392 |     }
393 | }
394 | 


--------------------------------------------------------------------------------
/src/basic.rs:
--------------------------------------------------------------------------------
  1 | use crate::dict_management::*;
  2 | use anyhow::Context as AContext;
  3 | 
  4 | use rusqlite::functions::Context;
  5 | 
  6 | use rusqlite::types::ToSqlOutput;
  7 | use rusqlite::types::{Value, ValueRef};
  8 | use std::{io::Write, sync::Arc};
  9 | use zstd::bulk::Compressor;
 10 | use zstd::dict::DecoderDictionary;
 11 | 
 12 | /// null_dict_is_passthrough is only true when called through the `zstd_compress_col` function (for transparent compression)
 13 | /// with null_dict_is_passthrough, the behaviour is slightly changed: When dict is null, the data is passed through without compression.
 14 | pub(crate) fn zstd_compress_fn<'a>(
 15 |     ctx: &Context,
 16 |     null_dict_is_passthrough: bool,
 17 | ) -> anyhow::Result<ToSqlOutput<'a>> {
 18 |     let arg_data = 0;
 19 |     let arg_level = 1;
 20 |     let arg_dict = 2;
 21 |     let arg_is_compact = 3;
 22 | 
 23 |     let input_value = match ctx.get_raw(arg_data) {
 24 |         ValueRef::Blob(b) => b,
 25 |         ValueRef::Text(b) => b,
 26 |         ValueRef::Null => return Ok(ToSqlOutput::Owned(Value::Null)), // pass through null
 27 |         e => {
 28 |             anyhow::bail!(
 29 |                 "zstd_compress expects blob or text as input, got {}",
 30 |                 e.data_type()
 31 |             )
 32 |         }
 33 |     };
 34 | 
 35 |     if null_dict_is_passthrough && ctx.len() >= arg_dict {
 36 |         // if the dict id is null, pass through data
 37 |         if let ValueRef::Null = ctx.get_raw(arg_dict) {
 38 |             // TODO: figure out if sqlite3_result_blob can be passed a pointer into sqlite3_context to avoid copying??
 39 |             // return Ok(ToSqlOutput::Borrowed(ctx.get_raw(arg_data)));
 40 |             return Ok(ToSqlOutput::Owned(Value::Blob(input_value.to_vec())));
 41 |         }
 42 |     }
 43 | 
 44 |     let level: i32 = if ctx.len() <= arg_level {
 45 |         // no level given, use default (currently 3)
 46 |         0
 47 |     } else {
 48 |         ctx.get(arg_level).context("level argument")?
 49 |     };
 50 |     let compact: bool = if ctx.len() <= arg_is_compact {
 51 |         false
 52 |     } else {
 53 |         ctx.get(arg_is_compact).context("is_compact argument")?
 54 |     };
 55 | 
 56 |     if ctx.len() <= arg_dict {
 57 |         zstd_compress_fn_tail(compact, input_value, Compressor::new(level))
 58 |     } else {
 59 |         match ctx.get_raw(arg_dict) {
 60 |             ValueRef::Integer(-1) | ValueRef::Null => {
 61 |                 zstd_compress_fn_tail(compact, input_value, Compressor::new(level))
 62 |             }
 63 |             ValueRef::Blob(d) => {
 64 |                 zstd_compress_fn_tail(compact, input_value, Compressor::with_dictionary(level, d))
 65 |             }
 66 |             //Some(Arc::new(wrap_encoder_dict(d.to_vec(), level))),
 67 |             ValueRef::Integer(_) => {
 68 |                 let dick = encoder_dict_from_ctx(ctx, arg_dict, level)
 69 |                     .context("loading dictionary from int")?;
 70 | 
 71 |                 let enc = Compressor::with_prepared_dictionary(&dick);
 72 |                 zstd_compress_fn_tail(compact, input_value, enc)
 73 |             }
 74 |             other => anyhow::bail!(
 75 |                 "dict argument must be int or blob, got {}",
 76 |                 other.data_type()
 77 |             ),
 78 |         }
 79 |     }
 80 | }
 81 | 
 82 | // separate fn purely for borrowship simplicity
 83 | fn zstd_compress_fn_tail<'a>(
 84 |     compact: bool,
 85 |     input_value: &[u8],
 86 |     encoder: Result<Compressor, std::io::Error>,
 87 | ) -> anyhow::Result<ToSqlOutput<'a>> {
 88 |     let mut encoder = encoder.context("creating zstd encoder")?;
 89 |     {
 90 |         // pledge source size (benchmarking shows this doesn't help any tho)
 91 |         let cctx = encoder.context_mut();
 92 |         cctx.set_pledged_src_size(input_value.len() as u64)
 93 |             .map_err(|c| anyhow::anyhow!("setting pledged source size (code {c})"))?;
 94 |         // cctx.set_parameter(zstd::zstd_safe::CParameter::BlockDelimiters(false))
 95 |         //    .map_err(|_| anyhow::anyhow!("no"))?;
 96 |     }
 97 |     if compact {
 98 |         encoder
 99 |             .include_checksum(false)
100 |             .context("disable checksums")?;
101 |         encoder.include_contentsize(false).context("cs")?;
102 |         encoder.include_dictid(false).context("did")?;
103 |         encoder.include_magicbytes(false).context("did")?;
104 |     }
105 |     let res = encoder
106 |         .compress(input_value)
107 |         .context("writing data to zstd encoder")?;
108 | 
109 |     Ok(ToSqlOutput::Owned(Value::Blob(res)))
110 | }
111 | 
112 | pub(crate) fn zstd_decompress_fn<'a>(
113 |     ctx: &Context,
114 |     null_dict_is_passthrough: bool,
115 | ) -> anyhow::Result<ToSqlOutput<'a>> {
116 |     let arg_data = 0;
117 |     let arg_output_text = 1;
118 |     let arg_dict = 2;
119 |     let arg_is_compact = 3;
120 | 
121 |     if null_dict_is_passthrough && ctx.len() >= arg_dict {
122 |         // if the dict id is null, pass through data
123 | 
124 |         if let ValueRef::Null = ctx.get_raw(arg_dict) {
125 |             // TODO: figure out if sqlite3_result_blob can be passed a pointer into sqlite3_context to avoid copying??
126 |             // return Ok(ToSqlOutput::Borrowed(ctx.get_raw(arg_data)));
127 |             return Ok(ToSqlOutput::Owned(ctx.get_raw(arg_data).into()));
128 |         }
129 |     }
130 | 
131 |     let output_text: bool = ctx
132 |         .get(arg_output_text)
133 |         .context("output_text arg invalid")?;
134 | 
135 |     let input_value = match ctx.get_raw(arg_data) {
136 |         ValueRef::Blob(b) => b,
137 |         ValueRef::Null => return Ok(ToSqlOutput::Owned(Value::Null)), // pass through null
138 |         e => {
139 |             anyhow::bail!(
140 |                 "zstd_decompress expects blob as input, got {}",
141 |                 e.data_type()
142 |             )
143 |         }
144 |     };
145 | 
146 |     let dict = if ctx.len() <= arg_dict {
147 |         None
148 |     } else {
149 |         match ctx.get_raw(arg_dict) {
150 |             ValueRef::Integer(-1) | ValueRef::Null => None,
151 |             ValueRef::Blob(d) => Some(Arc::new(DecoderDictionary::copy(d))),
152 |             ValueRef::Integer(_) => {
153 |                 Some(decoder_dict_from_ctx(ctx, arg_dict).context("load dict")?)
154 |             }
155 |             other => anyhow::bail!(
156 |                 "dict argument must be int or blob, got {}",
157 |                 other.data_type()
158 |             ),
159 |         }
160 |     };
161 | 
162 |     let compact = if ctx.len() <= arg_is_compact {
163 |         false
164 |     } else {
165 |         ctx.get(arg_is_compact).context("argument 'compact'")?
166 |     };
167 |     let dict_ref = dict.as_ref().map(|e| -> &DecoderDictionary { e });
168 | 
169 |     zstd_decompress_inner(input_value, dict_ref, output_text, compact)
170 | }
171 | 
172 | fn zstd_decompress_inner<'a>(
173 |     input_value: &[u8],
174 |     dict: Option<&DecoderDictionary>,
175 |     output_text: bool,
176 |     compact: bool,
177 | ) -> anyhow::Result<ToSqlOutput<'a>> {
178 |     let vec = {
179 |         // todo: use zstd::bulk api maybe (but we don't know the output size)
180 |         let out = Vec::new();
181 |         let mut decoder = match &dict {
182 |             Some(dict) => zstd::stream::write::Decoder::with_prepared_dictionary(out, dict),
183 |             None => zstd::stream::write::Decoder::new(out),
184 |         }
185 |         .context("dict load doesn't work")?;
186 |         if compact {
187 |             decoder.include_magicbytes(false)?;
188 |         }
189 |         decoder.write_all(input_value).context("decoding")?;
190 |         decoder.flush().context("decoder flushing")?;
191 |         decoder.into_inner()
192 |     };
193 | 
194 |     // dict; // to make sure the dict is still in scope because of https://github.com/gyscos/zstd-rs/issues/55
195 |     if output_text {
196 |         Ok(ToSqlOutput::Owned(Value::Text(
197 |             // converted right back to &u8 in https://docs.rs/rusqlite/0.21.0/src/rusqlite/types/value_ref.rs.html#107
198 |             // so we don't want the overhead of checking utf8. also db encoding might not be utf8 so ??
199 |             unsafe { String::from_utf8_unchecked(vec) },
200 |         )))
201 |     } else {
202 |         Ok(ToSqlOutput::Owned(Value::Blob(vec)))
203 |     }
204 | }
205 | 


--------------------------------------------------------------------------------
/src/bin/benchmark.rs:
--------------------------------------------------------------------------------
  1 | #![cfg(feature = "benchmark")]
  2 | 
  3 | use anyhow::Context;
  4 | use anyhow::Result;
  5 | use rand::seq::SliceRandom;
  6 | use rusqlite::{Connection, OpenFlags, params};
  7 | use std::path::{Path, PathBuf};
  8 | use std::{io::Write, time::Instant};
  9 | use structopt::StructOpt;
 10 | #[derive(Debug, StructOpt)]
 11 | struct Config {
 12 |     #[structopt(short, long)]
 13 |     input_db: Vec<String>,
 14 |     #[structopt(short, long)]
 15 |     location: Vec<String>,
 16 |     #[structopt(short, long)]
 17 |     zstd_lib: String,
 18 |     #[structopt(short, long)]
 19 |     hot_cache: bool,
 20 |     #[structopt(short, long)]
 21 |     iterations: i32,
 22 | }
 23 | 
 24 | fn pragmas(db: &Connection) -> Result<()> {
 25 |     //let want_page_size = 32768;
 26 |     //db.execute(&format!("pragma page_size = {};", want_page_size))
 27 |     //    .context("setup pragma 1")?;
 28 |     db.execute_batch(
 29 |         "
 30 |     pragma journal_mode = WAL;
 31 |     pragma foreign_keys = on;
 32 |     pragma temp_store = memory;
 33 |     pragma wal_autocheckpoint = 20;
 34 |     pragma synchronous = normal;
 35 |     pragma mmap_size = 30000000000;
 36 |     ",
 37 |     )?;
 38 |     let jm: String = db.pragma_query_value(None, "journal_mode", |r| r.get(0))?;
 39 |     if &jm != "wal" {
 40 |         anyhow::bail!("journal mode is not wal");
 41 |     }
 42 |     Ok(())
 43 | }
 44 | trait Bench {
 45 |     fn name(&self) -> &str;
 46 |     fn execute(&self, conn: &Connection) -> Result<i64>;
 47 | }
 48 | 
 49 | type DbId = i64;
 50 | struct SelectBench {
 51 |     name: &'static str,
 52 |     ids: Vec<DbId>,
 53 | }
 54 | 
 55 | impl SelectBench {
 56 |     fn prepare_sequential(conn: &Connection) -> Result<Box<dyn Bench>> {
 57 |         Ok(Box::new(SelectBench {
 58 |         name: "Select 1000 sequential (compressed) values",
 59 |             ids: conn.prepare("select id from title_basics where id >= (select id from title_basics order by random() limit 1) order by id asc limit 1000")?.query_map(params![], |r| r.get(0))?.collect::<Result<_, _>>()?
 60 |         }))
 61 |     }
 62 |     fn prepare_random(conn: &Connection) -> Result<Box<dyn Bench>> {
 63 |         Ok(Box::new(SelectBench {
 64 |             name: "Select 1000 random (compressed) values",
 65 |             ids: conn
 66 |                 .prepare("select id from title_basics order by random() limit 1000")?
 67 |                 .query_map(params![], |r| r.get(0))?
 68 |                 .collect::<Result<_, _>>()?,
 69 |         }))
 70 |     }
 71 | }
 72 | 
 73 | impl Bench for SelectBench {
 74 |     fn name(&self) -> &str {
 75 |         self.name
 76 |     }
 77 |     fn execute(&self, conn: &Connection) -> Result<i64> {
 78 |         let mut stmt = conn.prepare("select data from title_basics where id = ?")?;
 79 |         let mut _total_len = 0;
 80 |         for id in &self.ids {
 81 |             let data: String = stmt.query_row(params![id], |r| r.get(0))?;
 82 |             _total_len += data.len();
 83 |         }
 84 | 
 85 |         // eprintln!("total bytes got: {}", _total_len);
 86 |         Ok(self.ids.len() as i64)
 87 |     }
 88 | }
 89 | 
 90 | struct UpdateBench {
 91 |     name: &'static str,
 92 |     values: Vec<(DbId, String)>,
 93 | }
 94 | impl UpdateBench {
 95 |     fn prepare_random(conn: &Connection) -> Result<Box<dyn Bench>> {
 96 |         let ids: Vec<DbId> = conn
 97 |             .prepare("select id from title_basics order by random() limit 1000")?
 98 |             .query_map(params![], |r| r.get(0))?
 99 |             .collect::<Result<_, _>>()?;
100 | 
101 |         let values: Vec<String> = conn
102 |             .prepare("select data from title_basics order by random() limit 1000")?
103 |             .query_map(params![], |r| r.get(0))?
104 |             .collect::<Result<_, _>>()?;
105 |         Ok(Box::new(UpdateBench {
106 |             name: "Update 1000 random (compressed) values",
107 |             values: ids.into_iter().zip(values).collect(),
108 |         }))
109 |     }
110 |     fn prepare_sequential(conn: &Connection) -> Result<Box<dyn Bench>> {
111 |         let ids: Vec<DbId> = conn
112 |             .prepare("select id from title_basics where id >= (select id from title_basics order by random() limit 1) order by id asc limit 1000")?
113 |             .query_map(params![], |r| r.get(0))?
114 |             .collect::<Result<_, _>>()?;
115 | 
116 |         let values: Vec<String> = conn
117 |             .prepare("select data from title_basics order by random() limit ?")?
118 |             .query_map(params![ids.len()], |r| r.get(0))?
119 |             .collect::<Result<_, _>>()?;
120 |         Ok(Box::new(UpdateBench {
121 |             name: "Update 1000 sequential (compressed) values",
122 |             values: ids.into_iter().zip(values).collect(),
123 |         }))
124 |     }
125 | }
126 | impl Bench for UpdateBench {
127 |     fn name(&self) -> &str {
128 |         self.name
129 |     }
130 |     fn execute(&self, conn: &Connection) -> Result<i64> {
131 |         conn.execute("begin", params![])?;
132 |         let mut stmt = conn.prepare("update title_basics set data = ? where id = ?")?;
133 |         for (id, value) in &self.values {
134 |             stmt.execute(params![value, id])?;
135 |         }
136 |         conn.execute("commit", params![])?;
137 |         Ok(self.values.len() as i64)
138 |     }
139 | }
140 | struct InsertBench {
141 |     name: &'static str,
142 |     values: Vec<String>,
143 | }
144 | impl InsertBench {
145 |     fn prepare_random(conn: &Connection) -> Result<Box<dyn Bench>> {
146 |         let values: Vec<String> = conn
147 |             .prepare("select data from title_basics order by random() limit 1000")?
148 |             .query_map(params![], |r| r.get(0))?
149 |             .collect::<Result<_, _>>()?;
150 |         Ok(Box::new(InsertBench {
151 |             name: "Insert 1000 new values",
152 |             values,
153 |         }))
154 |     }
155 | }
156 | impl Bench for InsertBench {
157 |     fn name(&self) -> &str {
158 |         self.name
159 |     }
160 |     fn execute(&self, conn: &Connection) -> Result<i64> {
161 |         conn.execute("begin", params![])?;
162 |         let mut stmt = conn.prepare("insert into title_basics (data) values (?)")?;
163 |         for value in &self.values {
164 |             stmt.execute(params![value])?;
165 |         }
166 |         conn.execute("commit", params![])?;
167 |         Ok(self.values.len() as i64)
168 |     }
169 | }
170 | 
171 | fn drop_caches() -> Result<()> {
172 |     eprintln!("dropping caches");
173 |     assert!(std::process::Command::new("sync").status()?.success());
174 |     std::fs::OpenOptions::new()
175 |         .read(false)
176 |         .write(true)
177 |         .open("/proc/sys/vm/drop_caches")
178 |         .context("Could not open drop caches")?
179 |         .write_all(b"3")
180 |         .context("Could not drop caches")?;
181 |     Ok(())
182 | }
183 | 
184 | struct BenchTarget {
185 |     total_count: i64,
186 |     total_duration_s: f64,
187 |     path: PathBuf,
188 | }
189 | fn main() -> Result<()> {
190 |     if cfg!(debug_assertions) {
191 |         panic!("benching must be done in prod mode, otherwise the results are useless");
192 |     }
193 |     let config = Config::from_args();
194 |     //let input_db = Connection::open_with_flags(config.input_db)?;
195 | 
196 |     let its_per_bench = config.iterations;
197 | 
198 |     println!("location,db filename,test name,iterations/s,number of samples");
199 | 
200 |     let benches: Vec<Vec<_>> = {
201 |         let db1 =
202 |             Connection::open_with_flags(&config.input_db[0], OpenFlags::SQLITE_OPEN_READ_ONLY)?;
203 |         type Preparer = Box<dyn Fn(&Connection) -> Result<Box<dyn Bench>>>;
204 |         let preparers: Vec<Preparer> = vec![
205 |             Box::new(SelectBench::prepare_random),
206 |             Box::new(SelectBench::prepare_sequential),
207 |             Box::new(UpdateBench::prepare_random),
208 |             Box::new(UpdateBench::prepare_sequential),
209 |             Box::new(InsertBench::prepare_random),
210 |         ];
211 |         preparers
212 |             .iter()
213 |             .map(|preparer| {
214 |                 eprintln!("running preparer {its_per_bench} times");
215 |                 (0..its_per_bench)
216 |                     .map(|_i| preparer(&db1))
217 |                     .collect::<Result<_, _>>()
218 |                     .context("preparing benches")
219 |             })
220 |             .collect::<Result<_, _>>()?
221 |     };
222 | 
223 |     for locjoi in config.location {
224 |         let (location_name, location) = {
225 |             let vec: Vec<_> = locjoi.splitn(2, ':').collect();
226 |             (vec[0], vec[1])
227 |         };
228 |         eprintln!("{} at {}", location_name, location);
229 | 
230 |         let db_paths = config
231 |             .input_db
232 |             .iter()
233 |             .map(|input_db| {
234 |                 let pb = PathBuf::from(input_db);
235 |                 let file_name = pb.file_name().unwrap();
236 | 
237 |                 let db_path = Path::new(&location).join(file_name);
238 |                 if !db_path.exists() {
239 |                     eprintln!("copying {} -> {}", input_db, db_path.to_string_lossy());
240 |                     std::fs::copy(input_db, &db_path)?;
241 |                 } else {
242 |                     eprintln!(
243 |                         "{} already exists, assuming it's the same",
244 |                         file_name.to_string_lossy()
245 |                     );
246 |                 }
247 |                 Ok(db_path)
248 |             })
249 |             .collect::<Result<Vec<_>>>()?;
250 |         for bench_its in &benches {
251 |             // eprintln!("{locjoi} benchmark {}", bench_its[0].name());
252 |             let mut targets: Vec<_> = db_paths
253 |                 .iter()
254 |                 .map(|path| BenchTarget {
255 |                     total_count: 0,
256 |                     total_duration_s: 0.0,
257 |                     path: path.clone(),
258 |                 })
259 |                 .collect();
260 |             for (i, bench) in bench_its.iter().enumerate() {
261 |                 eprintln!(
262 |                     "{locjoi} benchmark {} iteration {i} / {its_per_bench}",
263 |                     bench.name()
264 |                 );
265 |                 if !config.hot_cache {
266 |                     drop_caches()?;
267 |                 }
268 |                 // shuffle to make sure there is no crosstalk
269 |                 targets.shuffle(&mut rand::thread_rng());
270 | 
271 |                 for target in targets.iter_mut() {
272 |                     let db = Connection::open(&target.path)?;
273 |                     pragmas(&db).context("Could not set pragmas")?;
274 |                     db.load_extension(&config.zstd_lib, None)?;
275 |                     let before = Instant::now();
276 |                     target.total_count += bench.execute(&db).context("executing bench")?;
277 |                     target.total_duration_s += before.elapsed().as_secs_f64();
278 |                 }
279 |             }
280 |             targets.sort_by_key(|e| e.path.clone());
281 |             for target in &targets {
282 |                 println!(
283 |                     "{},{},{},{:.0},{}",
284 |                     location_name,
285 |                     target.path.file_name().unwrap().to_string_lossy(),
286 |                     bench_its[0].name(),
287 |                     target.total_count as f64 / target.total_duration_s,
288 |                     target.total_count
289 |                 );
290 |             }
291 |         }
292 |     }
293 | 
294 |     Ok(())
295 | }
296 | 


--------------------------------------------------------------------------------
/src/bin/create_test_db.rs:
--------------------------------------------------------------------------------
  1 | use std::fs::File;
  2 | 
  3 | use anyhow::Context;
  4 | use anyhow::Result;
  5 | use rusqlite::Connection;
  6 | use rusqlite::params;
  7 | use serde::Deserialize;
  8 | use serde::Serialize;
  9 | use serde_json::json;
 10 | use structopt::StructOpt;
 11 | 
 12 | #[derive(Serialize, Deserialize)]
 13 | #[allow(non_snake_case)]
 14 | struct Title {
 15 |     tconst: String,
 16 |     titleType: String,
 17 |     primaryTitle: String,
 18 |     originalTitle: String,
 19 |     isAdult: i32,
 20 |     startYear: String,
 21 |     endYear: String,
 22 |     runtimeMinutes: String,
 23 |     genres: String,
 24 | }
 25 | 
 26 | fn pragmas(db: &Connection) -> Result<()> {
 27 |     //let want_page_size = 32768;
 28 |     //db.execute(&format!("pragma page_size = {};", want_page_size))
 29 |     //    .context("setup pragma 1")?;
 30 |     db.execute_batch(
 31 |         "
 32 |     pragma journal_mode = WAL;
 33 |     pragma foreign_keys = on;
 34 |     pragma temp_store = memory;
 35 |     pragma wal_autocheckpoint = 20;
 36 |     pragma synchronous = normal;
 37 |     pragma mmap_size = 30000000000;
 38 |     ",
 39 |     )?;
 40 |     let jm: String = db.pragma_query_value(None, "journal_mode", |r| r.get(0))?;
 41 |     if &jm != "wal" {
 42 |         anyhow::bail!("journal mode is not wal");
 43 |     }
 44 |     Ok(())
 45 | }
 46 | 
 47 | #[derive(Debug, StructOpt)]
 48 | struct Config {
 49 |     #[structopt(short, long)]
 50 |     zstd_lib: String,
 51 | }
 52 | 
 53 | fn main() -> Result<()> {
 54 |     let config = Config::from_args();
 55 |     env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
 56 |     std::env::set_var("SQLITE_ZSTD_LOG", "debug");
 57 |     // before running, download https://datasets.imdbws.com/title.basics.tsv.gz
 58 |     // loads title_basics.tsv.gz, creates a json database and a database in normal form
 59 | 
 60 |     log::info!("loading csv");
 61 |     let data: Vec<Title> = csv::ReaderBuilder::new()
 62 |         .delimiter(b'\t')
 63 |         .quoting(false)
 64 |         .from_reader(File::open("benchmark/title.basics.tsv")?)
 65 |         .deserialize()
 66 |         .collect::<std::result::Result<Vec<Title>, csv::Error>>()
 67 |         .context("foo")?;
 68 |     {
 69 |         log::info!("creating columnar db");
 70 |         let mut columnar = Connection::open("benchmark/imdb-columnar.sqlite3")?;
 71 |         pragmas(&columnar)?;
 72 | 
 73 |         columnar.execute("create table title_basics(
 74 |             id integer primary key, tconst text, titleType text, primaryTitle text, originalTitle text, isAdult int, startYear text, endYear text, runtimeMinutes text, genres text)", params![])?;
 75 |         let db = columnar.transaction()?;
 76 |         let mut stmt = db.prepare("insert into title_basics values (?,?,?,?,?,?,?,?,?,?)")?;
 77 |         for ele in &data {
 78 |             stmt.execute(params![
 79 |                 &Option::<String>::None,
 80 |                 &ele.tconst,
 81 |                 &ele.titleType,
 82 |                 &ele.primaryTitle,
 83 |                 &ele.originalTitle,
 84 |                 &ele.isAdult,
 85 |                 &ele.startYear,
 86 |                 &ele.endYear,
 87 |                 &ele.runtimeMinutes,
 88 |                 &ele.genres
 89 |             ])?;
 90 |         }
 91 |         drop(stmt);
 92 |         db.commit()?;
 93 |     }
 94 |     {
 95 |         log::info!("creating json db");
 96 |         let mut jsondb = Connection::open("benchmark/imdb-json.sqlite3").unwrap();
 97 |         pragmas(&jsondb)?;
 98 |         jsondb.execute(
 99 |             "create table title_basics(
100 |             id integer primary key, data text)",
101 |             params![],
102 |         )?;
103 |         let tx = jsondb.transaction()?;
104 |         let mut stmt = tx.prepare("insert into title_basics values (?, ?)")?;
105 |         for ele in &data {
106 |             stmt.execute(params![
107 |                 &Option::<String>::None,
108 |                 &serde_json::to_string(ele)?
109 |             ])?;
110 |         }
111 |         drop(stmt);
112 |         tx.commit()?;
113 |         log::info!("vacuum-copying dbs");
114 |         jsondb.execute(
115 |             "vacuum into 'benchmark/imdb-json-nocompress.sqlite3'",
116 |             params![],
117 |         )?;
118 |         jsondb.execute(
119 |             "vacuum into 'benchmark/imdb-json-zstd-transparent.sqlite3'",
120 |             params![],
121 |         )?;
122 |         jsondb.execute(
123 |             "vacuum into 'benchmark/imdb-json-zstd-nodict.sqlite3'",
124 |             params![],
125 |         )?;
126 |     }
127 |     {
128 |         log::info!("doing transparent compression");
129 |         let db = Connection::open("benchmark/imdb-json-zstd-transparent.sqlite3").unwrap();
130 |         pragmas(&db)?;
131 |         db.load_extension(&config.zstd_lib, None)?;
132 |         let config = json!({
133 |             "table": "title_basics",
134 |             "column": "data",
135 |             "compression_level": 19,
136 |             "dict_chooser": "'i' || (id/3000000)"
137 |         });
138 |         db.query_row(
139 |             "select zstd_enable_transparent(?)",
140 |             params![&serde_json::to_string(&config)?],
141 |             |_| Ok(()),
142 |         )?;
143 |         db.query_row(
144 |             "select zstd_incremental_maintenance(null, 1)",
145 |             params![],
146 |             |_| Ok(()),
147 |         )?;
148 |         db.execute("vacuum", params![])?;
149 |     }
150 |     {
151 |         log::info!("doing nodict compression");
152 |         let db = Connection::open("benchmark/imdb-json-zstd-nodict.sqlite3").unwrap();
153 |         pragmas(&db)?;
154 |         db.load_extension(&config.zstd_lib, None)?;
155 |         let config = json!({
156 |             "table": "title_basics",
157 |             "column": "data",
158 |             "compression_level": 19,
159 |             "dict_chooser": "'[nodict]'"
160 |         });
161 |         db.query_row(
162 |             "select zstd_enable_transparent(?)",
163 |             params![&serde_json::to_string(&config)?],
164 |             |_| Ok(()),
165 |         )?;
166 |         db.query_row(
167 |             "select zstd_incremental_maintenance(null, 1)",
168 |             params![],
169 |             |_| Ok(()),
170 |         )?;
171 |         db.execute("vacuum", params![])?;
172 |     }
173 |     Ok(())
174 | }
175 | 


--------------------------------------------------------------------------------
/src/create_extension.rs:
--------------------------------------------------------------------------------
 1 | // https://www.sqlite.org/loadext.html
 2 | // https://github.com/jgallagher/rusqlite/issues/524#issuecomment-507787350
 3 | 
 4 | use rusqlite::Connection;
 5 | use rusqlite::ffi;
 6 | use std::os::raw::c_int;
 7 | 
 8 | #[expect(clippy::not_unsafe_ptr_arg_deref)]
 9 | #[unsafe(no_mangle)]
10 | pub extern "C" fn sqlite3_sqlitezstd_init(
11 |     db: *mut ffi::sqlite3,
12 |     pz_err_msg: *mut *mut std::os::raw::c_char,
13 |     p_api: *mut ffi::sqlite3_api_routines,
14 | ) -> c_int {
15 |     /* Insert here calls to
16 |      **     sqlite3_create_function_v2(),
17 |      **     sqlite3_create_collation_v2(),
18 |      **     sqlite3_create_module_v2(), and/or
19 |      **     sqlite3_vfs_register()
20 |      ** to register the new features that your extension adds.
21 |      */
22 |     unsafe { Connection::extension_init2(db, pz_err_msg, p_api, init) }
23 | }
24 | 
25 | fn init(db: Connection) -> rusqlite::Result<bool> {
26 |     match crate::load(&db) {
27 |         Ok(()) => {
28 |             log::info!("[sqlite-zstd] initialized");
29 |             Ok(false)
30 |         }
31 |         Err(e) => {
32 |             log::error!("[sqlite-zstd] init error: {:?}", e);
33 |             Err(rusqlite::Error::ModuleError(format!("{:?}", e)))
34 |         }
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/dict_management.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::Context as AContext;
  2 | use lru_time_cache::LruCache;
  3 | use rusqlite::Connection;
  4 | use rusqlite::{functions::Context, params};
  5 | use std::sync::LazyLock;
  6 | use std::sync::{Arc, Mutex};
  7 | use std::time::Duration;
  8 | 
  9 | use zstd::dict::{DecoderDictionary, EncoderDictionary};
 10 | 
 11 | type EncoderCache = LruCache<(usize, i32, i32), Arc<EncoderDictionary<'static>>>;
 12 | // we cache the instantiated encoder dictionaries keyed by (DbConnection, dict_id, compression_level)
 13 | // DbConnection would ideally be db.path() because it's the same for multiple connections to the same db, but that would be less robust (e.g. in-memory databases)
 14 | // we use a Mutex and not a RwLock because even the .get() methods on LruCache need to write (to update expiry and least recently used time)
 15 | static ENCODER_DICTS: LazyLock<Mutex<EncoderCache>> =
 16 |     LazyLock::new(|| Mutex::new(LruCache::with_expiry_duration(Duration::from_secs(10))));
 17 | 
 18 | type DecoderCache = LruCache<(usize, i32), Arc<DecoderDictionary<'static>>>;
 19 | static DECODER_DICTS: LazyLock<Mutex<DecoderCache>> =
 20 |     LazyLock::new(|| Mutex::new(LruCache::with_expiry_duration(Duration::from_secs(10))));
 21 | 
 22 | /// when we open a new connection, it may reuse the same pointer location as an old connection, so we need to invalidate parts of the dict cache
 23 | pub(crate) fn invalidate_caches(_db: &Connection) {
 24 |     // (theoretically we only need to clear caches with key db_handle_pointer but it likely doesn't matter much,
 25 |     // how often are you going to open a new connection?)
 26 |     // let db_handle_pointer = unsafe { db.handle() } as usize;
 27 |     log::debug!("Invalidating dict caches");
 28 |     {
 29 |         let mut cache = ENCODER_DICTS.lock().unwrap();
 30 |         cache.clear();
 31 |     }
 32 |     {
 33 |         let mut cache = DECODER_DICTS.lock().unwrap();
 34 |         cache.clear();
 35 |     }
 36 | }
 37 | // TODO: the rust interface currently requires a level when preparing a dictionary, but the zstd interface (ZSTD_CCtx_loadDictionary) does not.
 38 | // TODO: Using LruCache here isn't very smart
 39 | pub fn encoder_dict_from_ctx(
 40 |     ctx: &Context,
 41 |     arg_index: usize,
 42 |     level: i32,
 43 | ) -> anyhow::Result<Arc<EncoderDictionary<'static>>> {
 44 |     let id: i32 = ctx.get(arg_index)?;
 45 |     let db = unsafe { ctx.get_connection()? }; // SAFETY: This might be unsafe depending on how the connection is used. See https://github.com/rusqlite/rusqlite/issues/643#issuecomment-640181213
 46 |     let db_handle_pointer = unsafe { db.handle() } as usize; // SAFETY: We're only getting the pointer as an int, not using the raw connection
 47 | 
 48 |     let mut dicts_write = ENCODER_DICTS.lock().unwrap();
 49 |     let entry = dicts_write.entry((db_handle_pointer, id, level));
 50 |     let res = match entry {
 51 |         lru_time_cache::Entry::Vacant(e) => e.insert({
 52 |             log::debug!(
 53 |                 "loading encoder dictionary {} level {} (should only happen once per 10s)",
 54 |                 id,
 55 |                 level
 56 |             );
 57 | 
 58 |             let dict_raw: Vec<u8> = db
 59 |                 .query_row(
 60 |                     "select dict from _zstd_dicts where id = ?",
 61 |                     params![id],
 62 |                     |r| r.get(0),
 63 |                 )
 64 |                 .with_context(|| format!("getting dict with id={id} from _zstd_dicts"))?;
 65 |             let dict = EncoderDictionary::copy(&dict_raw, level);
 66 |             Arc::new(dict)
 67 |         }),
 68 |         lru_time_cache::Entry::Occupied(o) => o.into_mut(),
 69 |     }
 70 |     .clone();
 71 |     Ok(res)
 72 | }
 73 | 
 74 | pub fn decoder_dict_from_ctx(
 75 |     ctx: &Context,
 76 |     arg_index: usize,
 77 | ) -> anyhow::Result<Arc<DecoderDictionary<'static>>> {
 78 |     // we cache the instantiated decoder dictionaries keyed by (DbConnection, dict_id)
 79 |     // DbConnection would ideally be db.path() because it's the same for multiple connections to the same db, but that would be less robust (e.g. in-memory databases)
 80 |     let id: i32 = ctx.get(arg_index)?;
 81 |     let db = unsafe { ctx.get_connection()? }; // SAFETY: This might be unsafe depending on how the connection is used. See https://github.com/rusqlite/rusqlite/issues/643#issuecomment-640181213
 82 |     let db_handle_pointer = unsafe { db.handle() } as usize; // SAFETY: We're only getting the pointer as an int, not using the raw connection
 83 |     log::trace!("Using DB Handle pointer {db_handle_pointer} as cache key");
 84 |     let cache_key = (db_handle_pointer, id);
 85 |     // since the get() function on lru cache also writes (updates last used time and expiry),
 86 |     // we can not use DICTS.read() (RwLock) for perf
 87 |     let mut dicts_write = DECODER_DICTS.lock().unwrap();
 88 |     let entry = dicts_write.entry(cache_key);
 89 |     let res = match entry {
 90 |         lru_time_cache::Entry::Vacant(e) => e.insert({
 91 |             log::debug!(
 92 |                 "loading decoder dictionary {} (should only happen once per 10s)",
 93 |                 id
 94 |             );
 95 |             let db = unsafe { ctx.get_connection()? };
 96 |             let dict_raw: Vec<u8> = db
 97 |                 .query_row(
 98 |                     "select dict from _zstd_dicts where id = ?",
 99 |                     params![id],
100 |                     |r| r.get(0),
101 |                 )
102 |                 .with_context(|| format!("getting dict with id={id} from _zstd_dicts"))?;
103 |             let dict = DecoderDictionary::copy(&dict_raw);
104 |             Arc::new(dict)
105 |         }),
106 |         lru_time_cache::Entry::Occupied(o) => o.into_mut(),
107 |     }
108 |     .clone();
109 |     Ok(res)
110 | }
111 | 
112 | /*
113 | 
114 | 
115 | use rusqlite::{functions::Context, params, types::ValueRef};
116 | 
117 | /// load a dict from sqlite function parameters
118 | ///
119 | /// sqlite sadly does not do auxdata caching for subqueries like `zstd_compress(data, 3, (select dict from _zstd_dicts where id = 4))`
120 | /// so instead we support the syntax `zstd_compress(data, 3, 4)` as an alias to the above
121 | /// if the dict parameter is a number, the dict will be queried from the _zstd_dicts table and cached in sqlite auxdata
122 | /// so it is only constructed once per query
123 | ///
124 | /// this function is not 100% correct because the level is passed separately from the dictionary but the dictionary is cached in the aux data of the dictionary parameter
125 | /// e.g. `select zstd_compress(tbl.data, tbl.row_compression_level, 123) from tbl` will probably compress all the data with the same compression ratio instead of a random one
126 | /// as a workaround `select zstd_compress(tbl.data, tbl.row_compression_level, (select 123)) from tbl` probably works
127 | /// to fix this the level parameter would need to be checked against the constructed dictionary and the dict discarded on mismatch
128 | pub fn encoder_dict_from_ctx<'a>(
129 |     ctx: &'a Context,
130 |     arg_index: usize,
131 |     level: i32,
132 | ) -> rusqlite::Result<Arc<OwnedEncoderDict<'a>>> {
133 |     Ok(match ctx.get_aux::<OwnedEncoderDict>(arg_index as i32)? {
134 |         Some(d) => d,
135 |         None => {
136 |             log::debug!("loading dictionary (should only happen once per query)");
137 |             let dict_raw = match ctx.get_raw(arg_index) {
138 |                 ValueRef::Blob(b) => b.to_vec(),
139 |                 ValueRef::Integer(i) => {
140 |                     let db = unsafe { ctx.get_connection()? };
141 |                     let res: Vec<u8> = db.query_row(
142 |                         "select dict from _zstd_dicts where id = ?",
143 |                         params![i],
144 |                         |r| r.get(0),
145 |                     )?;
146 |                     res
147 |                 }
148 |                 e => {
149 |                     return Err(rusqlite::Error::InvalidFunctionParameterType(
150 |                         arg_index,
151 |                         e.data_type(),
152 |                     ))
153 |                 }
154 |             };
155 |             let dict = wrap_encoder_dict(dict_raw, level);
156 |             ctx.set_aux(arg_index as i32, dict)?;
157 |             ctx.get_aux::<OwnedEncoderDict>(arg_index as i32)?.unwrap()
158 |         }
159 |     })
160 | }
161 | 
162 | 
163 | /// same as above
164 | pub fn decoder_dict_from_ctx<'a>(
165 |     ctx: &'a Context,
166 |     arg_index: usize,
167 | ) -> rusqlite::Result<Arc<OwnedDecoderDict<'a>>> {
168 |     Ok(match ctx.get_aux::<OwnedDecoderDict>(arg_index as i32)? {
169 |         Some(d) => d,
170 |         None => {
171 |             log::debug!("loading dictionary (should only happen once per query)");
172 |             let dict_raw = /*ctx.get::<Vec<u8>>(arg_index)?;*/
173 |             match ctx.get_raw(arg_index) {
174 |                 ValueRef::Blob(b) => b.to_vec(),
175 |                 ValueRef::Integer(i) => {
176 |                     let db = unsafe { ctx.get_connection()? };
177 |                     let res: Vec<u8> = db.query_row(
178 |                         "select dict from _zstd_dicts where id = ?",
179 |                         params![i],
180 |                         |r| r.get(0),
181 |                     )?;
182 |                     res
183 |                 }
184 |                 e => return Err(rusqlite::Error::InvalidFunctionParameterType(
185 |                     arg_index,
186 |                     e.data_type(),
187 |                 )),
188 |             };
189 |             let dict = wrap_decoder_dict(dict_raw);
190 |             ctx.set_aux(arg_index as i32, dict)?;
191 |             ctx.get_aux::<OwnedDecoderDict>(arg_index as i32)?.unwrap()
192 |         }
193 |     })
194 | }
195 | */
196 | 


--------------------------------------------------------------------------------
/src/dict_training.rs:
--------------------------------------------------------------------------------
  1 | use crate::transparent::pretty_bytes;
  2 | use crate::util::*;
  3 | use anyhow::Context as AContext;
  4 | use rand::Rng;
  5 | use rusqlite::functions::Context;
  6 | 
  7 | use rusqlite::params;
  8 | use rusqlite::types::{Value, ValueRef};
  9 | 
 10 | pub struct ZstdTrainDictAggregate {
 11 |     /// if None, return trained dict, otherwise insert into _zstd_dicts table with chooser_key given as fourth arg and return id
 12 |     /// if false expects 3 args, if true expects 4 args
 13 |     pub return_save_id: bool,
 14 | }
 15 | pub struct ZstdTrainDictState {
 16 |     reservoir: Vec<Vec<u8>>,
 17 |     wanted_item_count: usize,
 18 |     total_count: usize,
 19 |     wanted_dict_size: usize,
 20 |     chooser_key: Option<Option<String>>,
 21 | }
 22 | 
 23 | impl rusqlite::functions::Aggregate<ZstdTrainDictState, Value> for ZstdTrainDictAggregate {
 24 |     fn init(&self, ctx: &mut Context) -> rusqlite::Result<ZstdTrainDictState> {
 25 |         let arg_dict_size_bytes = 1;
 26 |         let arg_sample_count = 2;
 27 |         let arg_chooser_key = 3;
 28 |         let wanted_item_count = ctx.get::<f64>(arg_sample_count)? as usize;
 29 |         log::debug!("sampling {} values", wanted_item_count);
 30 |         Ok(ZstdTrainDictState {
 31 |             reservoir: vec![],
 32 |             wanted_item_count,
 33 |             wanted_dict_size: ctx.get::<i64>(arg_dict_size_bytes)? as usize,
 34 |             total_count: 0,
 35 |             chooser_key: if self.return_save_id {
 36 |                 Some(ctx.get(arg_chooser_key)?)
 37 |             } else {
 38 |                 None
 39 |             },
 40 |         })
 41 |     }
 42 |     fn step(&self, ctx: &mut Context, state: &mut ZstdTrainDictState) -> rusqlite::Result<()> {
 43 |         let arg_sample = 0;
 44 | 
 45 |         let cur = match ctx.get_raw(arg_sample) {
 46 |             ValueRef::Blob(b) => b,
 47 |             ValueRef::Text(b) => b,
 48 |             ValueRef::Real(_f) => return Ok(()),
 49 |             ValueRef::Integer(_i) => return Ok(()),
 50 |             ValueRef::Null => return Ok(()),
 51 |         };
 52 |         let i = state.total_count;
 53 |         let k = state.wanted_item_count;
 54 |         // https://en.wikipedia.org/wiki/Reservoir_sampling#Simple_algorithm
 55 | 
 56 |         if i < k {
 57 |             state.reservoir.push(Vec::from(cur));
 58 |             state.total_count += 1;
 59 |             return Ok(());
 60 |         }
 61 |         state.total_count += 1;
 62 |         let j = rand::thread_rng().gen_range(0..i);
 63 |         if j < k {
 64 |             state.reservoir[j] = Vec::from(cur);
 65 |         }
 66 |         Ok(())
 67 |     }
 68 | 
 69 |     fn finalize(
 70 |         &self,
 71 |         ctx: &mut Context,
 72 |         state: Option<ZstdTrainDictState>,
 73 |     ) -> rusqlite::Result<Value> {
 74 |         let state =
 75 |             state.ok_or_else(|| ah(anyhow::anyhow!("tried to train zstd dict on zero rows")))?;
 76 |         log::debug!(
 77 |             "training dict of max size {}kB with {} samples of total size {}kB (of {} samples seen)",
 78 |             state.wanted_dict_size / 1000,
 79 |             state.reservoir.len(),
 80 |             state.reservoir.iter().map(|x| x.len()).sum::<usize>() / 1000,
 81 |             state.total_count
 82 |         );
 83 |         let dict = zstd::dict::from_samples(&state.reservoir, state.wanted_dict_size)
 84 |             .context("Training dictionary failed")
 85 |             .map_err(ah)?;
 86 |         log::debug!(
 87 |             "resulting dict has size {}",
 88 |             pretty_bytes(dict.len() as i64)
 89 |         );
 90 |         if let Some(key) = state.chooser_key {
 91 |             let db = unsafe { ctx.get_connection()? };
 92 |             ensure_dicts_table_exists(&db)?;
 93 |             db.execute(
 94 |                 "insert into _zstd_dicts (chooser_key,dict) values (?, ?);",
 95 |                 params![key, dict],
 96 |             )?;
 97 |             let id = db.last_insert_rowid();
 98 |             log::debug!(
 99 |                 "inserted dict into _zstd_dicts with key {}, id {}",
100 |                 key.as_deref().unwrap_or("null"),
101 |                 id
102 |             );
103 |             Ok(Value::Integer(id))
104 |         } else {
105 |             Ok(Value::Blob(dict))
106 |         }
107 |     }
108 | }
109 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![warn(clippy::print_stdout)]
 2 | 
 3 | use rusqlite::Connection;
 4 | use util::init_logging;
 5 | 
 6 | #[cfg(feature = "build_extension")]
 7 | mod create_extension;
 8 | 
 9 | mod add_functions;
10 | mod basic;
11 | mod dict_management;
12 | mod dict_training;
13 | mod transparent;
14 | mod util;
15 | 
16 | pub use log::LevelFilter as LogLevel;
17 | 
18 | /// Loads the sqlite extension with the default log level (INFO)
19 | pub fn load(connection: &Connection) -> anyhow::Result<()> {
20 |     load_with_loglevel(connection, LogLevel::Info)
21 | }
22 | 
23 | /// Loads the sqlite extension with the given log level
24 | pub fn load_with_loglevel(
25 |     connection: &Connection,
26 |     default_log_level: LogLevel,
27 | ) -> anyhow::Result<()> {
28 |     init_logging(default_log_level);
29 |     crate::dict_management::invalidate_caches(connection);
30 |     crate::add_functions::add_functions(connection)
31 | }
32 | 


--------------------------------------------------------------------------------
/src/transparent.rs:
--------------------------------------------------------------------------------
   1 | use crate::{util::*, *};
   2 | use anyhow::Context as AContext;
   3 | use rusqlite::OptionalExtension;
   4 | use rusqlite::functions::Context;
   5 | use rusqlite::types::ToSqlOutput;
   6 | use rusqlite::types::Value;
   7 | use rusqlite::{named_params, params};
   8 | use std::collections::HashMap;
   9 | use std::time::{Duration, Instant};
  10 | 
  11 | // the output will be without magic header, without checksums, and without dictids. This will save 4 bytes when not using dictionaries and 8 bytes when using dictionaries.
  12 | // this also means the data will not be decodeable as a normal zstd archive with the standard tools
  13 | static COMPACT: bool = true;
  14 | #[derive(Debug)]
  15 | struct ColumnInfo {
  16 |     name: String,
  17 |     coltype: String,
  18 |     is_primary_key: bool,
  19 |     to_compress: bool,
  20 |     is_dict_id: bool,
  21 | }
  22 | 
  23 | fn def_min_dict_size() -> i64 {
  24 |     5000
  25 | }
  26 | fn def_dict_size_ratio() -> f32 {
  27 |     0.01
  28 | }
  29 | fn def_train_dict_samples_ratio() -> f32 {
  30 |     100.0
  31 | }
  32 | fn def_incremental_compression_step_bytes() -> i64 {
  33 |     // https://github.com/facebook/zstd/blob/dev/doc/images/CSpeed2.png
  34 |     // about 5MB/s at level 19
  35 |     5_000_000 / 3
  36 | }
  37 | 
  38 | /// This is the configuration of the transparent compression for one column of one table.
  39 | /// It is safe to change every property of this configuration at any time except for table and column, but data that is already compressed will not be recompressed with the new settings.
  40 | /// You can update the config e.g. using SQL: `update _zstd_configs set config = json_patch(config, '{"target_db_load": 1}');`
  41 | ///
  42 | /// Note that the configuration is assumed to be trusted. For example, dict_chooser can probably used for SQL injection.
  43 | #[derive(serde::Serialize, serde::Deserialize)]
  44 | pub struct TransparentCompressConfig {
  45 |     /// the name of the table to which the transparent compression will be applied. It will be renamed to _tblname_zstd and replaced with a editable view.
  46 |     pub table: String,
  47 |     /// the name of the column
  48 |     pub column: String,
  49 |     /// The compression level. Valid levels are 1-19.
  50 |     /// Compression will be significantly slower when the level is increased, but decompression speed should stay about the same regardless of compression level.
  51 |     /// That means this is a tradeoff between zstd_incremental_maintenance vs SELECT performance.
  52 |     pub compression_level: i8,
  53 |     /// An SQL expression that chooses which dict to use or returns null if data should stay uncompressed for now
  54 |     /// Examples:
  55 |     ///
  56 |     /// * `'a'`
  57 |     ///   This will cause a single dictionary to be trained for everything.
  58 |     ///
  59 |     /// * `strftime(created, '%Y-%m')`
  60 |     ///   This will cause every month of data to be compressed with its own dictionary.
  61 |     ///
  62 |     /// * `nullif(strftime(created, '%Y-%m'), strftime('now', '%Y-%m'))`
  63 |     ///
  64 |     ///   The same as above, but if the entry is from the current month it will stay uncompressed.
  65 |     ///   This is handy because it means that the dictionary for the month will only be created when the month is over
  66 |     ///   and can thus be optimized the most for the given data
  67 |     /// * `case when date(timestamp, ''weekday 0'') < date(''now'', ''weekday 0'') then data_type || ''.'' || date(timestamp, ''weekday 0'') else null end`
  68 |     ///
  69 |     ///   This one uses keys like data_type.2020-11-01` where the date is the first day of the week, except for the current week which stays uncompressed.
  70 |     ///   This means that every different data_type will be compressed separately and separately for each week.
  71 |     ///
  72 |     /// You can return the special string `[nodict]` to compress the given data without a dictionary.
  73 |     /// Note that the compression key is global for all tables. So if you want your dict to only apply to this table return
  74 |     /// something like `"tablename." || strftime(...)`.
  75 |     pub dict_chooser: String,
  76 |     #[serde(default = "def_min_dict_size")]
  77 |     /// if dictionary size would be smaller than this then no dict will be trained and if no dict exists the data will stay uncompressed
  78 |     pub min_dict_size_bytes_for_training: i64,
  79 |     #[serde(default = "def_dict_size_ratio")]
  80 |     /// The target size of the dictionary based on seen data. For example,
  81 |     /// For example if we see 10MB of data for a specific group, the dict will target a size of ratio * 10MB (default 0.01)
  82 |     pub dict_size_ratio: f32,
  83 |     /// for training, we find samples of this factor (default 100)
  84 |     /// the default of 100 and 0.01 means that by default the dict will be trained on all of the data
  85 |     #[serde(default = "def_train_dict_samples_ratio")]
  86 |     pub train_dict_samples_ratio: f32,
  87 |     /// how many bytes (approximately) to compress at once. By default tuned so at compression level 19 it locks the database for about 0.3s per step.
  88 |     #[serde(default = "def_incremental_compression_step_bytes")]
  89 |     pub incremental_compression_step_bytes: i64,
  90 | }
  91 | 
  92 | pub fn pretty_bytes(bytes: i64) -> String {
  93 |     if bytes >= 1_000_000_000 {
  94 |         format!("{:.2}GB", bytes as f64 / 1e9)
  95 |     } else if bytes >= 1_000_000 {
  96 |         format!("{:.2}MB", bytes as f64 / 1e6)
  97 |     } else if bytes >= 1_000 {
  98 |         format!("{:.2}kB", bytes as f64 / 1e3)
  99 |     } else {
 100 |         format!("{bytes}B")
 101 |     }
 102 | }
 103 | 
 104 | #[derive(Debug)]
 105 | enum SqliteAffinity {
 106 |     Integer,
 107 |     Text,
 108 |     Blob,
 109 |     Real,
 110 |     Numeric,
 111 | }
 112 | /// determine affinity, algorithm described at https://www.sqlite.org/draft/datatype3.html#determination_of_column_affinity
 113 | fn get_column_affinity(declared_type: &str) -> SqliteAffinity {
 114 |     use SqliteAffinity::*;
 115 |     let typ = declared_type.to_ascii_lowercase();
 116 |     if typ.contains("int") {
 117 |         Integer
 118 |     } else if typ.contains("char") || typ.contains("clob") || typ.contains("text") {
 119 |         Text
 120 |     } else if typ.contains("blob") || typ.is_empty() {
 121 |         Blob
 122 |     } else if typ.contains("real") || typ.contains("floa") || typ.contains("doub") {
 123 |         Real
 124 |     } else {
 125 |         Numeric
 126 |     }
 127 | }
 128 | 
 129 | fn show_warnings(db: &Connection) -> anyhow::Result<()> {
 130 |     // warnings
 131 |     let journal_mode: String = db
 132 |         .query_row("pragma journal_mode;", params![], |r| r.get(0))
 133 |         .context("querying journal mode")?;
 134 |     if journal_mode != "wal" {
 135 |         log::warn!("Warning: It is recommended to set `pragma journal_mode=WAL;`");
 136 |     }
 137 |     let vacuum_mode: i32 = db
 138 |         .query_row("pragma auto_vacuum;", params![], |r| r.get(0))
 139 |         .context("querying vacuum mode")?;
 140 |     if vacuum_mode != 1 {
 141 |         log::warn!("Warning: It is recommended to set `pragma auto_vacuum=full;`");
 142 |     }
 143 |     let busy_timeout: i32 = db
 144 |         .query_row("pragma busy_timeout;", params![], |r| r.get(0))
 145 |         .context("querying busy timeout")?;
 146 |     if busy_timeout == 0 {
 147 |         log::warn!("Warning: It is recommended to set `pragma busy_timeout=2000;` or higher");
 148 |     }
 149 |     Ok(())
 150 | }
 151 | ///
 152 | /// enables transparent row-level compression for a table with the following steps:
 153 | ///
 154 | /// 1. renames tablename to _tablename_zstd if table is not already enabled
 155 | /// 2. creates a view called tablename that mirrors _tablename_zstd except it decompresses the compressed column on the fly
 156 | /// 3. creates INSERT, UPDATE and DELETE triggers on the view so they affect the backing table instead
 157 | ///
 158 | /// Warning: this function assumes trusted input, it is not sql injection safe!
 159 | pub fn zstd_enable_transparent<'a>(ctx: &Context) -> anyhow::Result<ToSqlOutput<'a>> {
 160 |     let arg_config = 0;
 161 | 
 162 |     let config_str: String = ctx.get(arg_config)?;
 163 |     let config: TransparentCompressConfig = serde_json::from_str(&config_str)
 164 |         .with_context(|| format!("parsing json config '{config_str}'"))?;
 165 |     let db = &mut unsafe { ctx.get_connection()? };
 166 |     let db = db
 167 |         .unchecked_transaction()
 168 |         .context("Could not start transaction")?;
 169 |     let table_name = &config.table;
 170 |     let new_table_name = format!("_{table_name}_zstd");
 171 | 
 172 |     let configs = get_configs(&db)?;
 173 |     let already_compressed_columns = configs
 174 |         .iter()
 175 |         .filter(|c| &c.table == table_name)
 176 |         .map(|c| &c.column[..])
 177 |         .collect::<Vec<&str>>();
 178 | 
 179 |     log::debug!(
 180 |         "already compressed columns={:?}",
 181 |         already_compressed_columns
 182 |     );
 183 | 
 184 |     if already_compressed_columns.contains(&&config.column[..]) {
 185 |         anyhow::bail!(
 186 |             "Column {} is already enabled for compression.",
 187 |             &config.column
 188 |         );
 189 |     }
 190 | 
 191 |     let table_already_enabled = !already_compressed_columns.is_empty();
 192 | 
 193 |     let dict_id_columns: Vec<String> = if table_already_enabled {
 194 |         let query = format!(
 195 |             r#"select "from"
 196 |             from pragma_foreign_key_list('{}')
 197 |             where "table" = '_zstd_dicts'"#,
 198 |             &new_table_name
 199 |         );
 200 |         log::debug!("dict_id_columns query {:?}", query);
 201 |         db.prepare(&query)?
 202 |             .query_map(params![], |row| row.get("from"))
 203 |             .context("Could not get dicts ids info")?
 204 |             .collect::<Result<Vec<String>, _>>()?
 205 |     } else {
 206 |         vec![]
 207 |     };
 208 | 
 209 |     log::debug!("dict_id columns={:?}", dict_id_columns);
 210 | 
 211 |     if !check_table_exists(
 212 |         &db,
 213 |         if table_already_enabled {
 214 |             &new_table_name
 215 |         } else {
 216 |             table_name
 217 |         },
 218 |     ) {
 219 |         anyhow::bail!("Table {} doesn't exist", table_name);
 220 |     }
 221 | 
 222 |     let columns_info: Vec<ColumnInfo> = db
 223 |         .prepare(&format_sqlite!(
 224 |             r#"pragma table_info({})"#,
 225 |             if table_already_enabled {
 226 |                 &new_table_name
 227 |             } else {
 228 |                 table_name
 229 |             }
 230 |         ))?
 231 |         .query_map(params![], |row| {
 232 |             let col_name: String = row.get("name")?;
 233 |             let to_compress = (col_name == config.column)
 234 |                 || (already_compressed_columns.contains(&&col_name[..]));
 235 |             let is_dict_id = dict_id_columns.contains(&col_name);
 236 |             Ok(ColumnInfo {
 237 |                 name: col_name,
 238 |                 is_primary_key: row.get("pk")?,
 239 |                 coltype: row.get("type")?,
 240 |                 to_compress,
 241 |                 is_dict_id,
 242 |             })
 243 |         })
 244 |         .context("Could not query table_info")?
 245 |         .collect::<Result<_, rusqlite::Error>>()?;
 246 | 
 247 |     show_warnings(&db)?;
 248 | 
 249 |     // primary key columns. these will be used to index the table in the modifying triggers
 250 |     let primary_key_columns: Vec<&ColumnInfo> =
 251 |         columns_info.iter().filter(|e| e.is_primary_key).collect();
 252 | 
 253 |     if columns_info.is_empty() {
 254 |         anyhow::bail!("Table {} does not exist", table_name);
 255 |     }
 256 |     if primary_key_columns.is_empty() {
 257 |         anyhow::bail!(
 258 |             "Table {} does not have a primary key, sqlite-zstd only works on tables with primary keys, since rowids can change on VACUUM.",
 259 |             table_name
 260 |         );
 261 |     }
 262 | 
 263 |     let column_name = &config.column;
 264 | 
 265 |     let to_compress_column = columns_info
 266 |         .iter()
 267 |         .find(|e| &e.name == column_name)
 268 |         .with_context(|| format!("Column {column_name} does not exist in {table_name}"))?;
 269 |     if to_compress_column.is_primary_key {
 270 |         anyhow::bail!(
 271 |             "Can't compress column {} since it is part of primary key (this could probably be supported, but currently isn't)",
 272 |             column_name
 273 |         );
 274 |     }
 275 | 
 276 |     check_columns_to_compress_are_not_indexed(&db, &columns_info, table_name)?;
 277 | 
 278 |     let dict_id_column_name = get_dict_id(&to_compress_column.name);
 279 |     log::debug!("cols={:?}", columns_info);
 280 | 
 281 |     {
 282 |         let query = format!(
 283 |             "select ({}) as dict_chooser from {} limit 1",
 284 |             config.dict_chooser,
 285 |             escape_sqlite_identifier(table_name)
 286 |         );
 287 |         // small sanity check of chooser statement
 288 |         db.query_row(&query, params![], |row| row.get::<_, String>(0))
 289 |             .optional()
 290 |             .with_context(|| format!("Tried to execute:\n{query}"))
 291 |             .context(r#"Dict chooser expression does not seem to be valid. Make sure you return a string and get your escaping right: If you want an sqlite string inside a json string inside a sqlite string you need to do '{"foo": "''bar''"}'"#)?;
 292 |     }
 293 |     {
 294 |         // can't use prepared statement at these positions
 295 |         if !table_already_enabled {
 296 |             let rename_query =
 297 |                 format_sqlite!("alter table {} rename to {}", table_name, &new_table_name);
 298 |             log::debug!("[run] {}", &rename_query);
 299 |             db.execute(&rename_query, params![])
 300 |                 .context("Could not rename table")?;
 301 |         }
 302 | 
 303 |         util::ensure_dicts_table_exists(&db)?;
 304 | 
 305 |         db.execute(
 306 |             "
 307 |             create table if not exists _zstd_configs (
 308 |                 id integer primary key autoincrement,
 309 |                 config json not null
 310 |             );",
 311 |             params![],
 312 |         )
 313 |         .context("Could not create _zstd_configs")?;
 314 | 
 315 |         db.execute(
 316 |             "insert into _zstd_configs (config) values (?)",
 317 |             params![config_str],
 318 |         )
 319 |         .context("Could not insert config")?;
 320 | 
 321 |         db.execute(
 322 |             &format_sqlite!(
 323 |                 "alter table {} add column {} integer default null references _zstd_dicts(id)",
 324 |                 &new_table_name,
 325 |                 &dict_id_column_name
 326 |             ),
 327 |             params![],
 328 |         )
 329 |         .context("Could not add dictid column")?;
 330 | 
 331 |         // this index is needed since the maintenance function queries by the dictionary id to find rows that are not compressed
 332 |         db.execute(
 333 |             &format_sqlite!(
 334 |                 "create index {} on {} ({})",
 335 |                 &format!("{}_idx", &dict_id_column_name),
 336 |                 &new_table_name,
 337 |                 &dict_id_column_name
 338 |             ),
 339 |             params![],
 340 |         )
 341 |         .context("Could not create index on dictid")?;
 342 |     }
 343 | 
 344 |     create_or_replace_view(
 345 |         &db,
 346 |         &columns_info,
 347 |         table_name,
 348 |         &new_table_name,
 349 |         table_already_enabled,
 350 |     )?;
 351 | 
 352 |     create_insert_trigger(&db, &columns_info, table_name, &new_table_name, &config)?;
 353 | 
 354 |     // a WHERE statement that selects a row based on the primary key
 355 |     let primary_key_condition = primary_key_columns
 356 |         .iter()
 357 |         .map(|c| format_sqlite!("old.{0} = {0}", &c.name))
 358 |         .collect::<Vec<String>>()
 359 |         .join(" and ");
 360 | 
 361 |     // add delete trigger
 362 |     create_delete_trigger(&db, table_name, &new_table_name, &primary_key_condition)?;
 363 | 
 364 |     // update trigger
 365 |     create_update_triggers(
 366 |         &db,
 367 |         &columns_info,
 368 |         table_name,
 369 |         &new_table_name,
 370 |         &primary_key_condition,
 371 |         &config,
 372 |     )?;
 373 | 
 374 |     db.commit().context("Could not commit transaction")?;
 375 |     Ok(ToSqlOutput::Owned(Value::Text("Done!".to_string())))
 376 | }
 377 | 
 378 | fn get_dict_id(column_name: &str) -> String {
 379 |     format!("_{column_name}_dict")
 380 | }
 381 | 
 382 | fn check_table_exists(db: &rusqlite::Connection, table_name: &str) -> bool {
 383 |     let table_count: u32 = db
 384 |         .query_row(
 385 |             "select count(`type`) from sqlite_master where name = ? and type = 'table'",
 386 |             params![table_name],
 387 |             |r| r.get(0),
 388 |         )
 389 |         .unwrap_or(0);
 390 |     table_count != 0
 391 | }
 392 | 
 393 | fn check_columns_to_compress_are_not_indexed(
 394 |     db: &rusqlite::Connection,
 395 |     columns_info: &[ColumnInfo],
 396 |     table_name: &str,
 397 | ) -> anyhow::Result<()> {
 398 |     let indexed_columns: HashMap<String, String> = db
 399 |         .prepare(
 400 |             "
 401 |             select distinct ii.name as column_name, il.name as index_name
 402 |             from sqlite_master as m,
 403 |             pragma_index_list(m.name) as il,
 404 |             pragma_index_info(il.name) as ii
 405 |             where m.type='table' AND m.name=?",
 406 |         )?
 407 |         .query_map(params![table_name], |row| {
 408 |             Ok((row.get("column_name")?, row.get("index_name")?))
 409 |         })
 410 |         .context("could not get indices info")?
 411 |         .collect::<Result<_, rusqlite::Error>>()?;
 412 | 
 413 |     let indexed_columns_to_compress = columns_info
 414 |         .iter()
 415 |         .filter(|c| match indexed_columns.get(&c.name) {
 416 |             Some(_) => c.to_compress,
 417 |             None => false,
 418 |         })
 419 |         .collect::<Vec<&ColumnInfo>>();
 420 | 
 421 |     if !indexed_columns_to_compress.is_empty() {
 422 |         let columns_indices = indexed_columns_to_compress
 423 |             .iter()
 424 |             .map(|c| format!("{} ({})", c.name, indexed_columns.get(&c.name).unwrap()))
 425 |             .collect::<Vec<String>>()
 426 |             .join(", ");
 427 |         anyhow::bail!(
 428 |             "Can't compress column(s): {} - used as part of index (this could probably be supported, but currently isn't)",
 429 |             columns_indices
 430 |         );
 431 |     };
 432 |     Ok(())
 433 | }
 434 | 
 435 | fn create_or_replace_view(
 436 |     db: &rusqlite::Connection,
 437 |     columns_info: &[ColumnInfo],
 438 |     table_name: &str,
 439 |     internal_table_name: &str,
 440 |     table_already_enabled: bool,
 441 | ) -> anyhow::Result<()> {
 442 |     if table_already_enabled {
 443 |         // this drops the existing triggers as well
 444 |         let dropview_query = format!(r#"drop view {}"#, escape_sqlite_identifier(table_name));
 445 |         log::debug!("[run] {}", &dropview_query);
 446 |         db.execute(&dropview_query, params![])
 447 |             .context("Could not drop view")?;
 448 |     }
 449 | 
 450 |     // create view
 451 |     let select_columns_escaped = columns_info
 452 |         .iter()
 453 |         .filter(|c| !c.is_dict_id )
 454 |         .map(|c| {
 455 |             if c.to_compress {
 456 |                 let affinity_is_text = match get_column_affinity(&c.coltype) {
 457 |                     SqliteAffinity::Blob => false,
 458 |                     SqliteAffinity::Text => true,
 459 |                     other => anyhow::bail!("the to-compress column has type {} which has affinity {:?}, but affinity must be text or blob. See https://www.sqlite.org/draft/datatype3.html#determination_of_column_affinity", c.coltype, other)
 460 |                 };
 461 |                 Ok(format!(
 462 |                     // prepared statement parameters not allowed in view
 463 |                     "zstd_decompress_col({}, {}, {}, {}) as {0}",
 464 |                     &escape_sqlite_identifier(&c.name),
 465 |                     if affinity_is_text { 1 } else { 0 },
 466 |                     &escape_sqlite_identifier(&get_dict_id(&c.name)),
 467 |                     COMPACT
 468 |                 ))
 469 |             } else {
 470 |                 Ok(format_sqlite!("{}", &c.name))
 471 |             }
 472 |         })
 473 |         .collect::<Result<Vec<String>, _>>()
 474 |         .context("could not construct select in view")?
 475 |         .join(", ");
 476 |     let createview_query = format!(
 477 |         r#"
 478 |         create view {} as
 479 |             select {}
 480 |             from {}
 481 |         "#,
 482 |         escape_sqlite_identifier(table_name),
 483 |         select_columns_escaped,
 484 |         escape_sqlite_identifier(internal_table_name)
 485 |     );
 486 |     log::debug!("[run] {}", &createview_query);
 487 |     db.execute(&createview_query, params![])
 488 |         .context("Could not create view")?;
 489 |     Ok(())
 490 | }
 491 | 
 492 | fn create_insert_trigger(
 493 |     db: &rusqlite::Connection,
 494 |     columns_info: &[ColumnInfo],
 495 |     table_name: &str,
 496 |     internal_table_name: &str,
 497 |     _config: &TransparentCompressConfig,
 498 | ) -> anyhow::Result<()> {
 499 |     let trigger_name = format!("{table_name}_insert_trigger");
 500 | 
 501 |     // expressions that map backing table columns to view columns
 502 |     let mut insert_selection = vec![];
 503 |     // names of the columns to be inserted
 504 |     let mut columns_selection = vec![];
 505 | 
 506 |     for c in columns_info {
 507 |         if c.is_dict_id {
 508 |             continue;
 509 |         }
 510 |         columns_selection.push(String::from(&c.name));
 511 |         if c.to_compress {
 512 |             let dict_id = get_dict_id(&c.name);
 513 |             insert_selection.push(format!(
 514 |                 // prepared statement parameters not allowed in view
 515 |                 "new.{col} as {col}, null as {dictcol}",
 516 |                 col = escape_sqlite_identifier(&c.name),
 517 |                 dictcol = escape_sqlite_identifier(&dict_id)
 518 |             ));
 519 |             columns_selection.push(String::from(&dict_id));
 520 |         } else {
 521 |             insert_selection.push(format_sqlite!("new.{}", &c.name));
 522 |         }
 523 |     }
 524 | 
 525 |     let createtrigger_query = format!(
 526 |         "
 527 |         create trigger {}
 528 |             instead of insert on {}
 529 |             for each row
 530 |             begin
 531 |                 insert into {}({}) select {};
 532 |             end;
 533 |         ",
 534 |         escape_sqlite_identifier(&trigger_name),
 535 |         escape_sqlite_identifier(table_name),
 536 |         escape_sqlite_identifier(internal_table_name),
 537 |         columns_selection.join(", "),
 538 |         insert_selection.join(",\n"),
 539 |     );
 540 |     log::debug!("[run] {}", &createtrigger_query);
 541 |     db.execute(&createtrigger_query, params![])
 542 |         .context("Could not create insert trigger")?;
 543 |     Ok(())
 544 | }
 545 | 
 546 | fn create_delete_trigger(
 547 |     db: &rusqlite::Connection,
 548 |     table_name: &str,
 549 |     internal_table_name: &str,
 550 |     primary_key_condition: &str,
 551 | ) -> anyhow::Result<()> {
 552 |     let trigger_name = format!("{table_name}_delete_trigger");
 553 | 
 554 |     let deletetrigger_query = format!(
 555 |         "
 556 |         create trigger {trg_name}
 557 |             instead of delete on {view}
 558 |             for each row
 559 |             begin
 560 |                 delete from {backing_table} where {primary_key_condition};
 561 |             end;
 562 |         ",
 563 |         trg_name = escape_sqlite_identifier(&trigger_name),
 564 |         view = escape_sqlite_identifier(table_name),
 565 |         backing_table = escape_sqlite_identifier(internal_table_name),
 566 |         primary_key_condition = primary_key_condition
 567 |     );
 568 |     log::debug!("[run] {}", &deletetrigger_query);
 569 |     db.execute(&deletetrigger_query, params![])
 570 |         .context("could not create delete trigger")?;
 571 |     Ok(())
 572 | }
 573 | 
 574 | fn create_update_triggers(
 575 |     db: &rusqlite::Connection,
 576 |     columns_info: &[ColumnInfo],
 577 |     table_name: &str,
 578 |     internal_table_name: &str,
 579 |     primary_key_condition: &str,
 580 |     _config: &TransparentCompressConfig,
 581 | ) -> anyhow::Result<()> {
 582 |     for col in columns_info {
 583 |         if col.is_dict_id {
 584 |             continue;
 585 |         }
 586 | 
 587 |         let trigger_name = format!("{}_update_{}_trigger", table_name, col.name);
 588 | 
 589 |         let update = if col.to_compress {
 590 |             format!(
 591 |                 "{col} = new.{col}, {dictcol} = null",
 592 |                 col = escape_sqlite_identifier(&col.name),
 593 |                 dictcol = escape_sqlite_identifier(&get_dict_id(&col.name)),
 594 |             )
 595 |         } else {
 596 |             format_sqlite!("{} = new.{}", &col.name, &col.name)
 597 |         };
 598 |         // update triggers
 599 |         let updatetrigger_query = format!(
 600 |             "
 601 |             create trigger {trg_name}
 602 |                 instead of update of {upd_col} on {view_name}
 603 |                 for each row
 604 |                 begin
 605 |                     update {backing_table} set {update} where {primary_key_condition};
 606 |                 end;
 607 |             ",
 608 |             trg_name = escape_sqlite_identifier(&trigger_name),
 609 |             view_name = escape_sqlite_identifier(table_name),
 610 |             backing_table = escape_sqlite_identifier(internal_table_name),
 611 |             upd_col = escape_sqlite_identifier(&col.name),
 612 |             update = update,
 613 |             primary_key_condition = primary_key_condition
 614 |         );
 615 |         log::debug!("[run] {}", &updatetrigger_query);
 616 |         db.execute(&updatetrigger_query, params![])
 617 |             .with_context(|| format!("Could not create update of {} trigger", col.name))?;
 618 |     }
 619 |     Ok(())
 620 | }
 621 | 
 622 | fn get_configs(db: &rusqlite::Connection) -> Result<Vec<TransparentCompressConfig>, anyhow::Error> {
 623 |     // if the table `_zstd_configs` does not exist yet, transparent compression hasn't been used yet, so return an empty array
 624 |     if !check_table_exists(db, "_zstd_configs") {
 625 |         return Ok(vec![]);
 626 |     }
 627 | 
 628 |     let configs = db
 629 |         .prepare("select config from _zstd_configs")?
 630 |         .query_map(params![], |row| {
 631 |             serde_json::from_str(row.get_ref_unwrap("config").as_str()?)
 632 |                 .context("parsing config")
 633 |                 .map_err(ah)
 634 |         })
 635 |         .context("Couldn't fetch configs")?
 636 |         .collect::<Result<Vec<TransparentCompressConfig>, rusqlite::Error>>()?;
 637 |     Ok(configs)
 638 | }
 639 | 
 640 | #[derive(Debug)]
 641 | struct TodoInfo {
 642 |     dict_choice: Option<String>,
 643 |     count: i64,
 644 |     total_bytes: i64,
 645 | }
 646 | 
 647 | struct IncrementalMaintenanceArgs {
 648 |     end_limit: Instant,
 649 |     target_db_load: f32,
 650 |     time_limit: f64,
 651 | }
 652 | pub fn zstd_incremental_maintenance<'a>(ctx: &Context) -> Result<ToSqlOutput<'a>, anyhow::Error> {
 653 |     let args = {
 654 |         let arg_time_limit_seconds = 0;
 655 |         let arg_target_db_load = 1;
 656 |         let time_limit: Option<f64> = ctx
 657 |             .get(arg_time_limit_seconds)
 658 |             .context("could not get time limit argument")?;
 659 |         let time_limit = time_limit.unwrap_or(100000000.0);
 660 |         let target_db_load: f32 = ctx
 661 |             .get(arg_target_db_load)
 662 |             .context("could not get target db load argument")?;
 663 |         if !(0.0..=1e100).contains(&time_limit) {
 664 |             anyhow::bail!("time too large");
 665 |         }
 666 |         let end_limit = Instant::now() + Duration::from_secs_f64(time_limit);
 667 |         IncrementalMaintenanceArgs {
 668 |             end_limit,
 669 |             target_db_load,
 670 |             time_limit,
 671 |         }
 672 |     };
 673 |     let db = unsafe { ctx.get_connection()? };
 674 |     show_warnings(&db)?;
 675 |     let configs = get_configs(&db)?;
 676 | 
 677 |     for config in configs {
 678 |         match maintenance_for_config(&db, config, &args)? {
 679 |             MaintRet::TimeLimitReached => {
 680 |                 log::info!(
 681 |                     "time limit of {:.1}s reached, stopping with more maintenance work pending",
 682 |                     args.time_limit
 683 |                 );
 684 |                 return Ok(1.into());
 685 |             }
 686 |             MaintRet::Completed => {}
 687 |         }
 688 |     }
 689 |     log::info!("All maintenance work completed!");
 690 |     Ok(0.into())
 691 | }
 692 | 
 693 | enum MaintRet {
 694 |     TimeLimitReached,
 695 |     Completed,
 696 | }
 697 | 
 698 | struct EscapedNames {
 699 |     compressed_tablename: String,
 700 |     data_colname: String,
 701 |     dict_colname: String,
 702 | }
 703 | impl From<&TransparentCompressConfig> for EscapedNames {
 704 |     fn from(config: &TransparentCompressConfig) -> EscapedNames {
 705 |         EscapedNames {
 706 |             compressed_tablename: escape_sqlite_identifier(&format!("_{}_zstd", config.table)),
 707 |             data_colname: escape_sqlite_identifier(&config.column),
 708 |             dict_colname: escape_sqlite_identifier(&format!("_{}_dict", config.column)),
 709 |         }
 710 |     }
 711 | }
 712 | 
 713 | fn maintenance_for_config(
 714 |     db: &Connection,
 715 |     config: TransparentCompressConfig,
 716 |     args: &IncrementalMaintenanceArgs,
 717 | ) -> anyhow::Result<MaintRet> {
 718 |     let esc_names = EscapedNames::from(&config);
 719 | 
 720 |     let todos = db
 721 |         .prepare(&format!(
 722 |             "select
 723 |             ({chooser}) as dict_choice,
 724 |             count(*) as count,
 725 |             sum(length({datacol})) as total_bytes
 726 |         from {tbl} where {dictcol} is null group by dict_choice",
 727 |             tbl = esc_names.compressed_tablename,
 728 |             dictcol = esc_names.dict_colname,
 729 |             datacol = esc_names.data_colname,
 730 |             chooser = config.dict_chooser
 731 |         ))?
 732 |         .query_map(params![], |row| {
 733 |             Ok(TodoInfo {
 734 |                 dict_choice: row.get("dict_choice")?,
 735 |                 count: row.get("count")?,
 736 |                 total_bytes: row.get("total_bytes")?,
 737 |             })
 738 |         })?
 739 |         .collect::<Result<Vec<_>, _>>()?;
 740 | 
 741 |     let total_bytes_to_compress: i64 = todos
 742 |         .iter()
 743 |         .filter(|e| e.dict_choice.is_some())
 744 |         .map(|e| e.total_bytes)
 745 |         .sum();
 746 |     let mut rows_compressed_so_far: i64 = 0;
 747 |     let mut bytes_compressed_so_far: i64 = 0;
 748 |     let total_rows_to_compress: i64 = todos
 749 |         .iter()
 750 |         .filter(|e| e.dict_choice.is_some())
 751 |         .map(|e| e.count)
 752 |         .sum();
 753 |     log::info!(
 754 |         "{}.{}: Total {} rows ({}) to potentially compress (split in {} groups).",
 755 |         config.table,
 756 |         config.column,
 757 |         total_rows_to_compress,
 758 |         pretty_bytes(total_bytes_to_compress),
 759 |         todos.len()
 760 |     );
 761 |     for todo in todos.into_iter() {
 762 |         let rows_handled = maintenance_for_todo(db, &config, &todo, &esc_names, args)?;
 763 |         rows_compressed_so_far += rows_handled;
 764 |         // estimate bytes compressed
 765 |         bytes_compressed_so_far +=
 766 |             ((rows_handled as f64 / todo.count as f64) * todo.total_bytes as f64) as i64;
 767 |         if rows_handled > 0 {
 768 |             log::info!(
 769 |                 "Handled {} / {} rows  ({} / {})",
 770 |                 rows_compressed_so_far,
 771 |                 total_rows_to_compress,
 772 |                 pretty_bytes(bytes_compressed_so_far),
 773 |                 pretty_bytes(total_bytes_to_compress)
 774 |             );
 775 |         }
 776 |         if Instant::now() > args.end_limit {
 777 |             return Ok(MaintRet::TimeLimitReached);
 778 |         }
 779 |     }
 780 |     Ok(MaintRet::Completed)
 781 | }
 782 | 
 783 | fn maintenance_for_todo(
 784 |     db: &Connection,
 785 |     config: &TransparentCompressConfig,
 786 |     todo: &TodoInfo,
 787 |     esc_names: &EscapedNames,
 788 |     args: &IncrementalMaintenanceArgs,
 789 | ) -> anyhow::Result<i64> {
 790 |     let avg_sample_bytes = todo.total_bytes / todo.count;
 791 |     let dict_choice = todo.dict_choice.as_deref().unwrap_or("[null]");
 792 |     log::debug!(
 793 |         "looking at group={}, has {} rows with {} average size ({} total)",
 794 |         dict_choice,
 795 |         todo.count,
 796 |         pretty_bytes(avg_sample_bytes),
 797 |         pretty_bytes(todo.total_bytes)
 798 |     );
 799 | 
 800 |     let (dict_id, dict_is_new) =
 801 |         match get_or_train_dict(db, config, todo, esc_names).context("getting dict")? {
 802 |             TrainDictReturn::Skip => return Ok(0),
 803 |             TrainDictReturn::Done {
 804 |                 dict_id,
 805 |                 dict_is_new,
 806 |             } => (dict_id, dict_is_new),
 807 |         };
 808 | 
 809 |     let mut total_updated: i64 = 0;
 810 |     let mut chunk_size = config.incremental_compression_step_bytes / avg_sample_bytes;
 811 |     if chunk_size < 1 {
 812 |         chunk_size = 1;
 813 |     }
 814 |     log::debug!(
 815 |         "Compressing {} samples with key {} and level {}, chunksize {}",
 816 |         todo.count,
 817 |         dict_choice,
 818 |         config.compression_level,
 819 |         chunk_size
 820 |     );
 821 |     loop {
 822 |         let update_start = Instant::now();
 823 |         let q = &format!(
 824 |             "update {tbl} set {datacol} = zstd_compress_col({datacol}, :lvl, :dict, :compact), {dictcol} = :dict where rowid in (select rowid from {tbl} where {dictcol} is null and :dictchoice = ({chooser}) limit :chunksize)",
 825 |             tbl = esc_names.compressed_tablename,
 826 |             datacol = esc_names.data_colname,
 827 |             dictcol = esc_names.dict_colname,
 828 |             chooser = config.dict_chooser
 829 |         );
 830 |         log::trace!("executing {}", q);
 831 |         let updated = db
 832 |             .execute(
 833 |                 q,
 834 |                 named_params! {
 835 |                     ":lvl": config.compression_level,
 836 |                     ":dict": dict_id,
 837 |                     ":dictchoice": &dict_choice,
 838 |                     ":chunksize": chunk_size,
 839 |                     ":compact": COMPACT
 840 |                 },
 841 |             )
 842 |             .with_context(|| format!("while compressing chunk for key {dict_choice}"))?;
 843 | 
 844 |         total_updated += updated as i64;
 845 |         log::debug!("Compressed {} / {}", total_updated, todo.count);
 846 |         if Instant::now() > args.end_limit {
 847 |             break;
 848 |         }
 849 |         let elapsed = update_start.elapsed();
 850 |         if elapsed.div_f32(args.target_db_load) > elapsed {
 851 |             let sleep_duration = elapsed.div_f32(args.target_db_load) - elapsed;
 852 |             if sleep_duration > Duration::from_millis(1) {
 853 |                 log::debug!(
 854 |                     "Sleeping {}s to keep write load at {}",
 855 |                     sleep_duration.as_secs_f32(),
 856 |                     args.target_db_load
 857 |                 );
 858 |                 std::thread::sleep(sleep_duration);
 859 |             }
 860 |         }
 861 | 
 862 |         if updated == 0 {
 863 |             break;
 864 |         }
 865 |     }
 866 | 
 867 |     let (total_size_after, total_count_after): (i64, i64) = db.query_row(
 868 |         &format!(
 869 |             "select sum(length({datacol})), count(*) from {tbl} where {dictcol} = ?",
 870 |             tbl = esc_names.compressed_tablename,
 871 |             datacol = esc_names.data_colname,
 872 |             dictcol = esc_names.dict_colname
 873 |         ),
 874 |         params![dict_id],
 875 |         |row| Ok((row.get(0)?, row.get(1)?)),
 876 |     )?;
 877 |     if dict_is_new {
 878 |         log::info!(
 879 |             "Compressed {} rows with dict_choice={} (dict_id={}). Total size of entries before: {}, afterwards: {}, (average: before={}, after={})",
 880 |             total_updated,
 881 |             dict_choice,
 882 |             dict_id,
 883 |             pretty_bytes(todo.total_bytes),
 884 |             pretty_bytes(total_size_after),
 885 |             pretty_bytes(avg_sample_bytes),
 886 |             pretty_bytes(total_size_after / total_count_after),
 887 |         );
 888 |     }
 889 |     Ok(total_updated)
 890 | }
 891 | 
 892 | enum TrainDictReturn {
 893 |     Skip,
 894 |     Done { dict_id: i32, dict_is_new: bool },
 895 | }
 896 | fn get_or_train_dict(
 897 |     db: &Connection,
 898 |     config: &TransparentCompressConfig,
 899 |     todo: &TodoInfo,
 900 |     esc_names: &EscapedNames,
 901 | ) -> anyhow::Result<TrainDictReturn> {
 902 |     let dict_choice = match &todo.dict_choice {
 903 |         None => {
 904 |             log::debug!("Skipping group, no dict chosen");
 905 |             return Ok(TrainDictReturn::Skip);
 906 |         }
 907 |         Some(e) => e,
 908 |     };
 909 |     if dict_choice == "[nodict]" {
 910 |         return Ok(TrainDictReturn::Done {
 911 |             dict_id: -1,
 912 |             dict_is_new: false,
 913 |         });
 914 |     }
 915 | 
 916 |     let avg_sample_bytes = todo.total_bytes / todo.count;
 917 |     let dict_id: Option<i32> = db
 918 |         .query_row(
 919 |             "select id from _zstd_dicts where chooser_key = ?",
 920 |             params![dict_choice],
 921 |             |row| row.get("id"),
 922 |         )
 923 |         .optional()?;
 924 |     Ok(match dict_id {
 925 |         Some(dict_id) => {
 926 |             log::debug!(
 927 |                 "Found existing dictionary id={} for key={}",
 928 |                 dict_id,
 929 |                 dict_choice
 930 |             );
 931 |             TrainDictReturn::Done {
 932 |                 dict_id,
 933 |                 dict_is_new: false,
 934 |             }
 935 |         }
 936 |         None => {
 937 |             let dict_target_size = (todo.total_bytes as f32 * config.dict_size_ratio) as i64;
 938 | 
 939 |             if dict_target_size < config.min_dict_size_bytes_for_training {
 940 |                 log::debug!(
 941 |                     "Dictionary for group '{}' would be smaller than minimum ({} * {:.3} = {} < {}), ignoring",
 942 |                     dict_choice,
 943 |                     pretty_bytes(todo.total_bytes),
 944 |                     config.dict_size_ratio,
 945 |                     pretty_bytes(dict_target_size),
 946 |                     pretty_bytes(config.min_dict_size_bytes_for_training)
 947 |                 );
 948 |                 return Ok(TrainDictReturn::Skip);
 949 |             }
 950 |             let target_samples = (dict_target_size as f32 * config.train_dict_samples_ratio
 951 |                 / avg_sample_bytes as f32) as i64; // use roughly 100x the size of the dictionary as data
 952 | 
 953 |             log::debug!(
 954 |                 "Training dict for key {} of max size {}",
 955 |                 dict_choice,
 956 |                 pretty_bytes(dict_target_size)
 957 |             );
 958 |             let dict_id = db.query_row(&format!(
 959 |                 "select zstd_train_dict_and_save({datacol}, ?, ?, ?) as dictid from {tbl} where {dictcol} is null and ? = ({chooser})", 
 960 |                 datacol=esc_names.data_colname,
 961 |                 tbl=esc_names.compressed_tablename,
 962 |                 dictcol=esc_names.dict_colname,
 963 |                 chooser=config.dict_chooser
 964 |             ), params![dict_target_size, target_samples, dict_choice, dict_choice], |row| row.get("dictid"))?;
 965 |             TrainDictReturn::Done {
 966 |                 dict_id,
 967 |                 dict_is_new: true,
 968 |             }
 969 |         }
 970 |     })
 971 | }
 972 | #[cfg(test)]
 973 | mod tests {
 974 |     use super::add_functions::tests::create_example_db;
 975 |     use super::*;
 976 |     use pretty_assertions::assert_eq;
 977 |     use rand::prelude::SliceRandom;
 978 |     use rusqlite::params;
 979 |     use rusqlite::{Connection, Row};
 980 | 
 981 |     fn row_to_thong(r: &Row) -> anyhow::Result<Vec<Value>> {
 982 |         Ok((0..r.as_ref().column_count())
 983 |             .map(|i| r.get_ref(i).map(|e| e.into()))
 984 |             .collect::<Result<_, _>>()?)
 985 |     }
 986 | 
 987 |     fn get_whole_table(db: &Connection, tbl_name: &str) -> anyhow::Result<Vec<Vec<Value>>> {
 988 |         let mut stmt = db.prepare(&format!("select * from {tbl_name} ORDER BY id"))?;
 989 |         let q1: Vec<Vec<Value>> = stmt
 990 |             .query_map(params![], |e| row_to_thong(e).map_err(ah))?
 991 |             .collect::<Result<_, rusqlite::Error>>()?;
 992 |         Ok(q1)
 993 |     }
 994 | 
 995 |     fn check_table_rows_same(db1: &Connection, db2: &Connection) -> anyhow::Result<()> {
 996 |         let tbl1 = get_whole_table(db1, "events").context("Could not get whole table db 1")?;
 997 |         let tbl2 = get_whole_table(db2, "events").context("Could not get whole table db 2")?;
 998 |         assert_eq!(tbl1, tbl2);
 999 | 
1000 |         Ok(())
1001 |     }
1002 | 
1003 |     #[test]
1004 |     fn sanity() -> anyhow::Result<()> {
1005 |         let db1 = create_example_db(Some(123), 100)?;
1006 |         let db2 = create_example_db(Some(123), 100)?;
1007 | 
1008 |         check_table_rows_same(&db1, &db2)?;
1009 | 
1010 |         Ok(())
1011 |     }
1012 | 
1013 |     #[test]
1014 |     fn no_configs() -> anyhow::Result<()> {
1015 |         let db = create_example_db(Some(123), 100)?;
1016 | 
1017 |         assert_eq!(get_configs(&db)?.len(), 0);
1018 |         Ok(())
1019 |     }
1020 | 
1021 |     fn get_two_dbs(seed: Option<u64>) -> anyhow::Result<(Connection, Connection)> {
1022 |         if std::env::var("RUST_LOG").is_err() {
1023 |             // TODO: Audit that the environment access only happens in single-threaded code.
1024 |             unsafe { std::env::set_var("RUST_LOG", "info") };
1025 |         }
1026 |         env_logger::try_init().ok();
1027 | 
1028 |         let db1 = create_example_db(seed, 2000)?;
1029 |         let db2 = create_example_db(seed, 2000)?;
1030 | 
1031 |         db2.query_row(
1032 |             r#"select zstd_enable_transparent(?)"#,
1033 |             params![r#"{"table": "events", "column": "data", "compression_level": 3, "dict_chooser": "'1'"}"#],
1034 |             |_| Ok(())
1035 |         ).context("enable transparent")?;
1036 | 
1037 |         Ok((db1, db2))
1038 |     }
1039 |     #[test]
1040 |     fn enable_transparent() -> anyhow::Result<()> {
1041 |         let (db1, db2) = get_two_dbs(Some(123))?;
1042 |         check_table_rows_same(&db1, &db2)?;
1043 | 
1044 |         Ok(())
1045 |     }
1046 | 
1047 |     fn get_rand_id(db: &Connection) -> anyhow::Result<i64> {
1048 |         db.query_row(
1049 |             "select id from events order by random() limit 1",
1050 |             params![],
1051 |             |r| r.get(0),
1052 |         )
1053 |         .context("Could not get random id")
1054 |     }
1055 | 
1056 |     fn insert(db: &Connection, _id: i64, _id2: i64) -> anyhow::Result<()> {
1057 |         let query = r#"insert into events (timestamp, data) values ('2020-12-20T00:00:00Z', '{"foo": "bar"}')"#;
1058 | 
1059 |         db.execute(query, params![])?;
1060 | 
1061 |         Ok(())
1062 |     }
1063 | 
1064 |     fn insert_both_columns(db: &Connection, _id: i64, _id2: i64) -> anyhow::Result<()> {
1065 |         let query = r#"insert into events (timestamp, data, another_col) values ('2020-12-20T00:00:00Z', '{"foo": "bar"}', 'rustacean')"#;
1066 | 
1067 |         db.execute(query, params![])?;
1068 | 
1069 |         Ok(())
1070 |     }
1071 | 
1072 |     fn update_comp_col(db: &Connection, id: i64, _id2: i64) -> anyhow::Result<()> {
1073 |         let _updc = db.execute("update events set data='fooooooooooooooooooooooooooooooooooooooooooooobar' where id = ?", params![id]).context("updating compressed column")?;
1074 | 
1075 |         //assert_eq!(updc, 1);
1076 |         Ok(())
1077 |     }
1078 | 
1079 |     fn update_other_col(db: &Connection, id: i64, _id2: i64) -> anyhow::Result<()> {
1080 |         let _updc = db
1081 |             .execute(
1082 |                 "update events set timestamp = '2020-02-01' where id = ?",
1083 |                 params![id],
1084 |             )
1085 |             .context("updating other column")?;
1086 |         //assert_eq!(updc, 1);
1087 |         Ok(())
1088 |     }
1089 | 
1090 |     fn update_other_two_col(db: &Connection, id: i64, id2: i64) -> anyhow::Result<()> {
1091 |         //thread::rand
1092 |         delete_one(db, id2, id)?;
1093 |         let _updc = db
1094 |             .execute(
1095 |                 "update events set timestamp = '2020-02-01', id=? where id = ?",
1096 |                 params![id2, id],
1097 |             )
1098 |             .context("updating other two column")?;
1099 |         //assert_eq!(updc, 1);
1100 |         Ok(())
1101 |     }
1102 | 
1103 |     fn update_comp_col_and_other_two_col(db: &Connection, id: i64, id2: i64) -> anyhow::Result<()> {
1104 |         //thread::rand
1105 |         delete_one(db, id2, id)?;
1106 |         let _updc = db.execute("update events set timestamp = '2020-02-01', id=?, data='fooooooooooooooooooooooooooooooooooooooooooooobar' where id = ?", params![id2,id]).context("updating three column")?;
1107 |         //assert_eq!(updc, 1);
1108 |         Ok(())
1109 |     }
1110 | 
1111 |     fn update_two_rows(db: &Connection, id: i64, id2: i64) -> anyhow::Result<()> {
1112 |         //thread::rand
1113 |         let _updc = db.execute("update events set timestamp = '2020-02-01', data='fooooooooooooooooooooooooooooooooooooooooooooobar' where id in (:a, :b)", 
1114 |         named_params! {":a": id, ":b": id2}).context("updating two rows")?;
1115 |         //assert_eq!(updc, 2);
1116 |         Ok(())
1117 |     }
1118 | 
1119 |     fn update_two_rows_by_compressed(db: &Connection, id: i64, id2: i64) -> anyhow::Result<()> {
1120 |         let _updc = db
1121 |             .execute(
1122 |                 "update events set data = 'testingxy' where id in (?, ?)",
1123 |                 params![id, id2],
1124 |             )
1125 |             .context("updating two rows replace compressed")?;
1126 |         //assert_eq!(updc, 2);
1127 |         //thread::rand
1128 |         let _updc = db
1129 |             .execute(
1130 |                 "update events set timestamp='1234' where data = 'testingxy'",
1131 |                 params![],
1132 |             )
1133 |             .context("updating where compressed=...")?;
1134 |         //assert_eq!(updc, 2);
1135 |         Ok(())
1136 |     }
1137 | 
1138 |     fn delete_one(db: &Connection, id: i64, _id2: i64) -> anyhow::Result<()> {
1139 |         let _updc = db
1140 |             .execute("delete from events where id = ?", params![id])
1141 |             .context("deleting from events by id")?;
1142 |         //assert_eq!(updc, 1);
1143 |         Ok(())
1144 |     }
1145 | 
1146 |     fn delete_where_other(db: &Connection, id: i64, _id2: i64) -> anyhow::Result<()> {
1147 |         let ts: String = db.query_row(
1148 |             "select timestamp from events where id = ?",
1149 |             params![id],
1150 |             |r| r.get(0),
1151 |         )?;
1152 |         let _updc = db
1153 |             .execute("delete from events where timestamp = ?", params![ts])
1154 |             .context("deleting by timestamp")?;
1155 |         //assert_eq!(updc, 1);
1156 |         Ok(())
1157 |     }
1158 | 
1159 |     #[test]
1160 |     fn test_many() -> anyhow::Result<()> {
1161 |         type Executor = dyn Fn(&Connection, i64, i64) -> anyhow::Result<()>;
1162 |         let posses: Vec<&Executor> = vec![
1163 |             &insert,
1164 |             &update_comp_col,
1165 |             &update_other_col,
1166 |             &update_other_two_col,
1167 |             &update_comp_col_and_other_two_col,
1168 |             &update_two_rows,
1169 |             &update_two_rows_by_compressed,
1170 |             &delete_one,
1171 |             &delete_where_other,
1172 |         ];
1173 | 
1174 |         let mut posses2 = vec![];
1175 |         for _ in 0..100 {
1176 |             posses2.push(*posses.choose(&mut rand::thread_rng()).unwrap());
1177 |         }
1178 |         for compress_first in [false, true] {
1179 |             for operations in &[&posses2] {
1180 |                 if compress_first {
1181 |                     let (db1, db2) =
1182 |                         get_two_dbs(Some(123)).context("Could not create databases")?;
1183 |                     if compress_first {
1184 |                         let done: i64 = db2.query_row(
1185 |                             "select zstd_incremental_maintenance(9999999, 1)",
1186 |                             params![],
1187 |                             |r| r.get(0),
1188 |                         )?;
1189 | 
1190 |                         assert_eq!(done, 0);
1191 | 
1192 |                         let uncompressed_count: i64 = db2
1193 |                             .query_row(
1194 |                                 "select count(*) from _events_zstd where _data_dict is null",
1195 |                                 params![],
1196 |                                 |r| r.get(0),
1197 |                             )
1198 |                             .context("Could not query uncompressed count")?;
1199 |                         assert_eq!(uncompressed_count, 0);
1200 |                     }
1201 | 
1202 |                     for operation in *operations {
1203 |                         let id = get_rand_id(&db1)?;
1204 |                         let id2 = get_rand_id(&db2)?;
1205 |                         operation(&db1, id, id2)
1206 |                             .context("Could not run operation on uncompressed db")?;
1207 |                         operation(&db2, id, id2)
1208 |                             .context("Could not run operation on compressed db")?;
1209 |                     }
1210 | 
1211 |                     check_table_rows_same(&db1, &db2)?;
1212 |                 }
1213 |             }
1214 |         }
1215 | 
1216 |         Ok(())
1217 |     }
1218 | 
1219 |     #[test]
1220 |     fn columns_of_the_same_table_are_enabled() -> anyhow::Result<()> {
1221 |         let (db1, db2) = get_two_dbs(Some(456)).context("Could not create databases")?;
1222 |         db2.query_row(
1223 |             r#"select zstd_enable_transparent(?)"#,
1224 |             params![r#"{"table": "events", "column": "another_col", "compression_level": 3, "dict_chooser": "'1'"}"#],
1225 |             |_| Ok(())
1226 |         ).context("enable transparent")?;
1227 | 
1228 |         let done: i64 = db2.query_row(
1229 |             "select zstd_incremental_maintenance(9999999, 1)",
1230 |             params![],
1231 |             |r| r.get(0),
1232 |         )?;
1233 | 
1234 |         assert_eq!(done, 0);
1235 | 
1236 |         let uncompressed_count: i64 = db2
1237 |             .query_row(
1238 |                 "select count(*) from _events_zstd where _data_dict is null",
1239 |                 params![],
1240 |                 |r| r.get(0),
1241 |             )
1242 |             .context("Could not query uncompressed count")?;
1243 |         assert_eq!(uncompressed_count, 0);
1244 | 
1245 |         let id = get_rand_id(&db1)?;
1246 |         let id2 = get_rand_id(&db2)?;
1247 |         insert_both_columns(&db1, id, id2).context("Could not run operation on uncompressed db")?;
1248 |         insert_both_columns(&db2, id, id2).context("Could not run operation on compressed db")?;
1249 | 
1250 |         check_table_rows_same(&db1, &db2)?;
1251 | 
1252 |         Ok(())
1253 |     }
1254 | 
1255 |     #[test]
1256 |     #[should_panic(expected = "another_col (another_col_idx) - used as part of index")]
1257 |     fn indexed_column_cannot_be_enabled() {
1258 |         let db = create_example_db(None, 1100).unwrap();
1259 | 
1260 |         // When column of original table is indexed
1261 |         db.execute(
1262 |             "create index another_col_idx on events (another_col)",
1263 |             params![],
1264 |         )
1265 |         .unwrap();
1266 | 
1267 |         db.query_row(
1268 |             r#"select zstd_enable_transparent(?)"#,
1269 |             params![r#"{"table": "events", "column": "another_col", "compression_level": 3, "dict_chooser": "'1'"}"#],
1270 |             |_| Ok(())
1271 |         ).unwrap();
1272 |     }
1273 | 
1274 |     #[test]
1275 |     #[should_panic(expected = "another_col is already enabled for compression")]
1276 |     fn same_column_is_not_allowed_to_be_enabled_multiple_times() {
1277 |         let db = create_example_db(None, 1100).unwrap();
1278 | 
1279 |         db.query_row(
1280 |             r#"select zstd_enable_transparent(?)"#,
1281 |             params![r#"{"table": "events", "column": "another_col", "compression_level": 3, "dict_chooser": "'1'"}"#],
1282 |             |_| Ok(())
1283 |         ).unwrap();
1284 | 
1285 |         db.query_row(
1286 |             r#"select zstd_enable_transparent(?)"#,
1287 |             params![r#"{"table": "events", "column": "another_col", "compression_level": 3, "dict_chooser": "'1'"}"#],
1288 |             |_| Ok(())
1289 |         ).unwrap();
1290 |     }
1291 | }
1292 | 


--------------------------------------------------------------------------------
/src/util.rs:
--------------------------------------------------------------------------------
 1 | use log::LevelFilter;
 2 | 
 3 | pub fn ensure_dicts_table_exists(db: &rusqlite::Connection) -> rusqlite::Result<()> {
 4 |     db.execute_batch(
 5 |         "
 6 |         create table if not exists _zstd_dicts (
 7 |             id integer primary key autoincrement,
 8 |             chooser_key text unique,
 9 |             dict blob not null
10 |         );
11 |         insert or ignore into _zstd_dicts values (-1, '[nodict]', ''); -- only added so foreign key is fulfilled
12 |         ",
13 |     )?;
14 |     Ok(())
15 | }
16 | 
17 | /// format an expression while escaping given values as sqlite identifiers
18 | /// needed since prepared query parameters can't be used in identifier position
19 | #[doc(hidden)]
20 | #[macro_export]
21 | macro_rules! format_sqlite {
22 |     ($x:expr_2021, $($y:expr_2021),*) => {
23 |         format!($x, $(escape_sqlite_identifier($y),)*)
24 |     };
25 | }
26 | 
27 | pub fn ah(e: anyhow::Error) -> rusqlite::Error {
28 |     rusqlite::Error::UserFunctionError(format!("{e:?}").into())
29 | }
30 | 
31 | /*pub fn debug_row(r: &rusqlite::Row) {
32 |     let cols = r.column_names();
33 |     for (i, name) in cols.iter().enumerate() {
34 |         print!("{}={} ", name, format_blob(r.get_ref_unwrap(i)))
35 |     }
36 |     println!();
37 | }
38 | 
39 | fn format_blob(b: ValueRef) -> String {
40 |     use ValueRef::*;
41 |     match b {
42 |         Null => "NULL".to_owned(),
43 |         Integer(i) => format!("{}", i),
44 |         Real(i) => format!("{}", i),
45 |         Text(i) => format!("'{}'", String::from_utf8_lossy(i).replace("'", "''")),
46 |         Blob(b) => format!("[blob {}B]", b.len()),
47 |     }
48 | }*/
49 | 
50 | ///
51 | /// adapted from https://github.com/jgallagher/rusqlite/blob/022266239233857faa7f0b415c1a3d5095d96a53/src/vtab/mod.rs#L629
52 | /// sql injection safe? investigate
53 | /// hello -> `hello`
54 | /// he`lo -> `he``lo`
55 | ///
56 | /// we intentionally use the `e` syntax instead of "e" because of
57 | /// "a misspelled double-quoted identifier will be interpreted as a string literal, rather than generating an error"
58 | /// see https://www.sqlite.org/quirks.html#double_quoted_string_literals_are_accepted
59 | ///
60 | pub fn escape_sqlite_identifier(identifier: &str) -> String {
61 |     format!("`{}`", identifier.replace('`', "``"))
62 | }
63 | 
64 | pub fn init_logging(default_level: LevelFilter) {
65 |     if std::env::var("SQLITE_ZSTD_LOG").is_err() {
66 |         // TODO: Audit that the environment access only happens in single-threaded code.
67 |         unsafe { std::env::set_var("SQLITE_ZSTD_LOG", format!("{default_level}")) };
68 |     }
69 |     env_logger::try_init_from_env(env_logger::Env::new().filter("SQLITE_ZSTD_LOG")).ok();
70 | }
71 | 


--------------------------------------------------------------------------------