├── .coveragerc ├── .github └── workflows │ ├── deploy-package-to-pypi.yml │ └── test.yml ├── .gitignore ├── LICENSE ├── README.md ├── codecov.yml ├── pyproject.toml ├── sqlite_s3_query.py ├── start-services.sh ├── stop-services.sh └── test.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | -------------------------------------------------------------------------------- /.github/workflows/deploy-package-to-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Deploy package to PyPI 2 | 3 | ############################################################################### 4 | 5 | on: 6 | 7 | release: 8 | types: [published] 9 | 10 | ############################################################################### 11 | 12 | jobs: 13 | 14 | build: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v3 18 | - uses: actions/setup-python@v3 19 | with: 20 | python-version: 3.11 21 | - name: Update version in pyproject.toml from current git tag 22 | run: >- 23 | sed -i "s/0\\.0\\.0\\.dev0/${GITHUB_REF/refs\/tags\/v/}/g" pyproject.toml 24 | - run: | 25 | pip install build 26 | python -m build 27 | - uses: actions/upload-artifact@v3 28 | with: 29 | path: ./dist 30 | 31 | deploy: 32 | needs: [build] 33 | environment: pypi 34 | runs-on: ubuntu-latest 35 | permissions: 36 | id-token: write 37 | steps: 38 | - uses: actions/download-artifact@v3 39 | - uses: pypa/gh-action-pypi-publish@release/v1 40 | with: 41 | packages_dir: artifact/ 42 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | on: 3 | push: 4 | branches: [ "main" ] 5 | pull_request: 6 | branches: [ "main" ] 7 | 8 | jobs: 9 | download-minio: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: "Download MinIO" 13 | run: | 14 | mkdir -p ./minio 15 | curl -sS -L \ 16 | https://dl.min.io/server/minio/release/linux-amd64/archive/minio.RELEASE.2023-07-21T21-12-44Z -o ./minio/minio-linux \ 17 | https://dl.min.io/server/minio/release/darwin-amd64/archive/minio.RELEASE.2023-07-21T21-12-44Z -o ./minio/minio-darwin \ 18 | https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2023-07-21T21-12-44Z -o ./minio/minio-windows.exe 19 | - name: "Save MinIO" 20 | uses: actions/upload-artifact@v3 21 | with: 22 | name: minio 23 | path: ./minio/* 24 | 25 | # We want older SQLite amalgamation files, but they are not available to download, 26 | # so must be built from source. And they cannot be build on Windows, even for tests 27 | # that then compile the amalgamation on Windows 28 | create-sqlite-amalgamation: 29 | name: "Create SQLite amalgamation" 30 | runs-on: ubuntu-latest 31 | strategy: 32 | matrix: 33 | sqlite-url-version: 34 | - {version: "3042000", url: "https://www.sqlite.org/src/tarball/831d0fb2/SQLite-831d0fb2.tar.gz"} 35 | - {version: "3036000", url: "https://www.sqlite.org/src/tarball/5c9a6c06/SQLite-5c9a6c06.tar.gz"} 36 | - {version: "3007015", url: "https://www.sqlite.org/src/tarball/cd0b37c5/SQLite-cd0b37c5.tar.gz"} 37 | steps: 38 | - name: "Download SQLite source and build amalgamation" 39 | run: | 40 | curl -sS -L '${{ matrix.sqlite-url-version.url }}' -o sqlite3.tar.gz 41 | mkdir sqlite3 42 | tar -zxvf sqlite3.tar.gz --strip-components=1 -C sqlite3 43 | cd sqlite3 44 | ./configure 45 | make sqlite3.c 46 | - name: "Save SQLite amalgamation" 47 | uses: actions/upload-artifact@v3 48 | with: 49 | name: sqlite-${{ matrix.sqlite-url-version.version }} 50 | path: ./sqlite3/sqlite3.c 51 | 52 | test: 53 | name: Test 54 | needs: [download-minio, create-sqlite-amalgamation] 55 | strategy: 56 | matrix: 57 | # If changing how many times tests are run, must also change in codecov.yml 58 | # to ensure test coverage is reported only after all tests have finished 59 | os: 60 | - "macos-12" 61 | - "ubuntu-20.04" 62 | - "windows-2019" 63 | sqlite-version: 64 | - "3042000" 65 | - "3036000" 66 | - "3007015" 67 | - "default" 68 | python-version: 69 | - "3.6.7" 70 | - "3.7.1" 71 | - "3.8.0" 72 | - "3.9.0" 73 | - "3.10.0" 74 | - "3.11.0" 75 | package-extras: 76 | - "ci-earliest" 77 | - "ci-latest" 78 | exclude: 79 | - python-version: "3.6.7" 80 | package-extras: "ci-latest" 81 | runs-on: '${{ matrix.os }}' 82 | env: 83 | SQLITE3_VERSION: ${{ matrix.sqlite-version }} 84 | MINIO_ROOT_USER: AKIAIOSFODNN7EXAMPLE 85 | MINIO_ROOT_PASSWORD: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY 86 | MINIO_REGION: us-east-1 87 | steps: 88 | - name: "Checkout" 89 | uses: "actions/checkout@v3" 90 | - uses: "actions/setup-python@v4" 91 | with: 92 | python-version: '${{ matrix.python-version }}' 93 | - name: "Load MinIO" 94 | uses: actions/download-artifact@v3 95 | with: 96 | name: minio 97 | path: ./minio 98 | - name: "Load SQLite amalgamation" 99 | if: matrix.sqlite-version != 'default' 100 | uses: actions/download-artifact@v3 101 | with: 102 | name: sqlite-${{ matrix.sqlite-version }} 103 | path: . 104 | - name: "Compile SQLite from amalgamation (Windows)" 105 | if: matrix.os == 'windows-2019' && matrix.sqlite-version != 'default' 106 | run: | 107 | gcc -shared sqlite3.c -o sqlite3.dll 108 | echo "LIBSQLITE3_PATH=${PWD}/sqlite3.dll" >> $env:GITHUB_ENV 109 | - name: "Compile SQLite from amalgamation (Ubuntu or macOS)" 110 | if: (matrix.os == 'ubuntu-20.04' || matrix.os == 'macos-12') && matrix.sqlite-version != 'default' 111 | run: | 112 | gcc -shared -fPIC -o libsqlite3.so.0 sqlite3.c 113 | echo "LIBSQLITE3_PATH=${PWD}/libsqlite3.so.0" >> "$GITHUB_ENV" 114 | - name: "Install sqlite-s3-query and any dependencies" 115 | run: | 116 | pip install ".[dev,${{ matrix.package-extras }}]" 117 | - name: "Test (Windows)" 118 | if: matrix.os == 'windows-2019' 119 | run: | 120 | mkdir -p ./data 121 | ./minio/minio-windows.exe server ./data & 122 | do { 123 | Write-Host "Waiting for MinIO" 124 | sleep 3 125 | } until(Test-NetConnection 127.0.0.1 -Port 9000 | ? { $_.TcpTestSucceeded } ) 126 | coverage run -m unittest 127 | - name: "Test (Ubuntu)" 128 | if: matrix.os == 'ubuntu-20.04' 129 | run: | 130 | mkdir -p ./data 131 | chmod +x ./minio/minio-linux 132 | ./minio/minio-linux server ./data & 133 | until nc -w 10 127.0.0.1 9000; do sleep 1; done 134 | coverage run -m unittest 135 | - name: "Test (macOS)" 136 | if: matrix.os == 'macos-12' 137 | run: | 138 | mkdir -p ./data 139 | chmod +x ./minio/minio-darwin 140 | ./minio/minio-darwin server ./data & 141 | until nc -w 10 127.0.0.1 9000; do sleep 1; done 142 | coverage run -m unittest 143 | - uses: codecov/codecov-action@v4 144 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Michal Charemza 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sqlite-s3-query 2 | 3 | [![PyPI package](https://img.shields.io/pypi/v/sqlite-s3-query?label=PyPI%20package&color=%234c1)](https://pypi.org/project/sqlite-s3-query/) [![Test suite](https://img.shields.io/github/actions/workflow/status/michalc/sqlite-s3-query/test.yml?label=Test%20suite)](https://github.com/michalc/sqlite-s3-query/actions/workflows/test.yml) [![Code coverage](https://img.shields.io/codecov/c/github/michalc/sqlite-s3-query?label=Code%20coverage)](https://app.codecov.io/gh/michalc/sqlite-s3-query) 4 | 5 | Python context managers to query a SQLite file stored on S3. It uses multiple HTTP range requests per query to avoid downloading the entire file, and so is suitable for large databases. 6 | 7 | All queries using the same instance of the context will query the same version of the database object in S3. This means that a context is roughly equivalent to a REPEATABLE READ transaction, and queries should complete succesfully even if the database is replaced concurrently by another S3 client. [Versioning _must_ be enabled on the S3 bucket](#versioning). 8 | 9 | SQL statements that write to the database are not supported. If you're looking for a way to write to a SQLite database in S3, try [sqlite-s3vfs](https://github.com/uktrade/sqlite-s3vfs). 10 | 11 | Inspired by [phiresky's sql.js-httpvfs](https://github.com/phiresky/sql.js-httpvfs), and [dacort's Stack Overflow answer](https://stackoverflow.com/a/59434097/1319998). 12 | 13 | 14 | ## Installation 15 | 16 | You can install sqlite-s3-query from [PyPI](https://pypi.org/project/sqlite-s3-query/) using pip. 17 | 18 | ```bash 19 | pip install sqlite_s3_query 20 | ``` 21 | 22 | This will automatically install [HTTPX](https://www.python-httpx.org/), which is used to communicate with S3. A package often used to communciate with S3 from Python is [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html), but sqlite-s3-query does not use boto3. 23 | 24 | The libsqlite3 binary library is also required, but this is typically already installed on most systems. The earliest version of libsqlite3 known to work is 3.7.15 (2012-12-12). 25 | 26 | 27 | ## Usage 28 | 29 | For single-statement queries, the `sqlite_s3_query` function can be used. 30 | 31 | ```python 32 | from sqlite_s3_query import sqlite_s3_query 33 | 34 | with sqlite_s3_query(url='https://my-bucket.s3.eu-west-2.amazonaws.com/my-db.sqlite') as query: 35 | 36 | with query('SELECT * FROM my_table WHERE my_column = ?', params=('my-value',)) as (columns, rows): 37 | for row in rows: 38 | print(row) 39 | 40 | # Exactly the same results, even if the object in S3 was replaced 41 | with query('SELECT * FROM my_table WHERE my_column = ?', params=('my-value',)) as (columns, rows): 42 | for row in rows: 43 | print(row) 44 | 45 | # Or can use named parameters 46 | with query('SELECT * FROM my_table WHERE my_column = :my_param', named_params=((':my_param', 'my-value'),)) as (columns, rows): 47 | for row in rows: 48 | print(row) 49 | ``` 50 | 51 | For multi-statement queries, the `sqlite_s3_query_multi` function can be used. 52 | 53 | ```python 54 | from sqlite_s3_query import sqlite_s3_query_multi 55 | 56 | with sqlite_s3_query_multi(url='https://my-bucket.s3.eu-west-2.amazonaws.com/my-db.sqlite') as query_multi: 57 | for (columns, rows) in query_multi(''' 58 | SELECT * FROM my_table_a WHERE my_column_a = ?; 59 | SELECT * FROM my_table_b WHERE my_column_b = ?; 60 | ''', params=(('my-value-a',), ('my-value-b',)): 61 | for row in rows: 62 | print(row) 63 | ``` 64 | 65 | If in your project you query the same object from multiple places, `functools.partial` can be used to make an interface with less duplication. 66 | 67 | ```python 68 | from functools import partial 69 | from sqlite_s3_query import sqlite_s3_query 70 | 71 | query_my_db = partial(sqlite_s3_query, 72 | url='https://my-bucket.s3.eu-west-2.amazonaws.com/my-db.sqlite', 73 | ) 74 | 75 | with \ 76 | query_my_db() as query, \ 77 | query('SELECT * FROM my_table WHERE my_col = ?', params=('my-value',)) as (columns, rows): 78 | 79 | for row in rows: 80 | print(row) 81 | 82 | with \ 83 | query_my_db() as query, \ 84 | query('SELECT * FROM my_table_2 WHERE my_col = ?', params=('my-value',)) as (columns, rows): 85 | 86 | for row in rows: 87 | print(row) 88 | ``` 89 | 90 | ### Pandas DataFrame 91 | 92 | You can create a Pandas DataFrame from query results by passing the `rows` iterable and `columns` tuple to the `DataFrame` constructor as below. 93 | 94 | ```python 95 | import pandas as pd 96 | from sqlite_s3_query import sqlite_s3_query 97 | 98 | with \ 99 | sqlite_s3_query(url='https://my-bucket.s3.eu-west-2.amazonaws.com/my-db.sqlite') as query, \ 100 | query('SELECT * FROM my_table WHERE my_column = ?', params=('my-value',)) as (columns, rows): 101 | 102 | df = pd.DataFrame(rows, columns=columns) 103 | 104 | print(df) 105 | ``` 106 | 107 | ### Permissions 108 | 109 | The AWS credentials must have both the `s3:GetObject` and `s3:GetObjectVersion` permissions on the database object. For example if the database is at the key `my-db.sqlite` in bucket `my-bucket`, then the minimal set of permissions are shown below. 110 | 111 | ```json 112 | { 113 | "Version": "2012-10-17", 114 | "Statement": [{ 115 | "Effect": "Allow", 116 | "Action": ["s3:GetObject", "s3:GetObjectVersion"], 117 | "Resource": "arn:aws:s3:::my-bucket/my-db.sqlite" 118 | }] 119 | } 120 | ``` 121 | 122 | ### Credentials 123 | 124 | The AWS region and the credentials are taken from environment variables, but this can be changed using the `get_credentials` parameter. Below shows the default implementation of this that can be overriden. 125 | 126 | ```python 127 | from sqlite_s3_query import sqlite_s3_query 128 | import os 129 | 130 | def get_credentials(_): 131 | return ( 132 | os.environ['AWS_REGION'], 133 | os.environ['AWS_ACCESS_KEY_ID'], 134 | os.environ['AWS_SECRET_ACCESS_KEY'], 135 | os.environ.get('AWS_SESSION_TOKEN'), # Only needed for temporary credentials 136 | ) 137 | 138 | query_my_db = partial(sqlite_s3_query, 139 | url='https://my-bucket.s3.eu-west-2.amazonaws.com/my-db.sqlite', 140 | get_credentials=get_credentials, 141 | ) 142 | 143 | with \ 144 | query_my_db() as query, \ 145 | query('SELECT * FROM my_table_2 WHERE my_col = ?', params=('my-value',)) as (columns, rows): 146 | 147 | for row in rows: 148 | print(row) 149 | ``` 150 | 151 | sqlite-s3-query does not install or use boto3, but if you install it separately, you can use it to fetch credentials as in the below example. This can be useful when you want to use temporary credentials associated with an ECS or EC2 role, which boto3 fetches automatically. 152 | 153 | ```python 154 | import boto3 155 | from sqlite_s3_query import sqlite_s3_query 156 | 157 | def GetBoto3Credentials(): 158 | session = boto3.Session() 159 | credentials = session.get_credentials() 160 | def get_credentials(_): 161 | return (session.region_name,) + credentials.get_frozen_credentials() 162 | 163 | return get_credentials 164 | 165 | query_my_db = partial(sqlite_s3_query, 166 | url='https://my-bucket.s3.eu-west-2.amazonaws.com/my-db.sqlite', 167 | get_credentials=GetBoto3Credentials(), 168 | ) 169 | 170 | with \ 171 | query_my_db() as query, \ 172 | query('SELECT * FROM my_table_2 WHERE my_col = ?', params=('my-value',)) as (columns, rows): 173 | 174 | for row in rows: 175 | print(row) 176 | ``` 177 | 178 | 179 | ### Public Buckets 180 | 181 | For public buckets where credentials should not be passed, pass `None` as the `get_credentials` parameter. 182 | 183 | ```python 184 | query_my_db = partial(sqlite_s3_query, 185 | url='https://my-public-bucket.s3.eu-west-2.amazonaws.com/my-db.sqlite', 186 | get_credentials=None, 187 | ) 188 | 189 | with \ 190 | query_my_db() as query, \ 191 | query('SELECT * FROM my_table_2 WHERE my_col = ?', params=('my-value',)) as (columns, rows): 192 | 193 | for row in rows: 194 | print(row) 195 | ``` 196 | 197 | 198 | ### HTTP Client 199 | 200 | The HTTP client can be changed by overriding the the default `get_http_client` parameter, which is shown below. 201 | 202 | ```python 203 | from functools import partial 204 | import httpx 205 | from sqlite_s3_query import sqlite_s3_query 206 | 207 | query_my_db = partial(sqlite_s3_query, 208 | url='https://my-bucket.s3.eu-west-2.amazonaws.com/my-db.sqlite', 209 | get_http_client=lambda: httpx.Client(transport=httpx.HTTPTransport(retries=3)), 210 | ) 211 | 212 | with \ 213 | query_my_db() as query, \ 214 | query('SELECT * FROM my_table WHERE my_col = ?', params=('my-value',)) as (columns, rows): 215 | 216 | for row in rows: 217 | print(row) 218 | ``` 219 | 220 | ### Location of libsqlite3 221 | 222 | The location of the libsqlite3 library can be changed by overriding the `get_libsqlite3` parameter. 223 | 224 | ```python 225 | from ctypes import cdll 226 | from ctypes.util import find_library 227 | from functools import partial 228 | from sqlite_s3_query import sqlite_s3_query 229 | 230 | query_my_db = partial(sqlite_s3_query, 231 | url='https://my-bucket.s3.eu-west-2.amazonaws.com/my-db.sqlite', 232 | get_libsqlite3=lambda: cdll.LoadLibrary(find_library('sqlite3')) 233 | ) 234 | 235 | with \ 236 | query_my_db() as query, \ 237 | query('SELECT * FROM my_table WHERE my_col = ?', params=('my-value',)) as (columns, rows): 238 | 239 | for row in rows: 240 | print(row) 241 | ``` 242 | 243 | 244 | ## Multithreading 245 | 246 | It is safe for multiple threads to call the same `query` function. Under the hood, each use of `query` uses a separate SQLite "connection" to the database combined with the `SQLITE_OPEN_NOMUTEX` flag, which makes this safe while not locking unnecessarily. 247 | 248 | 249 | ## Versioning 250 | 251 | sqlite-s3-query is only for versioned buckets, to the point that it's a feature that it will error if run on an unversioned bucket. This is to keep the scope of this project small while giving the highest chance possible that a bucket is configured to allow queries running successfully during the replacement of the underlying database object. 252 | 253 | This means that sqlite-s3-query is not for all use cases of querying SQLite databases on S3: specifically it won't work when versioning cannot be enabled. In these cases you will have to do something else. For example: 254 | 255 | - Use https://github.com/litements/s3sqlite - at the time of writing it does not require versioning 256 | - Use a fork of sqlite-s3-query that allows unversioned buckets, for example as in https://github.com/michalc/sqlite-s3-query/pull/84 257 | 258 | This is not necessarily a permanent decision - it is possible that in future sqlite-s3-query will support unversioned buckets. 259 | 260 | 261 | ## Exceptions 262 | 263 | Under the hood [HTTPX](https://www.python-httpx.org/) is used to communicate with S3, but any [exceptions raised by HTTPX](https://www.python-httpx.org/exceptions/) are passed through to client code unchanged. This includes `httpx.HTTPStatusError` when S3 returns a non-200 response. Most commonly this will be when S3 returns a 403 in the case of insufficient permissions on the database object being queried. 264 | 265 | All other exceptions raised inherit from `sqlite_s3_query.SQLiteS3QueryError` as described in the following hierarchy. 266 | 267 | ### Exception hierarchy 268 | 269 | - `SQLiteS3QueryError` 270 | 271 | The base class for explicitly raised exceptions. 272 | 273 | - `VersioningNotEnabledError` 274 | 275 | Versioning is not enabled on the bucket. 276 | 277 | - `QueryContextClosedError` 278 | 279 | A results iterable has been attempted to be used after the close of its surrounding query context. 280 | 281 | - `SQLiteError` 282 | 283 | SQLite has detected an error. The first element of the `args` member of the raised exception is the description of the error as provided by SQLite. 284 | 285 | 286 | ## Compatibility 287 | 288 | - Linux (tested on Ubuntu 20.04), Windows (tested on Windows Server 2019), or macOS (tested on macOS 12) 289 | - SQLite >= 3.7.15, (tested on 3.7.15, 3.36.0, 3.42.0, and the default version available on each OS tested) 290 | - Python >= 3.6.7 (tested on 3.6.7, 3.7.1, 3.8.0, 3.9.0, 3.10.0, and 3.11.0) 291 | - HTTPX >= 0.18.2 (tested on 0.18.2 with Python >= 3.6.7, and 0.24.1 with Python >= 3.7.1) 292 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | codecov: 3 | notify: 4 | after_n_builds: 132 5 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "sqlite-s3-query" 7 | version = "0.0.0.dev0" 8 | authors = [ 9 | { name="Michal Charemza", email="michal@charemza.name" }, 10 | ] 11 | description = "Python context manager to query a SQLite file stored on S3" 12 | readme = "README.md" 13 | requires-python = ">=3.6.7" 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: MIT License", 17 | "Operating System :: OS Independent", 18 | ] 19 | dependencies = [ 20 | "httpx>=0.18.2", 21 | ] 22 | 23 | [project.optional-dependencies] 24 | dev = [ 25 | "coverage", 26 | ] 27 | ci-earliest = [ 28 | "httpx==0.18.2", 29 | ] 30 | ci-latest = [ 31 | "httpx==0.24.1", 32 | ] 33 | 34 | [project.urls] 35 | "Home-page" = "https://github.com/michalc/sqlite-s3-query" 36 | 37 | [tool.hatch.build] 38 | include = [ 39 | "sqlite_s3_query.py" 40 | ] 41 | -------------------------------------------------------------------------------- /sqlite_s3_query.py: -------------------------------------------------------------------------------- 1 | import hmac 2 | import os 3 | import threading 4 | from contextlib import contextmanager 5 | from ctypes import CFUNCTYPE, POINTER, Structure, create_string_buffer, pointer, cast, memmove, memset, sizeof, addressof, cdll, byref, string_at, c_char_p, c_int, c_double, c_int64, c_void_p, c_char 6 | from ctypes.util import find_library 7 | from datetime import datetime 8 | from functools import partial 9 | from hashlib import sha256 10 | from re import sub 11 | from time import time 12 | from urllib.parse import urlencode, urlsplit, quote 13 | from uuid import uuid4 14 | 15 | import httpx 16 | 17 | 18 | @contextmanager 19 | def sqlite_s3_query_multi(url, get_credentials=lambda now: ( 20 | os.environ['AWS_REGION'], 21 | os.environ['AWS_ACCESS_KEY_ID'], 22 | os.environ['AWS_SECRET_ACCESS_KEY'], 23 | os.environ.get('AWS_SESSION_TOKEN'), # Only needed for temporary credentials 24 | ), get_http_client=lambda: httpx.Client(transport=httpx.HTTPTransport(retries=3)), 25 | get_libsqlite3=lambda: cdll.LoadLibrary(find_library('sqlite3'))): 26 | libsqlite3 = get_libsqlite3() 27 | libsqlite3.sqlite3_errstr.restype = c_char_p 28 | libsqlite3.sqlite3_errmsg.restype = c_char_p 29 | libsqlite3.sqlite3_column_name.restype = c_char_p 30 | libsqlite3.sqlite3_column_double.restype = c_double 31 | libsqlite3.sqlite3_column_int64.restype = c_int64 32 | libsqlite3.sqlite3_column_blob.restype = c_void_p 33 | libsqlite3.sqlite3_column_bytes.restype = c_int64 34 | SQLITE_OK = 0 35 | SQLITE_IOERR = 10 36 | SQLITE_NOTFOUND = 12 37 | SQLITE_ROW = 100 38 | SQLITE_DONE = 101 39 | SQLITE_IOERR_SHORT_READ = 522 40 | SQLITE_TRANSIENT = c_void_p(-1) 41 | SQLITE_OPEN_READONLY = 0x00000001 42 | SQLITE_OPEN_NOMUTEX = 0x00008000 43 | SQLITE_IOCAP_IMMUTABLE = 0x00002000 44 | 45 | bind = { 46 | type(0): libsqlite3.sqlite3_bind_int64, 47 | type(0.0): libsqlite3.sqlite3_bind_double, 48 | type(''): lambda pp_stmt, i, value: libsqlite3.sqlite3_bind_text(pp_stmt, i, value.encode('utf-8'), len(value.encode('utf-8')), SQLITE_TRANSIENT), 49 | type(b''): lambda pp_stmt, i, value: libsqlite3.sqlite3_bind_blob(pp_stmt, i, value, len(value), SQLITE_TRANSIENT), 50 | type(None): lambda pp_stmt, i, _: libsqlite3.sqlite3_bind_null(pp_stmt, i), 51 | } 52 | 53 | extract = { 54 | 1: libsqlite3.sqlite3_column_int64, 55 | 2: libsqlite3.sqlite3_column_double, 56 | 3: lambda pp_stmt, i: string_at( 57 | libsqlite3.sqlite3_column_blob(pp_stmt, i), 58 | libsqlite3.sqlite3_column_bytes(pp_stmt, i), 59 | ).decode(), 60 | 4: lambda pp_stmt, i: string_at( 61 | libsqlite3.sqlite3_column_blob(pp_stmt, i), 62 | libsqlite3.sqlite3_column_bytes(pp_stmt, i), 63 | ), 64 | 5: lambda pp_stmt, i: None, 65 | } 66 | 67 | libsqlite3.sqlite3_initialize() 68 | 69 | vfs_name = b's3-' + str(uuid4()).encode() 70 | file_name = b's3-' + str(uuid4()).encode() 71 | body_hash = sha256(b'').hexdigest() 72 | scheme, netloc, path, _, _ = urlsplit(url) 73 | 74 | local = threading.local() 75 | local.pending_exception = None 76 | 77 | def get_request_headers_for_private_buckets(method, params, headers, now): 78 | region, access_key_id, secret_access_key, session_token = get_credentials(now) 79 | to_auth_headers = headers + ( 80 | (('x-amz-security-token', session_token),) if session_token is not None else \ 81 | () 82 | ) 83 | return aws_sigv4_headers( 84 | now, access_key_id, secret_access_key, region, method, to_auth_headers, params, 85 | ) 86 | 87 | def get_request_headers_for_public_buckets(_, __, headers, ___): 88 | return headers 89 | 90 | get_request_headers = \ 91 | get_request_headers_for_private_buckets if get_credentials is not None else \ 92 | get_request_headers_for_public_buckets 93 | 94 | def set_pending_exception(exception): 95 | local.pending_exception = exception 96 | 97 | def raise_any_pending_exception(): 98 | to_raise = local.pending_exception 99 | if to_raise is not None: 100 | local.pending_exception = None 101 | raise to_raise 102 | 103 | def run(func, *args): 104 | res = func(*args) 105 | raise_any_pending_exception() 106 | if res != 0: 107 | raise SQLiteError(libsqlite3.sqlite3_errstr(res).decode()) 108 | 109 | def run_with_db(db, func, *args): 110 | res = func(*args) 111 | raise_any_pending_exception() 112 | if res != 0: 113 | raise SQLiteError(libsqlite3.sqlite3_errmsg(db).decode()) 114 | 115 | @contextmanager 116 | def make_auth_request(http_client, method, params, headers): 117 | now = datetime.utcnow() 118 | request_headers = get_request_headers(method, params, headers, now) 119 | url = f'{scheme}://{netloc}{path}' 120 | with http_client.stream(method, url, params=params, headers=request_headers) as response: 121 | response.raise_for_status() 122 | yield response 123 | 124 | def aws_sigv4_headers( 125 | now, access_key_id, secret_access_key, region, method, headers_to_sign, params, 126 | ): 127 | def sign(key, msg): 128 | return hmac.new(key, msg.encode('ascii'), sha256).digest() 129 | 130 | algorithm = 'AWS4-HMAC-SHA256' 131 | 132 | amzdate = now.strftime('%Y%m%dT%H%M%SZ') 133 | datestamp = amzdate[:8] 134 | credential_scope = f'{datestamp}/{region}/s3/aws4_request' 135 | 136 | headers = tuple(sorted(headers_to_sign + ( 137 | ('host', netloc), 138 | ('x-amz-content-sha256', body_hash), 139 | ('x-amz-date', amzdate), 140 | ))) 141 | signed_headers = ';'.join(key for key, _ in headers) 142 | 143 | canonical_uri = quote(path, safe='/~') 144 | quoted_params = sorted( 145 | (quote(key, safe='~'), quote(value, safe='~')) 146 | for key, value in params 147 | ) 148 | canonical_querystring = '&'.join(f'{key}={value}' for key, value in quoted_params) 149 | canonical_headers = ''.join(f'{key}:{value}\n' for key, value in headers) 150 | canonical_request = f'{method}\n{canonical_uri}\n{canonical_querystring}\n' + \ 151 | f'{canonical_headers}\n{signed_headers}\n{body_hash}' 152 | 153 | string_to_sign = f'{algorithm}\n{amzdate}\n{credential_scope}\n' + \ 154 | sha256(canonical_request.encode('ascii')).hexdigest() 155 | 156 | date_key = sign(('AWS4' + secret_access_key).encode('ascii'), datestamp) 157 | region_key = sign(date_key, region) 158 | service_key = sign(region_key, 's3') 159 | request_key = sign(service_key, 'aws4_request') 160 | signature = sign(request_key, string_to_sign).hex() 161 | 162 | return ( 163 | ('authorization', ( 164 | f'{algorithm} Credential={access_key_id}/{credential_scope}, ' 165 | f'SignedHeaders={signed_headers}, Signature={signature}') 166 | ), 167 | ) + headers 168 | 169 | @contextmanager 170 | def get_vfs(http_client): 171 | with make_auth_request(http_client, 'HEAD', (), ()) as response: 172 | head_headers = response.headers 173 | next(response.iter_bytes(), b'') 174 | 175 | try: 176 | version_id = head_headers['x-amz-version-id'] 177 | except KeyError: 178 | raise VersioningNotEnabledError('The bucket must have versioning enabled') 179 | 180 | size = int(head_headers['content-length']) 181 | 182 | def make_struct(fields): 183 | class Struct(Structure): 184 | _fields_ = [(field_name, field_type) for (field_name, field_type, _) in fields] 185 | return Struct(*tuple(value for (_, _, value) in fields)) 186 | 187 | x_open_type = CFUNCTYPE(c_int, c_void_p, c_char_p, c_void_p, c_int, POINTER(c_int)) 188 | def x_open(p_vfs, z_name, p_file, flags, p_out_flags): 189 | memmove(p_file, addressof(file), sizeof(file)) 190 | p_out_flags[0] = flags 191 | return SQLITE_OK 192 | 193 | x_close_type = CFUNCTYPE(c_int, c_void_p) 194 | def x_close(p_file): 195 | return SQLITE_OK 196 | 197 | x_read_type = CFUNCTYPE(c_int, c_void_p, c_void_p, c_int, c_int64) 198 | def x_read(p_file, p_out, i_amt, i_ofst): 199 | offset = 0 200 | 201 | try: 202 | with make_auth_request(http_client, 'GET', 203 | (('versionId', version_id),), 204 | (('range', f'bytes={i_ofst}-{i_ofst + i_amt - 1}'),) 205 | ) as response: 206 | # Handle the case of the server being broken or slightly evil, 207 | # returning more than the number of bytes that's asked for 208 | for chunk in response.iter_bytes(): 209 | memmove(p_out + offset, chunk, min(i_amt - offset, len(chunk))) 210 | offset += len(chunk) 211 | if offset > i_amt: 212 | break 213 | except Exception as exception: 214 | set_pending_exception(exception) 215 | return SQLITE_IOERR 216 | 217 | if offset < i_amt: 218 | # The SQLite docs strongly suggest to fill unused with zeroes 219 | remainder = i_amt - offset 220 | memmove(p_out + offset, b'\0' * remainder, remainder) 221 | return SQLITE_IOERR_SHORT_READ 222 | 223 | if offset > i_amt: 224 | return SQLITE_IOERR 225 | 226 | return SQLITE_OK 227 | 228 | x_file_size_type = CFUNCTYPE(c_int, c_void_p, POINTER(c_int64)) 229 | def x_file_size(p_file, p_size): 230 | p_size[0] = size 231 | return SQLITE_OK 232 | 233 | x_lock_type = CFUNCTYPE(c_int, c_void_p, c_int) 234 | def x_lock(p_file, e_lock): 235 | return SQLITE_OK 236 | 237 | x_unlock_type = CFUNCTYPE(c_int, c_void_p, c_int) 238 | def x_unlock(p_file, e_lock): 239 | return SQLITE_OK 240 | 241 | x_file_control_type = CFUNCTYPE(c_int, c_void_p, c_int, c_void_p) 242 | def x_file_control(p_file, op, p_arg): 243 | return SQLITE_NOTFOUND 244 | 245 | x_device_characteristics_type = CFUNCTYPE(c_int, c_void_p) 246 | def x_device_characteristics(p_file): 247 | return SQLITE_IOCAP_IMMUTABLE 248 | 249 | x_access_type = CFUNCTYPE(c_int, c_void_p, c_char_p, c_int, POINTER(c_int)) 250 | def x_access(p_vfs, z_name, flags, z_out): 251 | z_out[0] = 0 252 | return SQLITE_OK 253 | 254 | x_full_pathname_type = CFUNCTYPE(c_int, c_void_p, c_char_p, c_int, POINTER(c_char)) 255 | def x_full_pathname(p_vfs, z_name, n_out, z_out): 256 | memmove(z_out, z_name, len(z_name) + 1) 257 | return SQLITE_OK 258 | 259 | x_current_time_type = CFUNCTYPE(c_int, c_void_p, POINTER(c_double)) 260 | def x_current_time(p_vfs, c_double_p): 261 | c_double_p[0] = time()/86400.0 + 2440587.5; 262 | return SQLITE_OK 263 | 264 | io_methods = make_struct(( 265 | ('i_version', c_int, 1), 266 | ('x_close', x_close_type, x_close_type(x_close)), 267 | ('x_read', x_read_type, x_read_type(x_read)), 268 | ('x_write', c_void_p, None), 269 | ('x_truncate', c_void_p, None), 270 | ('x_sync', c_void_p, None), 271 | ('x_file_size', x_file_size_type, x_file_size_type(x_file_size)), 272 | ('x_lock', x_lock_type, x_lock_type(x_lock)), 273 | ('x_unlock', x_unlock_type, x_unlock_type(x_unlock)), 274 | ('x_check_reserved_lock', c_void_p, None), 275 | ('x_file_control', x_file_control_type, x_file_control_type(x_file_control)), 276 | ('x_sector_size', c_void_p, None), 277 | ('x_device_characteristics', x_device_characteristics_type, x_device_characteristics_type(x_device_characteristics)), 278 | )) 279 | file = make_struct(( 280 | ('p_methods', POINTER(type(io_methods)), pointer(io_methods)), 281 | )) 282 | vfs = make_struct(( 283 | ('i_version', c_int, 1), 284 | ('sz_os_file', c_int, sizeof(file)), 285 | ('mx_pathname', c_int, 1024), 286 | ('p_next', c_void_p, None), 287 | ('z_name', c_char_p, vfs_name), 288 | ('p_app_data', c_char_p, None), 289 | ('x_open', x_open_type, x_open_type(x_open)), 290 | ('x_delete', c_void_p, None), 291 | ('x_access', x_access_type, x_access_type(x_access)), 292 | ('x_full_pathname', x_full_pathname_type, x_full_pathname_type(x_full_pathname)), 293 | ('x_dl_open', c_void_p, None), 294 | ('x_dl_error', c_void_p, None), 295 | ('x_dl_sym', c_void_p, None), 296 | ('x_dl_close', c_void_p, None), 297 | ('x_randomness', c_void_p, None), 298 | ('x_sleep', c_void_p, None), 299 | ('x_current_time', x_current_time_type, x_current_time_type(x_current_time)), 300 | ('x_get_last_error', c_void_p, None), 301 | )) 302 | 303 | run(libsqlite3.sqlite3_vfs_register, byref(vfs), 0) 304 | try: 305 | yield vfs 306 | finally: 307 | run(libsqlite3.sqlite3_vfs_unregister, byref(vfs)) 308 | 309 | @contextmanager 310 | def get_db(vfs): 311 | db = c_void_p() 312 | run(libsqlite3.sqlite3_open_v2, file_name, byref(db), SQLITE_OPEN_READONLY | SQLITE_OPEN_NOMUTEX, vfs_name) 313 | try: 314 | yield db 315 | finally: 316 | run_with_db(db, libsqlite3.sqlite3_close, db) 317 | 318 | @contextmanager 319 | def get_pp_stmt_getter(db): 320 | # The purpose of this context manager is to make sure we finalize statements before 321 | # attempting to close the database, including in the case of unfinished interation 322 | 323 | statements = {} 324 | 325 | def get_pp_stmt(statement): 326 | try: 327 | return statements[statement] 328 | except KeyError: 329 | raise QueryContextClosedError('Attempting to use finalized statement') from None 330 | 331 | def finalize(statement): 332 | pp_stmt = statements.pop(statement) 333 | 334 | try: 335 | run_with_db(db, libsqlite3.sqlite3_finalize, pp_stmt) 336 | except: 337 | # The only case found where this errored is when we've already had an error due to 338 | # a malformed disk image, which will already bubble up to client code 339 | pass 340 | 341 | def get_pp_stmts(sql): 342 | p_encoded = POINTER(c_char)(create_string_buffer(sql.encode())) 343 | 344 | while True: 345 | pp_stmt = c_void_p() 346 | run_with_db(db, libsqlite3.sqlite3_prepare_v2, db, p_encoded, -1, byref(pp_stmt), byref(p_encoded)) 347 | if not pp_stmt: 348 | break 349 | 350 | # c_void_p is not hashable, and there is a theoretical possibility that multiple 351 | # exist at the same time pointing to the same memory, so use a plain object instead 352 | statement = object() 353 | statements[statement] = pp_stmt 354 | yield partial(get_pp_stmt, statement), partial(finalize, statement) 355 | 356 | yield get_pp_stmts 357 | 358 | def rows(get_pp_stmt, columns): 359 | while True: 360 | pp_stmt = get_pp_stmt() 361 | res = libsqlite3.sqlite3_step(pp_stmt) 362 | if res == SQLITE_DONE: 363 | break 364 | if res != SQLITE_ROW: 365 | raise_any_pending_exception() 366 | raise SQLiteError(libsqlite3.sqlite3_errstr(res).decode()) 367 | 368 | yield tuple( 369 | extract[libsqlite3.sqlite3_column_type(pp_stmt, i)](pp_stmt, i) 370 | for i in range(0, len(columns)) 371 | ) 372 | 373 | def query(vfs, sql, params=(), named_params=()): 374 | 375 | def zip_first(first_iterable, *iterables, default=()): 376 | iters = tuple(iter(iterable) for iterable in iterables) 377 | for value in first_iterable: 378 | yield (value,) + tuple(next(it, default) for it in iters) 379 | 380 | with \ 381 | get_db(vfs) as db, \ 382 | get_pp_stmt_getter(db) as get_pp_stmts: 383 | 384 | for (get_pp_stmt, finalize_stmt), statment_params, statement_named_params in zip_first(get_pp_stmts(sql), params, named_params): 385 | try: 386 | pp_stmt = get_pp_stmt() 387 | for i, param in enumerate(statment_params): 388 | run_with_db(db, bind[type(param)], pp_stmt, i + 1, param) 389 | 390 | for param_name, param_value in statement_named_params: 391 | index = libsqlite3.sqlite3_bind_parameter_index(pp_stmt, param_name.encode('utf-8')) 392 | run_with_db(db, bind[type(param_value)], pp_stmt, index, param_value) 393 | 394 | columns = tuple( 395 | libsqlite3.sqlite3_column_name(pp_stmt, i).decode() 396 | for i in range(0, libsqlite3.sqlite3_column_count(pp_stmt)) 397 | ) 398 | 399 | yield columns, rows(get_pp_stmt, columns) 400 | finally: 401 | finalize_stmt() 402 | 403 | with \ 404 | get_http_client() as http_client, \ 405 | get_vfs(http_client) as vfs: 406 | 407 | yield partial(query, vfs) 408 | 409 | 410 | @contextmanager 411 | def sqlite_s3_query(url, get_credentials=lambda now: ( 412 | os.environ['AWS_REGION'], 413 | os.environ['AWS_ACCESS_KEY_ID'], 414 | os.environ['AWS_SECRET_ACCESS_KEY'], 415 | os.environ.get('AWS_SESSION_TOKEN'), # Only needed for temporary credentials 416 | ), get_http_client=lambda: httpx.Client(transport=httpx.HTTPTransport(retries=3)), 417 | get_libsqlite3=lambda: cdll.LoadLibrary(find_library('sqlite3'))): 418 | 419 | @contextmanager 420 | def query(query_base, sql, params=(), named_params=()): 421 | for columns, rows in query_base(sql, (params,), (named_params,)): 422 | yield columns, rows 423 | break 424 | 425 | with sqlite_s3_query_multi(url, 426 | get_credentials=get_credentials, 427 | get_http_client=get_http_client, 428 | get_libsqlite3=get_libsqlite3, 429 | ) as query_base: 430 | 431 | yield partial(query, query_base) 432 | 433 | 434 | class SQLiteS3QueryError(Exception): 435 | pass 436 | 437 | 438 | class VersioningNotEnabledError(SQLiteS3QueryError): 439 | pass 440 | 441 | 442 | class SQLiteError(SQLiteS3QueryError): 443 | pass 444 | 445 | 446 | class QueryContextClosedError(SQLiteS3QueryError): 447 | pass 448 | -------------------------------------------------------------------------------- /start-services.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | docker run --rm -p 9000:9000 --name sqlite-s3-query-minio -d \ 6 | -e 'MINIO_ROOT_USER=AKIAIOSFODNN7EXAMPLE' \ 7 | -e 'MINIO_ROOT_PASSWORD=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' \ 8 | -e 'MINIO_REGION=us-east-1' \ 9 | --entrypoint sh \ 10 | minio/minio:RELEASE.2023-07-21T21-12-44Z \ 11 | -c ' 12 | mkdir -p /data 13 | minio server /data 14 | ' 15 | -------------------------------------------------------------------------------- /stop-services.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | docker stop sqlite-s3-query-minio 6 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from ctypes import cdll 3 | from ctypes.util import find_library 4 | import datetime 5 | import functools 6 | import hashlib 7 | import hmac 8 | import os 9 | import socket 10 | import sqlite3 11 | import tempfile 12 | import threading 13 | import unittest 14 | import urllib.parse 15 | import uuid 16 | 17 | import httpx 18 | from httpx import HTTPStatusError 19 | 20 | from sqlite_s3_query import ( 21 | VersioningNotEnabledError, 22 | SQLiteError, 23 | QueryContextClosedError, 24 | sqlite_s3_query, 25 | sqlite_s3_query_multi, 26 | ) 27 | 28 | 29 | class TestSqliteS3Query(unittest.TestCase): 30 | 31 | def test_sqlite3_installed_on_ci(self): 32 | ci = os.environ.get('CI', '') 33 | sqlite3_version = os.environ.get('SQLITE3_VERSION', 'default') 34 | if ci and sqlite3_version != 'default': 35 | libsqlite3 = get_libsqlite3() 36 | self.assertEqual(libsqlite3.sqlite3_libversion_number(), int(sqlite3_version)) 37 | 38 | def test_without_versioning(self): 39 | with get_db([ 40 | ("CREATE TABLE my_table (my_col_a text, my_col_b text);",()), 41 | ] + [ 42 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-a', 'some-text-b')"] * 500),()), 43 | ]) as db: 44 | put_object_without_versioning('bucket-without-versioning', 'my.db', db) 45 | 46 | with self.assertRaisesRegex(VersioningNotEnabledError, 'The bucket must have versioning enabled'): 47 | sqlite_s3_query('http://localhost:9000/bucket-without-versioning/my.db', get_credentials=lambda now: ( 48 | 'us-east-1', 49 | 'AKIAIOSFODNN7EXAMPLE', 50 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 51 | None, 52 | ), get_libsqlite3=get_libsqlite3).__enter__() 53 | 54 | def test_select(self): 55 | with get_db([ 56 | ("CREATE TABLE my_table (my_col_a text, my_col_b text);",()), 57 | ] + [ 58 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-a', 'some-text-b')"] * 500),()) 59 | ]) as db: 60 | put_object_with_versioning('my-bucket', 'my.db', db) 61 | 62 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 63 | 'us-east-1', 64 | 'AKIAIOSFODNN7EXAMPLE', 65 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 66 | None, 67 | ), get_libsqlite3=get_libsqlite3) as query: 68 | with query('SELECT my_col_a FROM my_table') as (columns, rows): 69 | rows = list(rows) 70 | 71 | self.assertEqual(rows, [('some-text-a',)] * 500) 72 | 73 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 74 | 'us-east-1', 75 | 'AKIAIOSFODNN7EXAMPLE', 76 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 77 | None, 78 | ), get_libsqlite3=get_libsqlite3) as query: 79 | with \ 80 | query('SELECT my_col_a FROM my_table') as (columns_a, rows_a), \ 81 | query('SELECT my_col_b FROM my_table') as (columns_b, rows_b): 82 | 83 | rows = [ 84 | (next(rows_a)[0], next(rows_b)[0]) 85 | for i in range(0, 500) 86 | ] 87 | 88 | self.assertEqual(rows, [('some-text-a','some-text-b')] * 500) 89 | 90 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 91 | 'us-east-1', 92 | 'AKIAIOSFODNN7EXAMPLE', 93 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 94 | None, 95 | ), get_libsqlite3=get_libsqlite3) as query: 96 | with get_db([ 97 | ("CREATE TABLE my_table (my_col_a text, my_col_b text);", ()), 98 | ] + [ 99 | ("INSERT INTO my_table VALUES " + ','.join(["('some-new-a', 'some-new-b')"] * 500), ()), 100 | ]) as db: 101 | put_object_with_versioning('my-bucket', 'my.db', db) 102 | 103 | with query('SELECT my_col_a FROM my_table') as (columns, rows): 104 | rows = list(rows) 105 | 106 | self.assertEqual(rows, [('some-text-a',)] * 500) 107 | 108 | with self.assertRaisesRegex(QueryContextClosedError, 'Attempting to use finalized statement'): 109 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 110 | 'us-east-1', 111 | 'AKIAIOSFODNN7EXAMPLE', 112 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 113 | None, 114 | ), get_libsqlite3=get_libsqlite3) as query: 115 | with query('SELECT my_col_a FROM my_table') as (columns, rows): 116 | for row in rows: 117 | break 118 | next(rows) 119 | 120 | with self.assertRaisesRegex(QueryContextClosedError, 'Attempting to use finalized statement'): 121 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 122 | 'us-east-1', 123 | 'AKIAIOSFODNN7EXAMPLE', 124 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 125 | None, 126 | ), get_libsqlite3=get_libsqlite3) as query: 127 | with query('SELECT my_col_a FROM my_table') as (columns, rows): 128 | pass 129 | next(rows) 130 | 131 | def test_select_with_named_params(self): 132 | with get_db([ 133 | ("CREATE TABLE my_table (my_col_a text, my_col_b text);", ()) 134 | ] + [ 135 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-a', 'some-text-b')"] * 500), ()), 136 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-c', 'some-text-d')"] * 100), ()), 137 | ]) as db: 138 | put_object_with_versioning('my-bucket', 'my.db', db) 139 | 140 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 141 | 'us-east-1', 142 | 'AKIAIOSFODNN7EXAMPLE', 143 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 144 | None, 145 | ), get_libsqlite3=get_libsqlite3) as query: 146 | with query('SELECT COUNT(*) FROM my_table WHERE my_col_a = :first', named_params=((':first', 'some-text-a'),)) as (columns, rows): 147 | rows = list(rows) 148 | 149 | self.assertEqual(rows, [(500,)]) 150 | 151 | def test_select_with_named_params_public_bucket(self): 152 | create_bucket('my-public-bucket') 153 | disable_auth('my-public-bucket') 154 | with get_db([ 155 | ("CREATE TABLE my_table (my_col_a text, my_col_b text);", ()) 156 | ] + [ 157 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-a', 'some-text-b')"] * 500), ()), 158 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-c', 'some-text-d')"] * 100), ()), 159 | ]) as db: 160 | put_object_with_versioning('my-public-bucket', 'my.db', db) 161 | 162 | with sqlite_s3_query( 163 | 'http://localhost:9000/my-public-bucket/my.db', 164 | get_credentials=None, 165 | get_libsqlite3=get_libsqlite3 166 | ) as query: 167 | with query('SELECT COUNT(*) FROM my_table WHERE my_col_a = :first', named_params=((':first', 'some-text-a'),)) as (columns, rows): 168 | rows = list(rows) 169 | 170 | self.assertEqual(rows, [(500,)]) 171 | 172 | def test_select_large(self): 173 | empty = (bytes(4050),) 174 | 175 | def sqls(): 176 | yield ("CREATE TABLE foo(content BLOB);",()) 177 | for _ in range(0, 1200000): 178 | yield ("INSERT INTO foo VALUES (?);", empty) 179 | 180 | with get_db(sqls()) as db: 181 | length = 0 182 | for chunk in db(): 183 | length += len(chunk) 184 | self.assertGreater(length, 4294967296) 185 | put_object_with_versioning('my-bucket', 'my.db', db) 186 | 187 | count = 0 188 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 189 | 'us-east-1', 190 | 'AKIAIOSFODNN7EXAMPLE', 191 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 192 | None, 193 | ), get_libsqlite3=get_libsqlite3) as query: 194 | with query('SELECT content FROM foo ORDER BY rowid LIMIT 1') as (columns, rows): 195 | for _ in rows: 196 | count += 1 197 | 198 | self.assertEqual(count, 1) 199 | 200 | count = 0 201 | with query('SELECT content FROM foo ORDER BY rowid DESC LIMIT 1') as (columns, rows): 202 | for _ in rows: 203 | count += 1 204 | 205 | self.assertEqual(count, 1) 206 | 207 | def test_select_multi(self): 208 | with get_db([ 209 | ("CREATE TABLE my_table (my_col_a text, my_col_b text);", ()) 210 | ] + [ 211 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-a', 'some-text-b')"] * 500), ()), 212 | ]) as db: 213 | put_object_with_versioning('my-bucket', 'my.db', db) 214 | 215 | with sqlite_s3_query_multi('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 216 | 'us-east-1', 217 | 'AKIAIOSFODNN7EXAMPLE', 218 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 219 | None, 220 | ), get_libsqlite3=get_libsqlite3) as query: 221 | rows_list = [ 222 | list(rows) 223 | for (columns, rows) in query(''' 224 | SELECT my_col_a FROM my_table; 225 | SELECT my_col_a FROM my_table LIMIT 10; 226 | ''') 227 | ] 228 | 229 | self.assertEqual(rows_list, [[('some-text-a',)] * 500, [('some-text-a',)] * 10]) 230 | 231 | with self.assertRaisesRegex(Exception, 'Just after creating context'): 232 | with sqlite_s3_query_multi('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 233 | 'us-east-1', 234 | 'AKIAIOSFODNN7EXAMPLE', 235 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 236 | None, 237 | ), get_libsqlite3=get_libsqlite3) as query: 238 | raise Exception('Just after creating context') 239 | 240 | with self.assertRaisesRegex(Exception, 'Just after iterating statements'): 241 | with sqlite_s3_query_multi('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 242 | 'us-east-1', 243 | 'AKIAIOSFODNN7EXAMPLE', 244 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 245 | None, 246 | ), get_libsqlite3=get_libsqlite3) as query: 247 | for (columns, rows) in query(''' 248 | SELECT my_col_a FROM my_table; 249 | SELECT my_col_a FROM my_table LIMIT 10; 250 | '''): 251 | raise Exception('Just after iterating statements') 252 | 253 | with self.assertRaisesRegex(Exception, 'Just after iterating first row'): 254 | with sqlite_s3_query_multi('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 255 | 'us-east-1', 256 | 'AKIAIOSFODNN7EXAMPLE', 257 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 258 | None, 259 | ), get_libsqlite3=get_libsqlite3) as query: 260 | for (columns, rows) in query(''' 261 | SELECT my_col_a FROM my_table; 262 | SELECT my_col_a FROM my_table LIMIT 10; 263 | '''): 264 | rows_it = iter(rows) 265 | next(rows_it) 266 | raise Exception('Just after iterating first row') 267 | 268 | with self.assertRaisesRegex(Exception, 'Multiple open statements'): 269 | with sqlite_s3_query_multi('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 270 | 'us-east-1', 271 | 'AKIAIOSFODNN7EXAMPLE', 272 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 273 | None, 274 | ), get_libsqlite3=get_libsqlite3) as query: 275 | it = iter(query(''' 276 | SELECT my_col_a FROM my_table; 277 | SELECT my_col_a FROM my_table LIMIT 10; 278 | ''')) 279 | columns_1, rows_1 = next(it) 280 | rows_1_it = iter(rows_1) 281 | next(rows_1_it) 282 | 283 | columns_2, rows_2 = next(it) 284 | rows_2_it = iter(rows_2) 285 | next(rows_2_it) 286 | raise Exception('Multiple open statements') 287 | 288 | with self.assertRaisesRegex(QueryContextClosedError, 'Attempting to use finalized statement'): 289 | with sqlite_s3_query_multi('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 290 | 'us-east-1', 291 | 'AKIAIOSFODNN7EXAMPLE', 292 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 293 | None, 294 | ), get_libsqlite3=get_libsqlite3) as query: 295 | for columns, rows in query(''' 296 | SELECT my_col_a FROM my_table; 297 | SELECT my_col_a FROM my_table LIMIT 10; 298 | '''): 299 | pass 300 | 301 | rows_list = list(rows) 302 | 303 | def test_select_multi_with_named_params(self): 304 | with get_db([ 305 | ("CREATE TABLE my_table (my_col_a text, my_col_b text);", ()) 306 | ] + [ 307 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-a', 'some-text-b')"] * 500), ()), 308 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-c', 'some-text-d')"] * 100), ()), 309 | ]) as db: 310 | put_object_with_versioning('my-bucket', 'my.db', db) 311 | 312 | with sqlite_s3_query_multi('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 313 | 'us-east-1', 314 | 'AKIAIOSFODNN7EXAMPLE', 315 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 316 | None, 317 | ), get_libsqlite3=get_libsqlite3) as query: 318 | rows_list = [ 319 | list(rows) 320 | for (columns, rows) in query(''' 321 | SELECT COUNT(*) FROM my_table WHERE my_col_a = :first; 322 | SELECT COUNT(*) FROM my_table WHERE my_col_a = :second; 323 | ''', named_params=(((':first', 'some-text-a'),),((':second', 'some-text-c'),))) 324 | ] 325 | 326 | self.assertEqual(rows_list, [[(500,)], [(100,)]]) 327 | 328 | def test_select_multi_with_positional_params(self): 329 | with get_db([ 330 | ("CREATE TABLE my_table (my_col_a text, my_col_b text);", ()) 331 | ] + [ 332 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-a', 'some-text-b')"] * 500), ()), 333 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-c', 'some-text-d')"] * 100), ()), 334 | ]) as db: 335 | put_object_with_versioning('my-bucket', 'my.db', db) 336 | 337 | with sqlite_s3_query_multi('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 338 | 'us-east-1', 339 | 'AKIAIOSFODNN7EXAMPLE', 340 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 341 | None, 342 | ), get_libsqlite3=get_libsqlite3) as query: 343 | rows_list = [ 344 | list(rows) 345 | for (columns, rows) in query(''' 346 | SELECT COUNT(*) FROM my_table WHERE my_col_a = ?; 347 | SELECT COUNT(*) FROM my_table WHERE my_col_a = ?; 348 | ''', params=(('some-text-a',), ('some-text-c',),)) 349 | ] 350 | 351 | self.assertEqual(rows_list, [[(500,)], [(100,)]]) 352 | 353 | def test_placeholder(self): 354 | with get_db([ 355 | ("CREATE TABLE my_table (my_col_a text, my_col_b text);",()), 356 | ] + [ 357 | ("INSERT INTO my_table VALUES ('a','b'),('c','d')",()), 358 | ]) as db: 359 | put_object_with_versioning('my-bucket', 'my.db', db) 360 | 361 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 362 | 'us-east-1', 363 | 'AKIAIOSFODNN7EXAMPLE', 364 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 365 | None, 366 | ), get_libsqlite3=get_libsqlite3) as query: 367 | with query("SELECT my_col_a FROM my_table WHERE my_col_b = ?", params=(('d',))) as (columns, rows): 368 | rows = list(rows) 369 | 370 | self.assertEqual(rows, [('c',)]) 371 | 372 | def test_partial(self): 373 | with get_db([ 374 | ("CREATE TABLE my_table (my_col_a text, my_col_b text);",()), 375 | ] + [ 376 | ("INSERT INTO my_table VALUES ('a','b'),('c','d')",()), 377 | ]) as db: 378 | put_object_with_versioning('my-bucket', 'my.db', db) 379 | 380 | query_my_db = functools.partial(sqlite_s3_query, 381 | url='http://localhost:9000/my-bucket/my.db', 382 | get_credentials=lambda now: ( 383 | 'us-east-1', 384 | 'AKIAIOSFODNN7EXAMPLE', 385 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 386 | None, 387 | ), 388 | get_libsqlite3=get_libsqlite3, 389 | ) 390 | 391 | with query_my_db() as query: 392 | with query("SELECT my_col_a FROM my_table WHERE my_col_b = ?", params=(('d',))) as (columns, rows): 393 | rows = list(rows) 394 | 395 | self.assertEqual(rows, [('c',)]) 396 | 397 | def test_time_and_non_python_identifier(self): 398 | with get_db([("CREATE TABLE my_table (my_col_a text, my_col_b text);",())]) as db: 399 | put_object_with_versioning('my-bucket', 'my.db', db) 400 | 401 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 402 | 'us-east-1', 403 | 'AKIAIOSFODNN7EXAMPLE', 404 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 405 | None, 406 | ), get_libsqlite3=get_libsqlite3) as query: 407 | now = datetime.datetime.utcnow() 408 | with query("SELECT date('now'), time('now')") as (columns, rows): 409 | rows = list((datetime.datetime.strptime(row[0] + ' ' + row[1], '%Y-%m-%d %H:%M:%S'),) for row in rows) 410 | 411 | self.assertTrue(all((row[0] - now) < datetime.timedelta(seconds=3) for row in rows)) 412 | self.assertEqual(columns, ("date('now')", "time('now')")) 413 | 414 | def test_non_existant_table(self): 415 | with get_db([("CREATE TABLE my_table (my_col_a text, my_col_b text);",())]) as db: 416 | put_object_with_versioning('my-bucket', 'my.db', db) 417 | 418 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 419 | 'us-east-1', 420 | 'AKIAIOSFODNN7EXAMPLE', 421 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 422 | None, 423 | ), get_libsqlite3=get_libsqlite3) as query: 424 | with self.assertRaisesRegex(SQLiteError, 'no such table: non_table'): 425 | query("SELECT * FROM non_table").__enter__() 426 | 427 | def test_empty_object(self): 428 | put_object_with_versioning('my-bucket', 'my.db', lambda: (b'',)) 429 | 430 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 431 | 'us-east-1', 432 | 'AKIAIOSFODNN7EXAMPLE', 433 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 434 | None, 435 | ), get_libsqlite3=get_libsqlite3) as query: 436 | with self.assertRaisesRegex(HTTPStatusError, r"\b416\b"): 437 | query('SELECT 1').__enter__() 438 | 439 | def test_incorrect_permission_on_context_enter(self): 440 | with get_db([("CREATE TABLE my_table (my_col_a text, my_col_b text);",())]) as db: 441 | put_object_with_versioning('my-bucket', 'my.db', db) 442 | 443 | with self.assertRaisesRegex(HTTPStatusError, r"\b403\b"): 444 | sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 445 | 'us-east-1', 446 | 'AKIAIOSFODNN7EXAMPLE', 447 | 'not-the-right-key', 448 | None, 449 | ), get_libsqlite3=get_libsqlite3).__enter__() 450 | 451 | def test_incorrect_permission_on_run_query(self): 452 | with get_db([("CREATE TABLE my_table (my_col_a text, my_col_b text);",())]) as db: 453 | put_object_with_versioning('my-bucket', 'my.db', db) 454 | 455 | creds = ( 456 | ( 457 | 'us-east-1', 458 | 'AKIAIOSFODNN7EXAMPLE', 459 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 460 | None, 461 | ), ( 462 | 'us-east-1', 463 | 'AKIAIOSFODNN7EXAMPLE', 464 | 'not-the-right-key', 465 | None, 466 | ) 467 | ) 468 | creds_it = iter(creds) 469 | 470 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: next(creds_it), get_libsqlite3=get_libsqlite3) as query: 471 | with self.assertRaisesRegex(HTTPStatusError, r"\b403\b"): 472 | query('SELECT 1').__enter__() 473 | 474 | def test_short_db_header(self): 475 | put_object_with_versioning('my-bucket', 'my.db', lambda: (b'*' * 99,)) 476 | 477 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 478 | 'us-east-1', 479 | 'AKIAIOSFODNN7EXAMPLE', 480 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 481 | None, 482 | ), get_libsqlite3=get_libsqlite3) as query: 483 | with self.assertRaisesRegex(SQLiteError, 'not a database'): 484 | query("SELECT * FROM non_table").__enter__() 485 | 486 | def test_bad_db_header(self): 487 | put_object_with_versioning('my-bucket', 'my.db', lambda: (b'*' * 100,)) 488 | 489 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 490 | 'us-east-1', 491 | 'AKIAIOSFODNN7EXAMPLE', 492 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 493 | None, 494 | ), get_libsqlite3=get_libsqlite3) as query: 495 | with self.assertRaisesRegex(SQLiteError, 'not a database'): 496 | query("SELECT * FROM non_table").__enter__() 497 | 498 | def test_bad_db_first_page(self): 499 | put_object_with_versioning('my-bucket', 'my.db', lambda: (b'*' * 4096,)) 500 | 501 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 502 | 'us-east-1', 503 | 'AKIAIOSFODNN7EXAMPLE', 504 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 505 | None, 506 | ), get_libsqlite3=get_libsqlite3) as query: 507 | with self.assertRaisesRegex(SQLiteError, 'not a database'): 508 | query("SELECT * FROM non_table").__enter__() 509 | 510 | def test_bad_db_second_half(self): 511 | with get_db([("CREATE TABLE my_table (my_col_a text, my_col_b text);",())] + [ 512 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-a', 'some-text-b')"] * 500),()), 513 | ] * 10) as db_full: 514 | db = b''.join(db_full()) 515 | half_len = int(len(db) / 2) 516 | db = db[:half_len] + len(db[half_len:]) * b'-' 517 | put_object_with_versioning('my-bucket', 'my.db', lambda: (db,)) 518 | 519 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 520 | 'us-east-1', 521 | 'AKIAIOSFODNN7EXAMPLE', 522 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 523 | None, 524 | ), get_libsqlite3=get_libsqlite3) as query: 525 | with self.assertRaisesRegex(SQLiteError, 'database disk image is malformed'): 526 | with query("SELECT * FROM my_table") as (columns, rows): 527 | list(rows) 528 | 529 | def test_num_connections(self): 530 | num_connections = 0 531 | 532 | @contextmanager 533 | def server(): 534 | nonlocal num_connections 535 | def _run(server_sock): 536 | nonlocal num_connections 537 | 538 | while True: 539 | try: 540 | downstream_sock, _ = server_sock.accept() 541 | except Exception: 542 | break 543 | num_connections += 1 544 | connection_t = threading.Thread(target=handle_downstream, args=(downstream_sock,)) 545 | connection_t.start() 546 | 547 | with shutdown(get_new_socket()) as server_sock: 548 | server_sock.bind(('127.0.0.1', 9001)) 549 | server_sock.listen(socket.IPPROTO_TCP) 550 | threading.Thread(target=_run, args=(server_sock,)).start() 551 | yield server_sock 552 | 553 | def get_http_client(): 554 | @contextmanager 555 | def client(): 556 | with httpx.Client() as original_client: 557 | class Client(): 558 | def stream(self, method, url, params, headers): 559 | parsed_url = urllib.parse.urlparse(url) 560 | url = urllib.parse.urlunparse(parsed_url._replace(netloc='localhost:9001')) 561 | headers_proxy_host = tuple((key, value) for key, value in headers if key != 'host') + (('host', 'localhost:9000'),) 562 | return original_client.stream(method, url, params=params, headers=headers_proxy_host) 563 | yield Client() 564 | return client() 565 | 566 | with server() as server_sock: 567 | with get_db([ 568 | ("CREATE TABLE my_table (my_col_a text, my_col_b text);",()), 569 | ] + [ 570 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-a', 'some-text-b')"] * 500),()), 571 | ]) as db: 572 | put_object_with_versioning('my-bucket', 'my.db', db) 573 | 574 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 575 | 'us-east-1', 576 | 'AKIAIOSFODNN7EXAMPLE', 577 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 578 | None, 579 | ), get_http_client=get_http_client, get_libsqlite3=get_libsqlite3) as query: 580 | with query('SELECT my_col_a FROM my_table') as (columns, rows): 581 | rows = list(rows) 582 | 583 | self.assertEqual(rows, [('some-text-a',)] * 500) 584 | self.assertEqual(num_connections, 1) 585 | 586 | def test_streaming(self): 587 | rows_count = 0 588 | rows_yielded_at_request = [] 589 | 590 | def get_http_client(): 591 | @contextmanager 592 | def client(): 593 | with httpx.Client() as original_client: 594 | class Client(): 595 | @contextmanager 596 | def stream(self, method, url, params, headers): 597 | rows_yielded_at_request.append( 598 | (rows_count, dict(headers).get('range')) 599 | ) 600 | with original_client.stream(method, url, 601 | params=params, headers=headers 602 | ) as response: 603 | yield response 604 | yield Client() 605 | return client() 606 | 607 | with get_db([ 608 | ("PRAGMA page_size = 4096;",()), 609 | ("CREATE TABLE my_table (my_col_a text, my_col_b text);",()), 610 | ] + [ 611 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-a', 'some-text-b')"] * 500),()), 612 | ]) as db: 613 | put_object_with_versioning('my-bucket', 'my.db', db) 614 | 615 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 616 | 'us-east-1', 617 | 'AKIAIOSFODNN7EXAMPLE', 618 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 619 | None, 620 | ), get_http_client=get_http_client, get_libsqlite3=get_libsqlite3) as query: 621 | with query('SELECT my_col_a FROM my_table') as (cols, rows): 622 | for row in rows: 623 | rows_count += 1 624 | 625 | self.assertIn(rows_yielded_at_request, ([ 626 | (0, None), 627 | (0, 'bytes=0-99'), 628 | (0, 'bytes=0-4095'), 629 | (0, 'bytes=24-39'), # For older SQLite that doesn't support immutable files 630 | (0, 'bytes=4096-8191'), 631 | (0, 'bytes=8192-12287'), 632 | (140, 'bytes=12288-16383'), 633 | (276, 'bytes=16384-20479'), 634 | (412, 'bytes=20480-24575'), 635 | ], [ 636 | (0, None), 637 | (0, 'bytes=0-99'), 638 | (0, 'bytes=0-4095'), 639 | (0, 'bytes=4096-8191'), 640 | (0, 'bytes=8192-12287'), 641 | (140, 'bytes=12288-16383'), 642 | (276, 'bytes=16384-20479'), 643 | (412, 'bytes=20480-24575'), 644 | ])) 645 | 646 | # Documenting the difference with the above and a query that is not streaming. In this 647 | # case, a query with an ORDER BY on a column that does not have an index requires SQLite to 648 | # fetch all the pages before yielding any rows to client code 649 | rows_count = 0 650 | rows_yielded_at_request.clear() 651 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 652 | 'us-east-1', 653 | 'AKIAIOSFODNN7EXAMPLE', 654 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 655 | None, 656 | ), get_http_client=get_http_client, get_libsqlite3=get_libsqlite3) as query: 657 | with query('SELECT my_col_a FROM my_table ORDER BY my_col_a') as (cols, rows): 658 | for row in rows: 659 | rows_count += 1 660 | 661 | self.assertIn(rows_yielded_at_request, ([ 662 | (0, None), 663 | (0, 'bytes=0-99'), 664 | (0, 'bytes=0-4095'), 665 | (0, 'bytes=24-39'), # For older SQLite that doesn't support immutable files 666 | (0, 'bytes=4096-8191'), 667 | (0, 'bytes=8192-12287'), 668 | (0, 'bytes=12288-16383'), 669 | (0, 'bytes=16384-20479'), 670 | (0, 'bytes=20480-24575'), 671 | ], [ 672 | (0, None), 673 | (0, 'bytes=0-99'), 674 | (0, 'bytes=0-4095'), 675 | (0, 'bytes=4096-8191'), 676 | (0, 'bytes=8192-12287'), 677 | (0, 'bytes=12288-16383'), 678 | (0, 'bytes=16384-20479'), 679 | (0, 'bytes=20480-24575'), 680 | ])) 681 | 682 | def test_too_many_bytes(self): 683 | @contextmanager 684 | def server(port): 685 | def _run(server_sock): 686 | while True: 687 | try: 688 | downstream_sock, _ = server_sock.accept() 689 | except Exception: 690 | break 691 | connection_t = threading.Thread(target=handle_downstream, args=(downstream_sock,)) 692 | connection_t.start() 693 | 694 | with shutdown(get_new_socket()) as server_sock: 695 | server_sock.bind(('127.0.0.1', port)) 696 | server_sock.listen(socket.IPPROTO_TCP) 697 | threading.Thread(target=_run, args=(server_sock,)).start() 698 | yield server_sock 699 | 700 | def get_http_client(port): 701 | @contextmanager 702 | def client(): 703 | with httpx.Client() as original_client: 704 | class Client(): 705 | @contextmanager 706 | def stream(self, method, url, params, headers): 707 | parsed_url = urllib.parse.urlparse(url) 708 | url = urllib.parse.urlunparse(parsed_url._replace(netloc=f'localhost:{port}')) 709 | range_query = dict(headers).get('range') 710 | yield_extra = not only_after_header or (range_query and range_query != 'bytes=0-99') 711 | headers_proxy_host = tuple((key, value) for key, value in headers if key != 'host') + (('host', 'localhost:9000'),) 712 | with original_client.stream(method, url, 713 | params=params, headers=headers_proxy_host 714 | ) as response: 715 | chunks = response.iter_bytes() 716 | def iter_bytes(chunk_size=None): 717 | yield from chunks 718 | if yield_extra: 719 | yield b'e' 720 | response.iter_bytes = iter_bytes 721 | yield response 722 | yield Client() 723 | return client() 724 | 725 | for only_after_header, port in [(False, 9001), (True, 9002)]: 726 | with self.subTest((only_after_header, port)): 727 | with server(port) as server_sock: 728 | with get_db([ 729 | ("CREATE TABLE my_table (my_col_a text, my_col_b text);",()), 730 | ] + [ 731 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-a', 'some-text-b')"] * 500),()), 732 | ]) as db: 733 | put_object_with_versioning('my-bucket', 'my.db', db) 734 | 735 | with sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 736 | 'us-east-1', 737 | 'AKIAIOSFODNN7EXAMPLE', 738 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 739 | None, 740 | ), get_http_client=functools.partial(get_http_client, port), get_libsqlite3=get_libsqlite3) as query: 741 | with self.assertRaisesRegex(SQLiteError, 'disk I/O error'): 742 | query('SELECT my_col_a FROM my_table').__enter__() 743 | 744 | def test_disconnection(self): 745 | @contextmanager 746 | def server(): 747 | def _run(server_sock): 748 | while True: 749 | try: 750 | downstream_sock, _ = server_sock.accept() 751 | except Exception: 752 | break 753 | downstream_sock.close() 754 | connection_t = threading.Thread(target=handle_downstream, args=(downstream_sock,)) 755 | connection_t.start() 756 | 757 | with shutdown(get_new_socket()) as server_sock: 758 | server_sock.bind(('127.0.0.1', 9003)) 759 | server_sock.listen(socket.IPPROTO_TCP) 760 | threading.Thread(target=_run, args=(server_sock,)).start() 761 | yield server_sock 762 | 763 | def get_http_client(): 764 | @contextmanager 765 | def client(): 766 | with httpx.Client() as original_client: 767 | class Client(): 768 | def stream(self, method, url, headers, params): 769 | parsed_url = urllib.parse.urlparse(url) 770 | url = urllib.parse.urlunparse(parsed_url._replace(netloc='localhost:9003')) 771 | headers_proxy_host = tuple((key, value) for key, value in headers if key != 'host') + (('host', 'localhost:9000'),) 772 | return original_client.stream(method, url, headers=headers_proxy_host) 773 | yield Client() 774 | return client() 775 | 776 | with get_db([ 777 | ("CREATE TABLE my_table (my_col_a text, my_col_b text);",()), 778 | ] + [ 779 | ("INSERT INTO my_table VALUES " + ','.join(["('some-text-a', 'some-text-b')"] * 500),()), 780 | ]) as db: 781 | put_object_with_versioning('my-bucket', 'my.db', db) 782 | 783 | with server() as server_sock: 784 | with self.assertRaisesRegex(Exception, 'Server disconnected|Connection reset|WinError 10053|WinError 10054'): 785 | sqlite_s3_query('http://localhost:9000/my-bucket/my.db', get_credentials=lambda now: ( 786 | 'us-east-1', 787 | 'AKIAIOSFODNN7EXAMPLE', 788 | 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 789 | None, 790 | ), get_http_client=get_http_client, get_libsqlite3=get_libsqlite3).__enter__() 791 | 792 | def get_libsqlite3(): 793 | return cdll.LoadLibrary(os.environ.get('LIBSQLITE3_PATH', find_library('sqlite3'))) 794 | 795 | def put_object_without_versioning(bucket, key, content): 796 | create_bucket(bucket) 797 | 798 | url = f'http://127.0.0.1:9000/{bucket}/{key}' 799 | sha = hashlib.sha256() 800 | length = 0 801 | for chunk in content(): 802 | length += len(chunk) 803 | sha.update(chunk) 804 | body_hash = sha.hexdigest() 805 | parsed_url = urllib.parse.urlsplit(url) 806 | 807 | headers = aws_sigv4_headers( 808 | 'AKIAIOSFODNN7EXAMPLE', 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 809 | (), 's3', 'us-east-1', parsed_url.netloc, 'PUT', parsed_url.path, (), body_hash, 810 | ) + ((b'content-length', str(length).encode()),) 811 | response = httpx.put(url, content=content(), headers=headers) 812 | response.raise_for_status() 813 | 814 | def put_object_with_versioning(bucket, key, content): 815 | create_bucket(bucket) 816 | enable_versioning(bucket) 817 | 818 | url = f'http://127.0.0.1:9000/{bucket}/{key}' 819 | sha = hashlib.sha256() 820 | length = 0 821 | for chunk in content(): 822 | length += len(chunk) 823 | sha.update(chunk) 824 | body_hash = sha.hexdigest() 825 | parsed_url = urllib.parse.urlsplit(url) 826 | 827 | headers = aws_sigv4_headers( 828 | 'AKIAIOSFODNN7EXAMPLE', 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 829 | (), 's3', 'us-east-1', parsed_url.netloc, 'PUT', parsed_url.path, (), body_hash, 830 | ) + ((b'content-length', str(length).encode()),) 831 | 832 | response = httpx.put(url, content=content(), headers=headers) 833 | response.raise_for_status() 834 | 835 | def create_bucket(bucket): 836 | url = f'http://127.0.0.1:9000/{bucket}/' 837 | content = b'' 838 | body_hash = hashlib.sha256(content).hexdigest() 839 | parsed_url = urllib.parse.urlsplit(url) 840 | 841 | headers = aws_sigv4_headers( 842 | 'AKIAIOSFODNN7EXAMPLE', 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 843 | (), 's3', 'us-east-1', parsed_url.netloc, 'PUT', parsed_url.path, (), body_hash, 844 | ) 845 | response = httpx.put(url, content=content, headers=headers) 846 | 847 | def enable_versioning(bucket): 848 | content = ''' 849 | 850 | Enabled 851 | 852 | '''.encode() 853 | url = f'http://127.0.0.1:9000/{bucket}/?versioning' 854 | body_hash = hashlib.sha256(content).hexdigest() 855 | parsed_url = urllib.parse.urlsplit(url) 856 | 857 | headers = aws_sigv4_headers( 858 | 'AKIAIOSFODNN7EXAMPLE', 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 859 | (), 's3', 'us-east-1', parsed_url.netloc, 'PUT', parsed_url.path, (('versioning', ''),), body_hash, 860 | ) 861 | response = httpx.put(url, content=content, headers=headers) 862 | response.raise_for_status() 863 | 864 | def disable_auth(bucket): 865 | content = f''' 866 | {{ 867 | "Version": "2012-10-17", 868 | "Statement": [ 869 | {{ 870 | "Sid": "Stmt1405592139000", 871 | "Effect": "Allow", 872 | "Principal": "*", 873 | "Action": [ 874 | "s3:GetObject", 875 | "s3:GetObjectVersion" 876 | ], 877 | "Resource": [ 878 | "arn:aws:s3:::{bucket}/*" 879 | ] 880 | }} 881 | ] 882 | }} 883 | '''.encode() 884 | url = f'http://127.0.0.1:9000/{bucket}/?policy' 885 | body_hash = hashlib.sha256(content).hexdigest() 886 | parsed_url = urllib.parse.urlsplit(url) 887 | 888 | headers = aws_sigv4_headers( 889 | 'AKIAIOSFODNN7EXAMPLE', 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY', 890 | (), 's3', 'us-east-1', parsed_url.netloc, 'PUT', parsed_url.path, (('policy', ''),), body_hash, 891 | ) 892 | response = httpx.put(url, content=content, headers=headers) 893 | response.raise_for_status() 894 | 895 | def aws_sigv4_headers(access_key_id, secret_access_key, pre_auth_headers, 896 | service, region, host, method, path, params, body_hash): 897 | algorithm = 'AWS4-HMAC-SHA256' 898 | 899 | now = datetime.datetime.utcnow() 900 | amzdate = now.strftime('%Y%m%dT%H%M%SZ') 901 | datestamp = now.strftime('%Y%m%d') 902 | credential_scope = f'{datestamp}/{region}/{service}/aws4_request' 903 | 904 | pre_auth_headers_lower = tuple(( 905 | (header_key.lower(), ' '.join(header_value.split())) 906 | for header_key, header_value in pre_auth_headers 907 | )) 908 | required_headers = ( 909 | ('host', host), 910 | ('x-amz-content-sha256', body_hash), 911 | ('x-amz-date', amzdate), 912 | ) 913 | headers = sorted(pre_auth_headers_lower + required_headers) 914 | signed_headers = ';'.join(key for key, _ in headers) 915 | 916 | def signature(): 917 | def canonical_request(): 918 | canonical_uri = urllib.parse.quote(path, safe='/~') 919 | quoted_params = sorted( 920 | (urllib.parse.quote(key, safe='~'), urllib.parse.quote(value, safe='~')) 921 | for key, value in params 922 | ) 923 | canonical_querystring = '&'.join(f'{key}={value}' for key, value in quoted_params) 924 | canonical_headers = ''.join(f'{key}:{value}\n' for key, value in headers) 925 | 926 | return f'{method}\n{canonical_uri}\n{canonical_querystring}\n' + \ 927 | f'{canonical_headers}\n{signed_headers}\n{body_hash}' 928 | 929 | def sign(key, msg): 930 | return hmac.new(key, msg.encode('ascii'), hashlib.sha256).digest() 931 | 932 | string_to_sign = f'{algorithm}\n{amzdate}\n{credential_scope}\n' + \ 933 | hashlib.sha256(canonical_request().encode('ascii')).hexdigest() 934 | 935 | date_key = sign(('AWS4' + secret_access_key).encode('ascii'), datestamp) 936 | region_key = sign(date_key, region) 937 | service_key = sign(region_key, service) 938 | request_key = sign(service_key, 'aws4_request') 939 | return sign(request_key, string_to_sign).hex() 940 | 941 | return ( 942 | (b'authorization', ( 943 | f'{algorithm} Credential={access_key_id}/{credential_scope}, ' 944 | f'SignedHeaders={signed_headers}, Signature=' + signature()).encode('ascii') 945 | ), 946 | (b'x-amz-date', amzdate.encode('ascii')), 947 | (b'x-amz-content-sha256', body_hash.encode('ascii')), 948 | ) + pre_auth_headers 949 | 950 | 951 | @contextmanager 952 | def get_db(sqls): 953 | 954 | @contextmanager 955 | def temporary_directory_ignore_cleanup_errors(): 956 | # A backport of the ignore_cleanup_errors=True parameter in 957 | # TemporaryDirectory added in Python 3.10 958 | try: 959 | with tempfile.TemporaryDirectory() as directory_name: 960 | yield directory_name 961 | except (PermissionError, NotADirectoryError): 962 | pass 963 | 964 | with temporary_directory_ignore_cleanup_errors() as directory_name: 965 | db_path = os.path.join(directory_name, 'sqlite-s3-query-test.db') 966 | with sqlite3.connect(db_path, isolation_level=None) as con: 967 | cur = con.cursor() 968 | cur.execute('BEGIN') 969 | for sql, params in sqls: 970 | cur.execute(sql, params) 971 | cur.execute('COMMIT') 972 | 973 | # Really close the file, especially on Windows 974 | del cur, con 975 | 976 | def db(): 977 | with open(db_path, 'rb') as f: 978 | while True: 979 | chunk = f.read(65536) 980 | if not chunk: 981 | break 982 | yield chunk 983 | 984 | yield db 985 | 986 | 987 | def get_new_socket(): 988 | sock = socket.socket(family=socket.AF_INET, type=socket.SOCK_STREAM, 989 | proto=socket.IPPROTO_TCP) 990 | sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) 991 | return sock 992 | 993 | def upstream_connect(): 994 | upstream_sock = socket.create_connection(('127.0.0.1', 9000)) 995 | upstream_sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) 996 | return upstream_sock 997 | 998 | @contextmanager 999 | def shutdown(sock): 1000 | try: 1001 | yield sock 1002 | finally: 1003 | try: 1004 | sock.shutdown(socket.SHUT_RDWR) 1005 | except OSError: 1006 | pass 1007 | finally: 1008 | sock.close() 1009 | 1010 | def proxy(done, source, target): 1011 | try: 1012 | chunk = source.recv(1) 1013 | while chunk: 1014 | target.sendall(chunk) 1015 | chunk = source.recv(1) 1016 | except OSError: 1017 | pass 1018 | finally: 1019 | done.set() 1020 | 1021 | def handle_downstream(downstream_sock): 1022 | with \ 1023 | shutdown(upstream_connect()) as upstream_sock, \ 1024 | shutdown(downstream_sock) as downstream_sock: 1025 | 1026 | done = threading.Event() 1027 | threading.Thread(target=proxy, args=(done, upstream_sock, downstream_sock)).start() 1028 | threading.Thread(target=proxy, args=(done, downstream_sock, upstream_sock)).start() 1029 | done.wait() 1030 | --------------------------------------------------------------------------------