├── .github └── workflows │ ├── pypi-publish.yaml │ └── tests.yaml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── requirements.txt ├── s3sqlite.py ├── setup.py ├── start-minio.sh ├── stop-minio.sh └── test.py /.github/workflows/pypi-publish.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Python distribution to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - v* 7 | 8 | jobs: 9 | build-n-publish: 10 | name: Build and publish Python distributions to PyPI 11 | runs-on: ubuntu-18.04 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Python 3.7 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: 3.7 18 | - name: Install pypa/build 19 | run: >- 20 | python -m 21 | pip install 22 | build 23 | --user 24 | - name: Build a binary wheel and a source tarball 25 | run: >- 26 | python -m 27 | build 28 | --sdist 29 | --wheel 30 | --outdir dist/ 31 | . 32 | - name: Publish distribution to PyPI 33 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') 34 | uses: pypa/gh-action-pypi-publish@master 35 | with: 36 | password: ${{ secrets.GH_PYPI_TOKEN }} 37 | -------------------------------------------------------------------------------- /.github/workflows/tests.yaml: -------------------------------------------------------------------------------- 1 | # name: Tests 2 | # on: [push] 3 | # jobs: 4 | # tests: 5 | # name: ${{ matrix.python-version }} 6 | # runs-on: ubuntu-latest 7 | # strategy: 8 | # fail-fast: false 9 | # matrix: 10 | # python-version: [3.7, 3.8, 3.9, "3.10"] 11 | # steps: 12 | # - uses: actions/checkout@v2 13 | # - uses: actions/setup-python@v2 14 | # with: 15 | # python-version: ${{ matrix.python-version }} 16 | # - name: pip deps 17 | # run: | 18 | # pip install -U -r requirements.txt 19 | # - name: runtests 20 | # run: python3 -m pytest -x test.py 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything 2 | * 3 | 4 | # But not these files... 5 | !.gitignore 6 | !s3sqlite.py 7 | !README.md 8 | !CHANGELOG.md 9 | !/.github/ 10 | !/.github/** 11 | !test.py 12 | !requirements.txt 13 | !setup.py 14 | !stop-minio.sh 15 | !start-minio.sh 16 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.2.1 2 | 3 | - Use the `block_size` when openning the file. 4 | - Enable passing `kwargs` when openning the file (useful to set up caches). 5 | 6 | ## 0.2 7 | 8 | - Upload without reading the whole file in memory. 9 | 10 | ## 0.1 11 | 12 | - Initial release. 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # s3sqlite 2 | 3 | > Query SQLite databases in S3 using s3fs 4 | 5 | [APSW](https://rogerbinns.github.io/apsw/) SQLite VFS. This VFS enables reading 6 | databases from S3 using 7 | [s3fs](https://s3fs.readthedocs.io/en/latest/index.html). This only supports 8 | reading operations, any operation that tries to modify the DB file is ignored. 9 | 10 | Inspired by [sqlite-s3vfs](https://github.com/uktrade/sqlite-s3vfs) and 11 | [sqlite-s3-query](https://github.com/michalc/sqlite-s3-query). 12 | 13 | ## Notes about journal mode 14 | 15 | This VFS will only work when the DB file is in any journal mode that is **not** 16 | [WAL](https://sqlite.org/wal.html). However, it will work if you set the journal 17 | mode to something else just before uploading the file to S3. You can (and 18 | probably should) use WAL mode to generate the DB. Then you can change the 19 | journal mode (and the page size if you neeed) before uploading it to S3. 20 | 21 | The test suite 22 | [includes](https://github.com/litements/s3sqlite/blob/3719f1ce50a7b5cfae754776bc9b2c17292f8d72/test.py#L198) 23 | tests for that use case. Take into account that the page size can't be changed 24 | when the database is in WAL mode. You need to change it before setting the WAL 25 | mode or by setting the database to rollback journal mode. [You need to execute 26 | `VACUUM;` after changing the page 27 | size](https://www.sqlite.org/pragma.html#pragma_page_size) in a SQLite database. 28 | 29 | ## Example usage 30 | 31 | ```py 32 | import s3fs 33 | import s3sqlite 34 | import apsw 35 | 36 | # Create an S3 filesystem. Check the s3fs docs for more examples: 37 | # https://s3fs.readthedocs.io/en/latest/ 38 | s3 = s3fs.S3FileSystem( 39 | key="somekey", 40 | secret="secret", 41 | client_kwargs={"endpoint_url": "http://..."}, 42 | ) 43 | 44 | s3vfs = s3sqlite.S3VFS(name="s3-vfs", fs=s3) 45 | 46 | # Define the S3 location 47 | key_prefix = "mybucket/awesome.sqlite3" 48 | 49 | # Upload the file to S3 50 | s3vfs.upload_file("awesome.sqlite3", dest=key_prefix) 51 | 52 | # Create a database and query it 53 | with apsw.Connection( 54 | key_prefix, vfs=s3vfs.name, flags=apsw.SQLITE_OPEN_READONLY 55 | ) as conn: 56 | 57 | cursor = conn.execute("...") 58 | print(cursor.fetchall()) 59 | 60 | ``` 61 | 62 | ## Installation 63 | 64 | ``` 65 | python3 -m pip install s3sqlite 66 | ``` 67 | 68 | ## Run tests 69 | 70 | The testing script will use the [Chinook 71 | database](https://github.com/lerocha/chinook-database/), it will modify (and 72 | `VACUUM;`) the file to use all the possible combinations of journal modes and 73 | page sizes 74 | 75 | 1. Download the chinook database: 76 | 77 | ```sh 78 | curl https://raw.githubusercontent.com/lerocha/chinook-database/master/ChinookDatabase/DataSources/Chinook_Sqlite_AutoIncrementPKs.sqlite -o chinook.sqlite3 79 | ``` 80 | 81 | 2. Make sure you have Docker installed. 82 | 83 | The testing script will start a [MinIO](https://min.io/) container to run the 84 | tests locally. After the tests finish, the container will be stopped 85 | atuomatically. 86 | 87 | 3. Run the tests: 88 | 89 | ```sh 90 | python3 -m pytest test.py 91 | ``` 92 | 93 | ## Alternatives 94 | 95 | - [sqlite-s3vfs](https://github.com/uktrade/sqlite-s3vfs): This VFS stores the 96 | SQLite file as separate DB pages. This enables having a single writer without 97 | having to overwrite the whole file. `s3sqlite`'s main difference is that this 98 | just needs uploading a single file to S3. `sqlite-s3vfs` will split the 99 | database in pages and upload the pages separately to a bucket prefix. Having 100 | just a single file has some advantages, like making use of object [versioning 101 | in the 102 | bucket](https://s3fs.readthedocs.io/en/latest/index.html?highlight=version#bucket-version-awareness). 103 | I also think that relying on 104 | [s3fs](https://s3fs.readthedocs.io/en/latest/index.html) makes the VFS more 105 | [flexible](https://s3fs.readthedocs.io/en/latest/index.html#s3-compatible-storage) 106 | than calling `boto3` as `sqlite3-s3vfs` does. `s3fs` should also handle 107 | retries automatically. 108 | - [sqlite-s3-query](https://github.com/michalc/sqlite-s3-query): This VFS is very 109 | similar to `s3sqlit`, but it uses `ctypes` directly to create the VFS and uses 110 | `httpx` to make requests to S3. 111 | 112 | I decided to create a new VFS that didn't require using `ctypes` so that it's 113 | easier to understand and maintain, but I still want to have a single file in S3 114 | (vs. separate DB pages). At the same time, by using 115 | [s3f3](https://s3fs.readthedocs.io/en/latest/) I know I can use any S3 116 | storage supported by that library. 117 | 118 | ## Other 119 | 120 | The Chinook database used for testing can be obtained from: https://github.com/lerocha/chinook-database/ 121 | 122 | The testing section in this README contains a command you can run to get the file. 123 | 124 | ## License 125 | 126 | Distributed under the Apache 2.0 license. See `LICENSE` for more information. 127 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | s3fs 2 | apsw 3 | boto3 4 | pytest 5 | fsspec 6 | -------------------------------------------------------------------------------- /s3sqlite.py: -------------------------------------------------------------------------------- 1 | import apsw 2 | import s3fs 3 | import uuid 4 | import logging 5 | import sys 6 | from typing import Optional 7 | 8 | 9 | logger = logging.getLogger("s3sqlite") 10 | handler = logging.StreamHandler(sys.stderr) 11 | handler.setFormatter( 12 | logging.Formatter( 13 | fmt="%(levelname)s [%(asctime)s] %(name)s - %(message)s", 14 | datefmt="%Y-%m-%d %H:%M:%S", 15 | ) 16 | ) 17 | logger.addHandler(handler) 18 | 19 | 20 | # fmt: off 21 | # SQLite open flags 22 | SQLITE_OPEN_READONLY = 0x00000001 # /* Ok for sqlite3_open_v2() */ 23 | SQLITE_OPEN_READWRITE = 0x00000002 # /* Ok for sqlite3_open_v2() */ 24 | SQLITE_OPEN_CREATE = 0x00000004 # /* Ok for sqlite3_open_v2() */ 25 | SQLITE_OPEN_DELETEONCLOSE = 0x00000008 # /* VFS only */ 26 | SQLITE_OPEN_EXCLUSIVE = 0x00000010 # /* VFS only */ 27 | SQLITE_OPEN_AUTOPROXY = 0x00000020 # /* VFS only */ 28 | SQLITE_OPEN_URI = 0x00000040 # /* Ok for sqlite3_open_v2() */ 29 | SQLITE_OPEN_MEMORY = 0x00000080 # /* Ok for sqlite3_open_v2() */ 30 | SQLITE_OPEN_MAIN_DB = 0x00000100 # /* VFS only */ 31 | SQLITE_OPEN_TEMP_DB = 0x00000200 # /* VFS only */ 32 | SQLITE_OPEN_TRANSIENT_DB = 0x00000400 # /* VFS only */ 33 | SQLITE_OPEN_MAIN_JOURNAL = 0x00000800 # /* VFS only */ 34 | SQLITE_OPEN_TEMP_JOURNAL = 0x00001000 # /* VFS only */ 35 | SQLITE_OPEN_SUBJOURNAL = 0x00002000 # /* VFS only */ 36 | SQLITE_OPEN_SUPER_JOURNAL = 0x00004000 # /* VFS only */ 37 | SQLITE_OPEN_NOMUTEX = 0x00008000 # /* Ok for sqlite3_open_v2() */ 38 | SQLITE_OPEN_FULLMUTEX = 0x00010000 # /* Ok for sqlite3_open_v2() */ 39 | SQLITE_OPEN_SHAREDCACHE = 0x00020000 # /* Ok for sqlite3_open_v2() */ 40 | SQLITE_OPEN_PRIVATECACHE = 0x00040000 # /* Ok for sqlite3_open_v2() */ 41 | SQLITE_OPEN_WAL = 0x00080000 # /* VFS only */ 42 | SQLITE_OPEN_NOFOLLOW = 0x01000000 # /* Ok for sqlite3_open_v2() 43 | # fmt: on 44 | 45 | 46 | def hexify(n): 47 | padding = 8 48 | return f"{n:#0{padding}x}" 49 | 50 | 51 | def convert_flags(flags): 52 | if isinstance(flags, list): 53 | return [hexify(f) for f in flags] 54 | elif isinstance(flags, int): 55 | return hexify(flags) 56 | else: 57 | raise ValueError(flags) 58 | 59 | 60 | class S3VFS(apsw.VFS): 61 | def __init__( 62 | self, 63 | name: str, 64 | fs: s3fs.S3FileSystem, 65 | block_size: int = 4096, 66 | file_kwargs: Optional[dict] = None, 67 | ): 68 | """ 69 | APSW VFS to read by ranges from S3. 70 | 71 | Args: 72 | * name: S3 path of the file (bucket + prefix + filename) 73 | * fs: Instance of s3fs.S3FileSystem 74 | * block_size: Block size used by the filesystem. 75 | * file_kwargs: Extra arguments to pass when calling the open() method of fs (s3fs) 76 | This may be useful to configure the cache strategy used by the S3FileSystem 77 | """ 78 | self.name = f"{name}-{str(uuid.uuid4())}" 79 | self.fs = fs 80 | self.block_size = block_size 81 | self.file_kwargs = file_kwargs if file_kwargs else {} 82 | super().__init__(name=self.name, base="") 83 | 84 | def xAccess(self, pathname, flags): 85 | try: 86 | with self.fs.open(pathname): 87 | return True 88 | except Exception: 89 | return False 90 | 91 | def xFullPathname(self, filename): 92 | logger.debug("Calling VFS xFullPathname") 93 | logger.debug(f"Name: {self.name} fs: {self.fs}") 94 | logger.debug(filename) 95 | return filename 96 | 97 | def xDelete(self, filename, syncdir): 98 | logger.debug("Calling VFS xDelete") 99 | logger.debug( 100 | f"Name: {self.name} fs: {self.fs}, filename: {filename}, syncdir: {syncdir}" 101 | ) 102 | pass 103 | 104 | def xOpen(self, name, flags): 105 | # TODO: check flags to make sure the DB is openned in read-only mode. 106 | logger.debug("Calling VFS xOpen") 107 | fname = name.filename() if isinstance(name, apsw.URIFilename) else name 108 | logger.debug( 109 | f"Name: {self.name} fs: {self.fs}, open_name: {fname}, flags: {convert_flags(flags)}" 110 | ) 111 | 112 | ofile = self.fs.open( 113 | fname, mode="rb", block_size=self.block_size, **self.file_kwargs 114 | ) 115 | 116 | return S3VFSFile( 117 | f=ofile, 118 | name=fname, 119 | flags=flags, 120 | ) 121 | 122 | def upload_file(self, dbfile, dest): 123 | self.fs.upload(dbfile, dest) 124 | 125 | 126 | class S3VFSFile(apsw.VFSFile): 127 | def __init__(self, f: s3fs.S3File, name, flags): 128 | """ 129 | VFS File object 130 | 131 | Args: 132 | * f: S3File object returned by s3fs.S3FileSystem().open() 133 | * name: name of the file 134 | * flags: SQLite open flags 135 | """ 136 | 137 | self.f = f 138 | self.flags = flags 139 | logger.debug(f"Openned AVFSFile with flags: {convert_flags(self.flags)}") 140 | self.name = name 141 | self.mode = "rb" 142 | 143 | def xRead(self, amount, offset) -> bytes: 144 | logger.debug("Calling file xRead") 145 | logger.debug( 146 | f"Name: {self.name} file: {self.f.path}, amount: {amount} offset: {offset}" 147 | ) 148 | self.f.seek(offset) 149 | data = self.f.read(amount) 150 | logger.debug(f"Read data: {data}") 151 | return data 152 | 153 | def xFileControl(self, *args): 154 | return True 155 | 156 | def xDeviceCharacteristics(self): 157 | logger.debug("Calling xDeviceCharacteristics") 158 | return 4096 159 | 160 | def xCheckReservedLock(self): 161 | return False 162 | 163 | def xLock(self, level): 164 | pass 165 | 166 | def xUnlock(self, level): 167 | pass 168 | 169 | def xSectorSize(self): 170 | return self.f.block_size 171 | 172 | def xClose(self): 173 | logger.debug("Calling file xClose") 174 | logger.debug(f"Name: {self.name} file: {self.f.path}") 175 | self.f.close() 176 | pass 177 | 178 | def xFileSize(self): 179 | logger.debug("Calling file xFileSize") 180 | logger.debug(f"Name: {self.name} file: {self.f.path}") 181 | pos = self.f.tell() 182 | self.f.seek(0, 2) 183 | size = self.f.tell() 184 | self.f.seek(pos) 185 | logger.debug(f"Size: {size}") 186 | return size 187 | 188 | def xSync(self, flags): 189 | logger.debug("Calling file xSync") 190 | logger.debug( 191 | f"Name: {self.name} file: {self.f.path}, flags: {convert_flags(flags)}" 192 | ) 193 | pass 194 | 195 | def xTruncate(self, newsize): 196 | logger.debug("Calling file xTruncate") 197 | logger.debug(f"Name: {self.name} file: {self.f}, newsize: {newsize}") 198 | pass 199 | 200 | def xWrite(self, data, offset): 201 | logger.debug("Calling file xWrite") 202 | logger.debug( 203 | f"Name: {self.name} file: {self.f.path}, data_size: {len(data)}, offset: {offset}, data: {data}" 204 | ) 205 | pass 206 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | from pathlib import Path 3 | 4 | 5 | setuptools.setup( 6 | name="s3sqlite", 7 | version="0.2.1", 8 | author="Ricardo Ander-Egg Aguilar", 9 | author_email="rsubacc@gmail.com", 10 | description="Query SQLite databases on S3 using s3fs", 11 | long_description=Path("README.md").read_text(), 12 | long_description_content_type="text/markdown", 13 | url="https://github.com/litements/s3sqlite", 14 | py_modules=["s3sqlite"], 15 | classifiers=[ 16 | "Operating System :: OS Independent", 17 | ], 18 | install_requires=["fsspec", "apsw", "s3fs", "boto3"], 19 | python_requires=">=3.7", 20 | ) 21 | -------------------------------------------------------------------------------- /start-minio.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | docker run \ 6 | --rm \ 7 | --name s3sqlite-minio \ 8 | -p 9000:9000 \ 9 | -p 9001:9001 \ 10 | -e "MINIO_ROOT_USER=AKIAIDIDIDIDIDIDIDID" \ 11 | -e "MINIO_ROOT_PASSWORD=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" \ 12 | quay.io/minio/minio server /data --console-address ":9001" 13 | -------------------------------------------------------------------------------- /stop-minio.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | docker stop s3sqlite-minio 6 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import sqlite3 3 | import subprocess 4 | import uuid 5 | from contextlib import contextmanager 6 | from typing import Tuple 7 | 8 | import apsw 9 | import boto3 10 | import pytest 11 | import s3fs 12 | from fsspec.implementations.local import LocalFileSystem 13 | 14 | import s3sqlite 15 | 16 | # import logging 17 | 18 | # log = logging.getLogger("s3sqlite") 19 | # log.setLevel(logging.DEBUG) 20 | 21 | PAGE_SIZES = [512, 1024, 2048, 4096, 8192, 16384, 32768, 65536] 22 | JOURNAL_MODES = ["DELETE", "TRUNCATE", "PERSIST", "MEMORY", "OFF"] 23 | 24 | PAGE_JOURNAL_MIX = itertools.product(PAGE_SIZES, JOURNAL_MODES) 25 | 26 | QUERIES = [ 27 | """ 28 | -- Get the total sales per country. 29 | 30 | SELECT BillingCountry, sum(total) AS Total FROM Invoice 31 | GROUP BY BillingCountry 32 | ORDER BY Total desc 33 | """, 34 | """ 35 | -- Get the Invoice Total, Customer name, Country and Sale Agent name for all invoices and customers. 36 | 37 | SELECT i.`Total`, c.FirstName || " " || c.LastName AS CustomerName, c.Country, e.FirstName || " " || e.LastName AS SalesAgent FROM Invoice i 38 | JOIN Customer c ON c.CustomerId = i.CustomerId 39 | JOIN Employee e ON e.EmployeeId = c.SupportRepId 40 | ORDER BY SalesAgent; 41 | """, 42 | """ 43 | -- Get the invoices associated with each sales agent. The resultant table should include the Sales Agent's full name. 44 | 45 | SELECT i.InvoiceId, i.InvoiceDate, i.`Total`, i.CustomerId, e.FirstName || " " || e.LastName AS SalesAgent FROM Invoice i, Customer c, Employee e 46 | WHERE c.CustomerId = i.CustomerId 47 | AND e.EmployeeId = c.SupportRepId 48 | ORDER BY SalesAgent == "Sales Support Agent"; 49 | """, 50 | """ 51 | -- Which sales agent made the most in sales over all? 52 | 53 | SELECT "Sales Winner", max("Total") AS "Total" FROM ( 54 | SELECT e.firstName || " " || e.lastName AS "Sales Winner", sum(i.total) AS "Total" FROM Invoice AS i 55 | JOIN customer AS c ON c.customerid = i.customerid 56 | JOIN employee AS e ON e.employeeid = c.supportrepid 57 | GROUP BY e.Employeeid 58 | ) 59 | """, 60 | ] 61 | 62 | dbname = "chinook.sqlite3" 63 | 64 | 65 | @pytest.fixture(autouse=True, scope="session") 66 | def minio(): 67 | proc = subprocess.Popen( 68 | [ 69 | "docker", 70 | "run", 71 | "--rm", 72 | "--name", 73 | "s3sqlite-minio", 74 | "-p", 75 | "9000:9000", 76 | "-p", 77 | "9001:9001", 78 | "-e", 79 | "MINIO_ROOT_USER=AKIAIDIDIDIDIDIDIDID", 80 | "-e", 81 | "MINIO_ROOT_PASSWORD=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 82 | "quay.io/minio/minio", 83 | "server", 84 | "/data", 85 | "--console-address", 86 | ":9001", 87 | ], 88 | text=True, 89 | ) 90 | 91 | yield proc 92 | 93 | proc.terminate() 94 | 95 | 96 | @pytest.fixture 97 | def bucket(): 98 | session = boto3.Session( 99 | aws_access_key_id="AKIAIDIDIDIDIDIDIDID", 100 | aws_secret_access_key="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 101 | region_name="us-east-1", 102 | ) 103 | s3 = session.resource("s3", endpoint_url="http://localhost:9000/") 104 | 105 | name = f"s3vfs-{str(uuid.uuid4())}" 106 | 107 | bucket = s3.create_bucket(Bucket=name) 108 | yield name 109 | bucket.objects.all().delete() 110 | bucket.delete() 111 | 112 | 113 | @pytest.fixture 114 | def s3_data(): 115 | return dict( 116 | key="AKIAIDIDIDIDIDIDIDID", 117 | secret="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 118 | endpoint_url="http://localhost:9000/", 119 | ) 120 | 121 | 122 | # @pytest.fixture(params=BLOCK_SIZES) 123 | # @pytest.fixture 124 | # def s3vfs(s3_data): 125 | # s3 = s3fs.S3FileSystem( 126 | # key=s3_data["key"], 127 | # secret=s3_data["secret"], 128 | # client_kwargs={"endpoint_url": s3_data["endpoint_url"]}, 129 | # ) 130 | 131 | # yield s3sqlite.S3VFS(name="s3-vfs", fs=s3) 132 | 133 | 134 | @pytest.fixture 135 | def s3vfs(s3_data): 136 | s3 = s3fs.S3FileSystem( 137 | key=s3_data["key"], 138 | secret=s3_data["secret"], 139 | client_kwargs={"endpoint_url": s3_data["endpoint_url"]}, 140 | ) 141 | 142 | yield s3sqlite.S3VFS( 143 | name="s3-vfs", 144 | fs=s3, 145 | file_kwargs={"cache_type": "bytes", "cache_size": 100_000_000}, 146 | ) 147 | 148 | 149 | @pytest.fixture 150 | def local_fs(): 151 | fs = LocalFileSystem() 152 | yield fs 153 | 154 | 155 | @pytest.fixture 156 | def localvfs(local_fs): 157 | return s3sqlite.S3VFS(name="local-vfs", fs=local_fs) 158 | 159 | 160 | @contextmanager 161 | def transaction(conn): 162 | conn.execute("BEGIN;") 163 | try: 164 | yield conn 165 | except: 166 | conn.execute("ROLLBACK;") 167 | raise 168 | else: 169 | conn.execute("COMMIT;") 170 | 171 | 172 | def set_pragmas(conn, page_size, journal_mode): 173 | sqls = [ 174 | f"PRAGMA journal_mode = {journal_mode};", 175 | f"PRAGMA page_size = {page_size};", 176 | "VACUUM;", 177 | ] 178 | for sql in sqls: 179 | print(f"Running: {sql}") 180 | conn.execute(sql) 181 | 182 | 183 | def create_db(conn): 184 | with open("chinook.sql") as f: 185 | sql = f.read() 186 | 187 | # with transaction(conn): 188 | conn.executescript(sql) 189 | 190 | 191 | @pytest.fixture(params=itertools.product(PAGE_SIZES, JOURNAL_MODES)) 192 | def get_db(request) -> Tuple[str, sqlite3.Connection]: 193 | # if dbname in os.listdir(): 194 | # os.system(f"rm -rf {dbname}*") 195 | 196 | conn = sqlite3.connect(dbname, isolation_level=None) 197 | set_pragmas(conn, page_size=request.param[0], journal_mode=request.param[1]) 198 | # create_db(conn) 199 | 200 | assert conn.execute("PRAGMA page_size;").fetchone()[0] == request.param[0] 201 | assert ( 202 | conn.execute("PRAGMA journal_mode;").fetchone()[0].lower() 203 | == request.param[1].lower() 204 | ) 205 | return dbname, conn 206 | 207 | 208 | # I haven't been able to make this work as I want for DBs in WAL mode. For now 209 | # I'll just document that the DB needs to be set to a different journal mode 210 | # before uploading it to S3. This functions will test a database that is in WAL 211 | # mode and then changed to a different mode. This is because I spect the typica 212 | # workflow to start with a DB in WAL mode to load data in it, then the 213 | # journal_mode gets changed to something else for uploading: 214 | # 215 | # set page size -> vacuum -> set WAL -> truncate WAL -> change journal model before uploadig 216 | 217 | 218 | def set_wal_pragmas(conn, page_size, journal_mode): 219 | # Page size can't be changed after setting WAL mode, so we need to do 220 | # it before. 221 | sqls = [ 222 | f"PRAGMA page_size = {page_size};", 223 | "VACUUM;", 224 | "PRAGMA journal_mode = WAL;", 225 | "PRAGMA wal_checkpoint(truncate);", 226 | f"PRAGMA journal_mode = {journal_mode};", 227 | ] 228 | for sql in sqls: 229 | print(f"Running: {sql}") 230 | conn.execute(sql) 231 | 232 | 233 | @pytest.fixture(params=itertools.product(PAGE_SIZES, JOURNAL_MODES)) 234 | def get_db_wal(request) -> Tuple[str, sqlite3.Connection]: 235 | 236 | conn = sqlite3.connect(dbname, isolation_level=None) 237 | set_wal_pragmas(conn, page_size=request.param[0], journal_mode=request.param[1]) 238 | 239 | assert conn.execute("PRAGMA page_size;").fetchone()[0] == request.param[0] 240 | assert ( 241 | conn.execute("PRAGMA journal_mode;").fetchone()[0].lower() 242 | == request.param[1].lower() 243 | ) 244 | return dbname, conn 245 | 246 | 247 | @pytest.mark.parametrize("query", QUERIES) 248 | def test_s3vfs_query_wal(bucket, s3vfs, get_db_wal, query): 249 | 250 | key_prefix = f"{bucket}/{dbname}" 251 | s3vfs.upload_file(get_db_wal[0], dest=key_prefix) 252 | 253 | # Create a database and query it 254 | with apsw.Connection( 255 | key_prefix, vfs=s3vfs.name, flags=apsw.SQLITE_OPEN_READONLY 256 | ) as conn: 257 | 258 | local_c = get_db_wal[1].execute(query) 259 | c = conn.execute(query) 260 | assert c.fetchall() == local_c.fetchall() 261 | 262 | 263 | @pytest.mark.parametrize("query", QUERIES) 264 | def test_s3vfs_query(bucket, s3vfs, get_db, query): 265 | 266 | key_prefix = f"{bucket}/{dbname}" 267 | s3vfs.upload_file(get_db[0], dest=key_prefix) 268 | 269 | # Create a database and query it 270 | with apsw.Connection( 271 | key_prefix, vfs=s3vfs.name, flags=apsw.SQLITE_OPEN_READONLY 272 | ) as conn: 273 | 274 | local_c = get_db[1].execute(query) 275 | c = conn.execute(query) 276 | assert c.fetchall() == local_c.fetchall() 277 | --------------------------------------------------------------------------------