├── .github └── workflows │ └── test.yml ├── .gitignore ├── DictDataBase.code-workspace ├── LICENSE ├── README.md ├── assets ├── coverage.svg ├── logo.afdesign └── logo.png ├── dictdatabase ├── __init__.py ├── byte_codes.py ├── configuration.py ├── indexing.py ├── io_bytes.py ├── io_safe.py ├── io_unsafe.py ├── locking.py ├── models.py ├── sessions.py └── utils.py ├── justfile ├── profiler.py ├── pyproject.toml ├── scenario_comparison.py ├── scene_random_writes.py ├── test_key_finder.py ├── tests ├── __init__.py ├── benchmark │ ├── locking.py │ ├── parallel_appends.py │ ├── run_async.py │ ├── run_big_file.py │ ├── run_parallel.py │ ├── run_parallel_multi.py │ ├── run_threaded.py │ ├── sequential_appends.py │ ├── sqlite │ │ ├── run.sh │ │ ├── test.py │ │ └── test_parallel_runner.py │ └── utils.py ├── conftest.py ├── system_checks │ ├── test_clocks.py │ ├── test_monotonic_over_threads.py │ └── test_tick_rate.py ├── test_at.py ├── test_create.py ├── test_delete.py ├── test_excepts.py ├── test_exists.py ├── test_indentation.py ├── test_indexer.py ├── test_io_bytes.py ├── test_io_safe.py ├── test_locking.py ├── test_parallel_crud.py ├── test_parallel_sessions.py ├── test_partial.py ├── test_read.py ├── test_threaded_sessions.py ├── test_utils.py ├── test_where.py ├── test_write.py └── utils.py └── uv.lock /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | 6 | # JOB: Tests 7 | tests-job: 8 | runs-on: ubuntu-latest 9 | 10 | strategy: 11 | matrix: 12 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 13 | 14 | steps: 15 | #---------------------------------------------- 16 | #---- Checkout and install uv and python 17 | #---------------------------------------------- 18 | 19 | - uses: actions/checkout@v4 20 | - name: Install uv 21 | uses: astral-sh/setup-uv@v4 22 | with: 23 | enable-cache: true 24 | - name: Set up Python ${{ matrix.python-version }} 25 | run: uv python install ${{ matrix.python-version }} 26 | 27 | #---------------------------------------------- 28 | #---- Install dependencies 29 | #---------------------------------------------- 30 | 31 | - name: uv install 32 | run: uv sync --all-extras --dev 33 | 34 | #---------------------------------------------- 35 | #---- Show installation details 36 | #---------------------------------------------- 37 | 38 | - name: uv --version 39 | run: uv --version 40 | - name: uv run python --version 41 | run: uv run python --version 42 | - name: ls -lah 43 | run: ls -lah 44 | - name: uv tree 45 | run: uv tree 46 | 47 | #---------------------------------------------- 48 | #---- Pre-Checks 49 | #---------------------------------------------- 50 | 51 | - name: Show clock resolution 52 | run: uv run python tests/system_checks/test_tick_rate.py 53 | - name: Test clocks 54 | run: uv run python tests/system_checks/test_clocks.py 55 | - name: Test monotonicity 56 | run: uv run python tests/system_checks/test_monotonic_over_threads.py 57 | 58 | #---------------------------------------------- 59 | #---- Run tests with coverage report 60 | #---------------------------------------------- 61 | 62 | - name: 🚀 Run tests with code coverage report 63 | run: uv run pytest --cov=dictdatabase --cov-report term-missing 64 | 65 | #---------------------------------------------- 66 | #---- Save coverage artifact 67 | #---------------------------------------------- 68 | 69 | - name: Debug coverage file 70 | run: ls -lah 71 | - uses: actions/upload-artifact@v4 72 | with: 73 | name: coverage-${{ matrix.python-version }} 74 | include-hidden-files: true 75 | if-no-files-found: error 76 | path: ".coverage" 77 | 78 | # JOB: Coverage Badge 79 | cov-badge-job: 80 | # Only run this job on push events to the main branch, after tests succeed 81 | if: github.event_name == 'push' && github.ref == 'refs/heads/main' && needs.tests-job.result == 'success' 82 | needs: tests-job 83 | runs-on: ubuntu-latest 84 | steps: 85 | - uses: actions/checkout@v4 86 | 87 | #---------------------------------------------- 88 | #---- Download and debug artifact 89 | #---------------------------------------------- 90 | 91 | - name: Debug workspace 92 | run: ls -lah 93 | 94 | - uses: actions/download-artifact@v4 95 | with: 96 | name: coverage-3.12 97 | path: . 98 | 99 | - name: Debug downloaded artifact 100 | run: ls -lah 101 | 102 | #---------------------------------------------- 103 | #---- Generate coverage badge 104 | #---------------------------------------------- 105 | 106 | - name: Generate Coverage Badge 107 | uses: tj-actions/coverage-badge-py@v2 108 | with: 109 | output: assets/coverage.svg 110 | 111 | #---------------------------------------------- 112 | #---- Verify and commit changes 113 | #---------------------------------------------- 114 | 115 | - name: Verify Changed Files 116 | uses: tj-actions/verify-changed-files@v17 117 | id: changed_files 118 | with: 119 | files: assets/coverage.svg 120 | 121 | - name: Commit Files 122 | if: steps.changed_files.outputs.files_changed == 'true' 123 | run: | 124 | git config --local user.email "github-actions[bot]@users.noreply.github.com" 125 | git config --local user.name "github-actions[bot]" 126 | git add assets/coverage.svg 127 | git commit -m "Updated assets/coverage.svg" 128 | 129 | - name: Push Changes 130 | if: steps.changed_files.outputs.files_changed == 'true' 131 | uses: ad-m/github-push-action@master 132 | with: 133 | github_token: ${{ secrets.github_token }} 134 | branch: ${{ github.ref }} 135 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv/ 2 | .ddb_storage_testing/ 3 | .ddb_pytest_storage 4 | .ddb* 5 | .coverage* 6 | ddb_storage 7 | test_db/ 8 | *.prof 9 | dist/ 10 | __pycache__ 11 | -------------------------------------------------------------------------------- /DictDataBase.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "." 5 | } 6 | ], 7 | "settings": { 8 | "[python]": { 9 | "editor.formatOnSave": true, 10 | "editor.defaultFormatter": "charliermarsh.ruff" 11 | }, 12 | "editor.codeActionsOnSave": { 13 | "source.organizeImports": "explicit" 14 | }, 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 mkrd 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Logo](https://github.com/mkrd/DictDataBase/blob/main/assets/logo.png?raw=true) 2 | 3 | [![Downloads](https://static.pepy.tech/badge/dictdatabase)](https://pepy.tech/project/dictdatabase) 4 | ![Tests](https://github.com/mkrd/DictDataBase/actions/workflows/test.yml/badge.svg) 5 | ![Coverage](https://github.com/mkrd/DictDataBase/blob/main/assets/coverage.svg?raw=1) 6 | 7 | DictDataBase is a fast document-based database that uses json files or compressed json files for storage. 8 | - **Multi threading and multi processing safe**. Multiple processes on the same machine 9 | can simultaneously read and write to dicts without losing data. 10 | - **ACID** compliant. Unlike TinyDB, it is suited for concurrent environments. 11 | - **No Conflict resolution** required. Unlike with ZODB, lock-based access control is used, such that conflicts never occur. 12 | - **No database server** required. Simply import DictDataBase in your project and use 13 | it. 14 | - **Compression**. Configure if the files should be stored as raw json or as json 15 | compressed with zlib. 16 | - **Fast**. Key-value pairs inside a json file can be accessed quickly and efficiently because the keys are indexed. 17 | - **Tested** with 98%+ coverage on Python 3.8 to 3.13. 18 | 19 | ### Why use DictDataBase 20 | - Your application concurrently reads and writes data from multiple processes or threads. 21 | - Using database server is a bit too much for your application. 22 | - But you need [ACID](https://en.wikipedia.org/wiki/ACID) guarantees. 23 | - Your use case requires reading key-value pairs from very large json files repeatedly. (For example, DictDataBase can handle about 2000 reads per second when reading single key-value pairs from a 2.5GB json file with 20000 key-value pairs.) 24 | - You need to repeatedly read and write many smaller json files. 25 | - Your use case is suited for working with json data, or you have to work with a lot of 26 | json data. 27 | 28 | ### Why not DictDataBase 29 | - If your storage is slow. 30 | - Your use cases requires repeatedly modifying or writing data in a single very large json file 31 | - If a relational database is better suited for your use case. 32 | - If you need to read files that are larger than your system's RAM. 33 | 34 | Install 35 | ======================================================================================== 36 | 37 | ```sh 38 | pip install dictdatabase 39 | ``` 40 | 41 | Configuration 42 | ======================================================================================== 43 | The following configuration parameters can be modified using `DDB.config`: 44 | 45 | ### Storage directory 46 | Set storage_directory to the path of the directory that will contain your json files: 47 | ```python 48 | DDB.config.storage_directory = "./ddb_storage" # Default value 49 | ``` 50 | 51 | ### Compression 52 | If you want to use compressed files, set use_compression to `True`. 53 | This will make the db files significantly smaller and might improve performance if your 54 | disk is slow. However, the files will not be human readable. 55 | ```python 56 | DDB.config.use_compression = False # Default value 57 | ``` 58 | 59 | ### Indentation 60 | Set the way how written json files should be indented. Behaves exactly like 61 | `json.dumps(indent=...)`. It can be an `int` for the number of spaces, the tab 62 | character, or `None` if you don't want the files to be indented. 63 | ```python 64 | DDB.config.indent = "\t" # Default value 65 | ``` 66 | Notice: If `DDB.config.use_orjson = True`, then the value can only be 2 (spaces) or 67 | 0/None for no indentation. 68 | 69 | ### Use orjson 70 | You can use the orjson encoder and decoder if you need to. 71 | The standard library json module is sufficient most of the time. 72 | However, orjson is a lot more performant in virtually all cases. 73 | ```python 74 | DDB.config.use_orjson = True # Default value 75 | ``` 76 | 77 | Usage 78 | ======================================================================================== 79 | 80 | Import 81 | ---------------------------------------------------------------------------------------- 82 | 83 | ```python 84 | import dictdatabase as DDB 85 | ``` 86 | 87 | Create a file 88 | ---------------------------------------------------------------------------------------- 89 | This library is called DictDataBase, but you can actually use any json serializable object. 90 | ```python 91 | users_dict = { 92 | "u1": { "name" : "Ben", "age": 30, "job": "Software Engineer" }, 93 | "u2": { "name" : "Sue", "age": 21, "job": "Architect" }, 94 | "u3": { "name" : "Joe", "age": 50, "job": "Manager" }, 95 | } 96 | DDB.at("users").create(users_dict) 97 | ``` 98 | There is now a file called `users.json` or `users.ddb` in your specified storage 99 | directory depending on if you use compression. 100 | 101 | 102 | Check if file or sub-key exists 103 | ---------------------------------------------------------------------------------------- 104 | ```python 105 | DDB.at("users").exists() 106 | >>> True # File exists 107 | DDB.at("users", key="u10").exists() 108 | >>> False # Key "u10" not in users 109 | DDB.at("users", key="u2").exists() 110 | >>> True 111 | ``` 112 | 113 | Read dicts 114 | ---------------------------------------------------------------------------------------- 115 | 116 | ```python 117 | d = DDB.at("users").read() 118 | d == users_dict # True 119 | 120 | # Only partially read Joe 121 | joe = DDB.at("users", key="u3").read() 122 | joe == users_dict["Joe"] # True 123 | ``` 124 | 125 | > Note: Doing a partial read like with `DDB.at("users", key="Joe").read()` will only 126 | > return the value of the key if the key is at the root indentation level. 127 | > Example: You can get "a" from {"a" : 3}, but not from {"b": {"a": 3}}. 128 | 129 | It is also possible to only read a subset of keys based on a filter callback: 130 | 131 | ```python 132 | DDB.at("numbers").create({"a", 1, "b", 2, "c": 3}) 133 | 134 | above_1 = DDB.at("numbers", where=lambda k, v: v > 1).read() 135 | >>> above_1 == {"b", 2, "c": 3} 136 | ``` 137 | > The `where` callback is a function that takes two parameters, the key and the value. 138 | 139 | 140 | Write dicts 141 | ---------------------------------------------------------------------------------------- 142 | 143 | ```python 144 | with DDB.at("users").session() as (session, users): 145 | users["u3"]["age"] = 99 146 | print(DDB.at("users", key="u3").read()["age"]) 147 | >>> 99 148 | ``` 149 | > If you do not call session.write(), changes will not be written to disk! 150 | 151 | Partial writing 152 | ---------------------------------------------------------------------------------------- 153 | Imagine you have a huge json file with many purchases. 154 | The json file looks like this: `{: , : , ...}`. 155 | Normally, you would have to read and parse the entire file to get a specific key. 156 | After modifying the purchase, you would also have to serialize and write the 157 | entire file again. With DDB, you can do it more efficiently: 158 | ```python 159 | with DDB.at("purchases", key="3244").session() as (session, purchase): 160 | purchase["status"] = "cancelled" 161 | session.write() 162 | ``` 163 | Afterwards, the status is updated in the json file. 164 | However, DDB did only efficiently gather the one purchase with id 134425, parsed 165 | its value, and serialized that value alone before writing again. This is several 166 | orders of magnitude faster than the naive approach when working with big files. 167 | 168 | 169 | Folders 170 | ---------------------------------------------------------------------------------------- 171 | 172 | You can also read and write to folders of files. Consider the same example as 173 | before, but now we have a folder called `purchases` that contains many files 174 | `.json`. If you want to open a session or read a specific one, you can do: 175 | 176 | ```python 177 | DDB.at("purchases/").read() 178 | # Or equivalently: 179 | DDB.at("purchases", "").read() 180 | ``` 181 | 182 | To open a session or read all, do the following: 183 | ```python 184 | DDB.at("purchases/*").read() 185 | # Or equivalently: 186 | DDB.at("purchases", "*").read() 187 | ``` 188 | 189 | ### Select from folder 190 | 191 | If you have a folder containing many json files, you can read them selectively 192 | based on a function. The file is included if the provided function returns true 193 | when it get the file dict as input: 194 | 195 | To open a session or read all, do the following: 196 | ```python 197 | for i in range(10): 198 | DDB.at("folder", i).create({"a": i}) 199 | # Now in the directory "folder", 10 files exist 200 | res = DDB.at("folder/*", where=lambda x: x["a"] > 7).read() # .session() also possible 201 | assert ress == {"8": {"a": 8}, "9": {"a": 9}} # True 202 | ``` 203 | 204 | 205 | 206 | Performance 207 | ======================================================================================== 208 | 209 | In preliminary testing, DictDataBase showed promising performance. 210 | 211 | ### SQLite vs DictDataBase 212 | In each case, `16` parallel processes were spawned to perform `128` increments 213 | of a counter in `4` tables/files. SQLite achieves `2435 operations/s` while 214 | DictDataBase managed to achieve `3143 operations/s`. 215 | 216 | ### More tests 217 | It remains to be tested how DictDatabase performs in different scenarios, for 218 | example when multiple processes want to perform full writes to one big file. 219 | 220 | 221 | Advanced 222 | ======================================================================================== 223 | 224 | Sleep Timeout 225 | ---------------------------------------------------------------------------------------- 226 | DictDataBase uses a file locking protocol to coordinate concurrent file accesses. 227 | While waiting for a file where another thread or process currently has exclusive 228 | access rights, the status of the file lock is periodically checked. You can set 229 | the timout between the checks: 230 | 231 | ```python 232 | DDB.locking.SLEEP_TIMEOUT = 0.001 # 1ms, default value 233 | ``` 234 | 235 | A value of 1 millisecond is good and it is generally not recommended to change it, 236 | but you can still tune it to optimize performance in your use case. 237 | 238 | 239 | Lock aquisition timeout 240 | ---------------------------------------------------------------------------------------- 241 | AQUIRE_LOCK_TIMEOUT specifies the maximum duration to wait for acquiring a lock before 242 | giving up and throwing a timeout error. 243 | 244 | ```python 245 | DDB.locking.REMOVE_ORPHAN_LOCK_TIMEOUT = 60.0 # 60s, default value 246 | ``` 247 | 248 | 249 | API Reference 250 | ======================================================================================== 251 | 252 | ### `at(path) -> DDBMethodChooser:` 253 | Select a file or folder to perform an operation on. 254 | If you want to select a specific key in a file, use the `key` parameter, 255 | e.g. `DDB.at("file", key="subkey")`. The key value is only returned if the key 256 | is at the root level of the json object. 257 | 258 | If you want to select an entire folder, use the `*` wildcard, 259 | eg. `DDB.at("folder", "*")`, or `DDB.at("folder/*")`. You can also use 260 | the `where` callback to select a subset of the file or folder. 261 | 262 | If the callback returns `True`, the item will be selected. The callback 263 | needs to accept a key and value as arguments. 264 | 265 | Args: 266 | - `path`: The path to the file or folder. Can be a string, a 267 | comma-separated list of strings, or a list. 268 | - `key`: The key to select from the file. 269 | - `where`: A function that takes a key and value and returns `True` if the 270 | key should be selected. 271 | 272 | Beware: If you select a folder with the `*` wildcard, you can't use the `key` 273 | parameter. 274 | Also, you cannot use the `key` and `where` parameters at the same time. 275 | 276 | DDBMethodChooser 277 | ---------------------------------------------------------------------------------------- 278 | 279 | ### `exists() -> bool:` 280 | Create a new file with the given data as the content. If the file 281 | already exists, a FileExistsError will be raised unless 282 | `force_overwrite` is set to True. 283 | 284 | Args: 285 | - `data`: The data to write to the file. If not specified, an empty dict 286 | will be written. 287 | - `force_overwrite`: If `True`, will overwrite the file if it already 288 | exists, defaults to False (optional). 289 | 290 | 291 | ### `create(data=None, force_overwrite: bool = False):` 292 | It creates a database file at the given path, and writes the given database to 293 | it 294 | :param db: The database to create. If not specified, an empty database is 295 | created. 296 | :param force_overwrite: If True, will overwrite the database if it already 297 | exists, defaults to False (optional). 298 | 299 | ### `delete()` 300 | Delete the file at the selected path. 301 | 302 | ### `read(self, as_type: T = None) -> dict | T | None:` 303 | Reads a file or folder depending on previous `.at(...)` selection. 304 | 305 | Args: 306 | - `as_type`: If provided, return the value as the given type. 307 | Eg. as_type=str will return str(value). 308 | 309 | ### `session(self, as_type: T = None) -> DDBSession[T]:` 310 | Opens a session to the selected file(s) or folder, depending on previous 311 | `.at(...)` selection. Inside the with block, you have exclusive access 312 | to the file(s) or folder. 313 | Call `session.write()` to write the data to the file(s) or folder. 314 | 315 | Args: 316 | - `as_type`: If provided, cast the value to the given type. 317 | Eg. as_type=str will return str(value). 318 | -------------------------------------------------------------------------------- /assets/coverage.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | coverage 17 | coverage 18 | 98% 19 | 98% 20 | 21 | 22 | -------------------------------------------------------------------------------- /assets/logo.afdesign: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mkrd/DictDataBase/12e650460c9284f8cd1249d26b16c18c04445691/assets/logo.afdesign -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mkrd/DictDataBase/12e650460c9284f8cd1249d26b16c18c04445691/assets/logo.png -------------------------------------------------------------------------------- /dictdatabase/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import config # noqa: F401 2 | from .models import at # noqa: F401 3 | -------------------------------------------------------------------------------- /dictdatabase/byte_codes.py: -------------------------------------------------------------------------------- 1 | # See: https://www.charset.org/utf-8 2 | BACKSLASH = 92 3 | QUOTE = 34 4 | OPEN_SQUARE = 91 5 | CLOSE_SQUARE = 93 6 | OPEN_CURLY = 123 7 | CLOSE_CURLY = 125 8 | SPACE = 32 9 | TAB = 9 10 | NEWLINE = 10 11 | COMMA = 44 12 | -------------------------------------------------------------------------------- /dictdatabase/configuration.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | class Confuguration: 5 | __slots__ = ("storage_directory", "indent", "use_compression", "use_orjson") 6 | 7 | storage_directory: str 8 | indent: int | str | None # eg. "\t" or 4 or None 9 | use_compression: bool 10 | use_orjson: bool 11 | 12 | def __init__( 13 | self, 14 | storage_directory: str = "ddb_storage", 15 | indent: str | int | None = "\t", 16 | use_compression: bool = False, 17 | use_orjson: bool = True, 18 | ) -> None: 19 | self.storage_directory = storage_directory 20 | self.indent = indent 21 | self.use_compression = use_compression 22 | self.use_orjson = use_orjson 23 | 24 | 25 | config = Confuguration() 26 | -------------------------------------------------------------------------------- /dictdatabase/indexing.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Union 3 | 4 | import orjson 5 | 6 | from . import config 7 | 8 | # Problem: Multiple read processes will concurrently read and write the same file 9 | # In some cases this will result in a empty read error, thats why the try-except exists 10 | 11 | 12 | # Idea 1: 13 | # - Never write to the index when reading 14 | # - When writing, the lock is exclusive on the index aswell, so no other process can read or write 15 | # Problem: If a file is only ever reed, it will never be indexed 16 | 17 | # Idea 2: 18 | # - Write a new index_record to a new unique file 19 | # - Reading index happens from all related files 20 | # - When writing, the new index_record is collected and written into the main file 21 | # Problem: If a file is only ever reed, lots of index record files will accumulate 22 | 23 | # Idea 3: 24 | # - Leave everything as is. While not ideal, it works. When empty read error occurs, don't use the index for that read 25 | 26 | 27 | class Indexer: 28 | """ 29 | The Indexer takes the name of a database file, and tries to load the .index file 30 | of the corresponding database file. 31 | 32 | The name of the index file is the name of the database file, with the extension 33 | .index and all "/" replaced with "___" 34 | 35 | The content of the index file is a json object, where the keys are keys inside 36 | the database json file, and the values are lists of 5 elements: 37 | - start_index: The index of the first byte of the value of the key in the database file 38 | - end_index: The index of the last byte of the value of the key in the database file 39 | - indent_level: The indent level of the key in the database file 40 | - indent_with: The indent string used. 41 | - value_hash: The hash of the value bytes 42 | """ 43 | 44 | __slots__ = ("data", "path") 45 | 46 | def __init__(self, db_name: str) -> None: 47 | # Make path of index file 48 | db_name = db_name.replace("/", "___") 49 | self.path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index") 50 | 51 | os.makedirs(os.path.dirname(self.path), exist_ok=True) 52 | if not os.path.exists(self.path): 53 | self.data = {} 54 | return 55 | 56 | try: 57 | with open(self.path, "rb") as f: 58 | self.data = orjson.loads(f.read()) 59 | except orjson.JSONDecodeError: 60 | self.data = {} 61 | 62 | def get(self, key: str) -> Union[list, None]: 63 | """ 64 | Returns a list of 5 elements for a key if it exists, otherwise None 65 | Elements:[start_index, end_index, indent_level, indent_with, value_hash] 66 | """ 67 | return self.data.get(key, None) 68 | 69 | def write( 70 | self, 71 | key: str, 72 | start_index: int, 73 | end_index: int, 74 | indent_level: int, 75 | indent_with: str, 76 | value_hash: int, 77 | old_value_end: int, 78 | ) -> None: 79 | """ 80 | Write index information for a key to the index file 81 | """ 82 | 83 | if self.data.get(key, None) is not None: 84 | delta = end_index - old_value_end 85 | for entry in self.data.values(): 86 | if entry[0] > old_value_end: 87 | entry[0] += delta 88 | entry[1] += delta 89 | 90 | self.data[key] = [start_index, end_index, indent_level, indent_with, value_hash] 91 | with open(self.path, "wb") as f: 92 | f.write(orjson.dumps(self.data)) 93 | -------------------------------------------------------------------------------- /dictdatabase/io_bytes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zlib 3 | 4 | from . import config, utils 5 | 6 | 7 | def read(db_name: str, *, start: int = None, end: int = None) -> bytes: 8 | """ 9 | Read the content of a file as bytes. Reading works even when the config 10 | changes, so a compressed ddb file can also be read if compression is 11 | disabled, and vice versa. 12 | 13 | If no compression is used, efficient reading can be done by specifying a start 14 | and end byte index, such that only the bytes in that range are read from the 15 | file. 16 | 17 | If compression is used, specifying a start and end byte index is still possible, 18 | but the entire file has to be read and decompressed first, and then the bytes 19 | in the range are returned. This is because the compressed file is not seekable. 20 | 21 | Args: 22 | - `db_name`: The name of the database file to read from. 23 | - `start`: The start byte index to read from. 24 | - `end`: The end byte index to read up to (not included). 25 | 26 | Raises: 27 | - `FileNotFoundError`: If the file does not exist as .json nor .ddb. 28 | - `OSError`: If no compression is used and `start` is negative. 29 | - `FileExistsError`: If the file exists as .json and .ddb. 30 | """ 31 | 32 | json_path, json_exists, ddb_path, ddb_exists = utils.file_info(db_name) 33 | 34 | if json_exists: 35 | if ddb_exists: 36 | raise FileExistsError(f'Inconsistent: "{db_name}" exists as .json and .ddb.' "Please remove one of them.") 37 | with open(json_path, "rb") as f: 38 | if start is None and end is None: 39 | return f.read() 40 | start = start or 0 41 | f.seek(start) 42 | if end is None: 43 | return f.read() 44 | return f.read(end - start) 45 | if not ddb_exists: 46 | raise FileNotFoundError(f'No database file exists for "{db_name}"') 47 | with open(ddb_path, "rb") as f: 48 | json_bytes = zlib.decompress(f.read()) 49 | if start is None and end is None: 50 | return json_bytes 51 | start = start or 0 52 | end = end or len(json_bytes) 53 | return json_bytes[start:end] 54 | 55 | 56 | def write(db_name: str, dump: bytes, *, start: int = None) -> None: 57 | """ 58 | Write the bytes to the file of the db_path. If the db was compressed but no 59 | compression is enabled, remove the compressed file, and vice versa. 60 | 61 | Args: 62 | - `db_name`: The name of the database to write to. 63 | - `dump`: The bytes to write to the file, representing correct JSON when 64 | decoded. 65 | - `start`: The start byte index to write to. If None, the whole file is overwritten. 66 | If the original content was longer, the rest truncated. 67 | """ 68 | 69 | json_path, json_exists, ddb_path, ddb_exists = utils.file_info(db_name) 70 | 71 | # Write bytes or string to file 72 | remove_file = None 73 | if config.use_compression: 74 | if start is not None: 75 | raise RuntimeError("Cannot write to compressed file at a specific index") 76 | write_file = ddb_path 77 | if json_exists: 78 | remove_file = json_path 79 | dump = zlib.compress(dump, 1) 80 | else: 81 | write_file = json_path 82 | if ddb_exists: 83 | remove_file = ddb_path 84 | 85 | # Write bytes or string to file 86 | if start is None: 87 | with open(write_file, "wb") as f: 88 | f.write(dump) 89 | else: 90 | with open(write_file, "ab") as f: 91 | f.seek(start) 92 | f.truncate() 93 | f.write(dump) 94 | 95 | # Remove the other file if it exists 96 | # This is done after writing to avoid data loss 97 | if remove_file is not None: 98 | os.remove(remove_file) 99 | -------------------------------------------------------------------------------- /dictdatabase/io_safe.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from . import config, io_unsafe, locking, utils 4 | 5 | 6 | def read(file_name: str) -> dict: 7 | """ 8 | Read the content of a file as a dict. 9 | 10 | Args: 11 | - `file_name`: The name of the file to read from. 12 | """ 13 | 14 | _, json_exists, _, ddb_exists = utils.file_info(file_name) 15 | 16 | if not json_exists and not ddb_exists: 17 | return None 18 | 19 | with locking.ReadLock(file_name): 20 | return io_unsafe.read(file_name) 21 | 22 | 23 | def partial_read(file_name: str, key: str) -> dict: 24 | """ 25 | Read only the value of a key-value pair from a file. 26 | 27 | Args: 28 | - `file_name`: The name of the file to read from. 29 | - `key`: The key to read the value of. 30 | """ 31 | 32 | _, json_exists, _, ddb_exists = utils.file_info(file_name) 33 | 34 | if not json_exists and not ddb_exists: 35 | return None 36 | 37 | with locking.ReadLock(file_name): 38 | return io_unsafe.partial_read(file_name, key) 39 | 40 | 41 | def write(file_name: str, data: dict) -> None: 42 | """ 43 | Ensures that writing only starts if there is no reading or writing in progress. 44 | 45 | Args: 46 | - `file_name`: The name of the file to write to. 47 | - `data`: The data to write to the file. 48 | """ 49 | 50 | dirname = os.path.dirname(f"{config.storage_directory}/{file_name}.any") 51 | os.makedirs(dirname, exist_ok=True) 52 | 53 | with locking.WriteLock(file_name): 54 | io_unsafe.write(file_name, data) 55 | 56 | 57 | def delete(file_name: str) -> None: 58 | """ 59 | Ensures that deleting only starts if there is no reading or writing in progress. 60 | 61 | Args: 62 | - `file_name`: The name of the file to delete. 63 | """ 64 | 65 | json_path, json_exists, ddb_path, ddb_exists = utils.file_info(file_name) 66 | 67 | if not json_exists and not ddb_exists: 68 | return 69 | 70 | with locking.WriteLock(file_name): 71 | if json_exists: 72 | os.remove(json_path) 73 | if ddb_exists: 74 | os.remove(ddb_path) 75 | -------------------------------------------------------------------------------- /dictdatabase/io_unsafe.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import hashlib 4 | import json 5 | from dataclasses import dataclass 6 | 7 | import orjson 8 | 9 | from . import byte_codes, config, indexing, io_bytes, utils 10 | 11 | 12 | @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 13 | class PartialDict: 14 | prefix: bytes 15 | key: str 16 | value: dict 17 | value_start: int 18 | value_end: int 19 | suffix: bytes 20 | 21 | 22 | @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 23 | class PartialFileHandle: 24 | db_name: str 25 | partial_dict: PartialDict 26 | indent_level: int 27 | indent_with: str 28 | indexer: indexing.Indexer 29 | 30 | 31 | ######################################################################################## 32 | #### Full Reading 33 | ######################################################################################## 34 | 35 | 36 | def read(db_name: str) -> dict: 37 | """ 38 | Read the file at db_path from the configured storage directory. 39 | Make sure the file exists. If it does not a FileNotFoundError is 40 | raised. 41 | """ 42 | # Always use orjson to read the file, because it is faster 43 | return orjson.loads(io_bytes.read(db_name)) 44 | 45 | 46 | ######################################################################################## 47 | #### Partial Reading 48 | ######################################################################################## 49 | 50 | 51 | def try_read_bytes_using_indexer(indexer: indexing.Indexer, db_name: str, key: str) -> bytes | None: 52 | """ 53 | Check if the key info is saved in the file's index file. 54 | If it is and the value has not changed, return the value bytes. 55 | Otherwise return None. 56 | """ 57 | 58 | if (index := indexer.get(key)) is None: 59 | return None 60 | start, end, _, _, value_hash = index 61 | partial_bytes = io_bytes.read(db_name, start=start, end=end) 62 | if value_hash != hashlib.sha256(partial_bytes).hexdigest(): 63 | return None 64 | return partial_bytes 65 | 66 | 67 | def partial_read(db_name: str, key: str) -> dict | None: 68 | """ 69 | Partially read a key from a db. 70 | The key MUST be unique in the entire db, otherwise the behavior is undefined. 71 | This is a lot faster than reading the entire db, because it does not parse 72 | the entire file, but only the part part of the : pair. 73 | 74 | If the key is not found, a `KeyError` is raised. 75 | """ 76 | 77 | # Search for key in the index file 78 | indexer = indexing.Indexer(db_name) 79 | if (value_bytes := try_read_bytes_using_indexer(indexer, db_name, key)) is not None: 80 | return orjson.loads(value_bytes) 81 | 82 | # Not found in index file, search for key in the entire file 83 | all_file_bytes = io_bytes.read(db_name) 84 | key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) 85 | 86 | if key_end == -1: 87 | return None 88 | 89 | # Key found, now determine the bounding byte indices of the value 90 | start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0) 91 | end = utils.seek_index_through_value_bytes(all_file_bytes, start) 92 | 93 | indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start) 94 | value_bytes = all_file_bytes[start:end] 95 | value_hash = hashlib.sha256(value_bytes).hexdigest() 96 | 97 | # Write key info to index file 98 | indexer.write(key, start, end, indent_level, indent_with, value_hash, end) 99 | return orjson.loads(value_bytes) 100 | 101 | 102 | ################################################################################ 103 | #### Writing 104 | ################################################################################ 105 | 106 | 107 | def serialize_data_to_json_bytes(data: dict) -> bytes: 108 | """ 109 | Serialize the data as json bytes. Depending on the config, 110 | this can be done with orjson or the standard json module. 111 | Additionally config.indent is respected. 112 | """ 113 | if config.use_orjson: 114 | option = (orjson.OPT_INDENT_2 if config.indent else 0) | orjson.OPT_SORT_KEYS 115 | return orjson.dumps(data, option=option) 116 | else: 117 | db_dump = json.dumps(data, indent=config.indent, sort_keys=True) 118 | return db_dump.encode() 119 | 120 | 121 | def write(db_name: str, data: dict) -> None: 122 | """ 123 | Write the dict db dumped as a json string 124 | to the file of the db_path. 125 | """ 126 | data_bytes = serialize_data_to_json_bytes(data) 127 | io_bytes.write(db_name, data_bytes) 128 | 129 | 130 | ################################################################################ 131 | #### Partial Writing 132 | ################################################################################ 133 | 134 | 135 | def try_get_partial_file_handle_by_index( 136 | indexer: indexing.Indexer, 137 | db_name: str, 138 | key: str, 139 | ) -> tuple[PartialFileHandle | None, bytes | None]: 140 | """ 141 | Try to get a partial file handle by using the key entry in the index file. 142 | 143 | If the data could be read from the index file, a tuple of the partial file 144 | handle and None is returned. 145 | If the data could not be read from the index file, a tuple of None and the file 146 | bytes is returned, so that the file bytes can be searched for the key. 147 | """ 148 | 149 | if (index := indexer.get(key)) is None: 150 | return None, io_bytes.read(db_name) 151 | start, end, indent_level, indent_with, value_hash = index 152 | 153 | # If compression is enabled, all data has to be read from the file 154 | if config.use_compression: 155 | all_file_bytes = io_bytes.read(db_name) 156 | value_bytes = all_file_bytes[start:end] 157 | if value_hash != hashlib.sha256(value_bytes).hexdigest(): 158 | return None, all_file_bytes 159 | value_data = orjson.loads(value_bytes) 160 | partial_dict = PartialDict(all_file_bytes[:start], key, value_data, start, end, all_file_bytes[end:]) 161 | 162 | # If compression is disabled, only the value and suffix have to be read 163 | else: 164 | value_and_suffix_bytes = io_bytes.read(db_name, start=start) 165 | value_length = end - start 166 | value_bytes = value_and_suffix_bytes[:value_length] 167 | if value_hash != hashlib.sha256(value_bytes).hexdigest(): 168 | # If the hashes don't match, read the prefix to concat the full file bytes 169 | prefix_bytes = io_bytes.read(db_name, end=start) 170 | return None, prefix_bytes + value_and_suffix_bytes 171 | value_data = orjson.loads(value_bytes) 172 | partial_dict = PartialDict(None, key, value_data, start, end, value_and_suffix_bytes[value_length:]) 173 | 174 | return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer), None 175 | 176 | 177 | def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: 178 | """ 179 | Partially read a key from a db. 180 | The key MUST be unique in the entire db, otherwise the behavior is undefined. 181 | This is a lot faster than reading the entire db, because it does not parse 182 | the entire file, but only the part part of the : pair. 183 | 184 | If the key is not found, a `KeyError` is raised. 185 | """ 186 | 187 | # Search for key in the index file 188 | indexer = indexing.Indexer(db_name) 189 | partial_handle, all_file_bytes = try_get_partial_file_handle_by_index(indexer, db_name, key) 190 | if partial_handle is not None: 191 | return partial_handle 192 | 193 | # Not found in index file, search for key in the entire file 194 | key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) 195 | 196 | if key_end == -1: 197 | raise KeyError(f'Key "{key}" not found in db "{db_name}"') 198 | 199 | # Key found, now determine the bounding byte indices of the value 200 | start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0) 201 | end = utils.seek_index_through_value_bytes(all_file_bytes, start) 202 | 203 | indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start) 204 | 205 | partial_value = orjson.loads(all_file_bytes[start:end]) 206 | prefix_bytes = all_file_bytes[:start] if config.use_compression else None 207 | partial_dict = PartialDict(prefix_bytes, key, partial_value, start, end, all_file_bytes[end:]) 208 | return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer) 209 | 210 | 211 | def partial_write(pf: PartialFileHandle) -> None: 212 | """ 213 | Write a partial file handle to the db. 214 | """ 215 | 216 | partial_bytes = serialize_data_to_json_bytes(pf.partial_dict.value) 217 | 218 | # Add indentation 219 | if pf.indent_level > 0 and pf.indent_with: 220 | replace_this = b"\n" 221 | replace_with = ("\n" + (pf.indent_level * pf.indent_with)).encode() 222 | partial_bytes = partial_bytes.replace(replace_this, replace_with) 223 | 224 | # Write key info to index file 225 | pf.indexer.write( 226 | key=pf.partial_dict.key, 227 | start_index=pf.partial_dict.value_start, 228 | end_index=pf.partial_dict.value_start + len(partial_bytes), 229 | indent_level=pf.indent_level, 230 | indent_with=pf.indent_with, 231 | value_hash=hashlib.sha256(partial_bytes).hexdigest(), 232 | old_value_end=pf.partial_dict.value_end, 233 | ) 234 | 235 | if pf.partial_dict.prefix is None: 236 | # Prefix could not be determined due to compression, so write the entire file 237 | io_bytes.write(pf.db_name, partial_bytes + pf.partial_dict.suffix, start=pf.partial_dict.value_start) 238 | else: 239 | # Prefix was determined, so only write the changed part and the suffix 240 | io_bytes.write(pf.db_name, pf.partial_dict.prefix + partial_bytes + pf.partial_dict.suffix) 241 | -------------------------------------------------------------------------------- /dictdatabase/locking.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import contextlib 4 | import os 5 | import threading 6 | import time 7 | 8 | from . import config 9 | 10 | # Design decisions: 11 | # - Do not use pathlib, because it is slower than os 12 | 13 | # Constants 14 | SLEEP_TIMEOUT = 0.001 * 1 # (ms) 15 | LOCK_KEEP_ALIVE_TIMEOUT = 0.001 * 0.08 # (ms) 16 | 17 | # Duration to wait updating the timestamp of the lock file 18 | ALIVE_LOCK_REFRESH_INTERVAL_NS = 1_000_000_000 * 10 # (s) 19 | 20 | # Duration to wait before considering a lock as orphaned 21 | REMOVE_ORPHAN_LOCK_TIMEOUT = 20.0 22 | 23 | # Duration to wait before giving up on acquiring a lock 24 | AQUIRE_LOCK_TIMEOUT = 60.0 25 | 26 | 27 | def os_touch(path: str) -> None: 28 | """ 29 | Create an empty file at the given path. This mimics the UNIX touch command 30 | and is compatible with both Windows and UNIX systems. 31 | """ 32 | mode = 0o666 33 | flags = os.O_CREAT | os.O_WRONLY | os.O_EXCL 34 | fd = os.open(path, flags, mode) 35 | os.close(fd) 36 | 37 | 38 | class LockFileMeta: 39 | """ 40 | Metadata representation for a lock file. 41 | """ 42 | 43 | __slots__ = ("ddb_dir", "name", "id", "time_ns", "stage", "mode", "path") 44 | 45 | ddb_dir: str 46 | name: str 47 | id: str 48 | time_ns: str 49 | stage: str 50 | mode: str 51 | path: str 52 | 53 | def __init__(self, ddb_dir: str, name: str, id: str, time_ns: str, stage: str, mode: str) -> None: 54 | self.ddb_dir = ddb_dir 55 | self.name = name 56 | self.id = id 57 | self.time_ns = time_ns 58 | self.stage = stage 59 | self.mode = mode 60 | lock_file = f"{name}.{id}.{time_ns}.{stage}.{mode}.lock" 61 | self.path = os.path.join(ddb_dir, lock_file) 62 | 63 | def __repr__(self) -> str: 64 | return f"LockFileMeta({self.ddb_dir=}, {self.name=}, {self.id=}, {self.time_ns=}, {self.stage=}, {self.mode=})" 65 | 66 | def new_with_updated_time(self) -> LockFileMeta: 67 | """ 68 | Create a new instance with an updated timestamp. 69 | """ 70 | time_ns = f"{time.time_ns()}" 71 | return LockFileMeta(self.ddb_dir, self.name, self.id, time_ns, self.stage, self.mode) 72 | 73 | 74 | class FileLocksSnapshot: 75 | """ 76 | Represents a snapshot of the current state of file locks in the directory. 77 | This snapshot assists in deciding which lock should be acquired or released next. 78 | 79 | On init, orphaned locks are removed. 80 | """ 81 | 82 | __slots__ = ("any_has_locks", "any_write_locks", "any_has_write_locks", "locks") 83 | 84 | locks: list[LockFileMeta] 85 | any_has_locks: bool 86 | any_write_locks: bool 87 | any_has_write_locks: bool 88 | 89 | def __init__(self, need_lock: LockFileMeta) -> None: 90 | self.locks = [] 91 | self.any_has_locks = False 92 | self.any_write_locks = False 93 | self.any_has_write_locks = False 94 | 95 | for file_name in os.listdir(need_lock.ddb_dir): 96 | if not file_name.endswith(".lock"): 97 | continue 98 | name, id, time_ns, stage, mode, _ = file_name.split(".") 99 | if name != need_lock.name: 100 | continue 101 | 102 | lock_meta = LockFileMeta(need_lock.ddb_dir, name, id, time_ns, stage, mode) 103 | 104 | # Remove orphaned locks 105 | if lock_meta.path != need_lock.path: 106 | lock_age = time.time_ns() - int(lock_meta.time_ns) 107 | if lock_age > REMOVE_ORPHAN_LOCK_TIMEOUT * 1_000_000_000: 108 | os.unlink(lock_meta.path) 109 | print(f"Removed orphaned lock ({lock_meta.path})") 110 | continue 111 | 112 | self.locks.append(lock_meta) 113 | 114 | # Update lock state flags 115 | if lock_meta.stage == "has": 116 | self.any_has_locks = True 117 | if lock_meta.mode == "write": 118 | self.any_has_write_locks = True 119 | if lock_meta.mode == "write": 120 | self.any_write_locks = True 121 | 122 | def exists(self, l: LockFileMeta) -> bool: 123 | """ 124 | Check if a lock with the same ID, stage, and mode exists in the current snapshot. 125 | """ 126 | return any(x.id == l.id and x.stage == l.stage and x.mode == l.mode for x in self.locks) 127 | 128 | def oldest_need(self, need_lock: LockFileMeta) -> bool: 129 | """ 130 | Determine if the provided 'need_lock' is the oldest among all 'need' locks in the snapshot. 131 | """ 132 | # len(need_locks) is at least 1 since this function is only called if there is a need_lock 133 | need_locks = [l for l in self.locks if l.stage == "need"] 134 | # Sort by time_ns. If multiple, the the one with the smaller id is first 135 | need_locks = sorted(need_locks, key=lambda l: (int(l.time_ns), int(l.id))) 136 | return need_locks[0].id == need_lock.id 137 | 138 | 139 | class AbstractLock: 140 | """ 141 | Abstract base class for file locks. This class doesn't lock/unlock by itself but 142 | provides a blueprint for derived classes to implement. 143 | """ 144 | 145 | __slots__ = ("db_name", "need_lock", "has_lock", "snapshot", "mode", "is_alive" "keep_alive_thread") 146 | 147 | db_name: str 148 | need_lock: LockFileMeta 149 | has_lock: LockFileMeta 150 | snapshot: FileLocksSnapshot 151 | mode: str 152 | is_alive: bool 153 | keep_alive_thread: threading.Thread 154 | 155 | def __init__(self, db_name: str) -> None: 156 | # Normalize db_name to avoid file naming conflicts 157 | self.db_name = db_name.replace("/", "___").replace(".", "____") 158 | time_ns = time.time_ns() 159 | t_id = f"{threading.get_native_id()}" # ID that's unique across processes and threads. 160 | dir = os.path.join(config.storage_directory, ".ddb") 161 | 162 | self.need_lock = LockFileMeta(dir, self.db_name, t_id, time_ns, "need", self.mode) 163 | self.has_lock = LockFileMeta(dir, self.db_name, t_id, time_ns, "has", self.mode) 164 | 165 | self.is_alive = False 166 | self.keep_alive_thread = None 167 | 168 | # Ensure lock directory exists 169 | if not os.path.isdir(dir): 170 | os.makedirs(dir, exist_ok=True) 171 | 172 | def _keep_alive_thread(self) -> None: 173 | """ 174 | Keep the lock alive by updating the timestamp of the lock file. 175 | """ 176 | 177 | current_has_lock_time_ns: int = int(self.has_lock.time_ns) 178 | 179 | while self.is_alive: 180 | time.sleep(LOCK_KEEP_ALIVE_TIMEOUT) 181 | if time.time_ns() - current_has_lock_time_ns < ALIVE_LOCK_REFRESH_INTERVAL_NS: 182 | continue 183 | 184 | # Assert: The lock is older than ALIVE_LOCK_REFRESH_INTERVAL_NS ns 185 | # This means the has_lock must be refreshed 186 | 187 | new_has_lock = self.has_lock.new_with_updated_time() 188 | os_touch(new_has_lock.path) 189 | with contextlib.suppress(FileNotFoundError): 190 | os.unlink(self.has_lock.path) # Remove old lock file 191 | self.has_lock = new_has_lock 192 | current_has_lock_time_ns = int(new_has_lock.time_ns) 193 | 194 | def _start_keep_alive_thread(self) -> None: 195 | """ 196 | Start a thread that keeps the lock alive by updating the timestamp of the lock file. 197 | """ 198 | 199 | if self.keep_alive_thread is not None: 200 | raise RuntimeError("Keep alive thread already exists.") 201 | 202 | self.is_alive = True 203 | self.keep_alive_thread = threading.Thread(target=self._keep_alive_thread, daemon=False) 204 | self.keep_alive_thread.start() 205 | 206 | def _lock(self) -> None: 207 | """Override this method to implement locking mechanism.""" 208 | raise NotImplementedError 209 | 210 | def _unlock(self) -> None: 211 | """Remove the lock files associated with this lock.""" 212 | 213 | if self.keep_alive_thread is not None: 214 | self.is_alive = False 215 | self.keep_alive_thread.join() 216 | self.keep_alive_thread = None 217 | 218 | for p in ("need_lock", "has_lock"): 219 | try: 220 | if lock := getattr(self, p, None): 221 | os.unlink(lock.path) 222 | except FileNotFoundError: 223 | pass 224 | finally: 225 | setattr(self, p, None) 226 | 227 | def __enter__(self) -> None: 228 | self._lock() 229 | 230 | def __exit__(self, exc_type, exc_val, exc_tb) -> None: # noqa: ANN001 231 | self._unlock() 232 | 233 | 234 | class ReadLock(AbstractLock): 235 | """ 236 | A file-based read lock. 237 | Multiple threads/processes can simultaneously hold a read lock unless there's a write lock. 238 | """ 239 | 240 | mode = "read" 241 | 242 | def _lock(self) -> None: 243 | # Express intention to acquire read lock 244 | os.makedirs(os.path.dirname(self.need_lock.path), exist_ok=True) 245 | os_touch(self.need_lock.path) 246 | self.snapshot = FileLocksSnapshot(self.need_lock) 247 | 248 | # If this thread already holds a read lock, raise an exception. 249 | if self.snapshot.exists(self.has_lock): 250 | os.unlink(self.need_lock.path) 251 | raise RuntimeError("Thread already has a read lock. Do not try to obtain a read lock twice.") 252 | 253 | start_time = time.time() 254 | 255 | # Try to acquire lock until conditions are met or a timeout occurs 256 | while True: 257 | if not self.snapshot.any_write_locks or ( 258 | not self.snapshot.any_has_write_locks and self.snapshot.oldest_need(self.need_lock) 259 | ): 260 | self.has_lock = self.has_lock.new_with_updated_time() 261 | os_touch(self.has_lock.path) 262 | os.unlink(self.need_lock.path) 263 | self._start_keep_alive_thread() 264 | return 265 | time.sleep(SLEEP_TIMEOUT) 266 | if time.time() - start_time > AQUIRE_LOCK_TIMEOUT: 267 | raise RuntimeError("Timeout while waiting for read lock.") 268 | self.snapshot = FileLocksSnapshot(self.need_lock) 269 | 270 | 271 | class WriteLock(AbstractLock): 272 | """ 273 | A file-based write lock. 274 | Only one thread/process can hold a write lock, blocking others from acquiring either read or write locks. 275 | """ 276 | 277 | mode = "write" 278 | 279 | def _lock(self) -> None: 280 | # Express intention to acquire write lock 281 | os.makedirs(os.path.dirname(self.need_lock.path), exist_ok=True) 282 | os_touch(self.need_lock.path) 283 | self.snapshot = FileLocksSnapshot(self.need_lock) 284 | 285 | # If this thread already holds a write lock, raise an exception. 286 | if self.snapshot.exists(self.has_lock): 287 | os.unlink(self.need_lock.path) 288 | raise RuntimeError("Thread already has a write lock. Do not try to obtain a write lock twice.") 289 | 290 | start_time = time.time() 291 | 292 | # Try to acquire lock until conditions are met or a timeout occurs 293 | while True: 294 | if not self.snapshot.any_has_locks and self.snapshot.oldest_need(self.need_lock): 295 | self.has_lock = self.has_lock.new_with_updated_time() 296 | os_touch(self.has_lock.path) 297 | os.unlink(self.need_lock.path) 298 | self._start_keep_alive_thread() 299 | return 300 | time.sleep(SLEEP_TIMEOUT) 301 | if time.time() - start_time > AQUIRE_LOCK_TIMEOUT: 302 | raise RuntimeError("Timeout while waiting for write lock.") 303 | self.snapshot = FileLocksSnapshot(self.need_lock) 304 | -------------------------------------------------------------------------------- /dictdatabase/models.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Callable, Type, TypeVar 4 | 5 | from . import config, io_safe, utils 6 | from .sessions import ( 7 | SessionDirFull, 8 | SessionDirWhere, 9 | SessionFileFull, 10 | SessionFileKey, 11 | SessionFileWhere, 12 | ) 13 | 14 | T = TypeVar("T") 15 | 16 | 17 | class OperationType: 18 | """ 19 | Legal: 20 | - DDB.at("file") 21 | - DDB.at("file", key="subkey") 22 | - DDB.at("file", where=lambda k, v: ...) 23 | - DDB.at("dir", "*") 24 | - DDB.at("dir", "*", where=lambda k, v: ...) 25 | 26 | Illegal: 27 | - DDB.at("file", key="subkey", where=lambda k, v: ...) 28 | - DDB.at("dir", key="subkey", where=lambda k, v: ...) 29 | - DDB.at("dir", key="subkey") 30 | """ 31 | 32 | def __init__(self, path: str, key: str, where: Callable) -> None: 33 | self.dir = "*" in path 34 | self.file = not self.dir 35 | self.where = where is not None 36 | self.key = key is not None 37 | 38 | if self.key and self.where: 39 | raise TypeError("Cannot specify both key and where") 40 | if self.key and self.dir: 41 | raise TypeError("Cannot specify sub-key when selecting a folder. Specify the key in the path instead.") 42 | 43 | @property 44 | def file_normal(self) -> bool: 45 | return self.file and not self.where and not self.key 46 | 47 | @property 48 | def file_key(self) -> bool: 49 | return self.file and not self.where and self.key 50 | 51 | @property 52 | def file_where(self) -> bool: 53 | return self.file and self.where and not self.key 54 | 55 | @property 56 | def dir_normal(self) -> bool: 57 | return self.dir and not self.where and not self.key 58 | 59 | @property 60 | def dir_where(self) -> bool: 61 | return self.dir and self.where and not self.key 62 | 63 | 64 | def at(*path, key: str = None, where: Callable[[Any, Any], bool] = None) -> DDBMethodChooser: 65 | """ 66 | Select a file or folder to perform an operation on. 67 | If you want to select a specific key in a file, use the `key` parameter, 68 | e.g. `DDB.at("file", key="subkey")`. 69 | 70 | If you want to select an entire folder, use the `*` wildcard, 71 | eg. `DDB.at("folder", "*")`, or `DDB.at("folder/*")`. You can also use 72 | the `where` callback to select a subset of the file or folder. 73 | 74 | If the callback returns `True`, the item will be selected. The callback 75 | needs to accept a key and value as arguments. 76 | 77 | Args: 78 | - `path`: The path to the file or folder. Can be a string, a 79 | comma-separated list of strings, or a list. 80 | - `key`: The key to select from the file. 81 | - `where`: A function that takes a key and value and returns `True` if the 82 | key should be selected. 83 | 84 | Beware: If you select a folder with the `*` wildcard, you can't use the `key` parameter. 85 | Also, you cannot use the `key` and `where` parameters at the same time. 86 | """ 87 | return DDBMethodChooser(path, key, where) 88 | 89 | 90 | class DDBMethodChooser: 91 | __slots__ = ("path", "key", "where", "op_type") 92 | 93 | path: str 94 | key: str 95 | where: Callable[[Any, Any], bool] 96 | op_type: OperationType 97 | 98 | def __init__( 99 | self, 100 | path: tuple, 101 | key: str = None, 102 | where: Callable[[Any, Any], bool] = None, 103 | ) -> None: 104 | # Convert path to a list of strings 105 | pc = [] 106 | for p in path: 107 | pc += p if isinstance(p, list) else [p] 108 | self.path = "/".join([str(p) for p in pc]) 109 | self.key = key 110 | self.where = where 111 | self.op_type = OperationType(self.path, self.key, self.where) 112 | # Invariants: 113 | # - Both key and where cannot be not None at the same time 114 | # - If key is not None, then there is no wildcard in the path. 115 | 116 | def exists(self) -> bool: 117 | """ 118 | Efficiently checks if a database exists. If the selected path contains 119 | a wildcard, it will return True if at least one file exists in the folder. 120 | 121 | 122 | If a key was specified, check if it exists in a database. 123 | The key can be anywhere in the database, even deeply nested. 124 | As long it exists as a key in any dict, it will be found. 125 | """ 126 | if self.where is not None: 127 | raise RuntimeError("DDB.at(where=...).exists() cannot be used with the where parameter") 128 | 129 | if not utils.file_exists(self.path): 130 | return False 131 | if self.key is None: 132 | return True 133 | # Key is passed and occurs is True 134 | return io_safe.partial_read(self.path, key=self.key) is not None 135 | 136 | def create(self, data: dict | None = None, force_overwrite: bool = False) -> None: 137 | """ 138 | Create a new file with the given data as the content. If the file 139 | already exists, a FileExistsError will be raised unless 140 | `force_overwrite` is set to True. 141 | 142 | Args: 143 | - `data`: The data to write to the file. If not specified, it will be `{}` 144 | will be written. 145 | - `force_overwrite`: If `True`, will overwrite the file if it already 146 | exists, defaults to False (optional). 147 | """ 148 | if self.where is not None or self.key is not None: 149 | raise RuntimeError("DDB.at().create() cannot be used with the where or key parameters") 150 | 151 | # Except if db exists and force_overwrite is False 152 | if not force_overwrite and self.exists(): 153 | raise FileExistsError( 154 | f"Database {self.path} already exists in {config.storage_directory}. Pass force_overwrite=True to overwrite." 155 | ) 156 | # Write db to file 157 | if data is None: 158 | data = {} 159 | io_safe.write(self.path, data) 160 | 161 | def delete(self) -> None: 162 | """ 163 | Delete the file at the selected path. 164 | """ 165 | if self.where is not None or self.key is not None: 166 | raise RuntimeError("DDB.at().delete() cannot be used with the where or key parameters") 167 | io_safe.delete(self.path) 168 | 169 | def read(self, as_type: Type[T] = None) -> dict | T | None: 170 | """ 171 | Reads a file or folder depending on previous `.at(...)` selection. 172 | 173 | Args: 174 | - `as_type`: If provided, return the value as the given type. 175 | Eg. as_type=str will return str(value). 176 | """ 177 | 178 | def type_cast(value): 179 | if as_type is None: 180 | return value 181 | return as_type(value) 182 | 183 | data = {} 184 | 185 | if self.op_type.file_normal: 186 | data = io_safe.read(self.path) 187 | 188 | elif self.op_type.file_key: 189 | data = io_safe.partial_read(self.path, self.key) 190 | 191 | elif self.op_type.file_where: 192 | file_content = io_safe.read(self.path) 193 | if file_content is None: 194 | return None 195 | for k, v in file_content.items(): 196 | if self.where(k, type_cast(v)): 197 | data[k] = v 198 | 199 | elif self.op_type.dir_normal: 200 | pattern_paths = utils.find_all(self.path) 201 | data = {n.split("/")[-1]: io_safe.read(n) for n in pattern_paths} 202 | 203 | elif self.op_type.dir_where: 204 | for db_name in utils.find_all(self.path): 205 | k, v = db_name.split("/")[-1], io_safe.read(db_name) 206 | if self.where(k, type_cast(v)): 207 | data[k] = v 208 | 209 | return type_cast(data) 210 | 211 | def session( 212 | self, as_type: Type[T] = None 213 | ) -> SessionFileFull[T] | SessionFileKey[T] | SessionFileWhere[T] | SessionDirFull[T] | SessionDirWhere[T]: 214 | """ 215 | Opens a session to the selected file(s) or folder, depending on previous 216 | `.at(...)` selection. Inside the with block, you have exclusive access 217 | to the file(s) or folder. 218 | Call `session.write()` to write the data to the file(s) or folder. 219 | 220 | Args: 221 | - `as_type`: If provided, cast the value to the given type. 222 | Eg. as_type=str will return str(value). 223 | 224 | Raises: 225 | - `FileNotFoundError`: If the file does not exist. 226 | - `KeyError`: If a key is specified and it does not exist. 227 | 228 | Returns: 229 | - Tuple of (session_object, data) 230 | """ 231 | if self.op_type.file_normal: 232 | return SessionFileFull(self.path, as_type) 233 | if self.op_type.file_key: 234 | return SessionFileKey(self.path, self.key, as_type) 235 | if self.op_type.file_where: 236 | return SessionFileWhere(self.path, self.where, as_type) 237 | if self.op_type.dir_normal: 238 | return SessionDirFull(self.path, as_type) 239 | if self.op_type.dir_where: 240 | return SessionDirWhere(self.path, self.where, as_type) 241 | -------------------------------------------------------------------------------- /dictdatabase/sessions.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from contextlib import contextmanager 4 | from typing import Any, Callable, Generic, Tuple, TypeVar 5 | 6 | from . import io_unsafe, locking, utils 7 | 8 | T = TypeVar("T") 9 | JSONSerializable = TypeVar("JSONSerializable", str, int, float, bool, None, list, dict) 10 | 11 | 12 | def type_cast(obj, as_type): 13 | return obj if as_type is None else as_type(obj) 14 | 15 | 16 | class SessionBase: 17 | in_session: bool 18 | db_name: str 19 | as_type: T 20 | 21 | def __init__(self, db_name: str, as_type): 22 | self.in_session = False 23 | self.db_name = db_name 24 | self.as_type = as_type 25 | 26 | def __enter__(self): 27 | self.in_session = True 28 | self.data_handle = {} 29 | 30 | def __exit__(self, type, value, tb): 31 | write_lock = getattr(self, "write_lock", None) 32 | if write_lock is not None: 33 | if isinstance(write_lock, list): 34 | for lock in write_lock: 35 | lock._unlock() 36 | else: 37 | write_lock._unlock() 38 | self.write_lock, self.in_session = None, False 39 | 40 | def write(self): 41 | if not self.in_session: 42 | raise PermissionError("Only call write() inside a with statement.") 43 | 44 | 45 | @contextmanager 46 | def safe_context(super, self, *, db_names_to_lock=None): 47 | """ 48 | If an exception happens in the context, the __exit__ method of the passed super 49 | class will be called. 50 | """ 51 | super.__enter__() 52 | try: 53 | if isinstance(db_names_to_lock, str): 54 | self.write_lock = locking.WriteLock(self.db_name) 55 | self.write_lock._lock() 56 | elif isinstance(db_names_to_lock, list): 57 | self.write_lock = [locking.WriteLock(x) for x in self.db_name] 58 | for lock in self.write_lock: 59 | lock._lock() 60 | yield 61 | except BaseException as e: 62 | super.__exit__(type(e), e, e.__traceback__) 63 | raise e 64 | 65 | 66 | ######################################################################################## 67 | #### File sessions 68 | ######################################################################################## 69 | 70 | 71 | class SessionFileFull(SessionBase, Generic[T]): 72 | """ 73 | Context manager for read-write access to a full file. 74 | 75 | Efficiency: 76 | Reads and writes the entire file. 77 | """ 78 | 79 | def __enter__(self) -> Tuple[SessionFileFull, JSONSerializable | T]: 80 | with safe_context(super(), self, db_names_to_lock=self.db_name): 81 | self.data_handle = io_unsafe.read(self.db_name) 82 | return self, type_cast(self.data_handle, self.as_type) 83 | 84 | def write(self): 85 | super().write() 86 | io_unsafe.write(self.db_name, self.data_handle) 87 | 88 | 89 | class SessionFileKey(SessionBase, Generic[T]): 90 | """ 91 | Context manager for read-write access to a single key-value item in a file. 92 | 93 | Efficiency: 94 | Uses partial reading, which allows only reading the bytes of the key-value item. 95 | When writing, only the bytes of the key-value and the bytes of the file after 96 | the key-value are written. 97 | """ 98 | 99 | def __init__(self, db_name: str, key: str, as_type: T): 100 | super().__init__(db_name, as_type) 101 | self.key = key 102 | 103 | def __enter__(self) -> Tuple[SessionFileKey, JSONSerializable | T]: 104 | with safe_context(super(), self, db_names_to_lock=self.db_name): 105 | self.partial_handle = io_unsafe.get_partial_file_handle(self.db_name, self.key) 106 | self.data_handle = self.partial_handle.partial_dict.value 107 | return self, type_cast(self.data_handle, self.as_type) 108 | 109 | def write(self): 110 | super().write() 111 | io_unsafe.partial_write(self.partial_handle) 112 | 113 | 114 | class SessionFileWhere(SessionBase, Generic[T]): 115 | """ 116 | Context manager for read-write access to selection of key-value items in a file. 117 | The where callable is called with the key and value of each item in the file. 118 | 119 | Efficiency: 120 | Reads and writes the entire file, so it is not more efficient than 121 | SessionFileFull. 122 | """ 123 | 124 | def __init__(self, db_name: str, where: Callable[[Any, Any], bool], as_type: T): 125 | super().__init__(db_name, as_type) 126 | self.where = where 127 | 128 | def __enter__(self) -> Tuple[SessionFileWhere, JSONSerializable | T]: 129 | with safe_context(super(), self, db_names_to_lock=self.db_name): 130 | self.original_data = io_unsafe.read(self.db_name) 131 | for k, v in self.original_data.items(): 132 | if self.where(k, v): 133 | self.data_handle[k] = v 134 | return self, type_cast(self.data_handle, self.as_type) 135 | 136 | def write(self): 137 | super().write() 138 | self.original_data.update(self.data_handle) 139 | io_unsafe.write(self.db_name, self.original_data) 140 | 141 | 142 | ######################################################################################## 143 | #### File sessions 144 | ######################################################################################## 145 | 146 | 147 | class SessionDirFull(SessionBase, Generic[T]): 148 | """ 149 | Context manager for read-write access to all files in a directory. 150 | They are provided as a dict of {str(file_name): dict(file_content)}, where the 151 | file name does not contain the directory name nor the file extension. 152 | 153 | Efficiency: 154 | Fully reads and writes all files. 155 | """ 156 | 157 | def __init__(self, db_name: str, as_type: T): 158 | super().__init__(utils.find_all(db_name), as_type) 159 | 160 | def __enter__(self) -> Tuple[SessionDirFull, JSONSerializable | T]: 161 | with safe_context(super(), self, db_names_to_lock=self.db_name): 162 | self.data_handle = {n.split("/")[-1]: io_unsafe.read(n) for n in self.db_name} 163 | return self, type_cast(self.data_handle, self.as_type) 164 | 165 | def write(self): 166 | super().write() 167 | for name in self.db_name: 168 | io_unsafe.write(name, self.data_handle[name.split("/")[-1]]) 169 | 170 | 171 | class SessionDirWhere(SessionBase, Generic[T]): 172 | """ 173 | Context manager for read-write access to selection of files in a directory. 174 | The where callable is called with the file name and parsed content of each file. 175 | 176 | Efficiency: 177 | Fully reads all files, but only writes the selected files. 178 | """ 179 | 180 | def __init__(self, db_name: str, where: Callable[[Any, Any], bool], as_type: T): 181 | super().__init__(utils.find_all(db_name), as_type) 182 | self.where = where 183 | 184 | def __enter__(self) -> Tuple[SessionDirWhere, JSONSerializable | T]: 185 | with safe_context(super(), self): 186 | selected_db_names, write_lock = [], [] 187 | for db_name in self.db_name: 188 | lock = locking.WriteLock(db_name) 189 | lock._lock() 190 | k, v = db_name.split("/")[-1], io_unsafe.read(db_name) 191 | if self.where(k, v): 192 | self.data_handle[k] = v 193 | write_lock.append(lock) 194 | selected_db_names.append(db_name) 195 | else: 196 | lock._unlock() 197 | self.write_lock = write_lock 198 | self.db_name = selected_db_names 199 | return self, type_cast(self.data_handle, self.as_type) 200 | 201 | def write(self): 202 | super().write() 203 | for name in self.db_name: 204 | io_unsafe.write(name, self.data_handle[name.split("/")[-1]]) 205 | -------------------------------------------------------------------------------- /dictdatabase/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import glob 4 | import os 5 | from typing import Tuple 6 | 7 | from . import byte_codes, config 8 | 9 | 10 | def file_info(db_name: str) -> Tuple[str, bool, str, bool]: 11 | """ 12 | Returns a tuple of four elements, the first and third being the paths to the 13 | JSON and DDB files, and the second and third being booleans indicating whether 14 | those files exist: 15 | 16 | >>> (json_path, json_exists, ddb_path, ddb_exists) 17 | 18 | Args: 19 | - `db_name`: The name of the database 20 | """ 21 | base = f"{config.storage_directory}/{db_name}" 22 | j, d = f"{base}.json", f"{base}.ddb" 23 | return j, os.path.exists(j), d, os.path.exists(d) 24 | 25 | 26 | def file_exists(db_name: str) -> bool: 27 | """ 28 | Returns True if the given database exists, either as a JSON or DDB file. 29 | 30 | Args: 31 | - `db_name`: The name of the database 32 | """ 33 | base = f"{config.storage_directory}/{db_name}" 34 | j, d = f"{base}.json", f"{base}.ddb" 35 | return os.path.exists(j) or os.path.exists(d) 36 | 37 | 38 | def find_all(file_name: str) -> list[str]: 39 | """ 40 | Returns a list of all the database names that match the given glob file_name. 41 | 42 | Args: 43 | - `file_name`: The glob file_name to search for 44 | """ 45 | 46 | files_all = glob.glob(f"{config.storage_directory}/{file_name}.ddb") 47 | files_all += glob.glob(f"{config.storage_directory}/{file_name}.json") 48 | 49 | for trim in [f"{config.storage_directory}/", ".ddb", ".json"]: 50 | files_all = [d.replace(trim, "") for d in files_all] 51 | return files_all 52 | 53 | 54 | def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int: 55 | """ 56 | Finds the index of the next comma or closing bracket/brace after the value 57 | of a key-value pair in a bytes object containing valid JSON when decoded. 58 | 59 | Args: 60 | - `json_bytes`: A bytes object containing valid JSON when decoded 61 | - `index`: The start index in json_bytes 62 | 63 | Returns: 64 | - The end index of the value. 65 | """ 66 | 67 | # TODO: Try to implement this using bytes.find() instead of a loop 68 | # This make count_nesting a lot faster 69 | 70 | # See https://www.json.org/json-en.html for the JSON syntax 71 | 72 | list_depth, dict_depth, i, len_json_bytes = 0, 0, index, len(json_bytes) 73 | 74 | while i < len_json_bytes: 75 | current = json_bytes[i] 76 | # If backslash, skip the next character 77 | if current == byte_codes.BACKSLASH: 78 | i += 1 79 | 80 | # Assert: the current character is not escaped with a backslash 81 | 82 | elif current == byte_codes.QUOTE: 83 | while True: 84 | i = json_bytes.find(byte_codes.QUOTE, i + 1) 85 | if i == -1: 86 | raise TypeError("Invalid JSON") 87 | 88 | j = i - 1 89 | backslash_count = 0 90 | while j >= 0 and json_bytes[j] == byte_codes.BACKSLASH: 91 | backslash_count += 1 92 | j -= 1 93 | if backslash_count % 2 == 0: 94 | # If the number of backslashes is even, the quote is not escaped 95 | break 96 | # Else, the quote is escaped, and the loop continues 97 | 98 | # Exit point where string ends and nesting is zero 99 | if list_depth == 0 and dict_depth == 0: 100 | return i + 1 101 | 102 | # Invariant: Not in_str, not escaped 103 | 104 | # Handle opening brackets 105 | elif current == byte_codes.OPEN_SQUARE: 106 | list_depth += 1 107 | elif current == byte_codes.OPEN_CURLY: 108 | dict_depth += 1 109 | # Handle closing brackets 110 | elif current == byte_codes.CLOSE_SQUARE: 111 | list_depth -= 1 112 | if list_depth == 0 and dict_depth <= 0: 113 | return i + 1 + dict_depth # dict_depth is -1 in case: {"a": {}} 114 | elif current == byte_codes.CLOSE_CURLY: 115 | dict_depth -= 1 116 | if dict_depth <= 0 and list_depth == 0: 117 | return i + 1 + dict_depth # dict_depth is -1 in case: {"a": {}} 118 | elif list_depth == 0: 119 | if dict_depth == -1: 120 | return i 121 | if dict_depth == 0 and current in [byte_codes.COMMA, byte_codes.NEWLINE]: 122 | # Handle commas and newline as exit points 123 | return i 124 | i += 1 125 | 126 | raise TypeError("Invalid JSON") 127 | 128 | 129 | def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int: 130 | """ 131 | Returns the number of nesting levels. 132 | Considered bytes are from `start` inclusive to `end` exclusive. 133 | 134 | The nesting is counted by the number of opening and closing brackets/braces 135 | that are not in a string or escaped with a backslash. 136 | 137 | Args: 138 | - `json_bytes`: A bytes object containing valid JSON when decoded 139 | """ 140 | i, nesting = start, 0 141 | # Find the number of opening curly braces 142 | while (i := json_bytes.find(byte_codes.OPEN_CURLY, i, end)) != -1: 143 | if i == 0 or json_bytes[i - 1] != byte_codes.BACKSLASH: 144 | nesting += 1 145 | i += 1 146 | i = start 147 | # Find the number of closing curly braces 148 | while (i := json_bytes.find(byte_codes.CLOSE_CURLY, i, end)) != -1: 149 | if i == 0 or json_bytes[i - 1] != byte_codes.BACKSLASH: 150 | nesting -= 1 151 | i += 1 152 | return nesting 153 | 154 | 155 | def find_outermost_key_in_json_bytes(json_bytes: bytes, key: str) -> Tuple[int, int]: 156 | """ 157 | Returns the index of the key that is at the outermost nesting level. If the 158 | key is not found, return -1. If the key you are looking for is `some_key`, 159 | the function will search for `"some_key":` and return the start and end 160 | index of that string that is at the outermost nesting level, or -1 if the 161 | it is not found. 162 | 163 | Args: 164 | - `json_bytes`: A bytes object containing valid JSON when decoded 165 | - `key`: The key of an key-value pair in `json_bytes` to search for, 166 | represented as bytes. 167 | 168 | Returns: 169 | - A tuple of the key start (inclusive) and end (exclusive) index, 170 | or `(-1, -1)` if the key is not found. 171 | """ 172 | 173 | # TODO: Very strict. the key must have a colon directly after it 174 | # For example {"a": 1} will work, but {"a" : 1} will not work! 175 | 176 | key = f'"{key}":'.encode() 177 | 178 | if (curr_i := json_bytes.find(key, 0)) == -1: 179 | return (-1, -1) 180 | 181 | # Assert: Key was found and curr_i is the index of the first character of the key 182 | 183 | # Keep track of all found keys and their nesting level 184 | key_nest = [(curr_i, count_nesting_in_bytes(json_bytes, 0, curr_i))] 185 | 186 | # As long as more keys are found, keep track of them and their nesting level 187 | while (next_i := json_bytes.find(key, curr_i + len(key))) != -1: 188 | nesting = count_nesting_in_bytes(json_bytes, curr_i + len(key), next_i) 189 | key_nest.append((next_i, nesting)) 190 | curr_i = next_i 191 | 192 | # Assert: all keys have been found, and their nesting relative to each other is 193 | # stored in key_nest, whose length is at least 1. 194 | 195 | # Early exit if there is only one key 196 | if len(key_nest) == 1: 197 | index, level = key_nest[0] 198 | return (index, index + len(key)) if level == 1 else (-1, -1) 199 | 200 | # Relative to total nesting 201 | for i in range(1, len(key_nest)): 202 | key_nest[i] = (key_nest[i][0], key_nest[i - 1][1] + key_nest[i][1]) 203 | 204 | # Filter out all keys that are not at the outermost nesting level 205 | indices_at_index_one = [i for i, level in key_nest if level == 1] 206 | if len(indices_at_index_one) != 1: 207 | return (-1, -1) 208 | return (indices_at_index_one[0], indices_at_index_one[0] + len(key)) 209 | 210 | 211 | def detect_indentation_in_json_bytes(json_bytes: bytes, index: int) -> Tuple[int, str]: 212 | """ 213 | Count the amount of whitespace before the index to determine the indentation 214 | level and whitespace used. 215 | 216 | Args: 217 | - `json_bytes`: A bytes object containing valid JSON when decoded 218 | - `index`: The index behind which the indentation is to be determined 219 | 220 | Returns: 221 | - A tuple of the indentation level and the whitespace used 222 | """ 223 | 224 | indentation_bytes, contains_tab = bytes(), False 225 | for i in range(index - 1, -1, -1): 226 | if json_bytes[i] not in [byte_codes.SPACE, byte_codes.TAB]: 227 | break 228 | if json_bytes[i] == byte_codes.TAB: 229 | contains_tab = True 230 | indentation_bytes = indentation_bytes + bytes([json_bytes[i]]) 231 | 232 | if contains_tab: 233 | return len(indentation_bytes), "\t" 234 | if isinstance(config.indent, int) and config.indent > 0: 235 | return len(indentation_bytes) // config.indent, " " * config.indent 236 | if isinstance(config.indent, str): 237 | return len(indentation_bytes) // 2, " " 238 | return 0, "" 239 | -------------------------------------------------------------------------------- /justfile: -------------------------------------------------------------------------------- 1 | default: 2 | @just --list 3 | 4 | alias t := test 5 | test: 6 | poetry run pytest --cov=dictdatabase --cov-report term-missing 7 | rm ./.coverage 8 | 9 | alias p := profiler 10 | profiler: 11 | poetry run python profiler.py 12 | 13 | alias bp := benchmark_parallel 14 | benchmark_parallel: 15 | poetry run python tests/benchmark/run_parallel.py 16 | 17 | alias bt := benchmark_threaded 18 | benchmark_threaded: 19 | poetry run python tests/benchmark/run_threaded.py 20 | 21 | alias ba := benchmark_async 22 | benchmark_async: 23 | poetry run python tests/benchmark/run_async.py 24 | 25 | publish: 26 | uv build 27 | uv publish 28 | rm -rf dist 29 | rm -rf dictdatabase.egg-info 30 | -------------------------------------------------------------------------------- /profiler.py: -------------------------------------------------------------------------------- 1 | from distutils.command.config import config 2 | 3 | from path_dict import PathDict 4 | from pyinstrument import profiler 5 | 6 | import dictdatabase as DDB 7 | from dictdatabase import io_unsafe 8 | 9 | DDB.config.storage_directory = "./test_db/production_database" 10 | DDB.config.use_orjson = True 11 | DDB.config.indent = 2 12 | 13 | 14 | p = profiler.Profiler(interval=0.0001) 15 | with p: 16 | # fM44 is small 17 | # a2lU has many annotations 18 | # DDB.at("tasks", key="fM44").read(key="fM44", as_type=PathDict) 19 | for _ in range(10): 20 | with DDB.at("tasks", key="a2lU").session(as_type=PathDict) as (session, task): 21 | task["jay"] = lambda x: (x or 0) + 1 22 | session.write() 23 | # DDB.at("tasks_as_dir/*").read() 24 | 25 | 26 | p.open_in_browser(timeline=False) 27 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "dictdatabase" 3 | version = "2.5.1" 4 | description = "Easy-to-use database using dicts" 5 | readme = "README.md" 6 | authors = [{ name = "Marcel Kröker", email = "kroeker.marcel@gmail.com" }] 7 | license = { file = "LICENSE" } 8 | classifiers=[ 9 | "Programming Language :: Python :: 3", 10 | "License :: OSI Approved :: MIT License", 11 | "Operating System :: OS Independent", 12 | "Intended Audience :: Developers", 13 | "Programming Language :: Python", 14 | "Topic :: Software Development :: Libraries :: Python Modules" 15 | ] 16 | requires-python = ">=3.8,<3.14" 17 | dependencies = [ 18 | "orjson >= 3.9, <4.0", 19 | ] 20 | 21 | 22 | [dependency-groups] 23 | dev = [ 24 | "super-py ~= 0.4.2", 25 | "pyinstrument ~= 4.4.0", 26 | "pytest-cov ~= 4.0.0", 27 | "path-dict ~= 3.0.4", 28 | "ruff>=0.11.6", 29 | ] 30 | 31 | 32 | [tool.setuptools] 33 | packages = ["dictdatabase"] 34 | license-files = [] # Workaround for https://github.com/astral-sh/uv/issues/9513 35 | 36 | 37 | [tool.uv] 38 | package = true 39 | 40 | 41 | [tool.ruff] 42 | show-fixes = true 43 | line-length = 120 44 | select = [ 45 | "ANN", # annotations 46 | "B", # bugbear 47 | "C", # comprehensions 48 | "E", # style errors 49 | "F", # flakes 50 | "I", # import sorting 51 | "M", # meta 52 | "N", # naming 53 | "U", # upgrade 54 | "W", # style warnings 55 | "YTT", # sys.version 56 | ] 57 | ignore = [ 58 | "E501", # line length 59 | "UP007", # use X | Y for union (not possible in python 3.8) 60 | "UP006", # Use typing.Tuple for python 3.8 support 61 | "W191", # indentation contains tabs 62 | "E741", # ambiguous variable name 63 | ] 64 | 65 | 66 | [tool.ruff.format] 67 | indent-style = "tab" 68 | quote-style = "double" 69 | -------------------------------------------------------------------------------- /scenario_comparison.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | from pathlib import Path 4 | 5 | from pyinstrument import profiler 6 | 7 | import dictdatabase as DDB 8 | 9 | DDB.config.storage_directory = ".ddb_scenario_comparison" 10 | Path(DDB.config.storage_directory).mkdir(exist_ok=True) 11 | 12 | 13 | # Create a database with 10_000 entries 14 | all_users = {} 15 | for i in range(10_000): 16 | print(i) 17 | user = { 18 | "id": "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=8)), 19 | "name": "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=5)), 20 | "surname": "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=20)), 21 | "description": "".join(random.choices('abcdefghij"klmnopqrst😁uvwxyz\\ ', k=5000)), 22 | "age": random.randint(0, 100), 23 | } 24 | all_users[user["id"]] = user 25 | DDB.at("users_dir", user["id"]).create(user) 26 | DDB.at("users").create(all_users) 27 | 28 | 29 | ################################################################################ 30 | #### Test read from directory 31 | 32 | 33 | # 06.11.22: 2695ms 34 | t1 = time.monotonic() 35 | with profiler.Profiler() as p: 36 | DDB.at("users_dir/*").read() 37 | p.open_in_browser() 38 | print("Read all users from directory:", time.monotonic() - t1) 39 | 40 | 41 | ################################################################################ 42 | #### Test read from single file 43 | 44 | 45 | # 06.11.22: 181ms 46 | t1 = time.monotonic() 47 | DDB.at("users").read() 48 | print("Read all users from single file:", time.monotonic() - t1) 49 | -------------------------------------------------------------------------------- /scene_random_writes.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from pyinstrument.profiler import Profiler 4 | 5 | import dictdatabase as DDB 6 | 7 | user_count = 100_000 8 | 9 | # all_users = {} 10 | # for i in range(user_count): 11 | 12 | # user = { 13 | # "id": str(i), 14 | # "pref": "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=8)), 15 | # "name": "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=5)), 16 | # "surname": "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=20)), 17 | # "description": "".join(random.choices("abcdefghij\"klmnopqrst😁uvwxyz\\ ", k=5000)), 18 | # "age": random.randint(0, 100), 19 | # } 20 | # all_users[str(i)] = user 21 | # DDB.at("users").create(all_users, force_overwrite=True) 22 | 23 | print("Users created") 24 | 25 | p = Profiler(interval=0.0001) 26 | p.start() 27 | for it in range(500): 28 | print(it) 29 | user_id = str(random.randint(user_count - 100, user_count - 1)) 30 | with DDB.at("users", key=user_id).session() as (session, user): 31 | user["age"] += 1 32 | session.write() 33 | p.stop() 34 | p.open_in_browser(timeline=False) 35 | -------------------------------------------------------------------------------- /test_key_finder.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from dictdatabase import utils 4 | 5 | test_dict = { 6 | "b": 2, 7 | "c": { 8 | "a": 1, 9 | "b": 2, 10 | }, 11 | "d": { 12 | "a": 1, 13 | "b": 2, 14 | }, 15 | "a": 1, 16 | } 17 | 18 | json_str = json.dumps(test_dict, indent=2, sort_keys=False) 19 | json_bytes = json_str.encode() 20 | 21 | index = utils.find_outermost_key_in_json_bytes(json_bytes, "a") 22 | 23 | print("lel") 24 | print(index) 25 | print(json_bytes[index[0] : index[1]]) 26 | 27 | 28 | print(b"00111000".find(b"111", 0, 20)) 29 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mkrd/DictDataBase/12e650460c9284f8cd1249d26b16c18c04445691/tests/__init__.py -------------------------------------------------------------------------------- /tests/benchmark/locking.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from pathlib import Path 3 | 4 | from pyinstrument import profiler 5 | 6 | import dictdatabase as DDB 7 | from dictdatabase import locking 8 | 9 | DDB.config.storage_directory = "./.benchmark_locking" 10 | path = Path(DDB.config.storage_directory) 11 | path.mkdir(exist_ok=True, parents=True) 12 | 13 | 14 | # 05.11.22: 4520ms 15 | # 25.11.22: 4156ms 16 | with profiler.Profiler() as p: 17 | for _ in range(25_000): 18 | l = locking.ReadLock("db") 19 | l._lock() 20 | l._unlock() 21 | p.open_in_browser() 22 | 23 | 24 | # 05.11.22: 4884ms 25 | # 25.11.22: 4159ms 26 | with profiler.Profiler() as p: 27 | for _ in range(25_000): 28 | l = locking.WriteLock("db") 29 | l._lock() 30 | l._unlock() 31 | p.open_in_browser() 32 | 33 | 34 | l = locking.WriteLock("db/test.some") 35 | l._lock() 36 | 37 | 38 | shutil.rmtree(DDB.config.storage_directory) 39 | -------------------------------------------------------------------------------- /tests/benchmark/parallel_appends.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import time 5 | from calendar import c 6 | from multiprocessing import Pool 7 | 8 | from pyinstrument import Profiler 9 | from utils import db_job, make_table, print_and_assert_results 10 | 11 | import dictdatabase as DDB 12 | 13 | 14 | def proc_job(id, n): 15 | DDB.config.storage_directory = "./.ddb_bench_parallel" 16 | DDB.locking.SLEEP_TIMEOUT = 0.001 17 | for _ in range(n): 18 | t1 = time.monotonic_ns() 19 | with DDB.at("append_here").session() as (session, db): 20 | if len(db) == 0: 21 | db += [ 22 | { 23 | "counter": 0, 24 | "firstname": "John", 25 | "lastname": "Doe", 26 | "age": 42, 27 | "address": "1234 Main St", 28 | "city": "Anytown", 29 | "state": "CA", 30 | "zip": "12345", 31 | "phone": "123-456-7890", 32 | "interests": ["Python", "Databases", "DDB", "DDB-CLI", "DDB-Web", "Google"], 33 | } 34 | ] * 50000 35 | else: 36 | db.append({**db[-1], "counter": db[-1]["counter"] + 1}) 37 | session.write() 38 | time.sleep(0.5) 39 | 40 | vis = "🔴" * (id + 1) 41 | print(f"{(time.monotonic_ns() - t1) / 1e6:.2f} ms {vis}") 42 | 43 | 44 | def proc_read_job(id, n): 45 | DDB.config.storage_directory = "./.ddb_bench_parallel" 46 | DDB.locking.SLEEP_TIMEOUT = 0.001 47 | for _ in range(n): 48 | t1 = time.monotonic_ns() 49 | DDB.at("append_here").read() 50 | vis = "🟢" * (id + 1) 51 | print(f"{(time.monotonic_ns() - t1) / 1e6:.2f} ms {vis}") 52 | 53 | 54 | if __name__ == "__main__": 55 | proc_count = 2 56 | per_proc = 100 57 | DDB.config.storage_directory = "./.ddb_bench_parallel" 58 | # Create Tables 59 | DDB.at("append_here").create([], force_overwrite=True) 60 | # Execute process pool running incrementor as the target task 61 | t1 = time.monotonic() 62 | pool = Pool(processes=proc_count * 2) 63 | for i in range(proc_count): 64 | pool.apply_async( 65 | proc_job, 66 | args=( 67 | i, 68 | per_proc, 69 | ), 70 | ) 71 | pool.apply_async( 72 | proc_read_job, 73 | args=( 74 | i, 75 | per_proc, 76 | ), 77 | ) 78 | pool.close() 79 | pool.join() 80 | print(f"⏱️ {time.monotonic() - t1} seconds") 81 | -------------------------------------------------------------------------------- /tests/benchmark/run_async.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import shutil 4 | import time 5 | 6 | from utils import incrementor, print_and_assert_results 7 | 8 | import dictdatabase as DDB 9 | 10 | 11 | async def thread_job(i, n, file_count): 12 | DDB.locking.SLEEP_TIMEOUT = 0.001 13 | incrementor(i, n, file_count) 14 | 15 | 16 | async def threaded_stress(file_count=2, thread_count=10, per_thread=500): 17 | # Create file_count json files 18 | for t in range(file_count): 19 | DDB.at(f"incr{t}").create({"counter": 0}, force_overwrite=True) 20 | 21 | # Create tasks for concurrent execution 22 | tasks = [(incrementor, (i, per_thread, file_count)) for i in range(thread_count)] 23 | 24 | # Execute process pool running incrementor as the target task 25 | t1 = time.monotonic() 26 | await asyncio.gather(*[thread_job(i, per_thread, file_count) for i in range(thread_count)]) 27 | t2 = time.monotonic() 28 | 29 | print_and_assert_results(thread_count, per_thread, file_count, t1, t2) 30 | 31 | 32 | if __name__ == "__main__": 33 | DDB.config.storage_directory = ".ddb_bench_async" 34 | try: 35 | shutil.rmtree(".ddb_bench_async", ignore_errors=True) 36 | os.mkdir(".ddb_bench_async") 37 | loop = asyncio.get_event_loop() 38 | loop.run_until_complete(threaded_stress()) 39 | loop.close() 40 | finally: 41 | shutil.rmtree(".ddb_bench_async", ignore_errors=True) 42 | -------------------------------------------------------------------------------- /tests/benchmark/run_big_file.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | 4 | import dictdatabase as DDB 5 | 6 | 7 | def make_random_posts(count): 8 | posts = {} 9 | for _ in range(count): 10 | id = str(random.randint(0, 999_999_999)) 11 | title_length = random.randint(10, 100) 12 | content_length = random.randint(200, 500) 13 | posts[id] = { 14 | "id": id, 15 | "title": "".join(random.choices(" abcdefghijklmnopqrstuvwxyz,.", k=title_length)), 16 | "content": "".join(random.choices(" abcdefghijklmnopqrstuvwxyz,.", k=content_length)), 17 | } 18 | return posts 19 | 20 | 21 | def make_users(count): 22 | all_users = {} 23 | for i in range(count): 24 | all_users[str(i)] = { 25 | "id": str(i), 26 | "name": "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=5)), 27 | "surname": "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=20)), 28 | "age": random.randint(20, 80), 29 | "posts": make_random_posts(random.randint(200, 300)), 30 | } 31 | return all_users 32 | 33 | 34 | def read_specific_users(): 35 | accessed_users = sorted([str(i * 100) for i in range(100)], key=lambda x: random.random()) 36 | t1 = time.monotonic() 37 | for user_id in accessed_users: 38 | print(f"Accessing user {user_id}") 39 | u = DDB.at("big_users", key=user_id).read() 40 | print(f"User {user_id} has {len(u['posts'])} posts and is {u['age']} years old") 41 | t2 = time.monotonic() 42 | print(f"Time taken: {(t2 - t1) * 1000}ms") 43 | 44 | 45 | def write_specific_users(): 46 | accessed_users = sorted([str(i * 100) for i in range(100)], key=lambda x: random.random()) 47 | t1 = time.monotonic() 48 | for user_id in accessed_users: 49 | print(f"Accessing user {user_id}") 50 | 51 | with DDB.at("big_users", key=user_id).session() as (session, user): 52 | user["surname"] = "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=random.randint(3, 50))) 53 | session.write() 54 | t2 = time.monotonic() 55 | print(f"Time taken: {(t2 - t1) * 1000}ms") 56 | 57 | 58 | def random_access_users(write_read_ratio=0.1, count=500): 59 | accessed_users = [str(i * 100) for i in [random.randint(0, 99) for _ in range(count)]] 60 | t1 = time.monotonic() 61 | for user_id in accessed_users: 62 | if random.random() < write_read_ratio: 63 | with DDB.at("big_users", key=user_id).session() as (session, user): 64 | user["surname"] = "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=random.randint(3, 50))) 65 | session.write() 66 | print(f"Accessed user {user_id} for writing") 67 | else: 68 | u = DDB.at("big_users", key=user_id).read() 69 | print(f"User {user_id} has {len(u['posts'])} posts and is {u['age']} years old") 70 | 71 | t2 = time.monotonic() 72 | print(f"Time taken: {t2 - t1}s") 73 | 74 | 75 | # DDB.at("big_users").create(make_users(20_000), force_overwrite=True) # 2500MB 76 | 77 | # random_access_users() 78 | # write_specific_users() 79 | read_specific_users() 80 | -------------------------------------------------------------------------------- /tests/benchmark/run_parallel.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | import shutil 5 | import time 6 | from calendar import c 7 | from dataclasses import dataclass 8 | from multiprocessing import Pool 9 | 10 | from path_dict import PathDict 11 | 12 | import dictdatabase as DDB 13 | 14 | DDB.config.storage_directory = ".ddb_bench_multi" 15 | 16 | 17 | def benchmark(iterations, setup: callable = None): 18 | def decorator(function): 19 | def wrapper(*args, **kwargs): 20 | f_name = function.__name__ 21 | kwargs["name"] = f_name 22 | if setup: 23 | setup(kwargs) 24 | t1 = time.monotonic() 25 | for _ in range(iterations): 26 | function(*args, **kwargs) 27 | t2 = time.monotonic() 28 | print(f"⏱️ {iterations / (t2 - t1):.1f} op/s for {f_name} ({(t2 - t1):.1f} seconds)") 29 | 30 | return wrapper 31 | 32 | return decorator 33 | 34 | 35 | @benchmark(iterations=9000, setup=lambda kw: DDB.at(kw["name"]).create({"data": {"counter": 0}}, force_overwrite=True)) 36 | def sequential_full_read_small_file(name): 37 | DDB.at(name).read() 38 | 39 | 40 | @benchmark(iterations=8000, setup=lambda kw: DDB.at(kw["name"]).create({"data": {"counter": 0}}, force_overwrite=True)) 41 | def sequential_partial_read_small_file(name): 42 | DDB.at(name, key="data").read() 43 | 44 | 45 | @benchmark(iterations=8000, setup=lambda kw: DDB.at(kw["name"]).create({"data": {"counter": 0}}, force_overwrite=True)) 46 | def sequential_full_write_small_file(name): 47 | with DDB.at(name).session() as (session, db): 48 | db["data"]["counter"] += 1 49 | session.write() 50 | 51 | 52 | @benchmark(iterations=6000, setup=lambda kw: DDB.at(kw["name"]).create({"data": {"counter": 0}}, force_overwrite=True)) 53 | def sequential_partial_write_small_file(name): 54 | with DDB.at(name, key="data").session() as (session, db): 55 | db["counter"] += 1 56 | session.write() 57 | 58 | 59 | @dataclass 60 | class Scenario: 61 | files: int = 1 62 | readers: int = 0 63 | writers: int = 0 64 | big_file: bool = False 65 | use_compression: bool = False 66 | ops: int = 10 67 | 68 | def print(self): 69 | res = f"✨ Scenario: {'🔹' * self.readers}{'🔻' * self.writers} ({self.readers}r{self.writers}w)" 70 | res += ", 🔸 compression" if self.use_compression else "" 71 | res += ", 💎 big file" if self.big_file else "" 72 | print(res) 73 | 74 | 75 | def print_and_assert_results(scenario: Scenario, t): 76 | ops = (scenario.writers + scenario.readers) * scenario.ops * scenario.files 77 | ops_sec = f"{(ops / t):.0f}" 78 | s = f"⏱️ {ops_sec} op/s ({ops} in {t:.2f}s)" 79 | print(str.ljust(s, 32), end="") 80 | for t in range(scenario.files): 81 | db = DDB.at(f"incr{t}").read() 82 | if db["counter"]["counter"] != scenario.ops * scenario.writers: 83 | print("❌", db["counter"]["counter"], "!=", scenario.ops * scenario.writers) 84 | assert db["counter"]["counter"] == scenario.ops * scenario.writers 85 | 86 | 87 | def process_job(mode, scenario, cfg): 88 | DDB.config = cfg 89 | DDB.locking.SLEEP_TIMEOUT = 0.001 90 | 91 | t1 = time.monotonic() 92 | for _ in range(scenario.ops): 93 | for t in sorted(range(scenario.files), key=lambda _: random.random()): 94 | if mode == "r": 95 | DDB.at(f"incr{t}", key="counter").read() 96 | 97 | elif mode == "w": 98 | with DDB.at(f"incr{t}", key="counter").session(as_type=PathDict) as (session, d): 99 | d.at("counter").set(d.at("counter").get() + 1) 100 | session.write() 101 | t2 = time.monotonic() 102 | return t2 - t1 103 | 104 | 105 | def parallel_stressor(scenario: Scenario): 106 | DDB.config.use_compression = scenario.use_compression 107 | # Create Tables 108 | for t in range(scenario.files): 109 | if scenario.big_file: 110 | with open(os.path.join(os.getcwd(), "test_db/production_database/tasks.json"), "r") as f: 111 | db = json.loads(f.read()) 112 | db["counter"] = {"counter": 0} 113 | else: 114 | db = {"counter": {"counter": 0}} 115 | DDB.at(f"incr{t}").create(db, force_overwrite=True) 116 | 117 | # Execute process pool running incrementor as the target task 118 | res = [] 119 | pool = Pool(processes=scenario.readers + scenario.writers) 120 | for mode in "w" * scenario.writers + "r" * scenario.readers: 121 | res.append(pool.apply_async(process_job, args=(mode, scenario, DDB.config))) 122 | pool.close() 123 | pool.join() 124 | 125 | total_time = sum(r.get() for r in res) / (scenario.readers + scenario.writers) 126 | print_and_assert_results(scenario, total_time) 127 | 128 | 129 | scenarios = [ 130 | Scenario(readers=1, ops=6000), 131 | Scenario(readers=2, ops=6000), 132 | Scenario(readers=4, ops=6000), 133 | Scenario(readers=8, ops=3000), 134 | Scenario(writers=1, ops=6000), 135 | Scenario(writers=2, ops=1000), 136 | Scenario(writers=4, ops=800), 137 | Scenario(writers=8, ops=200), 138 | Scenario(readers=20, writers=20, ops=30), 139 | Scenario(readers=8, ops=1500), 140 | Scenario(readers=8, ops=1500, use_compression=True), 141 | Scenario(readers=8, ops=1500, big_file=True), 142 | Scenario(readers=8, writers=1, ops=200), 143 | Scenario(readers=8, writers=1, ops=25, big_file=True), 144 | Scenario(readers=1, writers=8, ops=200), 145 | Scenario(readers=1, writers=8, ops=10, big_file=True), 146 | Scenario(readers=8, writers=8, ops=100), 147 | Scenario(readers=8, writers=8, ops=8, big_file=True), 148 | ] 149 | 150 | if __name__ == "__main__": 151 | # print("✨ Simple sequential benchmarks") 152 | # sequential_full_read_small_file() 153 | # sequential_partial_read_small_file() 154 | # sequential_full_write_small_file() 155 | # sequential_partial_write_small_file() 156 | 157 | # Parallel benchmarks 158 | for scenario in scenarios: 159 | try: 160 | shutil.rmtree(".ddb_bench_multi", ignore_errors=True) 161 | os.mkdir(".ddb_bench_multi") 162 | parallel_stressor(scenario) 163 | scenario.print() 164 | finally: 165 | shutil.rmtree(".ddb_bench_multi", ignore_errors=True) 166 | -------------------------------------------------------------------------------- /tests/benchmark/run_parallel_multi.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import threading 5 | import time 6 | from calendar import c 7 | from multiprocessing import Pool 8 | 9 | from pyinstrument import Profiler 10 | from utils import print_and_assert_results 11 | 12 | import dictdatabase as DDB 13 | from dictdatabase.configuration import Confuguration 14 | 15 | 16 | def proc_job(n, cfg): 17 | DDB.config = cfg 18 | DDB.locking.SLEEP_TIMEOUT = 0.001 19 | 20 | for _ in range(n): 21 | with DDB.at("incr/*").session() as (session, d): 22 | for k, v in d.items(): 23 | v["counter"] += 1 24 | session.write() 25 | 26 | 27 | def parallel_stressor(file_count): 28 | # Create Tables 29 | for t in range(11): 30 | DDB.at("incr", t).create({"counter": 0}, force_overwrite=True) 31 | 32 | # Execute process pool running incrementor as the target task 33 | t1 = time.monotonic() 34 | res = [] 35 | pool = Pool(processes=file_count) 36 | for _ in range(file_count): 37 | r = pool.apply_async(proc_job, args=(1000, DDB.config)) 38 | res.append(r) 39 | pool.close() 40 | pool.join() 41 | t2 = time.monotonic() 42 | for r in res: 43 | print(r.get()) 44 | 45 | 46 | if __name__ == "__main__": 47 | DDB.config.storage_directory = ".ddb_bench_parallel" 48 | try: 49 | shutil.rmtree(".ddb_bench_parallel", ignore_errors=True) 50 | os.mkdir(".ddb_bench_parallel") 51 | parallel_stressor(4) 52 | finally: 53 | pass 54 | # shutil.rmtree(".ddb_bench_parallel", ignore_errors=True) 55 | -------------------------------------------------------------------------------- /tests/benchmark/run_threaded.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import time 5 | 6 | import super_py as sp 7 | from utils import db_job, print_and_assert_results 8 | 9 | import dictdatabase as DDB 10 | 11 | 12 | def threaded_stressor(file_count, readers, writers, operations_per_thread, big_file, compression): 13 | # Create Tables 14 | for t in range(file_count): 15 | if big_file: 16 | with open(os.path.join(os.getcwd(), "test_db/production_database/tasks.json"), "r") as f: 17 | db = json.loads(f.read()) 18 | db["counter"] = {"counter": 0} 19 | else: 20 | db = {"counter": {"counter": 0}} 21 | DDB.at(f"incr{t}").create(db, force_overwrite=True) 22 | 23 | tasks = [(db_job, (mode, file_count, operations_per_thread)) for mode in "w" * writers + "r" * readers] 24 | 25 | # Execute process pool running incrementor as the target task 26 | t1 = time.monotonic() 27 | sp.concurrency.run_threaded(tasks, max_threads=writers + readers) 28 | t2 = time.monotonic() 29 | print_and_assert_results(readers, writers, operations_per_process, file_count, big_file, compression, t1, t2) 30 | 31 | 32 | if __name__ == "__main__": 33 | DDB.config.storage_directory = ".ddb_bench_threaded" 34 | operations_per_process = 4 35 | for file_count, readers, writers in [(1, 4, 4), (1, 8, 1), (1, 1, 8), (4, 8, 8)]: 36 | print("") 37 | print( 38 | f"✨ Scenario: {file_count} files, {readers} readers, {writers} writers, {operations_per_process} operations per process" 39 | ) 40 | for big_file, compression in [(False, False), (False, True), (True, False), (True, True)]: 41 | try: 42 | shutil.rmtree(".ddb_bench_threaded", ignore_errors=True) 43 | os.mkdir(".ddb_bench_threaded") 44 | threaded_stressor(file_count, readers, writers, operations_per_process, big_file, compression) 45 | finally: 46 | shutil.rmtree(".ddb_bench_threaded", ignore_errors=True) 47 | -------------------------------------------------------------------------------- /tests/benchmark/sequential_appends.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import time 5 | from calendar import c 6 | from multiprocessing import Pool 7 | 8 | from pyinstrument import Profiler 9 | 10 | import dictdatabase as DDB 11 | 12 | 13 | def seq_job(n): 14 | DDB.at("db").create( 15 | [ 16 | { 17 | "counter": 0, 18 | "firstname": "John", 19 | "lastname": "Doe", 20 | "age": 42, 21 | "address": "1234 Main St", 22 | "city": "Anytown", 23 | "state": "CA", 24 | "zip": "12345", 25 | "phone": "123-456-7890", 26 | "interests": ["Python", "Databases", "DDB", "DDB-CLI", "DDB-Web", "Google"], 27 | } 28 | ] 29 | * 50000, 30 | force_overwrite=True, 31 | ) 32 | for _ in range(n): 33 | t1 = time.monotonic_ns() 34 | with DDB.at("db").session() as (session, db): 35 | db.append({**db[-1], "counter": db[-1]["counter"] + 1}) 36 | session.write() 37 | print(f"{(time.monotonic_ns() - t1) / 1e6:.2f} ms") 38 | 39 | 40 | if __name__ == "__main__": 41 | DDB.config.storage_directory = "./.ddb_bench_sequential" 42 | DDB.locking.SLEEP_TIMEOUT = 0.001 43 | DDB.config.use_orjson = True 44 | DDB.config.indent = 2 45 | 46 | p = Profiler(interval=0.00001) 47 | p.start() 48 | # Execute process pool running incrementor as the target task 49 | seq_job(20) 50 | p.stop() 51 | p.open_in_browser() 52 | -------------------------------------------------------------------------------- /tests/benchmark/sqlite/run.sh: -------------------------------------------------------------------------------- 1 | cd ./benchmarks/sqlite 2 | poetry run python3 test.py 3 | -------------------------------------------------------------------------------- /tests/benchmark/sqlite/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sqlite3 3 | import time 4 | 5 | import super_py as sp 6 | 7 | 8 | def teardown(): 9 | os.remove("test.db") 10 | 11 | 12 | @sp.test(teardown=teardown) 13 | def parallel_stress(tables=4, processes=16, per_process=128): 14 | # Create the database with all tables 15 | con = sqlite3.connect("test.db") 16 | for t in range(tables): 17 | cur = con.cursor() 18 | cur.execute(f"CREATE TABLE IF NOT EXISTS incr{t} (counter INTEGER)") 19 | cur.execute(f"INSERT INTO incr{t} (counter) VALUES (0)") 20 | con.commit() 21 | con.close() 22 | 23 | # Run the incr_db function in parallel 24 | args = f"{tables} {processes} {per_process}" 25 | t1 = time.time() 26 | os.system(f"python3 test_parallel_runner.py {args}") 27 | t2 = time.time() 28 | 29 | ops = processes * per_process * tables 30 | ops_sec = int(ops / (t2 - t1)) 31 | print(f"{ops = }, {ops_sec = }, {tables = }, {processes = } {per_process = }") 32 | print(f"{t2 - t1 = }") 33 | 34 | for t in range(tables): 35 | con = sqlite3.connect("test.db") 36 | cur = con.cursor() 37 | cur.execute(f"SELECT counter FROM incr{t}") 38 | t_counter = cur.fetchone()[0] 39 | con.close() 40 | print(f"{t_counter = }, should be {processes * per_process}") 41 | assert t_counter == processes * per_process 42 | -------------------------------------------------------------------------------- /tests/benchmark/sqlite/test_parallel_runner.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import sys 3 | from multiprocessing import Pool 4 | 5 | 6 | def incr_db(n, tables): 7 | for _ in range(n): 8 | for t in range(tables): 9 | con = sqlite3.connect("test.db") 10 | cur = con.cursor() 11 | cur.execute(f"UPDATE incr{t} SET counter = counter + 1") 12 | con.commit() 13 | con.close() 14 | return True 15 | 16 | 17 | if __name__ == "__main__": 18 | tables = int(sys.argv[1]) 19 | processes = int(sys.argv[2]) 20 | per_process = int(sys.argv[3]) 21 | 22 | pool = Pool(processes=processes) 23 | for _ in range(processes): 24 | pool.apply_async( 25 | incr_db, 26 | args=( 27 | per_process, 28 | tables, 29 | ), 30 | ) 31 | pool.close() 32 | pool.join() 33 | -------------------------------------------------------------------------------- /tests/benchmark/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | 4 | from path_dict import pd 5 | 6 | import dictdatabase as DDB 7 | 8 | 9 | def make_table(recursion_depth=3, keys_per_level=50): 10 | d = {"key1": "val1", "key2": 2, "key3": [1, "2", [3, 3]]} 11 | for i in range(recursion_depth): 12 | d = {f"key{i}{j}": d for j in range(keys_per_level)} 13 | # print(f"Made table of size {len(json.dumps(d)) // 1e6}mb") 14 | return {"counter": {"counter": 0}, "big": d} 15 | 16 | 17 | def print_stats(i, durations): 18 | avg = f"{sum(durations) / len(durations):.0f}" 19 | median = f"{sorted(durations)[len(durations) // 2]:.0f}" 20 | min_t = f"{min(durations):.0f}" 21 | max_t = f"{max(durations):.0f}" 22 | 23 | # print(f"{i}: total: {len(durations)}, avg: {avg}ms (med: {median}), {min_t}-{max_t}ms") 24 | 25 | 26 | def print_and_assert_results(readers, writers, per_proc, tables, big_file, compression, t1, t2): 27 | ops = (writers + readers) * per_proc * tables 28 | ops_sec = f"{(ops / (t2 - t1)):.0f}" 29 | print(f"⏱️ {ops_sec} op/s ({ops} in {t2 - t1:.2f}s), {big_file = }, {compression = }") 30 | for t in range(tables): 31 | db = DDB.at(f"incr{t}").read() 32 | # print(db["counter"]["counter"], "==", per_proc * writers) 33 | assert db["counter"]["counter"] == per_proc * writers 34 | # print(f"✅ counter={db['counter']}") 35 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | import dictdatabase as DDB 6 | 7 | 8 | @pytest.fixture(autouse=True) 9 | def isolate_database_files(tmp_path: Path): 10 | DDB.config.storage_directory = str(tmp_path) 11 | 12 | 13 | @pytest.fixture(scope="function") 14 | def name_of_test(request): 15 | return request.function.__name__ 16 | 17 | 18 | @pytest.fixture(params=[True, False]) 19 | def use_compression(request): 20 | DDB.config.use_compression = request.param 21 | return request.param 22 | 23 | 24 | @pytest.fixture(params=[True, False]) 25 | def use_orjson(request): 26 | DDB.config.use_orjson = request.param 27 | return request.param 28 | 29 | 30 | @pytest.fixture(params=[None, 0, 2, "\t"]) 31 | def indent(request): 32 | DDB.config.indent = request.param 33 | return request.param 34 | -------------------------------------------------------------------------------- /tests/system_checks/test_clocks.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import threading 3 | import time 4 | 5 | 6 | def print_clocks(label: str) -> None: 7 | print(f"--- {label} ---") 8 | print("time_ns() :", time.time_ns()) 9 | print("monotonic_ns() :", time.monotonic_ns()) 10 | print("perf_counter_ns():", time.perf_counter_ns()) 11 | print("\n") 12 | 13 | 14 | def thread_function(thread_name: str) -> None: 15 | print_clocks(f"Thread-{thread_name}") 16 | 17 | 18 | def process_function(process_name: str) -> None: 19 | print_clocks(f"Process-{process_name}") 20 | 21 | 22 | if __name__ == "__main__": 23 | print_clocks("Main Thread") 24 | 25 | threads = [] 26 | for i in range(3): 27 | thread = threading.Thread(target=thread_function, args=(i,)) 28 | thread.start() 29 | threads.append(thread) 30 | 31 | for thread in threads: 32 | thread.join() 33 | 34 | processes = [] 35 | for i in range(3): 36 | process = multiprocessing.Process(target=process_function, args=(i,)) 37 | process.start() 38 | processes.append(process) 39 | 40 | for process in processes: 41 | process.join() 42 | -------------------------------------------------------------------------------- /tests/system_checks/test_monotonic_over_threads.py: -------------------------------------------------------------------------------- 1 | import queue 2 | import threading 3 | import time 4 | 5 | # Number of threads 6 | NUM_THREADS = 64 7 | 8 | # Define the clocks to test 9 | clocks = { 10 | "time ": time.time, 11 | "time_ns ": time.time_ns, 12 | "monotonic ": time.monotonic, 13 | "monotonic_ns ": time.monotonic_ns, 14 | "perf_counter ": time.perf_counter, 15 | "perf_counter_ns": time.perf_counter_ns, 16 | } 17 | 18 | # Queue to store timestamps in order 19 | timestamps = queue.Queue() 20 | 21 | 22 | def capture_time(i, clock_func: callable) -> None: 23 | # Capture time using the given clock function and put it in the queue 24 | for _ in range(1000): 25 | # print(f"Thread {i} capturing time") 26 | timestamps.put(clock_func()) 27 | 28 | 29 | def check_monotonicity_for_clock(clock_name: str, clock_func: callable) -> None: 30 | # Clear the queue for the next clock 31 | while not timestamps.empty(): 32 | timestamps.get() 33 | 34 | # Create and start threads 35 | threads = [] 36 | for i in range(NUM_THREADS): 37 | thread = threading.Thread( 38 | target=capture_time, 39 | args=( 40 | i, 41 | clock_func, 42 | ), 43 | ) 44 | thread.start() 45 | threads.append(thread) 46 | 47 | # Wait for all threads to complete 48 | for thread in threads: 49 | thread.join() 50 | 51 | # Extract timestamps from the queue 52 | captured_times = [] 53 | while not timestamps.empty(): 54 | captured_times.append(timestamps.get()) 55 | 56 | # Check if the clock is monotonic 57 | is_monotonic = all(captured_times[i] <= captured_times[i + 1] for i in range(len(captured_times) - 1)) 58 | 59 | if is_monotonic: 60 | print(f"Clock: {clock_name} is monotonic over {NUM_THREADS} threads ✅") 61 | else: 62 | print(f"Clock: {clock_name} is not monotonic over {NUM_THREADS} threads ❌") 63 | print("-" * 40) 64 | 65 | 66 | if __name__ == "__main__": 67 | # Check monotonicity for each clock 68 | for clock_name, clock_func in clocks.items(): 69 | check_monotonicity_for_clock(clock_name, clock_func) 70 | -------------------------------------------------------------------------------- /tests/system_checks/test_tick_rate.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | def get_tick_rate(clock_func: callable) -> float: 5 | start_time = time.time() 6 | measurements = [clock_func() for _ in range(2_000_000)] 7 | end_time = time.time() 8 | 9 | ticks = 0 10 | prev_value = measurements[0] 11 | for current_value in measurements[1:]: 12 | if current_value < prev_value: 13 | raise RuntimeError("Clock function is not monotonic") 14 | if current_value != prev_value: 15 | ticks += 1 16 | prev_value = current_value 17 | 18 | return ticks / (end_time - start_time) # ticks per second 19 | 20 | 21 | if __name__ == "__main__": 22 | clock_funcs = { 23 | "time ": time.time, 24 | "time_ns ": time.time_ns, 25 | "monotonic ": time.monotonic, 26 | "monotonic_ns ": time.monotonic_ns, 27 | "perf_counter ": time.perf_counter, 28 | "perf_counter_ns": time.perf_counter_ns, 29 | } 30 | 31 | for name, func in clock_funcs.items(): 32 | print(f"Tick rate for {name}: {get_tick_rate(func) / 1_000_000.0:.3f}M ticks/second") 33 | -------------------------------------------------------------------------------- /tests/test_at.py: -------------------------------------------------------------------------------- 1 | from dictdatabase.models import at 2 | 3 | 4 | def test_at(): 5 | assert at("x").path == "x" 6 | assert at("x", "y", "z").path == "x/y/z" 7 | assert at(["x", "y", "z"]).path == "x/y/z" 8 | assert at("x", ["y", "z"]).path == "x/y/z" 9 | assert at(["x", "y"], "z").path == "x/y/z" 10 | assert at(["x"], "y", "z").path == "x/y/z" 11 | assert at("x", ["y"], "z").path == "x/y/z" 12 | assert at("x", "y", ["z"]).path == "x/y/z" 13 | assert at("x", ["y"], ["z"]).path == "x/y/z" 14 | assert at(["x"], "y", ["z"]).path == "x/y/z" 15 | assert at(["x"], ["y"], "z").path == "x/y/z" 16 | assert at(["x"], ["y"], ["z"]).path == "x/y/z" 17 | -------------------------------------------------------------------------------- /tests/test_create.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | from path_dict import pd 5 | 6 | import dictdatabase as DDB 7 | from tests.utils import make_complex_nested_random_dict 8 | 9 | 10 | def test_create(use_compression, use_orjson, indent): 11 | DDB.at("create").create(force_overwrite=True) 12 | db = DDB.at("create").read() 13 | assert db == {} 14 | 15 | with DDB.at("create").session(as_type=pd) as (session, d): 16 | d["a", "b", "c"] = "😁" 17 | session.write() 18 | assert DDB.at("create").read() == {"a": {"b": {"c": "😁"}}} 19 | 20 | with pytest.raises(RuntimeError): 21 | DDB.at("create", where=lambda k, v: True).create(force_overwrite=True) 22 | 23 | with pytest.raises(RuntimeError): 24 | DDB.at("create", key="any").create(force_overwrite=True) 25 | 26 | 27 | def test_create_edge_cases(use_compression, use_orjson, indent): 28 | cases = [-2, 0.0, "", "x", [], {}, True] 29 | 30 | for i, c in enumerate(cases): 31 | DDB.at(f"tcec{i}").create(c, force_overwrite=True) 32 | assert DDB.at(f"tcec{i}").read() == c 33 | 34 | with pytest.raises(TypeError): 35 | DDB.at("tcec99").create(object(), force_overwrite=True) 36 | 37 | 38 | def test_nested_file_creation(use_compression, use_orjson, indent): 39 | n = DDB.at("nested/file/nonexistent").read() 40 | assert n is None 41 | db = make_complex_nested_random_dict(12, 6) 42 | DDB.at("nested/file/creation/test").create(db, force_overwrite=True) 43 | assert DDB.at("nested/file/creation/test").read() == db 44 | 45 | 46 | def test_create_same_file_twice(use_compression, use_orjson, indent): 47 | name = "test_create_same_file_twice" 48 | # Check that creating the same file twice must raise an error 49 | with pytest.raises(FileExistsError): 50 | DDB.at(name).create(force_overwrite=True) 51 | DDB.at(name).create() 52 | # Check that creating the same file twice with force_overwrite=True works 53 | DDB.at(f"{name}2").create(force_overwrite=True) 54 | DDB.at(f"{name}2").create(force_overwrite=True) 55 | -------------------------------------------------------------------------------- /tests/test_delete.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import dictdatabase as DDB 4 | 5 | 6 | def test_delete(use_compression, use_orjson, indent): 7 | DDB.at("test_delete").create({"a": 1}, force_overwrite=True) 8 | assert DDB.at("test_delete").read() == {"a": 1} 9 | DDB.at("test_delete").delete() 10 | assert DDB.at("test_delete").read() is None 11 | 12 | with pytest.raises(RuntimeError): 13 | DDB.at("test_delete", where=lambda k, v: True).delete() 14 | 15 | with pytest.raises(RuntimeError): 16 | DDB.at("test_delete", key="any").delete() 17 | 18 | 19 | def test_delete_nonexistent(use_compression, use_orjson, indent): 20 | DDB.at("test_delete_nonexistent").delete() 21 | -------------------------------------------------------------------------------- /tests/test_excepts.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from path_dict import pd 3 | 4 | import dictdatabase as DDB 5 | from dictdatabase import io_bytes, utils 6 | 7 | 8 | def test_except_during_open_session(use_compression, use_orjson, indent): 9 | name = "test_except_during_open_session" 10 | d = {"test": "value"} 11 | DDB.at(name).create(d, force_overwrite=True) 12 | with pytest.raises(RuntimeError): 13 | with DDB.at(name).session() as (session, test): 14 | raise RuntimeError("Any Exception") 15 | 16 | 17 | def test_except_on_save_unserializable(use_compression, use_orjson, indent): 18 | name = "test_except_on_save_unserializable" 19 | with pytest.raises(TypeError): 20 | d = {"test": "value"} 21 | DDB.at(name).create(d, force_overwrite=True) 22 | with DDB.at(name).session(as_type=pd) as (session, test): 23 | test["test"] = {"key": {1, 2}} 24 | session.write() 25 | 26 | 27 | def test_except_on_save_unserializable_in_multisession(use_compression, use_orjson, indent): 28 | name = "test_except_on_save_unserializable_in_multisession" 29 | with pytest.raises(TypeError): 30 | d = {"test": "value"} 31 | DDB.at(name, "1").create(d, force_overwrite=True) 32 | DDB.at(name, "2").create(d, force_overwrite=True) 33 | with DDB.at(name, "*").session(as_type=pd) as (session, test): 34 | test["1"]["test"] = {"key": {1, 2}} 35 | session.write() 36 | 37 | 38 | def test_except_on_session_in_session(use_compression, use_orjson, indent): 39 | name = "test_except_on_session_in_session" 40 | d = {"test": "value"} 41 | DDB.at(name).create(d, force_overwrite=True) 42 | with pytest.raises(RuntimeError): 43 | with DDB.at(name).session(as_type=pd) as (session, test): 44 | with DDB.at(name).session(as_type=pd) as (session2, test2): 45 | pass 46 | 47 | 48 | def test_except_on_write_outside_session(use_compression, use_orjson, indent): 49 | with pytest.raises(PermissionError): 50 | s = DDB.at("test_except_on_write_outside_session").session() 51 | s.write() 52 | 53 | 54 | def test_wildcard_and_subkey_except(use_compression, use_orjson, indent): 55 | with pytest.raises(TypeError): 56 | DDB.at("test_wildcard_and_subkey_except/*", key="key").read() 57 | 58 | 59 | def test_utils_invalid_json_except(): 60 | with pytest.raises(TypeError): 61 | utils.seek_index_through_value_bytes(b"{This is not { JSON", 0) 62 | 63 | 64 | def test_bytes_write_except(): 65 | # It is not allowed to specify a start index when compression is used. 66 | with pytest.raises(RuntimeError): 67 | DDB.config.use_compression = True 68 | io_bytes.write("any", b"any", start=1) 69 | -------------------------------------------------------------------------------- /tests/test_exists.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import dictdatabase as DDB 4 | 5 | 6 | def test_exists(use_compression, use_orjson, indent): 7 | DDB.at("test_exists").create({"a": 1}, force_overwrite=True) 8 | assert DDB.at("test_exists").exists() 9 | assert not DDB.at("test_exists/nonexistent").exists() 10 | assert DDB.at("test_exists", key="a").exists() 11 | assert not DDB.at("test_exists", key="b").exists() 12 | with pytest.raises(RuntimeError): 13 | DDB.at("test_exists", where=lambda k, v: True).exists() 14 | -------------------------------------------------------------------------------- /tests/test_indentation.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import orjson 4 | import pytest 5 | 6 | import dictdatabase as DDB 7 | from dictdatabase import config, io_bytes, io_unsafe, utils 8 | 9 | data = { 10 | "a": 1, 11 | "b": { 12 | "c": 2, 13 | "cl": [1, "\\"], 14 | "d": { 15 | "e": 3, 16 | "el": [1, "\\"], 17 | }, 18 | }, 19 | "l": [1, "\\"], 20 | } 21 | 22 | 23 | def string_dump(db: dict): 24 | if not config.use_orjson: 25 | return json.dumps(db, indent=config.indent, sort_keys=True).encode() 26 | option = (orjson.OPT_INDENT_2 if config.indent else 0) | orjson.OPT_SORT_KEYS 27 | return orjson.dumps(db, option=option) 28 | 29 | 30 | def test_indentation(use_compression, use_orjson, indent): 31 | DDB.at("test_indentation").create(data, force_overwrite=True) 32 | 33 | with DDB.at("test_indentation", key="b").session() as (session, db_b): 34 | db_b["c"] = 3 35 | session.write() 36 | data["b"]["c"] = 3 37 | 38 | assert io_bytes.read("test_indentation") == string_dump(data) 39 | 40 | # Accessing a key not at root level should raise an error 41 | with pytest.raises(KeyError): 42 | with DDB.at("test_indentation", key="d").session() as (session, db_d): 43 | session.write() 44 | assert io_bytes.read("test_indentation") == string_dump(data) 45 | -------------------------------------------------------------------------------- /tests/test_indexer.py: -------------------------------------------------------------------------------- 1 | import dictdatabase as DDB 2 | 3 | 4 | def test_indexer(use_compression, use_orjson, indent): 5 | DDB.at("test_indexer").create(force_overwrite=True, data={"a": {"e": 4}, "b": 2}) 6 | 7 | # Trigger create index entry for key "a" 8 | assert DDB.at("test_indexer", key="a").read() == {"e": 4} 9 | 10 | # Retrieve the index entry for key "a" by using the indexer 11 | with DDB.at("test_indexer", key="a").session() as (session, d): 12 | d["e"] = 5 13 | session.write() 14 | 15 | # Check that the index entry for key "a" has been updated 16 | assert DDB.at("test_indexer").read() == {"a": {"e": 5}, "b": 2} 17 | -------------------------------------------------------------------------------- /tests/test_io_bytes.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dictdatabase import io_bytes 4 | 5 | 6 | def test_write_bytes(name_of_test, use_compression): 7 | # No partial writing to compressed file allowed 8 | if use_compression: 9 | with pytest.raises(RuntimeError): 10 | io_bytes.write(name_of_test, b"test", start=5) 11 | return 12 | # Write shorter content at index 13 | io_bytes.write(name_of_test, b"0123456789") 14 | io_bytes.write(name_of_test, b"abc", start=2) 15 | assert io_bytes.read(name_of_test) == b"01abc" 16 | # Overwrite with shorter content 17 | io_bytes.write(name_of_test, b"xy") 18 | assert io_bytes.read(name_of_test) == b"xy" 19 | # Overwrite with longer content 20 | io_bytes.write(name_of_test, b"0123456789") 21 | io_bytes.write(name_of_test, b"abcdef", start=8) 22 | assert io_bytes.read(name_of_test) == b"01234567abcdef" 23 | # Write at index out of range 24 | io_bytes.write(name_of_test, b"01") 25 | io_bytes.write(name_of_test, b"ab", start=4) 26 | assert io_bytes.read(name_of_test) == b"01\x00\x00ab" 27 | 28 | 29 | def test_read_bytes(name_of_test, use_compression): 30 | io_bytes.write(name_of_test, b"0123456789") 31 | # In range 32 | assert io_bytes.read(name_of_test, start=2, end=5) == b"234" 33 | # Normal ranges 34 | assert io_bytes.read(name_of_test, start=0, end=10) == b"0123456789" 35 | assert io_bytes.read(name_of_test, start=2) == b"23456789" 36 | assert io_bytes.read(name_of_test, end=2) == b"01" 37 | assert io_bytes.read(name_of_test) == b"0123456789" 38 | # End out of range 39 | assert io_bytes.read(name_of_test, start=9, end=20) == b"9" 40 | # Completely out of range 41 | assert io_bytes.read(name_of_test, start=25, end=30) == b"" 42 | # Start negative 43 | if use_compression: 44 | assert io_bytes.read(name_of_test, start=-5, end=3) == b"" 45 | else: 46 | with pytest.raises(OSError): 47 | io_bytes.read(name_of_test, start=-5, end=3) 48 | -------------------------------------------------------------------------------- /tests/test_io_safe.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | 5 | import dictdatabase as DDB 6 | from dictdatabase import io_safe 7 | 8 | 9 | def test_read(use_compression, use_orjson, indent): 10 | # Elicit read error 11 | DDB.config.use_orjson = True 12 | with pytest.raises(json.decoder.JSONDecodeError): 13 | with open(f"{DDB.config.storage_directory}/corrupted_json.json", "w") as f: 14 | f.write("This is not JSON") 15 | io_safe.read("corrupted_json") 16 | 17 | 18 | def test_partial_read(use_compression, use_orjson, indent): 19 | assert io_safe.partial_read("nonexistent", key="none") is None 20 | 21 | 22 | def test_write(use_compression, use_orjson, indent): 23 | with pytest.raises(TypeError): 24 | io_safe.write("nonexistent", lambda x: x) 25 | 26 | 27 | def test_delete(use_compression, use_orjson, indent): 28 | DDB.at("to_be_deleted").create() 29 | DDB.at("to_be_deleted").delete() 30 | assert DDB.at("to_be_deleted").read() is None 31 | -------------------------------------------------------------------------------- /tests/test_locking.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import time 3 | 4 | import pytest 5 | 6 | from dictdatabase import locking 7 | 8 | 9 | def test_lock_release(): 10 | lock = locking.WriteLock("db_release") 11 | 12 | with lock: 13 | pass # Lock should be released here 14 | 15 | # Now, another lock should be able to be acquired 16 | with locking.WriteLock("db_release"): 17 | pass 18 | 19 | 20 | def test_read_lock_release(): 21 | read_lock = locking.ReadLock("test_db") 22 | write_lock = locking.WriteLock("test_db") 23 | 24 | # Acquire and release a read lock 25 | with read_lock: 26 | pass 27 | 28 | # Now attempt to acquire a write lock 29 | with write_lock: 30 | assert write_lock.has_lock is not None 31 | 32 | read_lock._unlock() 33 | write_lock._unlock() 34 | 35 | 36 | def test_double_lock_exception(use_compression): 37 | name = "test_double_lock_exception" 38 | with pytest.raises(RuntimeError): 39 | with locking.ReadLock(name): 40 | with locking.ReadLock(name): 41 | pass 42 | 43 | ls = locking.FileLocksSnapshot(locking.ReadLock(name).need_lock) 44 | assert len(ls.locks) == 0 45 | 46 | 47 | def test_get_lock_names(use_compression): 48 | lock = locking.ReadLock("db") 49 | lock._lock() 50 | 51 | ls = locking.FileLocksSnapshot(locking.ReadLock("none").need_lock) 52 | assert ls.locks == [] 53 | ls = locking.FileLocksSnapshot(lock.need_lock) 54 | assert len(ls.locks) == 1 55 | 56 | assert ls.locks[0].id == str(threading.get_native_id()) 57 | assert int(ls.locks[0].time_ns) >= int(lock.need_lock.time_ns) 58 | assert ls.locks[0].stage == "has" 59 | assert ls.locks[0].mode == "read" 60 | 61 | assert ls.any_has_locks 62 | assert not ls.any_write_locks 63 | assert not ls.any_has_write_locks 64 | 65 | lock._unlock() 66 | 67 | 68 | def test_lock_must_implement_lock_function(): 69 | class BadLock(locking.AbstractLock): 70 | mode = "read" 71 | 72 | lock = BadLock("db") 73 | with pytest.raises(NotImplementedError): 74 | lock._lock() 75 | 76 | 77 | def test_remove_orphaned_locks(): 78 | # SLEEP_TIMEOUT = 0.001 79 | # LOCK_KEEP_ALIVE_TIMEOUT = 0.001 80 | # REMOVE_ORPHAN_LOCK_TIMEOUT = 20.0 # Duration to wait before considering a lock as orphaned. 81 | # AQUIRE_LOCK_TIMEOUT = 60.0 82 | 83 | prev = locking.AQUIRE_LOCK_TIMEOUT, locking.LOCK_KEEP_ALIVE_TIMEOUT, locking.REMOVE_ORPHAN_LOCK_TIMEOUT 84 | 85 | locking.AQUIRE_LOCK_TIMEOUT = 10.0 86 | locking.LOCK_KEEP_ALIVE_TIMEOUT = 1.0 87 | locking.REMOVE_ORPHAN_LOCK_TIMEOUT = 0.1 88 | lock = locking.ReadLock("test_remove_orphaned_locks") 89 | lock._lock() 90 | 91 | ls = locking.FileLocksSnapshot(lock.need_lock) 92 | assert len(ls.locks) >= 1 ## The one lock or two if currently in keep alive handover 93 | 94 | time.sleep(0.2) 95 | # Trigger the removal of orphaned locks 96 | ls = locking.FileLocksSnapshot(lock.need_lock) 97 | 98 | assert len(ls.locks) == 0 99 | 100 | lock._unlock() 101 | 102 | locking.AQUIRE_LOCK_TIMEOUT, locking.LOCK_KEEP_ALIVE_TIMEOUT, locking.REMOVE_ORPHAN_LOCK_TIMEOUT = prev 103 | 104 | 105 | def test_lock_keep_alive(): 106 | prev = locking.AQUIRE_LOCK_TIMEOUT, locking.LOCK_KEEP_ALIVE_TIMEOUT, locking.REMOVE_ORPHAN_LOCK_TIMEOUT 107 | 108 | locking.LOCK_KEEP_ALIVE_TIMEOUT = 0.1 109 | locking.ALIVE_LOCK_MAX_AGE = 0.5 110 | 111 | lock = locking.ReadLock("test_lock_keep_alive") 112 | 113 | with lock: 114 | time.sleep(1.0) 115 | 116 | locking.AQUIRE_LOCK_TIMEOUT, locking.LOCK_KEEP_ALIVE_TIMEOUT, locking.REMOVE_ORPHAN_LOCK_TIMEOUT = prev 117 | -------------------------------------------------------------------------------- /tests/test_parallel_crud.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | from multiprocessing import Manager, Process 4 | 5 | import dictdatabase as DDB 6 | 7 | 8 | def do_create(name_of_test: str, return_dict: dict, id_counter: dict, operations: dict) -> None: 9 | with DDB.at(name_of_test).session() as (session, db): 10 | key = f"{id_counter['id']}" 11 | db[key] = {"counter": 0} 12 | id_counter["id"] += 1 13 | operations["create"] += 1 14 | session.write() 15 | return_dict["created_ids"] += [key] 16 | 17 | 18 | def do_update(name_of_test: str, return_dict: dict, operations: dict) -> None: 19 | # increment a random counter 20 | with DDB.at(name_of_test).session() as (session, db): 21 | key = random.choice(return_dict["created_ids"]) 22 | db[key]["counter"] += 1 23 | operations["increment"] += 1 24 | session.write() 25 | 26 | 27 | def do_delete(name_of_test: str, return_dict: dict, operations: dict) -> None: 28 | # Delete a counter 29 | with DDB.at(name_of_test).session() as (session, db): 30 | key = random.choice(return_dict["created_ids"]) 31 | operations["increment"] -= db[key]["counter"] 32 | operations["delete"] += 1 33 | db.pop(key) 34 | return_dict["created_ids"] = [i for i in return_dict["created_ids"] if i != key] 35 | session.write() 36 | 37 | 38 | def do_read(name_of_test: str, return_dict: dict, operations: dict) -> None: 39 | # read a counter 40 | key = random.choice(return_dict["created_ids"]) 41 | DDB.at(name_of_test, key=key).read() 42 | operations["read"] += 1 43 | 44 | 45 | def worker_process(name_of_test: str, i: int, return_dict: dict, id_counter: dict) -> None: 46 | # Random seed to ensure each process gets different random numbers 47 | random.seed(i) 48 | DDB.config.storage_directory = ".ddb_bench_threaded" 49 | operations = { 50 | "create": 0, 51 | "increment": 0, 52 | "read": 0, 53 | "delete": 0, 54 | } 55 | 56 | for _ in range(1000): 57 | choice = random.random() 58 | if choice < 0.05: # 5% chance 59 | do_create(name_of_test, return_dict, id_counter, operations) 60 | elif choice < 0.30: # 25% chance 61 | do_update(name_of_test, return_dict, operations) 62 | elif choice < 0.33: # 3% chance 63 | do_delete(name_of_test, return_dict, operations) 64 | else: # 67% chance 65 | do_read(name_of_test, return_dict, operations) 66 | 67 | # Return the operations for this worker 68 | return_dict[i] = operations 69 | 70 | 71 | def test_multiprocessing_crud(name_of_test, use_compression, use_orjson): 72 | pre_fill_count = 500 73 | DDB.at(name_of_test).create({f"{i}": {"counter": 0} for i in range(pre_fill_count)}, force_overwrite=True) 74 | 75 | manager = Manager() 76 | return_dict = manager.dict() 77 | id_counter = manager.dict() 78 | id_counter["id"] = pre_fill_count 79 | return_dict["created_ids"] = [f"{i}" for i in range(pre_fill_count)] 80 | 81 | start_time = time.time() 82 | processes = [] 83 | for i in range(8): # Spawn 4 processes 84 | p = Process(target=worker_process, args=(name_of_test, i, return_dict, id_counter)) 85 | processes.append(p) 86 | p.start() 87 | 88 | for p in processes: 89 | p.join() 90 | 91 | print(return_dict) 92 | print("Duration", time.time() - start_time) 93 | 94 | db_state = DDB.at(name_of_test).read() 95 | 96 | logged_increment_ops = sum(x["increment"] for k, x in return_dict.items() if k != "created_ids") 97 | assert logged_increment_ops == sum(x["counter"] for x in db_state.values()) 98 | 99 | logged_create_ops = sum(x["create"] for k, x in return_dict.items() if k != "created_ids") 100 | logged_delete_ops = sum(x["delete"] for k, x in return_dict.items() if k != "created_ids") 101 | assert pre_fill_count + logged_create_ops - logged_delete_ops == len(db_state.keys()) 102 | -------------------------------------------------------------------------------- /tests/test_parallel_sessions.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.pool import Pool 2 | 3 | from path_dict import pd 4 | 5 | import dictdatabase as DDB 6 | 7 | 8 | def increment_counters(n, tables, cfg): 9 | DDB.config.storage_directory = cfg.storage_directory 10 | DDB.config.use_compression = cfg.use_compression 11 | DDB.config.use_orjson = cfg.use_orjson 12 | 13 | for _ in range(n): 14 | for t in range(tables): 15 | # Perform a counter increment 16 | with DDB.at(f"test_stress_parallel{t}").session(as_type=pd) as (session, d): 17 | d["counter"] = lambda x: (x or 0) + 1 18 | session.write() 19 | return True 20 | 21 | 22 | def read_counters(n, tables, cfg): 23 | DDB.config.storage_directory = cfg.storage_directory 24 | DDB.config.use_compression = cfg.use_compression 25 | DDB.config.use_orjson = cfg.use_orjson 26 | for _ in range(n): 27 | for t in range(tables): 28 | DDB.at(f"test_stress_parallel{t}").read() 29 | return True 30 | 31 | 32 | def test_stress_multiprocessing(use_compression, use_orjson): 33 | per_thread = 15 34 | tables = 1 35 | threads = 3 36 | # Create tables 37 | for t in range(tables): 38 | DDB.at(f"test_stress_parallel{t}").create({}, force_overwrite=True) 39 | 40 | results = [] 41 | pool = Pool(processes=threads) 42 | for _ in range(threads): 43 | r = pool.apply_async(increment_counters, args=(per_thread, tables, DDB.config)) 44 | results.append(r) 45 | r = pool.apply_async(read_counters, args=(per_thread, tables, DDB.config)) 46 | results.append(r) 47 | pool.close() 48 | pool.join() 49 | 50 | # Check correctness of results 51 | assert [r.get() for r in results] == [True] * threads * 2 52 | for t in range(tables): 53 | db = DDB.at(f"test_stress_parallel{t}").read() 54 | assert db["counter"] == threads * per_thread 55 | 56 | 57 | def test_heavy_multiprocessing(): 58 | per_thread = 50 59 | tables = 1 60 | threads = 20 61 | # Create tables 62 | for t in range(tables): 63 | DDB.at(f"test_stress_parallel{t}").create({}, force_overwrite=True) 64 | 65 | results = [] 66 | pool = Pool(processes=threads) 67 | for _ in range(threads): 68 | r = pool.apply_async(increment_counters, args=(per_thread, tables, DDB.config)) 69 | results.append(r) 70 | r = pool.apply_async(read_counters, args=(per_thread, tables, DDB.config)) 71 | results.append(r) 72 | pool.close() 73 | pool.join() 74 | 75 | # Check correctness of results 76 | assert [r.get() for r in results] == [True] * threads * 2 77 | for t in range(tables): 78 | db = DDB.at(f"test_stress_parallel{t}").read() 79 | assert db["counter"] == threads * per_thread 80 | 81 | 82 | def read_partial(n, cfg): 83 | DDB.locking.SLEEP_TIMEOUT = 0 84 | DDB.config = cfg 85 | for _ in range(n): 86 | DDB.at("test_stress_parallel0", key="key").read() 87 | return True 88 | 89 | 90 | def test_induce_indexer_except(use_compression): 91 | DDB.at("test_stress_parallel0").create({}, force_overwrite=True) 92 | 93 | pool = Pool(processes=2) 94 | for _ in range(2): 95 | pool.apply_async(read_partial, args=(1000, DDB.config)) 96 | pool.close() 97 | pool.join() 98 | -------------------------------------------------------------------------------- /tests/test_partial.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | from path_dict import pd 5 | 6 | import dictdatabase as DDB 7 | 8 | 9 | def test_subread(use_compression, use_orjson, indent): 10 | name = "test_subread" 11 | j = { 12 | "a": "Hello{}", 13 | "b": [0, 1], 14 | "c": {"d": "e"}, 15 | } 16 | 17 | DDB.at(name).create(j, force_overwrite=True) 18 | 19 | assert DDB.at(name, key="a").read() == "Hello{}" 20 | assert DDB.at(name, where=lambda k, v: isinstance(v, list)).read() == {"b": [0, 1]} 21 | 22 | assert DDB.at(name, key="f").read() is None 23 | 24 | assert DDB.at(name, key="b").read() == [0, 1] 25 | assert DDB.at(name, key="c").read() == {"d": "e"} 26 | 27 | j2 = {"a": {"b": "c"}, "b": {"d": "e"}} 28 | DDB.at("test_subread2").create(j2, force_overwrite=True) 29 | assert DDB.at("test_subread2", key="b").read() == {"d": "e"} 30 | 31 | assert DDB.at("none", key="none").read() is None 32 | 33 | j3 = {"a": {"b": {"\\c\\": {"a": "a"}}}} 34 | DDB.at("test_subread3").create(j3, force_overwrite=True) 35 | assert DDB.at("test_subread3", key="a").read() == {"b": {"\\c\\": {"a": "a"}}} 36 | 37 | 38 | def test_subwrite(use_compression, use_orjson, indent): 39 | name = "test_subwrite" 40 | j = { 41 | "b": {"0": 1}, 42 | "c": {"d": "e"}, 43 | } 44 | 45 | DDB.at(name).create(j, force_overwrite=True) 46 | with DDB.at(name, key="c").session(as_type=pd) as (session, task): 47 | task["f"] = lambda x: (x or 0) + 5 48 | session.write() 49 | assert DDB.at(name, key="c").read() == {"d": "e", "f": 5} 50 | 51 | with DDB.at(name, key="b").session(as_type=pd) as (session, task): 52 | task["f"] = lambda x: (x or 0) + 2 53 | session.write() 54 | 55 | assert DDB.at(name, key="f").read() is None 56 | 57 | with pytest.raises(KeyError): 58 | with DDB.at(name, key="none").session() as (session, key): 59 | session.write() 60 | 61 | 62 | def test_write_file_where(use_compression, use_orjson, indent): 63 | name = "test_write_file_where" 64 | j = { 65 | "a": 1, 66 | "b": 20, 67 | "c": 3, 68 | "d": 40, 69 | } 70 | 71 | DDB.at(name).create(j, force_overwrite=True) 72 | 73 | with DDB.at(name, where=lambda k, v: v > 10).session() as (session, vals): 74 | vals.update({"b": 30, "d": 50, "e": 60}) 75 | session.write() 76 | assert DDB.at(name).read() == { 77 | "a": 1, 78 | "b": 30, 79 | "c": 3, 80 | "d": 50, 81 | "e": 60, 82 | } 83 | 84 | 85 | def test_dir_where(use_compression, use_orjson, indent): 86 | name = "test_dir_where" 87 | for i in range(5): 88 | DDB.at(name, i).create({"k": i}, force_overwrite=True) 89 | 90 | with DDB.at(name, "*", where=lambda k, v: v["k"] > 2).session() as (session, vals): 91 | for k, v in vals.items(): 92 | v["k"] += 1 93 | session.write() 94 | assert DDB.at(name, "*").read() == { 95 | "0": {"k": 0}, 96 | "1": {"k": 1}, 97 | "2": {"k": 2}, 98 | "3": {"k": 4}, 99 | "4": {"k": 5}, 100 | } 101 | -------------------------------------------------------------------------------- /tests/test_read.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | 5 | import dictdatabase as DDB 6 | from tests.utils import make_complex_nested_random_dict 7 | 8 | 9 | def test_non_existent(use_compression, use_orjson, indent): 10 | d = DDB.at("nonexistent").read() 11 | assert d is None 12 | 13 | 14 | def test_file_exists_error(use_compression, use_orjson, indent): 15 | with open(f"{DDB.config.storage_directory}/test_file_exists_error.json", "w") as f: 16 | f.write("") 17 | with open(f"{DDB.config.storage_directory}/test_file_exists_error.ddb", "w") as f: 18 | f.write("") 19 | with pytest.raises(FileExistsError): 20 | DDB.at("test_file_exists_error").read() 21 | 22 | 23 | def test_invalid_params(use_compression, use_orjson, indent): 24 | with pytest.raises(TypeError): 25 | DDB.at("test_invalid_params", key="any", where=lambda k, v: True).read() 26 | 27 | 28 | def test_read_integrity(use_compression, use_orjson, indent): 29 | cases = [ 30 | r'{"a": "\\", "b": 0}', 31 | r'{"a": "\\\\", "b": 1234}', 32 | r'{"a": "\\\\\"", "b": 1234}', 33 | r'{"a": "\\\"\\", "b": 1234}', 34 | r'{"a": "\"\\\\", "b": 1234}', 35 | r'{"a": "\"", "b": 1234}', 36 | r'{"a": "\"\"", "b": 1234}', 37 | r'{"a": "\"\"\\", "b": 1234}', 38 | r'{"a": "\"\\\"", "b": 1234}', 39 | r'{"a": "\\\"\"", "b": 1234}', 40 | ] 41 | 42 | for case in cases: 43 | with open(f"{DDB.config.storage_directory}/test_read_integrity.json", "w") as f: 44 | f.write(case) 45 | key_a = DDB.at("test_read_integrity", key="a").read() 46 | key_b = DDB.at("test_read_integrity", key="b").read() 47 | assert key_a == json.loads(case)["a"] 48 | assert key_b == json.loads(case)["b"] 49 | 50 | 51 | def test_create_and_read(use_compression, use_orjson, indent): 52 | name = "test_create_and_read" 53 | d = make_complex_nested_random_dict(12, 6) 54 | DDB.at(name).create(d, force_overwrite=True) 55 | dd = DDB.at(name).read() 56 | assert d == dd 57 | 58 | 59 | def test_read_compression_switching(use_orjson, indent): 60 | name = "test_read_compression_switching" 61 | DDB.config.use_compression = False 62 | d = make_complex_nested_random_dict(12, 6) 63 | DDB.at(name).create(d, force_overwrite=True) 64 | DDB.config.use_compression = True 65 | dd = DDB.at(name).read() 66 | assert d == dd 67 | DDB.at(name).create(d, force_overwrite=True) 68 | DDB.config.use_compression = False 69 | dd = DDB.at(name).read() 70 | assert d == dd 71 | 72 | 73 | def test_multiread(use_compression, use_orjson, indent): 74 | dl = [] 75 | for i in range(3): 76 | dl += [make_complex_nested_random_dict(12, 6)] 77 | DDB.at(f"test_multiread/d{i}").create(dl[-1], force_overwrite=True) 78 | 79 | mr = DDB.at("test_multiread/*").read() 80 | mr2 = DDB.at("test_multiread", "*").read() 81 | assert mr == mr2 82 | mr = {k.replace("test_multiread/", ""): v for k, v in mr.items()} 83 | assert mr == {f"d{i}": dl[i] for i in range(3)} 84 | -------------------------------------------------------------------------------- /tests/test_threaded_sessions.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ThreadPoolExecutor, wait 2 | 3 | from path_dict import pd 4 | 5 | import dictdatabase as DDB 6 | 7 | 8 | def increment_counters(n, tables): 9 | for _ in range(n): 10 | for t in range(tables): 11 | # Perform a counter increment 12 | with DDB.at(f"test_stress_threaded{t}").session(as_type=pd) as (session, d): 13 | d["counter"] = lambda x: (x or 0) + 1 14 | session.write() 15 | return True 16 | 17 | 18 | def read_counters(n, tables): 19 | for _ in range(n): 20 | for t in range(tables): 21 | DDB.at(f"test_stress_threaded{t}").read() 22 | return True 23 | 24 | 25 | def test_stress_threaded(use_compression, use_orjson): 26 | per_thread = 15 27 | tables = 1 28 | threads = 3 29 | # Create tables 30 | for t in range(tables): 31 | DDB.at(f"test_stress_threaded{t}").create({}, force_overwrite=True) 32 | 33 | results = [] 34 | with ThreadPoolExecutor(max_workers=threads) as pool: 35 | for _ in range(threads): 36 | future = pool.submit(increment_counters, per_thread, tables) 37 | results.append(future) 38 | future = pool.submit(read_counters, per_thread, tables) 39 | results.append(future) 40 | wait(results) 41 | 42 | # Check correctness of results 43 | assert [r.result() for r in results] == [True] * threads * 2 44 | for t in range(tables): 45 | db = DDB.at(f"test_stress_threaded{t}").read() 46 | assert db["counter"] == threads * per_thread 47 | 48 | 49 | def test_heavy_threading(): 50 | per_thread = 50 51 | tables = 1 52 | threads = 20 53 | # Create tables 54 | for t in range(tables): 55 | DDB.at(f"test_stress_threaded{t}").create({}, force_overwrite=True) 56 | 57 | results = [] 58 | with ThreadPoolExecutor(max_workers=threads) as pool: 59 | for _ in range(threads): 60 | future = pool.submit(increment_counters, per_thread, tables) 61 | results.append(future) 62 | future = pool.submit(read_counters, per_thread, tables) 63 | results.append(future) 64 | wait(results) 65 | 66 | # Check correctness of results 67 | assert [r.result() for r in results] == [True] * threads * 2 68 | for t in range(tables): 69 | db = DDB.at(f"test_stress_threaded{t}").read() 70 | assert db["counter"] == threads * per_thread 71 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | import orjson 4 | 5 | from dictdatabase import byte_codes, utils 6 | 7 | 8 | def test_seek_index_through_value_bytes(): 9 | v = b'{"a": 1, "b": {}}' 10 | assert utils.seek_index_through_value_bytes(v, 5) == 7 11 | assert utils.seek_index_through_value_bytes(v, 6) == 7 12 | assert utils.seek_index_through_value_bytes(v, 13) == 16 13 | vc = b'{"a":1,"b":{}}' 14 | assert utils.seek_index_through_value_bytes(vc, 5) == 6 15 | assert utils.seek_index_through_value_bytes(vc, 11) == 13 16 | n = b'{"a": 1234, "b": {"c": 2}}' 17 | assert utils.seek_index_through_value_bytes(n, 5) == 10 18 | assert utils.seek_index_through_value_bytes(n, 6) == 10 19 | 20 | 21 | def test_seek_index_through_value_bytes_2(): 22 | def load_with_orjson(bytes, key): 23 | return orjson.loads(bytes)[key] 24 | 25 | def load_with_seeker(bytes, key): 26 | key_bytes = f'"{key}":'.encode() 27 | a_val_start = bytes.find(key_bytes) + len(key_bytes) 28 | if bytes[a_val_start] == byte_codes.SPACE: 29 | a_val_start += 1 30 | a_val_end = utils.seek_index_through_value_bytes(bytes, a_val_start) 31 | return orjson.loads(bytes[a_val_start:a_val_end]) 32 | 33 | values = [ 34 | # Lists 35 | [], 36 | [{}], 37 | [""], 38 | [1], 39 | [1, 2, 3], 40 | ["xs", -123.3, "c"], 41 | [1, "xs", 2, "value", 3, "c"], 42 | [1, "xs", 2, "value", 3, "c", [1, 2, 3], [1, 2, 3], [1, 2, 3]], 43 | [{}, {}, {}], 44 | [{"xs": 1}, {"value": 2}, {"c": 3}], 45 | [{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}], 46 | [{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}, [1, 2, 3], [1, 2, 3], [1, 2, 3]], 47 | # Dicts 48 | {}, 49 | {"": ""}, 50 | {"x": []}, 51 | {"xs": 1}, 52 | {"xs": 1, "value": 2}, 53 | {"xs": [], "value": {}}, 54 | {"xs": -3.3, "value": ""}, 55 | # Numbers 56 | 1, 57 | 1234, 58 | 1.3, 59 | 32.3, 60 | 0, 61 | -1.3, 62 | -0, 63 | # Strings 64 | "", 65 | "a", 66 | "hello", 67 | "a\\b", 68 | "\\", 69 | "\\\\", 70 | '\\\\"', 71 | '\\"\\', 72 | '"\\\\', 73 | '"', 74 | '""', 75 | '""\\', 76 | '"\\"', 77 | '\\""', 78 | # Booleans 79 | True, 80 | None, 81 | False, 82 | ] 83 | 84 | for indent, v1, v2 in itertools.product([False, True], values, values): 85 | option = orjson.OPT_SORT_KEYS | (orjson.OPT_INDENT_2 if indent else 0) 86 | json_bytes = orjson.dumps({"a": v1, "b": v2}, option=option) 87 | assert load_with_orjson(json_bytes, "a") == load_with_seeker(json_bytes, "a") 88 | assert load_with_orjson(json_bytes, "b") == load_with_seeker(json_bytes, "b") 89 | -------------------------------------------------------------------------------- /tests/test_where.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from path_dict import PathDict 3 | 4 | import dictdatabase as DDB 5 | 6 | 7 | def test_where(use_compression, use_orjson, indent): 8 | for i in range(10): 9 | DDB.at("test_select", i).create({"a": i}, force_overwrite=True) 10 | 11 | s = DDB.at("test_select/*", where=lambda k, v: v["a"] > 7).read() 12 | 13 | assert s == {"8": {"a": 8}, "9": {"a": 9}} 14 | 15 | with pytest.raises(KeyError): 16 | DDB.at("test_select/*", where=lambda k, v: v["b"] > 5).read() 17 | 18 | assert DDB.at("nonexistent/*", where=lambda k, v: v["a"] > 5).read() == {} 19 | 20 | assert DDB.at("nonexistent", where=lambda k, v: v["a"] > 5).read() is None 21 | 22 | s = DDB.at("test_select/*", where=lambda k, v: v.at("a").get() > 7).read(as_type=PathDict) 23 | assert s.get() == {"8": {"a": 8}, "9": {"a": 9}} 24 | -------------------------------------------------------------------------------- /tests/test_write.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from path_dict import pd 3 | 4 | import dictdatabase as DDB 5 | from tests.utils import make_complex_nested_random_dict 6 | 7 | 8 | def test_non_existent_session(use_compression, use_orjson, indent): 9 | name = "test_non_existent_session" 10 | with pytest.raises(FileNotFoundError): 11 | with DDB.at(name).session() as (session, d): 12 | session.write() 13 | 14 | 15 | def test_write(use_compression, use_orjson, indent): 16 | name = "test_write" 17 | d = make_complex_nested_random_dict(12, 6) 18 | DDB.at(name).create(d, force_overwrite=True) 19 | with DDB.at(name).session() as (session, dd): 20 | assert d == dd 21 | session.write() 22 | 23 | 24 | def test_write_compression_switching(use_orjson, indent): 25 | name = "test_write_compression_switching" 26 | DDB.config.use_compression = False 27 | d = make_complex_nested_random_dict(12, 6) 28 | DDB.at(name).create(d, force_overwrite=True) 29 | with DDB.at(name).session() as (session, dd): 30 | assert d == dd 31 | session.write() 32 | assert DDB.at(name).read() == d 33 | DDB.config.use_compression = True 34 | with DDB.at(name).session(as_type=pd) as (session, dd): 35 | assert d == dd.get() 36 | session.write() 37 | assert DDB.at(name).read() == d 38 | DDB.config.use_compression = False 39 | with DDB.at(name).session() as (session, dd): 40 | assert d == dd 41 | session.write() 42 | assert DDB.at(name).read() == d 43 | 44 | 45 | def test_multi_session(use_compression, use_orjson, indent): 46 | a = {"a": 1} 47 | b = {"b": 2} 48 | 49 | DDB.at("test_multi_session/d1").create(a, force_overwrite=True) 50 | DDB.at("test_multi_session/d2").create(b, force_overwrite=True) 51 | 52 | with DDB.at("test_multi_session/*").session() as (session, d): 53 | assert d == {"d1": a, "d2": b} 54 | session.write() 55 | assert DDB.at("test_multi_session/*").read() == {"d1": a, "d2": b} 56 | 57 | 58 | def test_write_wildcard_key_except(use_compression, use_orjson, indent): 59 | with pytest.raises(TypeError): 60 | with DDB.at("test/*", key="any").session() as (session, d): 61 | pass 62 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | import string 5 | 6 | 7 | def get_tasks_json(): 8 | print(os.getcwd()) 9 | with open("test_db/production_database/tasks.json", "rb") as f: 10 | return json.load(f) 11 | 12 | 13 | def make_complex_nested_random_dict(max_width, max_depth): 14 | def random_string(choices, md): 15 | length = random.randint(0, max_width) 16 | letters = string.ascii_letters + "".join(["\\", " ", "🚀", '"']) 17 | return "".join(random.choices(letters, k=length)) 18 | 19 | def random_int(choices, md): 20 | return random.randint(-1000, 1000) 21 | 22 | def random_float(choices, md): 23 | return random.uniform(-1000, 1000) 24 | 25 | def random_bool(choices, md): 26 | return random.choice([True, False]) 27 | 28 | def random_none(choices, md): 29 | return None 30 | 31 | def random_list(choices, md): 32 | if md == 0: 33 | return [] 34 | res = [] 35 | for _ in range(random.randint(0, max_width)): 36 | v = random.choice(choices)(choices, md - 1) 37 | res += [v] 38 | return res 39 | 40 | def random_dict(choices, md): 41 | if md == 0: 42 | return {} 43 | res = {} 44 | for _ in range(random.randint(0, max_width)): 45 | k = random_string(choices, md) 46 | v = random.choice(choices)(choices, md - 1) 47 | res[k] = v 48 | return res 49 | 50 | return random_dict( 51 | [random_string, random_int, random_float, random_bool, random_none, random_list, random_dict], max_depth 52 | ) 53 | --------------------------------------------------------------------------------