├── .ci
├── release
├── release-uv
└── run
├── .gitattributes
├── .github
└── workflows
│ └── main.yml
├── .gitignore
├── .idea
└── dictionaries
│ └── karlicos.xml
├── LICENSE.txt
├── README.ipynb
├── README.md
├── benchmarks
├── 20230912-comparison-with-legacy.org
├── 20230912.org
└── 20230917.org
├── conftest.py
├── doc
├── cachew_disable.md
├── serialization.org
└── test_serialization.py
├── generate-readme
├── github-issues.org
├── misc
├── profile.py
└── test_redis
│ ├── docker-compose.yml
│ └── test.py
├── mypy.ini
├── pyproject.toml
├── pytest.ini
├── readme.tpl
├── ruff.toml
├── src
└── cachew
│ ├── __init__.py
│ ├── backend
│ ├── common.py
│ ├── file.py
│ └── sqlite.py
│ ├── common.py
│ ├── compat.py
│ ├── experimental.py
│ ├── extra.py
│ ├── legacy.py
│ ├── logging_helper.py
│ ├── marshall
│ ├── cachew.py
│ └── common.py
│ ├── py.typed
│ ├── pytest.py
│ ├── tests
│ ├── marshall.py
│ ├── test_cachew.py
│ ├── test_future_annotations.py
│ └── utils.py
│ └── utils.py
└── tox.ini
/.ci/release:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | '''
3 | Run [[file:scripts/release][scripts/release]] to deploy Python package onto [[https://pypi.org][PyPi]] and [[https://test.pypi.org][test PyPi]].
4 |
5 | The script expects =TWINE_PASSWORD= environment variable to contain the [[https://pypi.org/help/#apitoken][PyPi token]] (not the password!).
6 |
7 | The script can be run manually.
8 | It's also running as =pypi= job in [[file:.github/workflows/main.yml][Github Actions config]]. Packages are deployed on:
9 | - every master commit, onto test pypi
10 | - every new tag, onto production pypi
11 |
12 | You'll need to set =TWINE_PASSWORD= and =TWINE_PASSWORD_TEST= in [[https://help.github.com/en/actions/configuring-and-managing-workflows/creating-and-storing-encrypted-secrets#creating-encrypted-secrets][secrets]]
13 | for Github Actions deployment to work.
14 | '''
15 |
16 | import os
17 | import sys
18 | from pathlib import Path
19 | from subprocess import check_call
20 | import shutil
21 |
22 | is_ci = os.environ.get('CI') is not None
23 |
24 | def main() -> None:
25 | import argparse
26 | p = argparse.ArgumentParser()
27 | p.add_argument('--test', action='store_true', help='use test pypi')
28 | args = p.parse_args()
29 |
30 | extra = []
31 | if args.test:
32 | extra.extend(['--repository', 'testpypi'])
33 |
34 | root = Path(__file__).absolute().parent.parent
35 | os.chdir(root) # just in case
36 |
37 | if is_ci:
38 | # see https://github.com/actions/checkout/issues/217
39 | check_call('git fetch --prune --unshallow'.split())
40 |
41 | dist = root / 'dist'
42 | if dist.exists():
43 | shutil.rmtree(dist)
44 |
45 | check_call(['python3', '-m', 'build'])
46 |
47 | TP = 'TWINE_PASSWORD'
48 | password = os.environ.get(TP)
49 | if password is None:
50 | print(f"WARNING: no {TP} passed", file=sys.stderr)
51 | import pip_secrets
52 | password = pip_secrets.token_test if args.test else pip_secrets.token # meh
53 |
54 | check_call([
55 | 'python3', '-m', 'twine',
56 | 'upload', *dist.iterdir(),
57 | *extra,
58 | ], env={
59 | 'TWINE_USERNAME': '__token__',
60 | TP: password,
61 | **os.environ,
62 | })
63 |
64 |
65 | if __name__ == '__main__':
66 | main()
67 |
--------------------------------------------------------------------------------
/.ci/release-uv:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | '''
3 | Deploys Python package onto [[https://pypi.org][PyPi]] or [[https://test.pypi.org][test PyPi]].
4 |
5 | - running manually
6 |
7 | You'll need =UV_PUBLISH_TOKEN= env variable
8 |
9 | - running on Github Actions
10 |
11 | Instead of env variable, relies on configuring github as Trusted publisher (https://docs.pypi.org/trusted-publishers/) -- both for test and regular pypi
12 |
13 | It's running as =pypi= job in [[file:.github/workflows/main.yml][Github Actions config]].
14 | Packages are deployed on:
15 | - every master commit, onto test pypi
16 | - every new tag, onto production pypi
17 | '''
18 |
19 | UV_PUBLISH_TOKEN = 'UV_PUBLISH_TOKEN'
20 |
21 | import argparse
22 | import os
23 | import shutil
24 | from pathlib import Path
25 | from subprocess import check_call
26 |
27 | is_ci = os.environ.get('CI') is not None
28 |
29 | def main() -> None:
30 | p = argparse.ArgumentParser()
31 | p.add_argument('--use-test-pypi', action='store_true')
32 | args = p.parse_args()
33 |
34 | publish_url = ['--publish-url', 'https://test.pypi.org/legacy/'] if args.use_test_pypi else []
35 |
36 | root = Path(__file__).absolute().parent.parent
37 | os.chdir(root) # just in case
38 |
39 | if is_ci:
40 | # see https://github.com/actions/checkout/issues/217
41 | check_call('git fetch --prune --unshallow'.split())
42 |
43 | # TODO ok, for now uv won't remove dist dir if it already exists
44 | # https://github.com/astral-sh/uv/issues/10293
45 | dist = root / 'dist'
46 | if dist.exists():
47 | shutil.rmtree(dist)
48 |
49 | # todo what is --force-pep517?
50 | check_call(['uv', 'build'])
51 |
52 | if not is_ci:
53 | # CI relies on trusted publishers so doesn't need env variable
54 | assert UV_PUBLISH_TOKEN in os.environ, f'no {UV_PUBLISH_TOKEN} passed'
55 |
56 | check_call(['uv', 'publish', *publish_url])
57 |
58 |
59 | if __name__ == '__main__':
60 | main()
61 |
--------------------------------------------------------------------------------
/.ci/run:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eu
3 |
4 | cd "$(dirname "$0")"
5 | cd .. # git root
6 |
7 | if ! command -v sudo; then
8 | # CI or Docker sometimes doesn't have it, so useful to have a dummy
9 | function sudo {
10 | "$@"
11 | }
12 | fi
13 |
14 | # --parallel-live to show outputs while it's running
15 | tox_cmd='run-parallel --parallel-live'
16 | if [ -n "${CI-}" ]; then
17 | # install OS specific stuff here
18 | case "$OSTYPE" in
19 | darwin*)
20 | # macos
21 | :
22 | ;;
23 | cygwin* | msys* | win*)
24 | # windows
25 | # ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that
26 | tox_cmd='run'
27 | ;;
28 | *)
29 | # must be linux?
30 | :
31 | ;;
32 | esac
33 | fi
34 |
35 | # NOTE: expects uv installed
36 | uv tool run --with tox-uv tox $tox_cmd "$@"
37 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb filter=nbstripout
2 |
3 | *.ipynb diff=ipynb
4 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | # see https://github.com/karlicoss/pymplate for up-to-date reference
2 |
3 | name: CI
4 | on:
5 | push:
6 | branches: '*'
7 | tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi
8 | # Ideally I would put this in the pypi job... but github syntax doesn't allow for regexes there :shrug:
9 | pull_request: # needed to trigger on others' PRs
10 | # Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them".
11 | workflow_dispatch: # needed to trigger workflows manually
12 | # todo cron?
13 | inputs:
14 | debug_enabled:
15 | type: boolean
16 | description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)'
17 | required: false
18 | default: false
19 |
20 |
21 | jobs:
22 | build:
23 | strategy:
24 | fail-fast: false
25 | matrix:
26 | platform: [ubuntu-latest, macos-latest] # todo windows-latest]
27 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
28 | # vvv just an example of excluding stuff from matrix
29 | # exclude: [{platform: macos-latest, python-version: '3.6'}]
30 |
31 | runs-on: ${{ matrix.platform }}
32 |
33 | # useful for 'optional' pipelines
34 | # continue-on-error: ${{ matrix.platform == 'windows-latest' }}
35 |
36 | steps:
37 | # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation
38 | - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
39 |
40 | - uses: actions/checkout@v4
41 | with:
42 | submodules: recursive
43 | fetch-depth: 0 # nicer to have all git history when debugging/for tests
44 |
45 | - uses: actions/setup-python@v5
46 | with:
47 | python-version: ${{ matrix.python-version }}
48 |
49 | - uses: astral-sh/setup-uv@v5
50 | with:
51 | enable-cache: false # we don't have lock files, so can't use them as cache key
52 |
53 | - uses: mxschmitt/action-tmate@v3
54 | if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }}
55 |
56 | # explicit bash command is necessary for Windows CI runner, otherwise it thinks it's cmd...
57 | - run: bash .ci/run
58 |
59 | - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms
60 | uses: actions/upload-artifact@v4
61 | with:
62 | include-hidden-files: true
63 | name: .coverage.mypy_${{ matrix.platform }}_${{ matrix.python-version }}
64 | path: .coverage.mypy/
65 |
66 |
67 | pypi:
68 | runs-on: ubuntu-latest
69 | needs: [build] # add all other jobs here
70 | permissions:
71 | # necessary for Trusted Publishing
72 | id-token: write
73 | steps:
74 | # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation
75 | - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
76 |
77 | - uses: actions/checkout@v4
78 | with:
79 | submodules: recursive
80 |
81 | - uses: actions/setup-python@v5
82 | with:
83 | python-version: '3.10'
84 |
85 | - uses: astral-sh/setup-uv@v5
86 | with:
87 | enable-cache: false # we don't have lock files, so can't use them as cache key
88 |
89 | - name: 'release to test pypi'
90 | # always deploy merged master to test pypi
91 | if: github.event_name != 'pull_request' && github.event.ref == 'refs/heads/master'
92 | run: .ci/release-uv --use-test-pypi
93 |
94 | - name: 'release to pypi'
95 | # always deploy tags to release pypi
96 | # NOTE: release tags are guarded by on: push: tags on the top
97 | if: github.event_name != 'pull_request' && startsWith(github.event.ref, 'refs/tags')
98 | run: .ci/release-uv
99 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.gitignore.io/api/python,emacs
3 | # Edit at https://www.gitignore.io/?templates=python,emacs
4 |
5 | ### Emacs ###
6 | # -*- mode: gitignore; -*-
7 | *~
8 | \#*\#
9 | /.emacs.desktop
10 | /.emacs.desktop.lock
11 | *.elc
12 | auto-save-list
13 | tramp
14 | .\#*
15 |
16 | # Org-mode
17 | .org-id-locations
18 | *_archive
19 |
20 | # flymake-mode
21 | *_flymake.*
22 |
23 | # eshell files
24 | /eshell/history
25 | /eshell/lastdir
26 |
27 | # elpa packages
28 | /elpa/
29 |
30 | # reftex files
31 | *.rel
32 |
33 | # AUCTeX auto folder
34 | /auto/
35 |
36 | # cask packages
37 | .cask/
38 | dist/
39 |
40 | # Flycheck
41 | flycheck_*.el
42 |
43 | # server auth directory
44 | /server/
45 |
46 | # projectiles files
47 | .projectile
48 |
49 | # directory configuration
50 | .dir-locals.el
51 |
52 | # network security
53 | /network-security.data
54 |
55 |
56 | ### Python ###
57 | # Byte-compiled / optimized / DLL files
58 | __pycache__/
59 | *.py[cod]
60 | *$py.class
61 |
62 | # C extensions
63 | *.so
64 |
65 | # Distribution / packaging
66 | .Python
67 | build/
68 | develop-eggs/
69 | downloads/
70 | eggs/
71 | .eggs/
72 | lib/
73 | lib64/
74 | parts/
75 | sdist/
76 | var/
77 | wheels/
78 | pip-wheel-metadata/
79 | share/python-wheels/
80 | *.egg-info/
81 | .installed.cfg
82 | *.egg
83 | MANIFEST
84 |
85 | # PyInstaller
86 | # Usually these files are written by a python script from a template
87 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
88 | *.manifest
89 | *.spec
90 |
91 | # Installer logs
92 | pip-log.txt
93 | pip-delete-this-directory.txt
94 |
95 | # Unit test / coverage reports
96 | htmlcov/
97 | .tox/
98 | .nox/
99 | .coverage
100 | .coverage.*
101 | .cache
102 | nosetests.xml
103 | coverage.xml
104 | *.cover
105 | .hypothesis/
106 | .pytest_cache/
107 |
108 | # Translations
109 | *.mo
110 | *.pot
111 |
112 | # Django stuff:
113 | *.log
114 | local_settings.py
115 | db.sqlite3
116 | db.sqlite3-journal
117 |
118 | # Flask stuff:
119 | instance/
120 | .webassets-cache
121 |
122 | # Scrapy stuff:
123 | .scrapy
124 |
125 | # Sphinx documentation
126 | docs/_build/
127 |
128 | # PyBuilder
129 | target/
130 |
131 | # Jupyter Notebook
132 | .ipynb_checkpoints
133 |
134 | # IPython
135 | profile_default/
136 | ipython_config.py
137 |
138 | # pyenv
139 | .python-version
140 |
141 | # pipenv
142 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
143 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
144 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
145 | # install all needed dependencies.
146 | #Pipfile.lock
147 |
148 | # celery beat schedule file
149 | celerybeat-schedule
150 |
151 | # SageMath parsed files
152 | *.sage.py
153 |
154 | # Environments
155 | .env
156 | .venv
157 | env/
158 | venv/
159 | ENV/
160 | env.bak/
161 | venv.bak/
162 |
163 | # Spyder project settings
164 | .spyderproject
165 | .spyproject
166 |
167 | # Rope project settings
168 | .ropeproject
169 |
170 | # mkdocs documentation
171 | /site
172 |
173 | # mypy
174 | .mypy_cache/
175 | .dmypy.json
176 | dmypy.json
177 |
178 | # Pyre type checker
179 | .pyre/
180 |
181 | # End of https://www.gitignore.io/api/python,emacs
182 |
183 | untracked/
184 |
--------------------------------------------------------------------------------
/.idea/dictionaries/karlicos.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | cachew
5 | dataclassish
6 | pylint
7 | typecheck
8 |
9 |
10 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2019 Dima Gerasimov
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "autoscroll": false,
8 | "ein.hycell": false,
9 | "ein.tags": "worksheet-0",
10 | "slideshow": {
11 | "slide_type": "-"
12 | },
13 | "tags": [
14 | "noexport"
15 | ]
16 | },
17 | "outputs": [],
18 | "source": [
19 | "from __future__ import annotations\n",
20 | "\n",
21 | "import ast\n",
22 | "import inspect\n",
23 | "import sys\n",
24 | "from pathlib import Path\n",
25 | "\n",
26 | "from IPython.display import Markdown as md\n",
27 | "\n",
28 | "##\n",
29 | "sys.path.insert(0, str(Path('src').absolute()))\n",
30 | "import cachew # isort: skip\n",
31 | "import cachew.extra # isort: skip\n",
32 | "import cachew.marshall.cachew # isort: skip\n",
33 | "import cachew.tests.test_cachew as tests # isort: skip\n",
34 | "sys.modules['tests'] = tests # meh\n",
35 | "##\n",
36 | "\n",
37 | "_CWD = Path.cwd()\n",
38 | "\n",
39 | "\n",
40 | "def flink(title: str, name: str | None = None) -> str:\n",
41 | " # name is method name\n",
42 | " if name is None:\n",
43 | " name = title.replace('`', '') # meh\n",
44 | " split = name.rsplit('.', maxsplit=1)\n",
45 | " if len(split) == 1:\n",
46 | " modname = split[0]\n",
47 | " fname = None\n",
48 | " else:\n",
49 | " [modname, fname] = split\n",
50 | " module = sys.modules[modname]\n",
51 | "\n",
52 | " file = Path(module.__file__).relative_to(_CWD)\n",
53 | "\n",
54 | " if fname is not None:\n",
55 | " func = module\n",
56 | " for p in fname.split('.'):\n",
57 | " func = getattr(func, p)\n",
58 | " _, number = inspect.getsourcelines(func)\n",
59 | " numbers = f'#L{number}'\n",
60 | " else:\n",
61 | " numbers = ''\n",
62 | " return f'[{title}]({file}{numbers})'\n",
63 | "\n",
64 | "\n",
65 | "dmd = lambda x: display(md(x.strip()))"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {
72 | "autoscroll": false,
73 | "ein.hycell": false,
74 | "ein.tags": "worksheet-0",
75 | "slideshow": {
76 | "slide_type": "-"
77 | }
78 | },
79 | "outputs": [],
80 | "source": [
81 | "dmd('''\n",
82 | "\n",
87 | "''')"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {
93 | "ein.tags": "worksheet-0",
94 | "slideshow": {
95 | "slide_type": "-"
96 | }
97 | },
98 | "source": [
99 | "# What is Cachew?\n",
100 | "TLDR: cachew lets you **cache function calls** into an sqlite database on your disk in a matter of **single decorator** (similar to [functools.lru_cache](https://docs.python.org/3/library/functools.html#functools.lru_cache)). The difference from `functools.lru_cache` is that cached data is persisted between program runs, so next time you call your function, it will only be a matter of reading from the cache.\n",
101 | "Cache is **invalidated automatically** if your function's arguments change, so you don't have to think about maintaining it.\n",
102 | "\n",
103 | "In order to be cacheable, your function needs to return a simple data type, or an [Iterator](https://docs.python.org/3/library/typing.html#typing.Iterator) over such types.\n",
104 | "\n",
105 | "A simple type is defined as:\n",
106 | "\n",
107 | "- primitive: `str`/`int`/`float`/`bool`\n",
108 | "- JSON-like types (`dict`/`list`/`tuple`)\n",
109 | "- `datetime`\n",
110 | "- `Exception` (useful for [error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss) )\n",
111 | "- [NamedTuples](https://docs.python.org/3/library/typing.html#typing.NamedTuple)\n",
112 | "- [dataclasses](https://docs.python.org/3/library/dataclasses.html)\n",
113 | "\n",
114 | "\n",
115 | "That allows to **automatically infer schema from type hints** ([PEP 526](https://www.python.org/dev/peps/pep-0526)) and not think about serializing/deserializing.\n",
116 | "Thanks to type hints, you don't need to annotate your classes with any special decorators, inherit from some special base classes, etc., as it's often the case for serialization libraries.\n",
117 | "\n",
118 | "## Motivation\n",
119 | "\n",
120 | "I often find myself processing big chunks of data, merging data together, computing some aggregates on it or extracting few bits I'm interested at. While I'm trying to utilize REPL as much as I can, some things are still fragile and often you just have to rerun the whole thing in the process of development. This can be frustrating if data parsing and processing takes seconds, let alone minutes in some cases.\n",
121 | "\n",
122 | "Conventional way of dealing with it is serializing results along with some sort of hash (e.g. md5) of input files,\n",
123 | "comparing on the next run and returning cached data if nothing changed.\n",
124 | "\n",
125 | "Simple as it sounds, it is pretty tedious to do every time you need to memorize some data, contaminates your code with routine and distracts you from your main task.\n",
126 | "\n",
127 | "\n",
128 | "# Examples\n",
129 | "## Processing Wikipedia\n",
130 | "Imagine you're working on a data analysis pipeline for some huge dataset, say, extracting urls and their titles from Wikipedia archive.\n",
131 | "Parsing it (`extract_links` function) takes hours, however, as long as the archive is same you will always get same results. So it would be nice to be able to cache the results somehow.\n",
132 | "\n",
133 | "\n",
134 | "With this library your can achieve it through single `@cachew` decorator."
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {
141 | "autoscroll": false,
142 | "ein.hycell": false,
143 | "ein.tags": "worksheet-0",
144 | "slideshow": {
145 | "slide_type": "-"
146 | }
147 | },
148 | "outputs": [],
149 | "source": [
150 | "doc = inspect.getdoc(cachew.cachew)\n",
151 | "doc = doc.split('Usage example:')[-1].lstrip()\n",
152 | "dmd(f\"\"\"```python\n",
153 | "{doc}\n",
154 | "```\"\"\")"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "When you call `extract_links` with the same archive, you start getting results in a matter of milliseconds, as fast as sqlite reads it.\n",
162 | "\n",
163 | "When you use newer archive, `archive_path` changes, which will make cachew invalidate old cache and recompute it, so you don't need to think about maintaining it separately."
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "## Incremental data exports\n",
171 | "This is my most common usecase of cachew, which I'll illustrate with example.\n",
172 | "\n",
173 | "I'm using an [environment sensor](https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger) to log stats about temperature and humidity.\n",
174 | "Data is synchronized via bluetooth in the sqlite database, which is easy to access. However sensor has limited memory (e.g. 1000 latest measurements).\n",
175 | "That means that I end up with a new database every few days, each of them containing only a slice of data I need, e.g.:\n",
176 | "\n",
177 | " ...\n",
178 | " 20190715100026.db\n",
179 | " 20190716100138.db\n",
180 | " 20190717101651.db\n",
181 | " 20190718100118.db\n",
182 | " 20190719100701.db\n",
183 | " ...\n",
184 | "\n",
185 | "To access **all** of historic temperature data, I have two options:\n",
186 | "\n",
187 | "- Go through all the data chunks every time I wan to access them and 'merge' into a unified stream of measurements, e.g. something like:\n",
188 | " \n",
189 | " def measurements(chunks: List[Path]) -> Iterator[Measurement]:\n",
190 | " for chunk in chunks:\n",
191 | " # read measurements from 'chunk' and yield unseen ones\n",
192 | "\n",
193 | " This is very **easy, but slow** and you waste CPU for no reason every time you need data.\n",
194 | "\n",
195 | "- Keep a 'master' database and write code to merge chunks in it.\n",
196 | "\n",
197 | " This is very **efficient, but tedious**:\n",
198 | " \n",
199 | " - requires serializing/deserializing data -- boilerplate\n",
200 | " - requires manually managing sqlite database -- error prone, hard to get right every time\n",
201 | " - requires careful scheduling, ideally you want to access new data without having to refresh cache\n",
202 | "\n",
203 | " \n",
204 | "Cachew gives the best of two worlds and makes it both **easy and efficient**. The only thing you have to do is to decorate your function:\n",
205 | "\n",
206 | " @cachew \n",
207 | " def measurements(chunks: List[Path]) -> Iterator[Measurement]:\n",
208 | " # ...\n",
209 | " \n",
210 | "- as long as `chunks` stay same, data stays same so you always read from sqlite cache which is very fast\n",
211 | "- you don't need to maintain the database, cache is automatically refreshed when `chunks` change (i.e. you got new data)\n",
212 | "\n",
213 | " All the complexity of handling database is hidden in `cachew` implementation."
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {
220 | "autoscroll": false,
221 | "ein.hycell": false,
222 | "ein.tags": "worksheet-0",
223 | "slideshow": {
224 | "slide_type": "-"
225 | }
226 | },
227 | "outputs": [],
228 | "source": [
229 | "[composite] = [x\n",
230 | " for x in ast.walk(ast.parse(inspect.getsource(cachew)))\n",
231 | " if isinstance(x, ast.FunctionDef) and x.name == 'composite_hash'\n",
232 | "]\n",
233 | "\n",
234 | "link = f'{Path(cachew.__file__).relative_to(_CWD)}:#L{composite.lineno}'\n",
235 | "\n",
236 | "dmd(f'''\n",
237 | "# How it works\n",
238 | "\n",
239 | "- first your objects get {flink('converted', 'cachew.marshall.cachew.CachewMarshall')} into a simpler JSON-like representation\n",
240 | "- after that, they are mapped into byte blobs via [`orjson`](https://github.com/ijl/orjson).\n",
241 | "\n",
242 | "When the function is called, cachew [computes the hash of your function's arguments ]({link})\n",
243 | "and compares it against the previously stored hash value.\n",
244 | "\n",
245 | "- If they match, it would deserialize and yield whatever is stored in the cache database\n",
246 | "- If the hash mismatches, the original function is called and new data is stored along with the new hash\n",
247 | "''')"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "metadata": {
254 | "autoscroll": false,
255 | "ein.hycell": false,
256 | "ein.tags": "worksheet-0",
257 | "slideshow": {
258 | "slide_type": "-"
259 | }
260 | },
261 | "outputs": [],
262 | "source": [
263 | "dmd('# Features')\n",
264 | "types = [f'`{t}`' for t in ['str', 'int', 'float', 'bool', 'datetime', 'date', 'Exception']]\n",
265 | "dmd(f\"\"\"\n",
266 | "* automatic schema inference: {flink('1', 'tests.test_return_type_inference')}, {flink('2', 'tests.test_return_type_mismatch')}\n",
267 | "* supported types:\n",
268 | "\n",
269 | " * primitive: {', '.join(types)}\n",
270 | "\n",
271 | " See {flink('tests.test_types')}, {flink('tests.test_primitive')}, {flink('tests.test_dates')}, {flink('tests.test_exceptions')}\n",
272 | " * {flink('@dataclass and NamedTuple', 'tests.test_dataclass')}\n",
273 | " * {flink('Optional', 'tests.test_optional')} types\n",
274 | " * {flink('Union', 'tests.test_union')} types\n",
275 | " * {flink('nested datatypes', 'tests.test_nested')}\n",
276 | "\n",
277 | "* detects {flink('datatype schema changes', 'tests.test_schema_change')} and discards old data automatically\n",
278 | "\"\"\")\n",
279 | "# * custom hash function TODO example with mtime?"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "# Performance\n",
287 | "Updating cache takes certain overhead, but that would depend on how complicated your datatype in the first place, so I'd suggest measuring if you're not sure.\n",
288 | "\n",
289 | "During reading cache all that happens is reading blobls from sqlite/decoding as JSON, and mapping them onto your target datatype, so the overhead depends on each of these steps.\n",
290 | "\n",
291 | "It would almost certainly make your program faster if your computations take more than several seconds.\n",
292 | "\n",
293 | "You can find some of my performance tests in [benchmarks/](benchmarks) dir, and the tests themselves in [src/cachew/tests/marshall.py](src/cachew/tests/marshall.py)."
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "metadata": {
300 | "autoscroll": false,
301 | "ein.hycell": false,
302 | "ein.tags": "worksheet-0",
303 | "slideshow": {
304 | "slide_type": "-"
305 | }
306 | },
307 | "outputs": [],
308 | "source": [
309 | "dmd(f\"\"\"\n",
310 | "# Using\n",
311 | "See {flink('docstring', 'cachew.cachew')} for up-to-date documentation on parameters and return types.\n",
312 | "You can also use {flink('extensive unit tests', 'tests')} as a reference.\n",
313 | "\n",
314 | "Some useful (but optional) arguments of `@cachew` decorator:\n",
315 | "\n",
316 | "* `cache_path` can be a directory, or a callable that {flink('returns a path', 'tests.test_callable_cache_path')} and depends on function's arguments.\n",
317 | "\n",
318 | " By default, `settings.DEFAULT_CACHEW_DIR` is used.\n",
319 | "\n",
320 | "* `depends_on` is a function which determines whether your inputs have changed, and the cache needs to be invalidated.\n",
321 | "\n",
322 | " By default it just uses string representation of the arguments, you can also specify a custom callable.\n",
323 | "\n",
324 | " For instance, it can be used to {flink('discard cache', 'tests.test_custom_hash')} if the input file was modified.\n",
325 | "\n",
326 | "* `cls` is the type that would be serialized.\n",
327 | "\n",
328 | " By default, it is inferred from return type annotations, but can be specified explicitly if you don't control the code you want to cache.\n",
329 | "\"\"\")"
330 | ]
331 | },
332 | {
333 | "cell_type": "markdown",
334 | "metadata": {
335 | "ein.tags": "worksheet-0",
336 | "slideshow": {
337 | "slide_type": "-"
338 | }
339 | },
340 | "source": [
341 | "# Installing\n",
342 | "Package is available on [pypi](https://pypi.org/project/cachew/).\n",
343 | "\n",
344 | " pip3 install --user cachew\n",
345 | " \n",
346 | "## Developing\n",
347 | "I'm using [tox](tox.ini) to run tests, and [Github Actions](.github/workflows/main.yml) for CI."
348 | ]
349 | },
350 | {
351 | "cell_type": "markdown",
352 | "metadata": {
353 | "ein.tags": "worksheet-0",
354 | "slideshow": {
355 | "slide_type": "-"
356 | }
357 | },
358 | "source": [
359 | "# Implementation\n",
360 | "\n",
361 | "* why NamedTuples and dataclasses?\n",
362 | " \n",
363 | " `NamedTuple` and `dataclass` provide a very straightforward and self documenting way to represent data in Python.\n",
364 | " Very compact syntax makes it extremely convenient even for one-off means of communicating between couple of functions.\n",
365 | " \n",
366 | " If you want to find out more why you should use more dataclasses in your code I suggest these links:\n",
367 | " \n",
368 | " - [What are data classes?](https://stackoverflow.com/questions/47955263/what-are-data-classes-and-how-are-they-different-from-common-classes)\n",
369 | " - [basic data classes](https://realpython.com/python-data-classes/#basic-data-classes)\n",
370 | " \n",
371 | "* why not `pandas.DataFrame`?\n",
372 | "\n",
373 | " DataFrames are great and can be serialised to csv or pickled.\n",
374 | " They are good to have as one of the ways you can interface with your data, however hardly convenient to think about it abstractly due to their dynamic nature.\n",
375 | " They also can't be nested.\n",
376 | "\n",
377 | "* why not [ORM](https://en.wikipedia.org/wiki/Object-relational_mapping)?\n",
378 | " \n",
379 | " ORMs tend to be pretty invasive, which might complicate your scripts or even ruin performance. It's also somewhat an overkill for such a specific purpose.\n",
380 | "\n",
381 | " * E.g. [SQLAlchemy](https://docs.sqlalchemy.org/en/13/orm/tutorial.html#declare-a-mapping) requires you using custom sqlalchemy specific types and inheriting a base class.\n",
382 | " Also it doesn't support nested types.\n",
383 | " \n",
384 | "* why not [pickle](https://docs.python.org/3/library/pickle.html) or [`marshmallow`](https://marshmallow.readthedocs.io/en/3.0/nesting.html) or `pydantic`?\n",
385 | "\n",
386 | " Pickling is kinda heavyweigh for plain data class, it's slower just using JSON. Lastly, it can only be loaded via Python, whereas JSON + sqlite has numerous bindings and tools to explore and interface.\n",
387 | "\n",
388 | " Marshmallow is a common way to map data into db-friendly format, but it requires explicit schema which is an overhead when you have it already in the form of type annotations. I've looked at existing projects to utilize type annotations, but didn't find them covering all I wanted:\n",
389 | " \n",
390 | " * https://marshmallow-annotations.readthedocs.io/en/latest/ext/namedtuple.html#namedtuple-type-api\n",
391 | " * https://pypi.org/project/marshmallow-dataclass\n",
392 | " \n",
393 | " I wrote up an extensive review of alternatives I considered: see [doc/serialization.org](doc/serialization.org).\n",
394 | " So far looks like only `cattrs` comes somewhere close to the feature set I need, but still not quite.\n",
395 | "\n",
396 | "* why `sqlite` database for storage?\n",
397 | "\n",
398 | " It's pretty efficient and iterables (i.e. sequences) map onto database rows in a very straightforward manner, plus we get some concurrency guarantees.\n",
399 | "\n",
400 | " There is also a somewhat experimental backend which uses a simple file (jsonl-like) for storage, you can use it via `@cache(backend='file')`, or via `settings.DEFAULT_BACKEND`.\n",
401 | " It's slightly faster than sqlite judging by benchmarks, but unless you're caching millions of items this shouldn't really be noticeable.\n",
402 | " \n",
403 | " It would also be interesting to experiment with in-RAM storages.\n",
404 | "\n",
405 | " I had [a go](https://github.com/karlicoss/cachew/issues/9) at Redis as well, but performance for writing to cache was pretty bad. That said it could still be interesting for distributed caching if you don't care too much about performance.\n"
406 | ]
407 | },
408 | {
409 | "cell_type": "markdown",
410 | "metadata": {},
411 | "source": [
412 | "# Tips and tricks\n",
413 | "## Optional dependency\n",
414 | "You can benefit from `cachew` even if you don't want to bloat your app's dependencies. Just use the following snippet:"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": null,
420 | "metadata": {},
421 | "outputs": [],
422 | "source": [
423 | "import cachew.extra\n",
424 | "\n",
425 | "dmd(f\"\"\"```python\n",
426 | "{inspect.getsource(cachew.extra.mcachew)}\n",
427 | "```\"\"\")"
428 | ]
429 | },
430 | {
431 | "cell_type": "markdown",
432 | "metadata": {},
433 | "source": [
434 | "Now you can use `@mcachew` in place of `@cachew`, and be certain things don't break if `cachew` is missing.\n",
435 | "\n",
436 | "## Settings"
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": null,
442 | "metadata": {},
443 | "outputs": [],
444 | "source": [
445 | "dmd(f'''\n",
446 | "{flink('cachew.settings')} exposes some parameters that allow you to control `cachew` behaviour:\n",
447 | "- `ENABLE`: set to `False` if you want to disable caching for without removing the decorators (useful for testing and debugging).\n",
448 | " You can also use {flink('cachew.extra.disabled_cachew')} context manager to do it temporarily.\n",
449 | "- `DEFAULT_CACHEW_DIR`: override to set a different base directory. The default is the \"user cache directory\" (see [platformdirs docs](https://github.com/tox-dev/platformdirs?tab=readme-ov-file#example-output)).\n",
450 | "- `THROW_ON_ERROR`: by default, cachew is defensive and simply attemps to cause the original function on caching issues.\n",
451 | " Set to `True` to catch errors earlier.\n",
452 | "- `DEFAULT_BACKEND`: currently supported are `sqlite` and `file` (file is somewhat experimental, although should work too).\n",
453 | "\n",
454 | "''')"
455 | ]
456 | },
457 | {
458 | "cell_type": "markdown",
459 | "metadata": {},
460 | "source": [
461 | "## Updating this readme\n",
462 | "This is a literate readme, implemented as a Jupiter notebook: [README.ipynb](README.ipynb). To update the (autogenerated) [README.md](README.md), use [generate-readme](generate-readme) script."
463 | ]
464 | }
465 | ],
466 | "metadata": {
467 | "celltoolbar": "Tags",
468 | "kernelspec": {
469 | "display_name": "Python 3 (ipykernel)",
470 | "language": "python",
471 | "name": "python3"
472 | },
473 | "language_info": {
474 | "codemirror_mode": {
475 | "name": "ipython",
476 | "version": 3
477 | },
478 | "file_extension": ".py",
479 | "mimetype": "text/x-python",
480 | "name": "python",
481 | "nbconvert_exporter": "python",
482 | "pygments_lexer": "ipython3",
483 | "version": "3.10.12"
484 | },
485 | "name": "README.ipynb"
486 | },
487 | "nbformat": 4,
488 | "nbformat_minor": 4
489 | }
490 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
6 |
7 |
8 | # What is Cachew?
9 | TLDR: cachew lets you **cache function calls** into an sqlite database on your disk in a matter of **single decorator** (similar to [functools.lru_cache](https://docs.python.org/3/library/functools.html#functools.lru_cache)). The difference from `functools.lru_cache` is that cached data is persisted between program runs, so next time you call your function, it will only be a matter of reading from the cache.
10 | Cache is **invalidated automatically** if your function's arguments change, so you don't have to think about maintaining it.
11 |
12 | In order to be cacheable, your function needs to return a simple data type, or an [Iterator](https://docs.python.org/3/library/typing.html#typing.Iterator) over such types.
13 |
14 | A simple type is defined as:
15 |
16 | - primitive: `str`/`int`/`float`/`bool`
17 | - JSON-like types (`dict`/`list`/`tuple`)
18 | - `datetime`
19 | - `Exception` (useful for [error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss) )
20 | - [NamedTuples](https://docs.python.org/3/library/typing.html#typing.NamedTuple)
21 | - [dataclasses](https://docs.python.org/3/library/dataclasses.html)
22 |
23 |
24 | That allows to **automatically infer schema from type hints** ([PEP 526](https://www.python.org/dev/peps/pep-0526)) and not think about serializing/deserializing.
25 | Thanks to type hints, you don't need to annotate your classes with any special decorators, inherit from some special base classes, etc., as it's often the case for serialization libraries.
26 |
27 | ## Motivation
28 |
29 | I often find myself processing big chunks of data, merging data together, computing some aggregates on it or extracting few bits I'm interested at. While I'm trying to utilize REPL as much as I can, some things are still fragile and often you just have to rerun the whole thing in the process of development. This can be frustrating if data parsing and processing takes seconds, let alone minutes in some cases.
30 |
31 | Conventional way of dealing with it is serializing results along with some sort of hash (e.g. md5) of input files,
32 | comparing on the next run and returning cached data if nothing changed.
33 |
34 | Simple as it sounds, it is pretty tedious to do every time you need to memorize some data, contaminates your code with routine and distracts you from your main task.
35 |
36 |
37 | # Examples
38 | ## Processing Wikipedia
39 | Imagine you're working on a data analysis pipeline for some huge dataset, say, extracting urls and their titles from Wikipedia archive.
40 | Parsing it (`extract_links` function) takes hours, however, as long as the archive is same you will always get same results. So it would be nice to be able to cache the results somehow.
41 |
42 |
43 | With this library your can achieve it through single `@cachew` decorator.
44 |
45 |
46 | ```python
47 | >>> from typing import NamedTuple, Iterator
48 | >>> class Link(NamedTuple):
49 | ... url : str
50 | ... text: str
51 | ...
52 | >>> @cachew
53 | ... def extract_links(archive_path: str) -> Iterator[Link]:
54 | ... for i in range(5):
55 | ... # simulate slow IO
56 | ... # this function runs for five seconds for the purpose of demonstration, but realistically it might take hours
57 | ... import time; time.sleep(1)
58 | ... yield Link(url=f'http://link{i}.org', text=f'text {i}')
59 | ...
60 | >>> list(extract_links(archive_path='wikipedia_20190830.zip')) # that would take about 5 seconds on first run
61 | [Link(url='http://link0.org', text='text 0'), Link(url='http://link1.org', text='text 1'), Link(url='http://link2.org', text='text 2'), Link(url='http://link3.org', text='text 3'), Link(url='http://link4.org', text='text 4')]
62 |
63 | >>> from timeit import Timer
64 | >>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20190830.zip'))).timeit(number=1)
65 | ... # second run is cached, so should take less time
66 | >>> print(f"call took {int(res)} seconds")
67 | call took 0 seconds
68 |
69 | >>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20200101.zip'))).timeit(number=1)
70 | ... # now file has changed, so the cache will be discarded
71 | >>> print(f"call took {int(res)} seconds")
72 | call took 5 seconds
73 | ```
74 |
75 |
76 | When you call `extract_links` with the same archive, you start getting results in a matter of milliseconds, as fast as sqlite reads it.
77 |
78 | When you use newer archive, `archive_path` changes, which will make cachew invalidate old cache and recompute it, so you don't need to think about maintaining it separately.
79 |
80 | ## Incremental data exports
81 | This is my most common usecase of cachew, which I'll illustrate with example.
82 |
83 | I'm using an [environment sensor](https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger) to log stats about temperature and humidity.
84 | Data is synchronized via bluetooth in the sqlite database, which is easy to access. However sensor has limited memory (e.g. 1000 latest measurements).
85 | That means that I end up with a new database every few days, each of them containing only a slice of data I need, e.g.:
86 |
87 | ...
88 | 20190715100026.db
89 | 20190716100138.db
90 | 20190717101651.db
91 | 20190718100118.db
92 | 20190719100701.db
93 | ...
94 |
95 | To access **all** of historic temperature data, I have two options:
96 |
97 | - Go through all the data chunks every time I wan to access them and 'merge' into a unified stream of measurements, e.g. something like:
98 |
99 | def measurements(chunks: List[Path]) -> Iterator[Measurement]:
100 | for chunk in chunks:
101 | # read measurements from 'chunk' and yield unseen ones
102 |
103 | This is very **easy, but slow** and you waste CPU for no reason every time you need data.
104 |
105 | - Keep a 'master' database and write code to merge chunks in it.
106 |
107 | This is very **efficient, but tedious**:
108 |
109 | - requires serializing/deserializing data -- boilerplate
110 | - requires manually managing sqlite database -- error prone, hard to get right every time
111 | - requires careful scheduling, ideally you want to access new data without having to refresh cache
112 |
113 |
114 | Cachew gives the best of two worlds and makes it both **easy and efficient**. The only thing you have to do is to decorate your function:
115 |
116 | @cachew
117 | def measurements(chunks: List[Path]) -> Iterator[Measurement]:
118 | # ...
119 |
120 | - as long as `chunks` stay same, data stays same so you always read from sqlite cache which is very fast
121 | - you don't need to maintain the database, cache is automatically refreshed when `chunks` change (i.e. you got new data)
122 |
123 | All the complexity of handling database is hidden in `cachew` implementation.
124 |
125 |
126 | # How it works
127 |
128 | - first your objects get [converted](src/cachew/marshall/cachew.py#L30) into a simpler JSON-like representation
129 | - after that, they are mapped into byte blobs via [`orjson`](https://github.com/ijl/orjson).
130 |
131 | When the function is called, cachew [computes the hash of your function's arguments ](src/cachew/__init__.py:#L587)
132 | and compares it against the previously stored hash value.
133 |
134 | - If they match, it would deserialize and yield whatever is stored in the cache database
135 | - If the hash mismatches, the original function is called and new data is stored along with the new hash
136 |
137 |
138 |
139 | # Features
140 |
141 |
142 |
143 | * automatic schema inference: [1](src/cachew/tests/test_cachew.py#L387), [2](src/cachew/tests/test_cachew.py#L401)
144 | * supported types:
145 |
146 | * primitive: `str`, `int`, `float`, `bool`, `datetime`, `date`, `Exception`
147 |
148 | See [tests.test_types](src/cachew/tests/test_cachew.py#L687), [tests.test_primitive](src/cachew/tests/test_cachew.py#L725), [tests.test_dates](src/cachew/tests/test_cachew.py#L637), [tests.test_exceptions](src/cachew/tests/test_cachew.py#L1124)
149 | * [@dataclass and NamedTuple](src/cachew/tests/test_cachew.py#L602)
150 | * [Optional](src/cachew/tests/test_cachew.py#L531) types
151 | * [Union](src/cachew/tests/test_cachew.py#L832) types
152 | * [nested datatypes](src/cachew/tests/test_cachew.py#L447)
153 |
154 | * detects [datatype schema changes](src/cachew/tests/test_cachew.py#L477) and discards old data automatically
155 |
156 |
157 | # Performance
158 | Updating cache takes certain overhead, but that would depend on how complicated your datatype in the first place, so I'd suggest measuring if you're not sure.
159 |
160 | During reading cache all that happens is reading blobls from sqlite/decoding as JSON, and mapping them onto your target datatype, so the overhead depends on each of these steps.
161 |
162 | It would almost certainly make your program faster if your computations take more than several seconds.
163 |
164 | You can find some of my performance tests in [benchmarks/](benchmarks) dir, and the tests themselves in [src/cachew/tests/marshall.py](src/cachew/tests/marshall.py).
165 |
166 |
167 | # Using
168 | See [docstring](src/cachew/__init__.py#L292) for up-to-date documentation on parameters and return types.
169 | You can also use [extensive unit tests](src/cachew/tests/test_cachew.py) as a reference.
170 |
171 | Some useful (but optional) arguments of `@cachew` decorator:
172 |
173 | * `cache_path` can be a directory, or a callable that [returns a path](src/cachew/tests/test_cachew.py#L424) and depends on function's arguments.
174 |
175 | By default, `settings.DEFAULT_CACHEW_DIR` is used.
176 |
177 | * `depends_on` is a function which determines whether your inputs have changed, and the cache needs to be invalidated.
178 |
179 | By default it just uses string representation of the arguments, you can also specify a custom callable.
180 |
181 | For instance, it can be used to [discard cache](src/cachew/tests/test_cachew.py#L119) if the input file was modified.
182 |
183 | * `cls` is the type that would be serialized.
184 |
185 | By default, it is inferred from return type annotations, but can be specified explicitly if you don't control the code you want to cache.
186 |
187 |
188 | # Installing
189 | Package is available on [pypi](https://pypi.org/project/cachew/).
190 |
191 | pip3 install --user cachew
192 |
193 | ## Developing
194 | I'm using [tox](tox.ini) to run tests, and [Github Actions](.github/workflows/main.yml) for CI.
195 |
196 | # Implementation
197 |
198 | * why NamedTuples and dataclasses?
199 |
200 | `NamedTuple` and `dataclass` provide a very straightforward and self documenting way to represent data in Python.
201 | Very compact syntax makes it extremely convenient even for one-off means of communicating between couple of functions.
202 |
203 | If you want to find out more why you should use more dataclasses in your code I suggest these links:
204 |
205 | - [What are data classes?](https://stackoverflow.com/questions/47955263/what-are-data-classes-and-how-are-they-different-from-common-classes)
206 | - [basic data classes](https://realpython.com/python-data-classes/#basic-data-classes)
207 |
208 | * why not `pandas.DataFrame`?
209 |
210 | DataFrames are great and can be serialised to csv or pickled.
211 | They are good to have as one of the ways you can interface with your data, however hardly convenient to think about it abstractly due to their dynamic nature.
212 | They also can't be nested.
213 |
214 | * why not [ORM](https://en.wikipedia.org/wiki/Object-relational_mapping)?
215 |
216 | ORMs tend to be pretty invasive, which might complicate your scripts or even ruin performance. It's also somewhat an overkill for such a specific purpose.
217 |
218 | * E.g. [SQLAlchemy](https://docs.sqlalchemy.org/en/13/orm/tutorial.html#declare-a-mapping) requires you using custom sqlalchemy specific types and inheriting a base class.
219 | Also it doesn't support nested types.
220 |
221 | * why not [pickle](https://docs.python.org/3/library/pickle.html) or [`marshmallow`](https://marshmallow.readthedocs.io/en/3.0/nesting.html) or `pydantic`?
222 |
223 | Pickling is kinda heavyweigh for plain data class, it's slower just using JSON. Lastly, it can only be loaded via Python, whereas JSON + sqlite has numerous bindings and tools to explore and interface.
224 |
225 | Marshmallow is a common way to map data into db-friendly format, but it requires explicit schema which is an overhead when you have it already in the form of type annotations. I've looked at existing projects to utilize type annotations, but didn't find them covering all I wanted:
226 |
227 | * https://marshmallow-annotations.readthedocs.io/en/latest/ext/namedtuple.html#namedtuple-type-api
228 | * https://pypi.org/project/marshmallow-dataclass
229 |
230 | I wrote up an extensive review of alternatives I considered: see [doc/serialization.org](doc/serialization.org).
231 | So far looks like only `cattrs` comes somewhere close to the feature set I need, but still not quite.
232 |
233 | * why `sqlite` database for storage?
234 |
235 | It's pretty efficient and iterables (i.e. sequences) map onto database rows in a very straightforward manner, plus we get some concurrency guarantees.
236 |
237 | There is also a somewhat experimental backend which uses a simple file (jsonl-like) for storage, you can use it via `@cache(backend='file')`, or via `settings.DEFAULT_BACKEND`.
238 | It's slightly faster than sqlite judging by benchmarks, but unless you're caching millions of items this shouldn't really be noticeable.
239 |
240 | It would also be interesting to experiment with in-RAM storages.
241 |
242 | I had [a go](https://github.com/karlicoss/cachew/issues/9) at Redis as well, but performance for writing to cache was pretty bad. That said it could still be interesting for distributed caching if you don't care too much about performance.
243 |
244 |
245 | # Tips and tricks
246 | ## Optional dependency
247 | You can benefit from `cachew` even if you don't want to bloat your app's dependencies. Just use the following snippet:
248 |
249 |
250 | ```python
251 | def mcachew(*args, **kwargs):
252 | """
253 | Stands for 'Maybe cachew'.
254 | Defensive wrapper around @cachew to make it an optional dependency.
255 | """
256 | try:
257 | import cachew
258 | except ModuleNotFoundError:
259 | import warnings
260 |
261 | warnings.warn('cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew')
262 | return lambda orig_func: orig_func
263 | else:
264 | return cachew.cachew(*args, **kwargs)
265 |
266 | ```
267 |
268 |
269 | Now you can use `@mcachew` in place of `@cachew`, and be certain things don't break if `cachew` is missing.
270 |
271 | ## Settings
272 |
273 |
274 | [cachew.settings](src/cachew/__init__.py#L64) exposes some parameters that allow you to control `cachew` behaviour:
275 | - `ENABLE`: set to `False` if you want to disable caching for without removing the decorators (useful for testing and debugging).
276 | You can also use [cachew.extra.disabled_cachew](src/cachew/extra.py#L21) context manager to do it temporarily.
277 | - `DEFAULT_CACHEW_DIR`: override to set a different base directory. The default is the "user cache directory" (see [platformdirs docs](https://github.com/tox-dev/platformdirs?tab=readme-ov-file#example-output)).
278 | - `THROW_ON_ERROR`: by default, cachew is defensive and simply attemps to cause the original function on caching issues.
279 | Set to `True` to catch errors earlier.
280 | - `DEFAULT_BACKEND`: currently supported are `sqlite` and `file` (file is somewhat experimental, although should work too).
281 |
282 |
283 | ## Updating this readme
284 | This is a literate readme, implemented as a Jupiter notebook: [README.ipynb](README.ipynb). To update the (autogenerated) [README.md](README.md), use [generate-readme](generate-readme) script.
285 |
--------------------------------------------------------------------------------
/benchmarks/20230912-comparison-with-legacy.org:
--------------------------------------------------------------------------------
1 | Running on @karlicoss desktop PC, =python3.10=.
2 |
3 | This is basically to justify switching to the new serialization method
4 |
5 | - old way, =legacy= used to 'flatten' the type into an sqlite row
6 | - new way, =cachew=, just dumps it as a dict, then to bytes via =orjson= and stores in a single sqlite column
7 |
8 | The numbers between legacy and cachew can't be directly compared though.
9 | Legacy =serializing= step emits a tuple, which can be inserted directly into the database.
10 | So to compare it with the new way, we need to compare with the sum of =serializing= + =json dump=.
11 | That said this won't be exact comparison either, since legacy binder relied on sqlalchemy to dump custom types to sqlite types (e.g. =datetime= or =Exception=). So legacy will have a slight advantage this way, but it's fine.
12 |
13 | So we can see that for:
14 | - =test_union_str_dataclass=
15 | - new implementation: =0.53 + 0.45s= to serialize; =0.29 + 0.48= to deserialize
16 | - old implementation: =2.38s= to serialize; =1.92= to deserialize
17 | - =test_nested_dataclass=
18 | - new implementation: =1.05 + 0.26s= to serialize; =0.50 + 1.42= to deserialize
19 | - old implementation: =1.92s= to serialize; =1.88= to deserialize
20 |
21 | For both tests, serialization if quite a bit faster with the new implementation.
22 | On the second test, they are on par for deserialization, but as I mention these numbers are in favor of the legacy implementation.
23 |
24 | In addition, keeping everything in one column unlocks some othe optimizations which wouldn't be possible with multiple columns.
25 |
26 |
27 | #+begin_example
28 | $ pytest --pyargs cachew.tests.marshall -k 'gc_off and 1000000 and not cattrs' -s
29 | =========================================================== test session starts ============================================================
30 | platform linux -- Python 3.10.12, pytest-7.3.1, pluggy-1.0.0 -- /usr/bin/python3
31 | cachedir: .pytest_cache
32 | rootdir: /code/cachew_jsonpickle
33 | configfile: pytest.ini
34 | plugins: anyio-3.6.2
35 | collected 100 items / 95 deselected / 5 selected
36 |
37 | src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew]
38 | building 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.34s
39 | serializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.53s
40 | json dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.45s
41 | sqlite dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.08s
42 | sqlite load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.45s
43 | jsonl dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.18s
44 | jsonl load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.13s
45 | json load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.29s
46 | deserializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.48s
47 | PASSED
48 | src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-legacy]
49 | building 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.35s
50 | serializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 2.38s
51 | json dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.22s
52 | sqlite dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.06s
53 | sqlite load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.29s
54 | jsonl dump 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.12s
55 | jsonl load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.12s
56 | json load 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 0.23s
57 | deserializing 1000000 objects of type typing.Union[str, cachew.tests.marshall.Name]: 1.92s
58 | PASSED
59 | src/cachew/tests/marshall.py::test_nested_dataclass[gc_off-1000000-cachew]
60 | building 1000000 objects of type .TE2'>: 0.58s
61 | serializing 1000000 objects of type .TE2'>: 1.05s
62 | json dump 1000000 objects of type .TE2'>: 0.26s
63 | sqlite dump 1000000 objects of type .TE2'>: 1.03s
64 | sqlite load 1000000 objects of type .TE2'>: 0.30s
65 | jsonl dump 1000000 objects of type .TE2'>: 0.14s
66 | jsonl load 1000000 objects of type .TE2'>: 0.14s
67 | json load 1000000 objects of type .TE2'>: 0.50s
68 | deserializing 1000000 objects of type .TE2'>: 1.42s
69 | PASSED
70 | src/cachew/tests/marshall.py::test_nested_dataclass[gc_off-1000000-legacy]
71 | building 1000000 objects of type .TE2'>: 0.56s
72 | serializing 1000000 objects of type .TE2'>: 1.92s
73 | json dump 1000000 objects of type .TE2'>: 0.21s
74 | sqlite dump 1000000 objects of type .TE2'>: 0.99s
75 | sqlite load 1000000 objects of type .TE2'>: 0.29s
76 | jsonl dump 1000000 objects of type .TE2'>: 0.12s
77 | jsonl load 1000000 objects of type .TE2'>: 0.12s
78 | json load 1000000 objects of type .TE2'>: 0.24s
79 | deserializing 1000000 objects of type .TE2'>: 1.88s
80 | PASSED
81 | #+end_example
82 |
--------------------------------------------------------------------------------
/benchmarks/20230912.org:
--------------------------------------------------------------------------------
1 | Running on @karlicoss desktop PC, =python3.10=
2 |
3 | - serializing/deserializing here refers to converting object to json-ish python dictionary (not actual json string!)
4 | - json dump/json load refers to converting the dict above to a json string and back
5 | - sqlite dump/jsonl dump refers to saving/loading these strings to a persistent storage
6 |
7 |
8 | #+begin_example
9 | $ pytest --pyargs --ignore-glob '*test_cachew*' -k marshall -s
10 | =========================================================== test session starts ============================================================
11 | platform linux -- Python 3.10.6, pytest-7.3.1, pluggy-1.0.0 -- /usr/bin/python3
12 | cachedir: .pytest_cache
13 | configfile: pytest.ini
14 | plugins: anyio-3.6.2
15 | collected 37 items / 8 deselected / 29 selected
16 |
17 | src/cachew/marshall/cachew.py::test_serialize_and_deserialize PASSED
18 | src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew]
19 | building 1000000 objects of type str | cachew.tests.marshall.Name: 0.60s
20 | serializing 1000000 objects of type str | cachew.tests.marshall.Name: 0.85s
21 | json dump 1000000 objects of type str | cachew.tests.marshall.Name: 0.46s
22 | sqlite dump 1000000 objects of type str | cachew.tests.marshall.Name: 1.11s
23 | sqlite load 1000000 objects of type str | cachew.tests.marshall.Name: 0.31s
24 | jsonl dump 1000000 objects of type str | cachew.tests.marshall.Name: 0.13s
25 | jsonl load 1000000 objects of type str | cachew.tests.marshall.Name: 0.13s
26 | json load 1000000 objects of type str | cachew.tests.marshall.Name: 1.04s
27 | deserializing 1000000 objects of type str | cachew.tests.marshall.Name: 0.86s
28 | PASSED
29 | src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cattrs] SKIPPED (TODO need to adjust the handling of Union ...)
30 | src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew]
31 | building 5000000 objects of type str | cachew.tests.marshall.Name: 3.00s
32 | serializing 5000000 objects of type str | cachew.tests.marshall.Name: 4.38s
33 | json dump 5000000 objects of type str | cachew.tests.marshall.Name: 2.14s
34 | sqlite dump 5000000 objects of type str | cachew.tests.marshall.Name: 5.43s
35 | sqlite load 5000000 objects of type str | cachew.tests.marshall.Name: 1.47s
36 | jsonl dump 5000000 objects of type str | cachew.tests.marshall.Name: 0.62s
37 | jsonl load 5000000 objects of type str | cachew.tests.marshall.Name: 0.64s
38 | json load 5000000 objects of type str | cachew.tests.marshall.Name: 4.74s
39 | deserializing 5000000 objects of type str | cachew.tests.marshall.Name: 4.06s
40 | PASSED
41 | src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cattrs] SKIPPED (TODO need to adjust the handling of Union ...)
42 | src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cattrs] SKIPPED (TODO need to adjust the handling of Union...)
43 | src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew]
44 | building 5000000 objects of type str | cachew.tests.marshall.Name: 1.77s
45 | serializing 5000000 objects of type str | cachew.tests.marshall.Name: 2.59s
46 | json dump 5000000 objects of type str | cachew.tests.marshall.Name: 1.22s
47 | sqlite dump 5000000 objects of type str | cachew.tests.marshall.Name: 5.28s
48 | sqlite load 5000000 objects of type str | cachew.tests.marshall.Name: 1.58s
49 | jsonl dump 5000000 objects of type str | cachew.tests.marshall.Name: 0.64s
50 | jsonl load 5000000 objects of type str | cachew.tests.marshall.Name: 0.66s
51 | json load 5000000 objects of type str | cachew.tests.marshall.Name: 1.53s
52 | deserializing 5000000 objects of type str | cachew.tests.marshall.Name: 2.60s
53 | PASSED
54 | src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cattrs] SKIPPED (TODO need to adjust the handling of Union...)
55 | src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew]
56 | building 1000000 objects of type : 1.05s
57 | serializing 1000000 objects of type : 1.28s
58 | json dump 1000000 objects of type : 0.22s
59 | sqlite dump 1000000 objects of type : 1.14s
60 | sqlite load 1000000 objects of type : 0.30s
61 | jsonl dump 1000000 objects of type : 0.14s
62 | jsonl load 1000000 objects of type : 0.14s
63 | json load 1000000 objects of type : 0.70s
64 | deserializing 1000000 objects of type : 2.20s
65 | PASSED
66 | src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)
67 | src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew]
68 | building 5000000 objects of type : 5.08s
69 | serializing 5000000 objects of type : 6.35s
70 | json dump 5000000 objects of type : 1.13s
71 | sqlite dump 5000000 objects of type : 5.58s
72 | sqlite load 5000000 objects of type : 1.47s
73 | jsonl dump 5000000 objects of type : 0.69s
74 | jsonl load 5000000 objects of type : 0.70s
75 | json load 5000000 objects of type : 6.85s
76 | deserializing 5000000 objects of type : 11.10s
77 | PASSED
78 | src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)
79 | src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew]
80 | building 1000000 objects of type : 1.37s
81 | serializing 1000000 objects of type : 1.25s
82 | json dump 1000000 objects of type : 0.24s
83 | sqlite dump 1000000 objects of type : 1.12s
84 | sqlite load 1000000 objects of type : 0.29s
85 | jsonl dump 1000000 objects of type : 0.14s
86 | jsonl load 1000000 objects of type : 0.14s
87 | json load 1000000 objects of type : 0.24s
88 | deserializing 1000000 objects of type : 2.17s
89 | PASSED
90 | src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)
91 | src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew]
92 | building 5000000 objects of type : 5.10s
93 | serializing 5000000 objects of type : 6.22s
94 | json dump 5000000 objects of type : 1.17s
95 | sqlite dump 5000000 objects of type : 5.43s
96 | sqlite load 5000000 objects of type : 1.54s
97 | jsonl dump 5000000 objects of type : 0.70s
98 | jsonl load 5000000 objects of type : 0.71s
99 | json load 5000000 objects of type : 1.22s
100 | deserializing 5000000 objects of type : 10.97s
101 | PASSED
102 | src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cattrs] SKIPPED (TODO support datetime with pytz for cattrs)
103 | src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew]
104 | building 1000000 objects of type .TE2'>: 1.64s
105 | serializing 1000000 objects of type .TE2'>: 1.43s
106 | json dump 1000000 objects of type .TE2'>: 0.30s
107 | sqlite dump 1000000 objects of type .TE2'>: 1.16s
108 | sqlite load 1000000 objects of type .TE2'>: 0.30s
109 | jsonl dump 1000000 objects of type .TE2'>: 0.15s
110 | jsonl load 1000000 objects of type .TE2'>: 0.15s
111 | json load 1000000 objects of type .TE2'>: 1.02s
112 | deserializing 1000000 objects of type .TE2'>: 2.78s
113 | PASSED
114 | src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs]
115 | building 1000000 objects of type .TE2'>: 1.88s
116 | serializing 1000000 objects of type .TE2'>: 0.80s
117 | json dump 1000000 objects of type .TE2'>: 0.31s
118 | sqlite dump 1000000 objects of type .TE2'>: 1.39s
119 | sqlite load 1000000 objects of type .TE2'>: 0.31s
120 | jsonl dump 1000000 objects of type .TE2'>: 0.15s
121 | jsonl load 1000000 objects of type .TE2'>: 0.15s
122 | json load 1000000 objects of type .TE2'>: 1.03s
123 | deserializing 1000000 objects of type .TE2'>: 2.61s
124 | PASSED
125 | src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew]
126 | building 1000000 objects of type .TE2'>: 0.57s
127 | serializing 1000000 objects of type .TE2'>: 1.08s
128 | json dump 1000000 objects of type .TE2'>: 0.29s
129 | sqlite dump 1000000 objects of type .TE2'>: 1.09s
130 | sqlite load 1000000 objects of type .TE2'>: 0.30s
131 | jsonl dump 1000000 objects of type .TE2'>: 0.15s
132 | jsonl load 1000000 objects of type .TE2'>: 0.15s
133 | json load 1000000 objects of type .TE2'>: 0.50s
134 | deserializing 1000000 objects of type .TE2'>: 1.43s
135 | PASSED
136 | src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs]
137 | building 1000000 objects of type .TE2'>: 0.57s
138 | serializing 1000000 objects of type .TE2'>: 0.39s
139 | json dump 1000000 objects of type .TE2'>: 0.29s
140 | sqlite dump 1000000 objects of type .TE2'>: 1.16s
141 | sqlite load 1000000 objects of type .TE2'>: 0.32s
142 | jsonl dump 1000000 objects of type .TE2'>: 0.16s
143 | jsonl load 1000000 objects of type .TE2'>: 0.15s
144 | json load 1000000 objects of type .TE2'>: 0.50s
145 | deserializing 1000000 objects of type .TE2'>: 1.29s
146 | PASSED
147 |
148 | ============================================================ slowest durations =============================================================
149 | 44.87s call src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew]
150 | 38.76s call src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew]
151 | 28.65s call src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew]
152 | 20.05s call src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew]
153 | 9.82s call src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew]
154 | 9.51s call src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs]
155 | 8.37s call src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew]
156 | 8.20s call src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew]
157 | 6.45s call src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew]
158 | 5.93s call src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew]
159 | 5.78s call src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs]
160 | 3.98s call src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew]
161 | 0.01s call src/cachew/marshall/cachew.py::test_serialize_and_deserialize
162 |
163 | (68 durations < 0.005s hidden. Use -vv to show these durations.)
164 | ========================================================= short test summary info ==========================================================
165 | SKIPPED [6] src/cachew/tests/marshall.py:171: TODO need to adjust the handling of Union types..
166 | SKIPPED [4] src/cachew/tests/marshall.py:194: TODO support datetime with pytz for cattrs
167 | PASSED src/cachew/marshall/cachew.py::test_serialize_and_deserialize
168 | PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-1000000-cachew]
169 | PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_on-5000000-cachew]
170 | PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-1000000-cachew]
171 | PASSED src/cachew/tests/marshall.py::test_union_str_dataclass[gc_off-5000000-cachew]
172 | PASSED src/cachew/tests/marshall.py::test_datetimes[gc_on-1000000-cachew]
173 | PASSED src/cachew/tests/marshall.py::test_datetimes[gc_on-5000000-cachew]
174 | PASSED src/cachew/tests/marshall.py::test_datetimes[gc_off-1000000-cachew]
175 | PASSED src/cachew/tests/marshall.py::test_datetimes[gc_off-5000000-cachew]
176 | PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cachew]
177 | PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_on-1000000-cattrs]
178 | PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cachew]
179 | PASSED src/cachew/tests/marshall.py::test_many_from_cachew[gc_off-1000000-cattrs]
180 | #+end_example
181 |
--------------------------------------------------------------------------------
/benchmarks/20230917.org:
--------------------------------------------------------------------------------
1 | Running on @karlicoss desktop PC, =python3.10=
2 |
3 | Just a comparison of =sqlite= and =file= backends.
4 |
5 | #+begin_example
6 | $ pytest --pyargs -k 'test_many and gc_off and 3000000' -s
7 | src/cachew/tests/test_cachew.py::test_many[sqlite-gc_off-3000000] [INFO 2023-09-17 02:02:09,946 cachew __init__.py:657 ] cachew.tests.test_cachew:test_many..iter_data: wrote 3000000 objects to cachew (sqlite:/tmp/pytest-of-karlicos/pytest-129/test_many_sqlite_gc_off_3000000/test_many)
8 | test_many: initial write to cache took 13.6s
9 | test_many: cache size is 229.220352Mb
10 | [INFO 2023-09-17 02:02:10,780 cachew __init__.py:662 ] cachew.tests.test_cachew:test_many..iter_data: loading 3000000 objects from cachew (sqlite:/tmp/pytest-of-karlicos/pytest-129/test_many_sqlite_gc_off_3000000/test_many)
11 | test_many: reading from cache took 7.0s
12 | PASSED
13 | src/cachew/tests/test_cachew.py::test_many[file-gc_off-3000000] [INFO 2023-09-17 02:02:23,944 cachew __init__.py:657 ] cachew.tests.test_cachew:test_many..iter_data: wrote 3000000 objects to cachew (file:/tmp/pytest-of-karlicos/pytest-129/test_many_file_gc_off_3000000_0/test_many)
14 | test_many: initial write to cache took 6.1s
15 | test_many: cache size is 202.555667Mb
16 | [INFO 2023-09-17 02:02:23,945 cachew __init__.py:662 ] cachew.tests.test_cachew:test_many..iter_data: loading objects from cachew (file:/tmp/pytest-of-karlicos/pytest-129/test_many_file_gc_off_3000000_0/test_many)
17 | test_many: reading from cache took 5.4s
18 | #+end_example
19 |
--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
1 | # this is a hack to monkey patch pytest so it handles tests inside namespace packages without __init__.py properly
2 | # without it, pytest can't discover the package root for some reason
3 | # also see https://github.com/karlicoss/pytest_namespace_pkgs for more
4 |
5 | import os
6 | import pathlib
7 | from typing import Optional
8 |
9 | import _pytest.main
10 | import _pytest.pathlib
11 |
12 | # we consider all dirs in repo/ to be namespace packages
13 | root_dir = pathlib.Path(__file__).absolute().parent.resolve() / 'src'
14 | assert root_dir.exists(), root_dir
15 |
16 | # TODO assert it contains package name?? maybe get it via setuptools..
17 |
18 | namespace_pkg_dirs = [str(d) for d in root_dir.iterdir() if d.is_dir()]
19 |
20 | # resolve_package_path is called from _pytest.pathlib.import_path
21 | # takes a full abs path to the test file and needs to return the path to the 'root' package on the filesystem
22 | resolve_pkg_path_orig = _pytest.pathlib.resolve_package_path
23 |
24 |
25 | def resolve_package_path(path: pathlib.Path) -> Optional[pathlib.Path]:
26 | result = path # search from the test file upwards
27 | for parent in result.parents:
28 | if str(parent) in namespace_pkg_dirs:
29 | return parent
30 | if os.name == 'nt':
31 | # ??? for some reason on windows it is trying to call this against conftest? but not on linux/osx
32 | if path.name == 'conftest.py':
33 | return resolve_pkg_path_orig(path)
34 | raise RuntimeError("Couldn't determine path for ", path)
35 |
36 |
37 | _pytest.pathlib.resolve_package_path = resolve_package_path
38 |
39 |
40 | # without patching, the orig function returns just a package name for some reason
41 | # (I think it's used as a sort of fallback)
42 | # so we need to point it at the absolute path properly
43 | # not sure what are the consequences.. maybe it wouldn't be able to run against installed packages? not sure..
44 | search_pypath_orig = _pytest.main.search_pypath
45 |
46 |
47 | def search_pypath(module_name: str) -> str:
48 | mpath = root_dir / module_name.replace('.', os.sep)
49 | if not mpath.is_dir():
50 | mpath = mpath.with_suffix('.py')
51 | assert mpath.exists(), mpath # just in case
52 | return str(mpath)
53 |
54 |
55 | _pytest.main.search_pypath = search_pypath
56 |
--------------------------------------------------------------------------------
/doc/cachew_disable.md:
--------------------------------------------------------------------------------
1 | Can put this in the README.md once its been tested a bit
2 |
3 | ### Disable through Environment Variables
4 |
5 | To disable a `cachew` function in some module, you can use the `CACHEW_DISABLE` environment variable. This is a colon-delimited (like a `$PATH`) list of modules to disable. It disables modules given some name recursively, and supports [unix-style globs](https://docs.python.org/3/library/fnmatch.html)
6 |
7 | For example, say you were using [HPI](https://github.com/karlicoss/HPI) which internally uses a snippet like `mcachew` above. You may want to enable `cachew` for _most_ modules, but disable them for specific ones. For example take:
8 |
9 | ```
10 | my/browser
11 | ├── active_browser.py
12 | ├── all.py
13 | ├── common.py
14 | └── export.py
15 | my/reddit
16 | ├── __init__.py
17 | ├── all.py
18 | ├── common.py
19 | ├── pushshift.py
20 | └── rexport.py
21 | ```
22 |
23 | To disable `cachew` in all of these files: `export CACHEW_DISABLE=my.browser:my.reddit` (disables for all submodules)
24 |
25 | To disable just for a particular module: `export CACHEW_DISABLE='my.browser.export'`
26 |
27 | Similarly to `$PATH` manipulations, you can do this in your shell configuration incrementally:
28 |
29 | ```
30 | CACHEW_DISABLE='my.reddit.rexport'
31 | if some condition...; then
32 | CACHEW_DISABLE="my.browser.export:$CACHEW_DISABLE"
33 | fi
34 | export CACHEW_DISABLE
35 | ```
36 |
37 | You can also use globs, e.g. `CACHEW_DISABLE='my.*.gdpr`
38 |
39 | To disable `cachew` everywhere, you could set `export CACHEW_DISABLE='*'`
40 |
--------------------------------------------------------------------------------
/doc/serialization.org:
--------------------------------------------------------------------------------
1 | Cachew works kinda like =functools.lru_cache=, but it also works in-between program runs.
2 | For that, it needs to somehow persist the objects on the disk (unlike =lru_cache= which just keeps references to the objects already in process memory).
3 |
4 | While persisting objects to the cache, essentially cachew needs to map them into simpler types, i.e. ones you can keep in a database like strings/ints/binary blobs.
5 |
6 | At the moment (as of =v0.13.0=), we use sqlite as the cache store, with =sqlalchemy= as the interface to interact with it.
7 |
8 | The way cachew works now is, to save the object in cache:
9 |
10 | - first it's "flattened out" to conform to the database row model, so individual fields (including recursive fields) become database columns
11 | - python types are mapped into sqlalchemy types, with extra =sqlalchemy.TypeDecorator= instances to support custom types like =datetime= or =Exception=
12 |
13 | You can find a more detailed example [[https://github.com/karlicoss/cachew/blob/175afade0a417bfd533ced174365d246b8a7dabc/src/cachew/__init__.py#L319-L353][here]].
14 |
15 | A big problem is that in general it's not really possible to serialize, and especially to deserialize back an arbitrary object in Python, unless you resort to binary serialization like =pickle= (which is very slow and comes with its own hosts of issues).
16 |
17 | However in cachew we require the user to supply the *type signature* for the functions that are cached, so we can benefit from it for serializing and deserializing.
18 |
19 | Few years ago, when I implemented =cachew= at first, there weren't really many options for serialization driven by type signatures, so I implemented the custom code I mentioned above to support that. In 2023, however, more and more libraries are benefiting from type signatures, in particular for serializing stuff.
20 |
21 | So I decided to give it another go, in hope of using some mature library, simplifying cachew's code, and possibly getting a perfromance boost.
22 | It's possible that I missed some documentation so if you think the problems I am describing can actually be worked around, please don't hesitate to let me know.
23 |
24 | * Comparison
25 |
26 | In cachew the very minimum we're aiming to support are:
27 |
28 | - all json-ish types, e.g. =int=/=str=/=dict=/=list= etc
29 | - =dataclass= and =NamedTuple=
30 | - =Optional= and =Union=
31 | - custom types, e.g. =datetime=, =Exception= (e.g. at least preserve exception message)
32 |
33 | See [[file:test_serialization.py]] for more specific examples and supporting evidence for my summary here.
34 |
35 | ** [[https://docs.python.org/3.10/library/pickle.html][pickle]]
36 | Builtin pickle module can handle any objects, without even needing type annotations.
37 |
38 | However, it's [[https://www.benfrederickson.com/dont-pickle-your-data/][famously very slow]], so I even didn't consider using it.
39 |
40 | It's also not secure in general, although in our case we control the objects we save/load from cache, so it's not a big issue.
41 |
42 | ** [[https://github.com/jsonpickle/jsonpickle#readme][jsonpickle]]
43 | Jsonpickle -- similar to pickle, can handle any types.
44 |
45 | I [[https://github.com/karlicoss/cachew/commit/048df33e65560205d63845f022b027a27719ff48][gave it a go]] just in case, and it's an order of magnitude slower than custom serialization code I already had, which is a no-go.
46 |
47 | ** [[https://github.com/lidatong/dataclasses-json/#readme][dataclasses-json]]
48 | # TODO link to code
49 | - CON: requires annotating all dataclasses involved with =@dataclass_json=, recursively.
50 | This is a blocker from using it in =cachew=.
51 | - CON: requires the type to be a =@dataclass= to annotate
52 | So if you have something simpler you'll have to wrap it into a dummy dataclass or something.
53 | - PRO: supports =Union= correctly
54 |
55 | ** [[https://github.com/marshmallow-code/marshmallow][marshmallow]]
56 |
57 | By default marshmallow doesn't support dataclasses or unions, but there are some extra packages
58 |
59 | - for dataclasses https://github.com/lovasoa/marshmallow_dataclass
60 | - PRO: doesn't require modifying the original class, handles recursion out of the box
61 | - CON: doesn't handle =Union= correctly
62 | This is a blocker for cachew.
63 | In addition it has a custom implementation of Union handling (rather than e.g. relying on =python-marshmallow-union=).
64 | - https://github.com/adamboche/python-marshmallow-union
65 | I didn't even get to try it since if dataclasses don't work marshmallow is a no-go for me.
66 | Plus for some reason =marshmallow_dataclass= has a custom Union handling implementation which is different from this one, so it's going to be a huge mess.
67 |
68 | ** [[https://github.com/pydantic/pydantic#readme][pydantic]]
69 | - PRO: if you use =TypeAdapter=, you can serialize/deserialize arbitrary types without decorating/inheriting from =BaseModel=
70 | - CON: doesn't handle =Union= correctly
71 | Again, this is a bit blocker. I've created an issue on pydantic bug tracker here: https://github.com/pydantic/pydantic/issues/7391
72 |
73 | Kind of sad, because otherwise pydantic seemed promising!
74 |
75 | ** [[https://github.com/python-attrs/cattrs#features][cattrs]]
76 | - PRO: doesn't require modifying the classes you serialise
77 | - PRO: rich feature set, clearly aiming to comply with standard python's typing annotations
78 | - CON: there is an issue with handling =NamedTuple=
79 |
80 | It isn't converted to a dictionary like =dataclass= does, [[https://github.com/python-attrs/cattrs/issues/425][likely a bug]]?
81 | - =Union= types are supported, but require some extra configuration
82 |
83 | Unions work, but you have to 'register' them first.
84 | A bit annoying that this is necessary even for simple unions like =int | str=, although [[https://github.com/python-attrs/cattrs/issues/423][possible]] to workaround.
85 |
86 | The plus side is that cattr has a builtin utility for Union type discrimination.
87 |
88 | I guess for my application I could traverse the type and register all necessary Unions with =catrrs=?
89 | # TODO create an issue to support opting in everywhere by default?
90 |
91 |
92 | Since the above seems quite good, I did a quick cachew hack on [[https://github.com/karlicoss/cachew/tree/cattrs][cattrs branch]] to try and use it.
93 |
94 | The pipeline is the following:
95 | - serialize type to a dictionary with primitive types via =cattrs=
96 | - serialize dictionary to a byte string via =orjson=
97 | - persist the byte string as an sqlite database row
98 |
99 | (for deserializing we just do the same in reverse)
100 |
101 | You can find the results [[https://github.com/karlicoss/cachew/commit/82691b10cd1d4ced4862dff21cf038fb83f9525c][here]] -- cattrs proved to be quite a huge speedup over my custom serialization code!
102 |
103 | It needs a bit more work and evaluation for use in =cachew=, however it's super promising!
104 |
105 | # TODO https://catt.rs/en/stable/preconf.html#orjson
106 |
107 | Some interesting reading about cattrs:
108 | - https://threeofwands.com/why-cattrs-is-so-fast/#v2-the-genconverter
109 | - https://threeofwands.com/why-i-use-attrs-instead-of-pydantic
110 |
111 | * Verdict
112 |
113 | The biggest shared issues are that most of this libraries:
114 | - require modifying the original class definitions, either by inheriting or decorating
115 | - don't handle =Union= at all or don't handle it corectly (usually relying on the structural equivalence rather than actual types)
116 |
117 | So for most of them, I even didn't get to trying to support custom types and measuing performance with =cachew=.
118 |
119 | Of all of them only =cattrs= stood out, it takes builtin python typing and performance very seriously, and very configurable.
120 | So if you need no bullshit serialization in python, I can definitely recommend it.
121 | I might switch to it in [[https://github.com/karlicoss/promnesia][promnesia]] (where we have full control over the type we serialize in the database), and could potentially be used in HPI for [[https://github.com/karlicoss/HPI/blob/master/my/core/serialize.py][my.core.serialize]].
122 |
--------------------------------------------------------------------------------
/doc/test_serialization.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from dataclasses import dataclass
3 | from typing import NamedTuple, Union
4 |
5 |
6 | def test_dataclasses_json():
7 | # pip install dataclasses-json
8 | from dataclasses_json import dataclass_json
9 |
10 | @dataclass
11 | class Inner:
12 | value: int
13 |
14 | @dataclass
15 | class Outer:
16 | inner: Inner
17 |
18 | ### issue 1: requires @dataclass_json annotation on all involved dataclasses
19 | obj = Outer(inner=Inner(value=123)) # noqa: F841
20 |
21 | # we don't control the types that are passed to us, so we can't use the @dataclass_json
22 | # but we can just call the decorator directly
23 |
24 | # HOWEVER: this modifies the original class, Outer!!
25 | OuterJson = dataclass_json(Outer) # noqa: F841
26 | # it adds 'from_dict', 'from_json', 'schema', 'to_dict', 'to_json' attributes to it
27 |
28 | # now if you try
29 | # print(OuterJson.schema().dump(obj))
30 | # you get a warning that it wants you to add annotations to Inner classes too.
31 | # this isn't really an option for us.
32 | ###
33 |
34 | ### issue 2: can't dump anything unless the top level type is a dataclass?
35 | ### could wrap into a dummy dataclass or something, but is wasteful in terms of performance
36 | ###
37 |
38 | ### nice thing: correctly serializes Union types, even if they share the same attributes
39 | @dataclass_json
40 | @dataclass
41 | class City:
42 | name: str
43 |
44 | @dataclass_json
45 | @dataclass
46 | class Country:
47 | name: str
48 |
49 | @dataclass_json
50 | @dataclass
51 | class WithUnion:
52 | union: Union[City, Country]
53 |
54 | objs = [
55 | WithUnion(union=City(name='London')),
56 | WithUnion(union=Country(name='UK')),
57 | ]
58 |
59 | schema = WithUnion.schema()
60 | json = schema.dumps(objs, many=True)
61 | objs2 = schema.loads(json, many=True)
62 | print("objects ", objs)
63 | print("json ", json)
64 | # NOTE: it dumps [{"union": {"name": "London", "__type": "City"}}, {"union": {"name": "UK", "__type": "Country"}}]
65 | # so types are correctly distinguished
66 | print("restored ", objs2)
67 | assert objs == objs2, (objs, objs2)
68 | ###
69 |
70 |
71 | def test_marshmallow_dataclass():
72 | # pip3 install --user marshmallow-dataclass[union]
73 | import marshmallow_dataclass
74 |
75 | ### issue 1: the top level type has to be a dataclass?
76 | ### although possible that we could use regular marshmallow for that instead
77 | ###
78 |
79 | ### issue 2: doesn't handle unions correctly
80 | @dataclass
81 | class City:
82 | name: str
83 |
84 | @dataclass
85 | class Country:
86 | name: str
87 |
88 | @dataclass
89 | class WithUnion:
90 | union: Union[City, Country]
91 |
92 | objs = [
93 | WithUnion(union=City(name="London")),
94 | WithUnion(union=Country(name="UK")),
95 | ]
96 |
97 | # NOTE: good, doesn't require adding annotations on the original classes
98 | schema = marshmallow_dataclass.class_schema(WithUnion)()
99 |
100 | json = schema.dumps(objs, many=True)
101 | objs2 = schema.loads(json, many=True)
102 | print("objects ", objs)
103 | print("json ", json)
104 | # NOTE: it dumps [{"union": {"value": 123}}, {"union": {"value": 123}}]
105 | # so it doesn't distingush based on types => won't deserialize correctly
106 | print("restored ", objs2)
107 | # assert objs == objs2, (objs, objs2)
108 | # ^ this assert fails!
109 | ###
110 |
111 |
112 | def test_pydantic():
113 | from pydantic import TypeAdapter
114 |
115 | ### issue: doesn't handle Unions correctly
116 | @dataclass
117 | class City:
118 | name: str
119 |
120 | @dataclass
121 | class Country:
122 | name: str
123 |
124 | @dataclass
125 | class WithUnion:
126 | union: Union[City, Country]
127 |
128 | objs = [
129 | WithUnion(union=City(name="London")),
130 | WithUnion(union=Country(name="UK")),
131 | ]
132 |
133 | # NOTE: nice, doesn't require annotating the original classes with anything
134 | Schema = TypeAdapter(list[WithUnion])
135 |
136 | json = Schema.dump_python(
137 | objs,
138 | # round_rtip: Whether to output the serialized data in a way that is compatible with deserialization
139 | # not sure, doesn't seem to impact anything..
140 | round_trip=True,
141 | )
142 | objs2 = Schema.validate_python(json)
143 |
144 | print("objects ", objs)
145 | print("json ", json)
146 | print("restored ", objs2)
147 |
148 | # assert objs == objs2, (objs, objs2)
149 | # ^ this assert fails!
150 | # created an issue https://github.com/pydantic/pydantic/issues/7391
151 | ###
152 |
153 |
154 | def test_cattrs():
155 | from cattrs import Converter
156 | from cattrs.strategies import configure_tagged_union
157 |
158 | converter = Converter()
159 |
160 | ### issue: NamedTuples aren't unstructured? asked here https://github.com/python-attrs/cattrs/issues/425
161 | class X(NamedTuple):
162 | value: int
163 |
164 | d = converter.unstructure(X(value=123), X) # noqa: F841
165 | # NOTE: this assert doesn't pass!
166 | # assert isinstance(d, dict)
167 | ###
168 |
169 | ### good: handles Union correctly (although some extra configuring required)
170 | @dataclass
171 | class City:
172 | name: str
173 |
174 | @dataclass
175 | class Country:
176 | name: str
177 |
178 | @dataclass
179 | class WithUnion:
180 | union: Union[City, Country]
181 |
182 | objs = [
183 | WithUnion(union=City(name="London")),
184 | WithUnion(union=Country(name="UK")),
185 | ]
186 |
187 | configure_tagged_union(
188 | union=City | Country,
189 | converter=converter,
190 | )
191 | # NOTE: nice -- doesn't require decorating original classes
192 | json = converter.unstructure(objs, list[WithUnion])
193 | assert isinstance(json, list)
194 | objs2 = converter.structure(json, list[WithUnion])
195 |
196 | print("objects ", objs)
197 | # NOTE: dumps it as [{'union': {'name': 'London', '_type': 'City'}}, {'union': {'name': 'UK', '_type': 'Country'}}]
198 | print("json ", json)
199 | print("restored ", objs2)
200 |
201 | assert objs == objs2, (objs, objs2)
202 | ###
203 |
204 | ### issue: unions of simple types aren't supported?
205 | # see https://github.com/python-attrs/cattrs/issues/423
206 | mixed: list[int | str] = [
207 | 123,
208 | 'Jakarta',
209 | ]
210 | json = converter.unstructure(mixed, list[int | str])
211 | # NOTE: this fails
212 | # mixed2 = converter.structure(json , list[int | str])
213 | ###
214 |
215 |
216 | test_dataclasses_json()
217 | test_marshmallow_dataclass()
218 | test_pydantic()
219 | test_cattrs()
220 |
--------------------------------------------------------------------------------
/generate-readme:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | cd "$(dirname "$0")"
4 |
5 | # '--TagRemovePreprocessor.remove_cell_tags={"noexport"}'
6 | uv run --with=jupyter --group=testing jupyter nbconvert --execute --to markdown --template readme.tpl README.ipynb
7 |
8 | # TODO run it on CI to make sure it renders and up to date?
9 |
--------------------------------------------------------------------------------
/github-issues.org:
--------------------------------------------------------------------------------
1 | #+todo: OPEN | CLOSED
2 | * Issues of cachew
3 | :PROPERTIES:
4 | :since:
5 | :url: https://api.github.com/repos/karlicoss/cachew
6 | :END:
7 | ** OPEN keep hash along each cached entity instead of separate table?
8 | :PROPERTIES:
9 | :tags: ("prio-B")
10 | :id: 15
11 | :date-modification: 2020-01-08T22:26:04+0000
12 | :date-creation: 2020-01-08T22:26:04+0000
13 | :author: "karlicoss"
14 | :END:
15 | : At the moment there are two separate tables: one for latest hash value, another for cached entities.
16 | : It might be simpler and safer to keep a single table, with hash along with each cached entity.
17 | :
18 | ** OPEN support multiple cached values?
19 | :PROPERTIES:
20 | :tags: ("prio-B")
21 | :id: 14
22 | :date-modification: 2020-01-08T22:26:03+0000
23 | :date-creation: 2020-01-08T22:26:02+0000
24 | :author: "karlicoss"
25 | :END:
26 | : At the moment it's LRU(1) cache, it some usecases it makes sense to cache more values though
27 | :
28 | ** OPEN support pathlib.Path
29 | :PROPERTIES:
30 | :tags: ("prio-C")
31 | :id: 13
32 | :date-modification: 2020-01-08T22:26:02+0000
33 | :date-creation: 2020-01-08T22:26:01+0000
34 | :author: "karlicoss"
35 | :END:
36 | : Path is a trivial wrapper around str. I guess generally think of a good way to allow adhoc mapping of simple types.
37 | : Perhaps current Exception makes sense.
38 | :
39 | ** OPEN support defensive behaviour
40 | :PROPERTIES:
41 | :tags: ("prio-C")
42 | :id: 12
43 | :date-modification: 2020-01-08T22:26:01+0000
44 | :date-creation: 2020-01-08T22:26:00+0000
45 | :author: "karlicoss"
46 | :END:
47 | : E.g. if we can't serialize for some reason, bail the database but at least yield values anyway
48 | :
49 | ** OPEN Add Redis support
50 | :PROPERTIES:
51 | :id: 9
52 | :date-modification: 2020-01-06T00:48:59+0000
53 | :date-creation: 2020-01-06T00:48:59+0000
54 | :author: "softinio"
55 | :END:
56 | : Add Redis support as an alternative to sqlite
57 | :
58 | : This would be a great feature as it will make this solution easier to use in an enterprise production environment as getting a redis instance shared amonst multiple instances of your app is very easy and cost effective to use.
59 | :
60 | ** OPEN better pytz support?
61 | :PROPERTIES:
62 | :tags: ("prio-C")
63 | :id: 6
64 | :date-modification: 2020-01-05T13:34:51+0000
65 | :date-creation: 2020-01-05T13:33:25+0000
66 | :author: "karlicoss"
67 | :END:
68 | ** CLOSED Optional feature: Exception support
69 | :PROPERTIES:
70 | :id: 11
71 | :date-modification: 2020-01-08T21:56:56+0000
72 | :date-creation: 2020-01-08T21:34:03+0000
73 | :author: "karlicoss"
74 | :END:
75 | ** CLOSED Add doc on defensive/optional usage
76 | :PROPERTIES:
77 | :id: 10
78 | :date-modification: 2020-01-06T23:48:54+0000
79 | :date-creation: 2020-01-06T23:47:39+0000
80 | :author: "karlicoss"
81 | :END:
82 | ** CLOSED Safer concurrent writes handling
83 | :PROPERTIES:
84 | :id: 8
85 | :date-modification: 2020-01-05T22:32:13+0000
86 | :date-creation: 2020-01-05T22:08:24+0000
87 | :author: "karlicoss"
88 | :END:
89 | ** CLOSED Update readme
90 | :PROPERTIES:
91 | :id: 7
92 | :date-modification: 2020-01-05T15:29:37+0000
93 | :date-creation: 2020-01-05T15:24:38+0000
94 | :author: "karlicoss"
95 | :END:
96 | ** CLOSED support for dataclasses
97 | :PROPERTIES:
98 | :id: 1
99 | :date-modification: 2020-01-05T13:34:50+0000
100 | :date-creation: 2019-07-30T21:45:30+0100
101 | :author: "karlicoss"
102 | :END:
103 | ** CLOSED Fix Json support for python3.6
104 | :PROPERTIES:
105 | :id: 2
106 | :date-modification: 2020-01-05T13:33:28+0000
107 | :date-creation: 2019-12-08T12:21:58+0000
108 | :author: "karlicoss"
109 | :END:
110 | ** CLOSED Fix bug when default argument is explicitly specified
111 | :PROPERTIES:
112 | :id: 3
113 | :date-modification: 2020-01-05T13:33:27+0000
114 | :date-creation: 2019-12-08T17:56:51+0000
115 | :author: "karlicoss"
116 | :END:
117 | ** CLOSED Union types
118 | :PROPERTIES:
119 | :id: 4
120 | :date-modification: 2020-01-05T13:33:27+0000
121 | :date-creation: 2019-12-19T23:32:55+0000
122 | :author: "karlicoss"
123 | :END:
124 | ** CLOSED support top level primitive types
125 | :PROPERTIES:
126 | :id: 5
127 | :date-modification: 2020-01-05T13:33:26+0000
128 | :date-creation: 2019-12-20T00:09:00+0000
129 | :author: "karlicoss"
130 | :END:
131 |
--------------------------------------------------------------------------------
/misc/profile.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import sqlite3
3 | from collections.abc import Iterator
4 | from pathlib import Path
5 |
6 | import sqlalchemy
7 | from codetiming import Timer
8 | from more_itertools import ilen
9 |
10 | from cachew import cachew
11 |
12 | # todo not sure it really helps much?
13 | import gc # isort: skip
14 |
15 | gc.disable()
16 |
17 |
18 | def timer(name: str) -> Timer:
19 | return Timer(name=name, text=name + ': ' + '{:.2f}s')
20 |
21 |
22 | def test_ints() -> None:
23 | N = 5_000_000
24 |
25 | base = Path('/tmp/cachew_profiling/')
26 | # shutil.rmtree(base)
27 | base.mkdir(exist_ok=True, parents=True)
28 |
29 | cache_path = base / 'ints'
30 |
31 | def fun_nocachew(n) -> Iterator[int]:
32 | yield from range(n)
33 |
34 | @cachew(cache_path=cache_path, force_file=True)
35 | def fun(n) -> Iterator[int]:
36 | yield from range(n)
37 |
38 | # with timer('no caching'):
39 | # ilen(fun_nocachew(N))
40 |
41 | # with timer('initial call'):
42 | # ilen(fun(N))
43 |
44 | assert cache_path.exists() # just in case
45 | with timer('reading directly via sqlite'):
46 | total = 0
47 | with sqlite3.connect(cache_path) as conn:
48 | for (_x,) in conn.execute('SELECT * FROM cache'):
49 | total += 1
50 | assert total == N # just in case
51 |
52 | with timer('reading directly via sqlalchemy'):
53 | total = 0
54 | engine = sqlalchemy.create_engine(f'sqlite:///{cache_path}')
55 |
56 | from sqlalchemy import Column, MetaData, Table
57 |
58 | meta = MetaData()
59 | table_cache = Table('cache', meta, Column('_cachew_primitive', sqlalchemy.Integer))
60 | with engine.connect() as conn:
61 | with timer('sqlalchemy querying'):
62 | rows = conn.execute(table_cache.select())
63 | for (_x,) in rows:
64 | total += 1
65 | engine.dispose()
66 | assert total == N # just in case
67 |
68 | cache_size_mb = cache_path.stat().st_size / 10**6
69 | print(f'cache size: {cache_size_mb:.1f} Mb')
70 |
71 | with timer('subsequent call'):
72 | ilen(fun(N))
73 |
74 |
75 | test_ints()
76 |
--------------------------------------------------------------------------------
/misc/test_redis/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | redis:
3 | image: "redis:alpine"
4 | # restart: always
5 | command:
6 | - "sh"
7 | - "-euc"
8 | - |
9 | exec redis-server
10 | # - |
11 | # echo "requirepass '$$REDIS_PASSWORD'" > /etc/redis.conf
12 | # exec redis-server /etc/redis.conf
13 | # environment:
14 | # REDIS_PASSWORD: "password"
15 | ports:
16 | - 6379:6379
17 | volumes:
18 | - "redis-cachew:/data:rw"
19 |
20 | volumes:
21 | redis-cachew:
22 |
--------------------------------------------------------------------------------
/misc/test_redis/test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from time import time
3 |
4 | import redis
5 | from loguru import logger
6 | from more_itertools import ilen
7 |
8 | r = redis.Redis(host='localhost', port=6379, db=0)
9 |
10 |
11 | N = 1_000_000
12 |
13 |
14 | def items():
15 | yield from map(str, range(N))
16 |
17 |
18 | TAG = 'keys'
19 |
20 |
21 | def reset():
22 | r.delete(TAG)
23 |
24 |
25 | def write():
26 | for i, obj in enumerate(items()):
27 | key = f'obj:{i}'
28 | r.hset(key, 'data', obj)
29 | r.lpush(TAG, key)
30 |
31 |
32 | def read():
33 | keys = r.lrange(TAG, 0, -1)
34 | result = (r.hget(key, 'data') for key in keys)
35 | print('total', ilen(result))
36 |
37 |
38 | # TODO could use lmove for atomic operations?
39 | def write2():
40 | for obj in items():
41 | r.lpush(TAG, obj)
42 |
43 |
44 | def read2():
45 | result = r.lrange(TAG, 0, -1)
46 | print('total', ilen(result))
47 |
48 |
49 | reset()
50 |
51 | a = time()
52 | write2()
53 | b = time()
54 | logger.info(f'writing took {b - a:.1f}s')
55 |
56 | a = time()
57 | read2()
58 | b = time()
59 | logger.info(f'reading took {b - a:.1f}s')
60 |
61 |
62 | # with read()/write()
63 | # 100000 strings:
64 | # 2023-09-09 01:50:23.498 | INFO | __main__::37 - writing took 13.1s
65 | # 2023-09-09 01:50:30.052 | INFO | __main__::42 - reading took 6.6s
66 | # hmm kinda slow..
67 |
68 |
69 | # with read2/write2, writing about 7secs, and reading is instantaneous??
70 | # for 1M objects, writing took 60 secs, and reading 0.2s?
71 | # lol could be promising...
72 | # I guess it's not iterative, but could retrieve items in batches?
73 |
--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | pretty = True
3 | show_error_context = True
4 | show_column_numbers = True
5 | show_error_end = True
6 |
7 | check_untyped_defs = True
8 |
9 | # see https://mypy.readthedocs.io/en/stable/error_code_list2.html
10 | warn_redundant_casts = True
11 | strict_equality = True
12 | warn_unused_ignores = True
13 | enable_error_code = deprecated,redundant-expr,possibly-undefined,truthy-bool,truthy-iterable,ignore-without-code,unused-awaitable
14 |
15 |
16 | # an example of suppressing
17 | # [mypy-my.config.repos.pdfannots.pdfannots]
18 | # ignore_errors = True
19 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | # see https://github.com/karlicoss/pymplate for up-to-date reference
2 | [project]
3 | dynamic = ["version"] # version is managed by build backend
4 | name = "cachew"
5 | dependencies = [
6 | "platformdirs", # default cache dir
7 | "sqlalchemy>=1.0", # cache DB interaction
8 | "orjson", # fast json serialization
9 | ]
10 | requires-python = ">=3.9"
11 |
12 | ## these need to be set if you're planning to upload to pypi
13 | # description = "TODO"
14 | license = {file = "LICENSE.txt"}
15 | authors = [
16 | {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"},
17 | ]
18 | maintainers = [
19 | {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"},
20 | ]
21 | # keywords = []
22 | # # see: http://pypi.python.org/pypi?%3Aaction=list_classifiers
23 | # classifiers = [
24 | # ]
25 |
26 |
27 | [project.urls]
28 | Homepage = "https://github.com/karlicoss/cachew"
29 | ##
30 |
31 |
32 | [project.optional-dependencies]
33 | optional = [
34 | "colorlog",
35 | ]
36 | [dependency-groups]
37 | testing = [
38 | "pytz", "types-pytz", # optional runtime only dependency
39 |
40 | "pytest",
41 | "more-itertools",
42 | "patchy", # for injecting sleeps and testing concurrent behaviour
43 | "enlighten", # used in logging helper, but not really required
44 | "cattrs", # benchmarking alternative marshalling implementation
45 | "pyinstrument", # for profiling from within tests
46 | "codetiming", # Timer context manager
47 |
48 | "ruff",
49 | "mypy",
50 | "lxml", # for mypy html coverage
51 | ]
52 |
53 |
54 | # workaround for error during uv publishing
55 | # see https://github.com/astral-sh/uv/issues/9513#issuecomment-2519527822
56 | [tool.setuptools]
57 | license-files = []
58 |
59 |
60 | [build-system]
61 | requires = ["hatchling", "hatch-vcs"]
62 | build-backend = "hatchling.build"
63 |
64 | # unfortunately have to duplicate project name here atm, see https://github.com/pypa/hatch/issues/1894
65 | [tool.hatch.build.targets.wheel]
66 | packages = ["src/cachew"]
67 |
68 | [tool.hatch.version]
69 | source = "vcs"
70 |
71 | [tool.hatch.version.raw-options]
72 | version_scheme = "python-simplified-semver"
73 | local_scheme = "dirty-tag"
74 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | # discover files that don't follow test_ naming. Useful to keep tests along with the source code
3 | python_files = *.py
4 |
5 | # this setting only impacts package/module naming under pytest, not the discovery
6 | consider_namespace_packages = true
7 |
8 | addopts =
9 | # prevent pytest cache from being created... it craps into project dir and I never use it anyway
10 | -p no:cacheprovider
11 |
12 | # -rap to print tests summary even when they are successful
13 | -rap
14 | --verbose
15 |
16 | # otherwise it won't discover doctests
17 | --doctest-modules
18 |
19 | # show all test durations (unless they are too short)
20 | --durations=0
21 |
--------------------------------------------------------------------------------
/readme.tpl:
--------------------------------------------------------------------------------
1 | {# disable code used to generate readme #}
2 | {# based on https://stackoverflow.com/a/55305881/706389 #}
3 |
4 | {%- extends 'markdown/index.md.j2' -%}
5 |
6 | {% block input_group %}
7 | {%- if cell.metadata.get('nbconvert', {}).get('show_code', False) -%}
8 | ((( super() )))
9 | {%- endif -%}
10 | {% endblock input_group %}
11 |
--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
1 | lint.extend-select = [
2 | "F", # flakes rules -- default, but extend just in case
3 | "E", # pycodestyle -- default, but extend just in case
4 | "W", # various warnings
5 |
6 | "B", # 'bugbear' set -- various possible bugs
7 | "C4", # flake8-comprehensions -- unnecessary list/map/dict calls
8 | "COM", # trailing commas
9 | "EXE", # various checks wrt executable files
10 | "I", # sort imports
11 | "ICN", # various import conventions
12 | "FBT", # detect use of boolean arguments
13 | "FURB", # various rules
14 | "PERF", # various potential performance speedups
15 | "PD", # pandas rules
16 | "PIE", # 'misc' lints
17 | "PLC", # pylint convention rules
18 | "PLR", # pylint refactor rules
19 | "PLW", # pylint warnings
20 | "PT", # pytest stuff
21 | "PYI", # various type hinting rules
22 | "RET", # early returns
23 | "RUF", # various ruff-specific rules
24 | "TID", # various imports suggestions
25 | "TRY", # various exception handling rules
26 | "UP", # detect deprecated python stdlib stuff
27 | "FA", # suggest using from __future__ import annotations
28 | "PTH", # pathlib migration
29 | "ARG", # unused argument checks
30 | "A", # builtin shadowing
31 | "G", # logging stuff
32 | # "EM", # TODO hmm could be helpful to prevent duplicate err msg in traceback.. but kinda annoying
33 |
34 | # "ALL", # uncomment this to check for new rules!
35 | ]
36 |
37 | # Preserve types, even if a file imports `from __future__ import annotations`
38 | # we need this for cachew to work with HPI types on 3.9
39 | # can probably remove after 3.10?
40 | lint.pyupgrade.keep-runtime-typing = true
41 |
42 | lint.ignore = [
43 | "D", # annoying nags about docstrings
44 | "N", # pep naming
45 | "TCH", # type checking rules, mostly just suggests moving imports under TYPE_CHECKING
46 | "S", # bandit (security checks) -- tends to be not very useful, lots of nitpicks
47 | "DTZ", # datetimes checks -- complaining about missing tz and mostly false positives
48 | "FIX", # complains about fixmes/todos -- annoying
49 | "TD", # complains about todo formatting -- too annoying
50 | "ANN", # missing type annotations? seems way to strict though
51 |
52 | ### too opinionated style checks
53 | "E501", # too long lines
54 | "E702", # Multiple statements on one line (semicolon)
55 | "E731", # assigning lambda instead of using def
56 | "E741", # Ambiguous variable name: `l`
57 | "E742", # Ambiguous class name: `O
58 | "E401", # Multiple imports on one line
59 | "F403", # import *` used; unable to detect undefined names
60 | ###
61 |
62 | ###
63 | "E722", # Do not use bare `except` ## Sometimes it's useful for defensive imports and that sort of thing..
64 | "F811", # Redefinition of unused # this gets in the way of pytest fixtures (e.g. in cachew)
65 |
66 | ## might be nice .. but later and I don't wanna make it strict
67 | "E402", # Module level import not at top of file
68 |
69 | "RUF100", # unused noqa -- handle later
70 | "RUF012", # mutable class attrs should be annotated with ClassVar... ugh pretty annoying for user configs
71 |
72 | ### these are just nitpicky, we usually know better
73 | "PLR0911", # too many return statements
74 | "PLR0912", # too many branches
75 | "PLR0913", # too many function arguments
76 | "PLR0915", # too many statements
77 | "PLR1714", # consider merging multiple comparisons
78 | "PLR2044", # line with empty comment
79 | "PLR5501", # use elif instead of else if
80 | "PLR2004", # magic value in comparison -- super annoying in tests
81 | ###
82 | "PLR0402", # import X.Y as Y -- TODO maybe consider enabling it, but double check
83 |
84 | "B009", # calling gettattr with constant attribute -- this is useful to convince mypy
85 | "B010", # same as above, but setattr
86 | "B011", # complains about assert False
87 | "B017", # pytest.raises(Exception)
88 | "B023", # seems to result in false positives?
89 | "B028", # suggest using explicit stacklevel? TODO double check later, but not sure it's useful
90 |
91 | # complains about useless pass, but has sort of a false positive if the function has a docstring?
92 | # this is common for click entrypoints (e.g. in __main__), so disable
93 | "PIE790",
94 |
95 | # a bit too annoying, offers to convert for loops to list comprehension
96 | # , which may heart readability
97 | "PERF401",
98 |
99 | # suggests no using exception in for loops
100 | # we do use this technique a lot, plus in 3.11 happy path exception handling is "zero-cost"
101 | "PERF203",
102 |
103 | "RET504", # unnecessary assignment before returning -- that can be useful for readability
104 | "RET505", # unnecessary else after return -- can hurt readability
105 |
106 | "PLW0603", # global variable update.. we usually know why we are doing this
107 | "PLW2901", # for loop variable overwritten, usually this is intentional
108 |
109 | "PT011", # pytest raises should is too broad
110 | "PT012", # pytest raises should contain a single statement
111 |
112 | "COM812", # trailing comma missing -- mostly just being annoying with long multiline strings
113 |
114 | "PD901", # generic variable name df
115 |
116 | "TRY003", # suggests defining exception messages in exception class -- kinda annoying
117 | "TRY004", # prefer TypeError -- don't see the point
118 | "TRY201", # raise without specifying exception name -- sometimes hurts readability
119 | "TRY400", # TODO double check this, might be useful
120 | "TRY401", # redundant exception in logging.exception call? TODO double check, might result in excessive logging
121 |
122 | "PGH", # TODO force error code in mypy instead? although it also has blanket noqa rule
123 |
124 | "TID252", # Prefer absolute imports over relative imports from parent modules
125 |
126 | "UP038", # suggests using | (union) in isisntance checks.. but it results in slower code
127 |
128 | ## too annoying
129 | "T20", # just complains about prints and pprints
130 | "Q", # flake quotes, too annoying
131 | "C90", # some complexity checking
132 | "G004", # logging statement uses f string
133 | "ERA001", # commented out code
134 | "SLF001", # private member accessed
135 | "BLE001", # do not catch 'blind' Exception
136 | "INP001", # complains about implicit namespace packages
137 | "SIM", # some if statements crap
138 | "RSE102", # complains about missing parens in exceptions
139 | ##
140 | ]
141 |
--------------------------------------------------------------------------------
/src/cachew/__init__.py:
--------------------------------------------------------------------------------
1 | import fnmatch
2 | import functools
3 | import importlib.metadata
4 | import inspect
5 | import json
6 | import logging
7 | import os
8 | import stat
9 | import sys
10 | import warnings
11 | from collections.abc import Iterable
12 | from dataclasses import dataclass
13 | from pathlib import Path
14 | from typing import (
15 | TYPE_CHECKING,
16 | Any,
17 | Callable,
18 | Generic,
19 | Literal,
20 | Optional,
21 | TypeVar,
22 | Union,
23 | cast,
24 | get_args,
25 | get_origin,
26 | get_type_hints,
27 | overload,
28 | )
29 |
30 | try:
31 | # orjson might not be available on some architectures, so let's make it defensive just in case
32 | from orjson import dumps as orjson_dumps
33 | from orjson import loads as orjson_loads
34 | except:
35 | warnings.warn("orjson couldn't be imported. It's _highly_ recommended for better caching performance")
36 |
37 | def orjson_dumps(*args, **kwargs): # type: ignore[misc]
38 | # sqlite needs a blob
39 | return json.dumps(*args, **kwargs).encode('utf8')
40 |
41 | orjson_loads = json.loads
42 |
43 | import platformdirs
44 |
45 | from .backend.common import AbstractBackend
46 | from .backend.file import FileBackend
47 | from .backend.sqlite import SqliteBackend
48 | from .common import SourceHash
49 | from .logging_helper import make_logger
50 | from .marshall.cachew import CachewMarshall, build_schema
51 | from .utils import (
52 | CachewException,
53 | TypeNotSupported,
54 | )
55 |
56 | # in case of changes in the way cachew stores data, this should be changed to discard old caches
57 | CACHEW_VERSION: str = importlib.metadata.version(__name__)
58 |
59 | PathIsh = Union[Path, str]
60 |
61 | Backend = Literal['sqlite', 'file']
62 |
63 |
64 | class settings:
65 | '''
66 | Global settings, you can override them after importing cachew
67 | '''
68 |
69 | '''
70 | Toggle to disable caching
71 | '''
72 | ENABLE: bool = True
73 |
74 | DEFAULT_CACHEW_DIR: PathIsh = Path(platformdirs.user_cache_dir('cachew'))
75 |
76 | '''
77 | Set to true if you want to fail early. Otherwise falls back to non-cached version
78 | '''
79 | THROW_ON_ERROR: bool = False
80 |
81 | DEFAULT_BACKEND: Backend = 'sqlite'
82 |
83 |
84 | def get_logger() -> logging.Logger:
85 | return make_logger(__name__)
86 |
87 |
88 | BACKENDS: dict[Backend, type[AbstractBackend]] = {
89 | 'file': FileBackend,
90 | 'sqlite': SqliteBackend,
91 | }
92 |
93 |
94 | R = TypeVar('R')
95 | # ugh. python < 3.10 doesn't have ParamSpec and it seems tricky to backport it in compatible manner
96 | if sys.version_info[:2] >= (3, 10) or TYPE_CHECKING:
97 | if sys.version_info[:2] >= (3, 10):
98 | from typing import ParamSpec
99 | else:
100 | from typing_extensions import ParamSpec
101 | P = ParamSpec('P')
102 | CC = Callable[P, R] # need to give it a name, if inlined into bound=, mypy runs in a bug
103 | PathProvider = Union[PathIsh, Callable[P, PathIsh]]
104 | HashFunction = Callable[P, SourceHash]
105 | else:
106 | # just use some dummy types so runtime is happy
107 | P = TypeVar('P')
108 | CC = Any
109 | PathProvider = Union[P, Any]
110 | HashFunction = Union[P, Any]
111 |
112 | F = TypeVar('F', bound=CC)
113 |
114 |
115 | def default_hash(*args, **kwargs) -> SourceHash:
116 | # TODO eh, demand hash? it's not safe either... ugh
117 | # can lead to werid consequences otherwise..
118 | return str(args + tuple(sorted(kwargs.items()))) # good enough??
119 |
120 |
121 | # TODO give it as an example in docs
122 | def mtime_hash(path: Path, *args, **kwargs) -> SourceHash:
123 | mt = path.stat().st_mtime
124 | return default_hash(f'{path}.{mt}', *args, **kwargs)
125 |
126 |
127 | Failure = str
128 | Kind = Literal['single', 'multiple']
129 | Inferred = tuple[Kind, type[Any]]
130 |
131 |
132 | def infer_return_type(func) -> Union[Failure, Inferred]:
133 | """
134 | >>> def const() -> int:
135 | ... return 123
136 | >>> infer_return_type(const)
137 | ('single', )
138 |
139 | >>> from typing import Optional
140 | >>> def first_character(s: str) -> Optional[str]:
141 | ... return None if len(s) == 0 else s[0]
142 | >>> kind, opt = infer_return_type(first_character)
143 | >>> # in 3.8, Optional[str] is printed as Union[str, None], so need to hack around this
144 | >>> (kind, opt is Optional[str])
145 | ('single', True)
146 |
147 | # tuple is an iterable.. but presumably should be treated as a single value
148 | >>> from typing import Tuple
149 | >>> def a_tuple() -> Tuple[int, str]:
150 | ... return (123, 'hi')
151 | >>> infer_return_type(a_tuple)
152 | ('single', typing.Tuple[int, str])
153 |
154 | >>> from typing import Collection, NamedTuple
155 | >>> class Person(NamedTuple):
156 | ... name: str
157 | ... age: int
158 | >>> def person_provider() -> Collection[Person]:
159 | ... return []
160 | >>> infer_return_type(person_provider)
161 | ('multiple', )
162 |
163 | >>> def single_str() -> str:
164 | ... return 'hello'
165 | >>> infer_return_type(single_str)
166 | ('single', )
167 |
168 | >>> def single_person() -> Person:
169 | ... return Person(name="what", age=-1)
170 | >>> infer_return_type(single_person)
171 | ('single', )
172 |
173 | >>> from typing import Sequence
174 | >>> def int_provider() -> Sequence[int]:
175 | ... return (1, 2, 3)
176 | >>> infer_return_type(int_provider)
177 | ('multiple', )
178 |
179 | >>> from typing import Iterator, Union
180 | >>> def union_provider() -> Iterator[Union[str, int]]:
181 | ... yield 1
182 | ... yield 'aaa'
183 | >>> infer_return_type(union_provider)
184 | ('multiple', typing.Union[str, int])
185 |
186 | # a bit of an edge case
187 | >>> from typing import Tuple
188 | >>> def empty_tuple() -> Iterator[Tuple[()]]:
189 | ... yield ()
190 | >>> infer_return_type(empty_tuple)
191 | ('multiple', typing.Tuple[()])
192 |
193 | ... # doctest: +ELLIPSIS
194 |
195 | >>> def untyped():
196 | ... return 123
197 | >>> infer_return_type(untyped)
198 | 'no return type annotation...'
199 |
200 | >>> from typing import List
201 | >>> class Custom:
202 | ... pass
203 | >>> def unsupported() -> Custom:
204 | ... return Custom()
205 | >>> infer_return_type(unsupported)
206 | "can't infer type from : can't cache "
207 |
208 | >>> def unsupported_list() -> List[Custom]:
209 | ... return [Custom()]
210 | >>> infer_return_type(unsupported_list)
211 | "can't infer type from typing.List[cachew.Custom]: can't cache "
212 | """
213 | try:
214 | hints = get_type_hints(func)
215 | except Exception as ne:
216 | # get_type_hints might fail if types are forward defined or missing
217 | # see test_future_annotation for an example
218 | return str(ne)
219 | rtype = hints.get('return', None)
220 | if rtype is None:
221 | return f"no return type annotation on {func}"
222 |
223 | def bail(reason: str) -> str:
224 | return f"can't infer type from {rtype}: " + reason
225 |
226 | # first we wanna check if the top level type is some sort of iterable that makes sense ot cache
227 | # e.g. List/Sequence/Iterator etc
228 | return_multiple = _returns_multiple(rtype)
229 |
230 | if return_multiple:
231 | # then the actual type to cache will be the argument of the top level one
232 | args = get_args(rtype)
233 | if args is None:
234 | return bail("has no __args__")
235 |
236 | if len(args) != 1:
237 | return bail(f"wrong number of __args__: {args}")
238 |
239 | (cached_type,) = args
240 | else:
241 | cached_type = rtype
242 |
243 | try:
244 | build_schema(Type=cached_type)
245 | except TypeNotSupported as ex:
246 | return bail(f"can't cache {ex.type_}")
247 |
248 | return ('multiple' if return_multiple else 'single', cached_type)
249 |
250 |
251 | def _returns_multiple(rtype) -> bool:
252 | origin = get_origin(rtype)
253 | if origin is None:
254 | return False
255 | if origin is tuple:
256 | # usually tuples are more like single values rather than a sequence? (+ this works for namedtuple)
257 | return False
258 | try:
259 | return issubclass(origin, Iterable)
260 | except TypeError:
261 | # that would happen if origin is not a 'proper' type, e.g. is a Union or something
262 | # seems like exception is the easiest way to check
263 | return False
264 |
265 |
266 | # https://stackoverflow.com/questions/653368/how-to-create-a-python-decorator-that-can-be-used-either-with-or-without-paramet
267 | def doublewrap(f):
268 | @functools.wraps(f)
269 | def new_dec(*args, **kwargs):
270 | if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
271 | # actual decorated function
272 | return f(args[0])
273 | else:
274 | # decorator arguments
275 | return lambda realf: f(realf, *args, **kwargs)
276 |
277 | return new_dec
278 |
279 |
280 | def cachew_error(e: Exception, *, logger: logging.Logger) -> None:
281 | if settings.THROW_ON_ERROR:
282 | # TODO would be nice to throw from the original code line -- maybe mess with the stack here?
283 | raise e
284 | logger.error("error while setting up cache, falling back to non-cached version")
285 | logger.exception(e)
286 |
287 |
288 | use_default_path = cast(Path, object())
289 |
290 |
291 | # using cachew_impl here just to use different signatures during type checking (see below)
292 | @doublewrap
293 | def cachew_impl(
294 | func=None,
295 | cache_path: Optional[PathProvider[P]] = use_default_path,
296 | *,
297 | force_file: bool = False,
298 | cls: Optional[Union[type, tuple[Kind, type]]] = None,
299 | depends_on: HashFunction[P] = default_hash,
300 | logger: Optional[logging.Logger] = None,
301 | chunk_by: int = 100,
302 | # NOTE: allowed values for chunk_by depend on the system.
303 | # some systems (to be more specific, sqlite builds), it might be too large and cause issues
304 | # ideally this would be more defensive/autodetected, maybe with a warning?
305 | # you can use 'test_many' to experiment
306 | # - too small values (e.g. 10) are slower than 100 (presumably, too many sql statements)
307 | # - too large values (e.g. 10K) are slightly slower as well (not sure why?)
308 | synthetic_key: Optional[str] = None,
309 | backend: Optional[Backend] = None,
310 | **kwargs,
311 | ):
312 | r"""
313 | Database-backed cache decorator. TODO more description?
314 | # TODO use this doc in readme?
315 |
316 | :param cache_path: if not set, `cachew.settings.DEFAULT_CACHEW_DIR` will be used.
317 | :param force_file: if set to True, assume `cache_path` is a regular file (instead of a directory)
318 | :param cls: if not set, cachew will attempt to infer it from return type annotation. See :func:`infer_return_type` and :func:`cachew.tests.test_cachew.test_return_type_inference`.
319 | :param depends_on: hash function to determine whether the underlying . Can potentially benefit from the use of side effects (e.g. file modification time). TODO link to test?
320 | :param logger: custom logger, if not specified will use logger named `cachew`. See :func:`get_logger`.
321 | :return: iterator over original or cached items
322 |
323 | Usage example:
324 | >>> from typing import NamedTuple, Iterator
325 | >>> class Link(NamedTuple):
326 | ... url : str
327 | ... text: str
328 | ...
329 | >>> @cachew
330 | ... def extract_links(archive_path: str) -> Iterator[Link]:
331 | ... for i in range(5):
332 | ... # simulate slow IO
333 | ... # this function runs for five seconds for the purpose of demonstration, but realistically it might take hours
334 | ... import time; time.sleep(1)
335 | ... yield Link(url=f'http://link{i}.org', text=f'text {i}')
336 | ...
337 | >>> list(extract_links(archive_path='wikipedia_20190830.zip')) # that would take about 5 seconds on first run
338 | [Link(url='http://link0.org', text='text 0'), Link(url='http://link1.org', text='text 1'), Link(url='http://link2.org', text='text 2'), Link(url='http://link3.org', text='text 3'), Link(url='http://link4.org', text='text 4')]
339 |
340 | >>> from timeit import Timer
341 | >>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20190830.zip'))).timeit(number=1)
342 | ... # second run is cached, so should take less time
343 | >>> print(f"call took {int(res)} seconds")
344 | call took 0 seconds
345 |
346 | >>> res = Timer(lambda: list(extract_links(archive_path='wikipedia_20200101.zip'))).timeit(number=1)
347 | ... # now file has changed, so the cache will be discarded
348 | >>> print(f"call took {int(res)} seconds")
349 | call took 5 seconds
350 | """
351 | if logger is None:
352 | module_name = getattr(func, '__module__', None)
353 | if module_name is not None and module_name in logging.Logger.manager.loggerDict:
354 | # if logger for the function's module already exists, reuse it
355 | logger = logging.getLogger(module_name)
356 | else:
357 | # rely on default cachew logger
358 | logger = get_logger()
359 |
360 | class AddFuncName(logging.LoggerAdapter):
361 | def process(self, msg, kwargs):
362 | extra = self.extra
363 | assert extra is not None
364 | func_name = extra['func_name']
365 | return f'[{func_name}] {msg}', kwargs
366 |
367 | func_name = callable_name(func)
368 | adapter = AddFuncName(logger, {'func_name': func_name})
369 | logger = cast(logging.Logger, adapter)
370 |
371 | hashf = kwargs.get('hashf', None)
372 | if hashf is not None:
373 | warnings.warn("'hashf' is deprecated. Please use 'depends_on' instead")
374 | depends_on = hashf
375 |
376 | # todo not very nice that ENABLE check is scattered across two places
377 | if not settings.ENABLE or cache_path is None:
378 | logger.debug('cache explicitly disabled (settings.ENABLE is False or cache_path is None)')
379 | return func
380 |
381 | if cache_path is use_default_path:
382 | cache_path = settings.DEFAULT_CACHEW_DIR
383 | logger.debug(f'no cache_path specified, using the default {cache_path}')
384 |
385 | use_kind: Optional[Kind] = None
386 | use_cls: Optional[type] = None
387 | if cls is not None:
388 | # defensive here since typing. objects passed as cls might fail on isinstance
389 | try:
390 | is_tuple = isinstance(cls, tuple)
391 | except:
392 | is_tuple = False
393 | if is_tuple:
394 | use_kind, use_cls = cls # type: ignore[misc]
395 | else:
396 | use_kind = 'multiple'
397 | use_cls = cls # type: ignore[assignment]
398 |
399 | # TODO fuzz infer_return_type, should never crash?
400 | inference_res = infer_return_type(func)
401 | if isinstance(inference_res, Failure):
402 | msg = f"failed to infer cache type: {inference_res}. See https://github.com/karlicoss/cachew#features for the list of supported types."
403 | if use_cls is None:
404 | ex = CachewException(msg)
405 | cachew_error(ex, logger=logger)
406 | return func
407 | else:
408 | # it's ok, assuming user knows better
409 | logger.debug(msg)
410 | assert use_kind is not None
411 | else:
412 | (inferred_kind, inferred_cls) = inference_res
413 | if use_cls is None:
414 | logger.debug(f'using inferred type {inferred_kind} {inferred_cls}')
415 | (use_kind, use_cls) = (inferred_kind, inferred_cls)
416 | else:
417 | assert use_kind is not None
418 | if (use_kind, use_cls) != inference_res:
419 | logger.warning(f"inferred type {inference_res} mismatches explicitly specified type {(use_kind, use_cls)}")
420 | # TODO not sure if should be more serious error...
421 |
422 | if use_kind == 'single':
423 | # pretend it's an iterable, this is just simpler for cachew_wrapper
424 | @functools.wraps(func)
425 | def _func(*args, **kwargs):
426 | return [func(*args, **kwargs)]
427 |
428 | else:
429 | _func = func
430 |
431 | # fmt: off
432 | ctx = Context(
433 | func =_func,
434 | cache_path =cache_path,
435 | force_file =force_file,
436 | cls_ =use_cls,
437 | depends_on =depends_on,
438 | logger =logger,
439 | chunk_by =chunk_by,
440 | synthetic_key=synthetic_key,
441 | backend =backend,
442 | )
443 | # fmt: on
444 |
445 | # hack to avoid extra stack frame (see test_recursive*)
446 | @functools.wraps(func)
447 | def binder(*args, **kwargs):
448 | kwargs['_cachew_context'] = ctx
449 | res = cachew_wrapper(*args, **kwargs)
450 |
451 | if use_kind == 'single':
452 | lres = list(res)
453 | assert len(lres) == 1, lres # shouldn't happen
454 | return lres[0]
455 | return res
456 |
457 | return binder
458 |
459 |
460 | if TYPE_CHECKING:
461 | # we need two versions due to @doublewrap
462 | # this is when we just annotate as @cachew without any args
463 | @overload # type: ignore[no-overload-impl]
464 | def cachew(fun: F) -> F: ...
465 |
466 | # NOTE: we won't really be able to make sure the args of cache_path are the same as args of the wrapped function
467 | # because when cachew() is called, we don't know anything about the wrapped function yet
468 | # but at least it works for checking that cachew_path and depdns_on have the same args :shrug:
469 | @overload
470 | def cachew(
471 | cache_path: Optional[PathProvider[P]] = ...,
472 | *,
473 | force_file: bool = ...,
474 | cls: Optional[Union[type, tuple[Kind, type]]] = ...,
475 | depends_on: HashFunction[P] = ...,
476 | logger: Optional[logging.Logger] = ...,
477 | chunk_by: int = ...,
478 | synthetic_key: Optional[str] = ...,
479 | backend: Optional[Backend] = ...,
480 | ) -> Callable[[F], F]: ...
481 |
482 | else:
483 | cachew = cachew_impl
484 |
485 |
486 | def callable_name(func: Callable) -> str:
487 | # some functions don't have __module__
488 | mod = getattr(func, '__module__', None) or ''
489 | return f'{mod}:{func.__qualname__}'
490 |
491 |
492 | def callable_module_name(func: Callable) -> Optional[str]:
493 | return getattr(func, '__module__', None)
494 |
495 |
496 | # could cache this, but might be worth not to, so the user can change it on the fly?
497 | def _parse_disabled_modules(logger: Optional[logging.Logger] = None) -> list[str]:
498 | # e.g. CACHEW_DISABLE=my.browser:my.reddit
499 | if 'CACHEW_DISABLE' not in os.environ:
500 | return []
501 | disabled = os.environ['CACHEW_DISABLE']
502 | if disabled.strip() == '':
503 | return []
504 | if ',' in disabled and logger:
505 | logger.warning(
506 | 'CACHEW_DISABLE contains a comma, but this expects a $PATH-like, colon-separated list; '
507 | f'try something like CACHEW_DISABLE={disabled.replace(",", ":")}'
508 | )
509 | # remove any empty strings incase did something like CACHEW_DISABLE=my.module:$CACHEW_DISABLE
510 | return [p for p in disabled.split(':') if p.strip() != '']
511 |
512 |
513 | def _matches_disabled_module(module_name: str, pattern: str) -> bool:
514 | '''
515 | >>> _matches_disabled_module('my.browser', 'my.browser')
516 | True
517 | >>> _matches_disabled_module('my.browser', 'my.*')
518 | True
519 | >>> _matches_disabled_module('my.browser', 'my')
520 | True
521 | >>> _matches_disabled_module('my.browser', 'my.browse*')
522 | True
523 | >>> _matches_disabled_module('my.browser.export', 'my.browser')
524 | True
525 | >>> _matches_disabled_module('mysomething.else', '*') # CACHEW_DISABLE='*' disables everything
526 | True
527 | >>> _matches_disabled_module('my.browser', 'my.br?????') # fnmatch supports unix-like patterns
528 | True
529 | >>> _matches_disabled_module('my.browser', 'my.browse')
530 | False
531 | >>> _matches_disabled_module('mysomething.else', 'my') # since not at '.' boundary, doesn't match
532 | False
533 | >>> _matches_disabled_module('mysomething.else', '')
534 | False
535 | >>> _matches_disabled_module('my.browser', 'my.browser.export')
536 | False
537 | '''
538 |
539 | if module_name == pattern:
540 | return True
541 |
542 | module_parts = module_name.split('.')
543 | pattern_parts = pattern.split('.')
544 |
545 | # e.g. if pattern is 'module.submod.inner_module' and module is just 'module.submod'
546 | # theres no possible way for it to match
547 | if len(module_parts) < len(pattern_parts):
548 | return False
549 |
550 | for mp, pp in zip(module_parts, pattern_parts):
551 | if fnmatch.fnmatch(mp, pp):
552 | continue
553 | return False
554 | return True
555 |
556 |
557 | def _module_is_disabled(module_name: str, logger: logging.Logger) -> bool:
558 | disabled_modules = _parse_disabled_modules(logger)
559 | for pat in disabled_modules:
560 | if _matches_disabled_module(module_name, pat):
561 | logger.debug(f"caching disabled for {module_name} (matched '{pat}' from 'CACHEW_DISABLE={os.environ['CACHEW_DISABLE']})'")
562 | return True
563 | return False
564 |
565 |
566 | # fmt: off
567 | _CACHEW_CACHED = 'cachew_cached' # TODO add to docs
568 | _SYNTHETIC_KEY = 'synthetic_key'
569 | _SYNTHETIC_KEY_VALUE = 'synthetic_key_value'
570 | _DEPENDENCIES = 'dependencies'
571 | # fmt: on
572 |
573 |
574 | @dataclass
575 | class Context(Generic[P]):
576 | # fmt: off
577 | func : Callable
578 | cache_path : PathProvider[P]
579 | force_file : bool
580 | cls_ : type
581 | depends_on : HashFunction[P]
582 | logger : logging.Logger
583 | chunk_by : int
584 | synthetic_key: Optional[str]
585 | backend : Optional[Backend]
586 |
587 | def composite_hash(self, *args, **kwargs) -> dict[str, Any]:
588 | fsig = inspect.signature(self.func)
589 | # defaults wouldn't be passed in kwargs, but they can be an implicit dependency (especially inbetween program runs)
590 | defaults = {
591 | k: v.default
592 | for k, v in fsig.parameters.items()
593 | if v.default is not inspect.Parameter.empty
594 | }
595 | # but only pass default if the user wants it in the hash function?
596 | hsig = inspect.signature(self.depends_on)
597 | defaults = {
598 | k: v
599 | for k, v in defaults.items()
600 | if k in hsig.parameters or 'kwargs' in hsig.parameters
601 | }
602 | kwargs = {**defaults, **kwargs}
603 | schema = str(self.cls_)
604 | hash_parts = {
605 | 'cachew' : CACHEW_VERSION,
606 | 'schema' : schema,
607 | _DEPENDENCIES : str(self.depends_on(*args, **kwargs)),
608 | }
609 | synthetic_key = self.synthetic_key
610 | if synthetic_key is not None:
611 | hash_parts[_SYNTHETIC_KEY ] = synthetic_key
612 | hash_parts[_SYNTHETIC_KEY_VALUE] = kwargs[synthetic_key]
613 | # FIXME assert it's in kwargs in the first place?
614 | # FIXME support positional args too? maybe extract the name from signature somehow? dunno
615 | # need to test it
616 | return hash_parts
617 | # fmt: on
618 |
619 |
620 | def cachew_wrapper(
621 | *args,
622 | _cachew_context: Context[P],
623 | **kwargs,
624 | ):
625 | C = _cachew_context
626 | # fmt: off
627 | func = C.func
628 | cache_path = C.cache_path
629 | force_file = C.force_file
630 | cls = C.cls_
631 | logger = C.logger
632 | chunk_by = C.chunk_by
633 | synthetic_key = C.synthetic_key
634 | backend_name = C.backend
635 | # fmt: on
636 |
637 | used_backend = backend_name or settings.DEFAULT_BACKEND
638 |
639 | func_name = callable_name(func)
640 | if not settings.ENABLE:
641 | logger.debug('cache explicitly disabled (settings.ENABLE is False)')
642 | yield from func(*args, **kwargs)
643 | return
644 |
645 | mod_name = callable_module_name(func)
646 | if mod_name is not None and _module_is_disabled(mod_name, logger):
647 | yield from func(*args, **kwargs)
648 | return
649 |
650 | def get_db_path() -> Optional[Path]:
651 | db_path: Path
652 | if callable(cache_path):
653 | pp = cache_path(*args, **kwargs)
654 | if pp is None:
655 | logger.debug('cache explicitly disabled (cache_path is None)')
656 | # early return, in this case we just yield the original items from the function
657 | return None
658 | else:
659 | db_path = Path(pp)
660 | else:
661 | db_path = Path(cache_path)
662 |
663 | db_path.parent.mkdir(parents=True, exist_ok=True)
664 |
665 | # need to be atomic here, hence calling stat() once and then just using the results
666 | try:
667 | # note: stat follows symlinks (which is what we want)
668 | st = db_path.stat()
669 | except FileNotFoundError:
670 | # doesn't exist. then it's controlled by force_file
671 | if force_file:
672 | # just use db_path as is
673 | pass
674 | else:
675 | db_path.mkdir(parents=True, exist_ok=True)
676 | db_path = db_path / func_name
677 | else:
678 | # already exists, so just use callable name if it's a dir
679 | if stat.S_ISDIR(st.st_mode):
680 | db_path = db_path / func_name
681 |
682 | logger.debug(f'using {used_backend}:{db_path} for cache')
683 | return db_path
684 |
685 | def try_use_synthetic_key() -> None:
686 | if synthetic_key is None:
687 | return
688 | # attempt to use existing cache if possible, as a 'prefix'
689 |
690 | old_hash_d: dict[str, Any] = {}
691 | if old_hash is not None:
692 | try:
693 | old_hash_d = json.loads(old_hash)
694 | except json.JSONDecodeError:
695 | # possible if we used old cachew version (<=0.8.1), hash wasn't json
696 | pass
697 |
698 | hash_diffs = {
699 | k: new_hash_d.get(k) == old_hash_d.get(k)
700 | for k in (*new_hash_d.keys(), *old_hash_d.keys())
701 | # the only 'allowed' differences for hash, otherwise need to recompute (e.g. if schema changed)
702 | if k not in {_SYNTHETIC_KEY_VALUE, _DEPENDENCIES}
703 | }
704 | cache_compatible = all(hash_diffs.values())
705 | if not cache_compatible:
706 | return
707 |
708 | def missing_keys(cached: list[str], wanted: list[str]) -> Optional[list[str]]:
709 | # FIXME assert both cached and wanted are sorted? since we rely on it
710 | # if not, then the user could use some custom key for caching (e.g. normalise filenames etc)
711 | # although in this case passing it into the function wouldn't make sense?
712 |
713 | if len(cached) == 0:
714 | # no point trying to reuse anything, cache should be empty?
715 | return None
716 | if len(wanted) == 0:
717 | # similar, no way to reuse cache
718 | return None
719 | if cached[0] != wanted[0]:
720 | # there is no common prefix, so no way to reuse cache really
721 | return None
722 | last_cached = cached[-1]
723 | # ok, now actually figure out which items are missing
724 | for i, k in enumerate(wanted):
725 | if k > last_cached:
726 | # ok, rest of items are missing
727 | return wanted[i:]
728 | # otherwise too many things are cached, and we seem to wante less
729 | return None
730 |
731 | new_values: list[str] = new_hash_d[_SYNTHETIC_KEY_VALUE]
732 | old_values: list[str] = old_hash_d[_SYNTHETIC_KEY_VALUE]
733 | missing = missing_keys(cached=old_values, wanted=new_values)
734 | if missing is not None:
735 | # can reuse cache
736 | kwargs[_CACHEW_CACHED] = cached_items()
737 | kwargs[synthetic_key] = missing
738 |
739 | early_exit = False
740 |
741 | def written_to_cache():
742 | nonlocal early_exit
743 |
744 | datas = func(*args, **kwargs)
745 |
746 | if isinstance(backend, FileBackend):
747 | # FIXME uhhh.. this is a bit crap
748 | # but in sqlite mode we don't want to publish new hash before we write new items
749 | # maybe should use tmp table for hashes as well?
750 | backend.write_new_hash(new_hash)
751 | else:
752 | # happens later for sqlite
753 | pass
754 |
755 | flush_blobs = backend.flush_blobs
756 |
757 | chunk: list[Any] = []
758 |
759 | def flush() -> None:
760 | nonlocal chunk
761 | if len(chunk) > 0:
762 | flush_blobs(chunk=chunk)
763 | chunk = []
764 |
765 | total_objects = 0
766 | for obj in datas:
767 | try:
768 | total_objects += 1
769 | yield obj
770 | except GeneratorExit:
771 | early_exit = True
772 | return
773 |
774 | dct = marshall.dump(obj)
775 | blob = orjson_dumps(dct)
776 | chunk.append(blob)
777 | if len(chunk) >= chunk_by:
778 | flush()
779 | flush()
780 |
781 | backend.finalize(new_hash)
782 | logger.info(f'wrote {total_objects} objects to cachew ({used_backend}:{db_path})')
783 |
784 | def cached_items():
785 | total_cached = backend.cached_blobs_total()
786 | total_cached_s = '' if total_cached is None else f'{total_cached} '
787 | logger.info(f'loading {total_cached_s}objects from cachew ({used_backend}:{db_path})')
788 |
789 | for blob in backend.cached_blobs():
790 | j = orjson_loads(blob)
791 | obj = marshall.load(j)
792 | yield obj
793 |
794 | # NOTE: annoyingly huge try/catch ahead...
795 | # but it lets us save a function call, hence a stack frame
796 | # see test_recursive*
797 | try:
798 | db_path = get_db_path()
799 | if db_path is None:
800 | yield from func(*args, **kwargs)
801 | return
802 |
803 | BackendCls = BACKENDS[used_backend]
804 |
805 | new_hash_d = C.composite_hash(*args, **kwargs)
806 | new_hash: SourceHash = json.dumps(new_hash_d)
807 | logger.debug(f'new hash: {new_hash}')
808 |
809 | marshall: CachewMarshall[Any] = CachewMarshall(Type_=cls)
810 |
811 | with BackendCls(cache_path=db_path, logger=logger) as backend:
812 | old_hash = backend.get_old_hash()
813 | logger.debug(f'old hash: {old_hash}')
814 |
815 | if new_hash == old_hash:
816 | logger.debug('hash matched: loading from cache')
817 | yield from cached_items()
818 | return
819 |
820 | logger.debug('hash mismatch: computing data and writing to db')
821 |
822 | try_use_synthetic_key()
823 |
824 | got_write = backend.get_exclusive_write()
825 | if not got_write:
826 | # NOTE: this is the bit we really have to watch out for and not put in a helper function
827 | # otherwise it's causing an extra stack frame on every call
828 | # the rest (reading from cachew or writing to cachew) happens once per function call? so not a huge deal
829 | yield from func(*args, **kwargs)
830 | return
831 |
832 | # at this point we're guaranteed to have an exclusive write transaction
833 | yield from written_to_cache()
834 | except Exception as e:
835 | # sigh... see test_early_exit_shutdown...
836 | if early_exit and 'Cannot operate on a closed database' in str(e):
837 | return
838 |
839 | # todo hmm, kinda annoying that it tries calling the function twice?
840 | # but gonna require some sophisticated cooperation with the cached wrapper otherwise
841 | cachew_error(e, logger=logger)
842 | yield from func(*args, **kwargs)
843 |
844 |
845 | __all__ = [
846 | 'CachewException',
847 | 'HashFunction',
848 | 'SourceHash',
849 | 'cachew',
850 | 'get_logger',
851 | ]
852 |
--------------------------------------------------------------------------------
/src/cachew/backend/common.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from abc import abstractmethod
3 | from collections.abc import Iterator, Sequence
4 | from pathlib import Path
5 | from typing import (
6 | Optional,
7 | )
8 |
9 | from ..common import SourceHash
10 |
11 |
12 | class AbstractBackend:
13 | @abstractmethod
14 | def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None:
15 | raise NotImplementedError
16 |
17 | @abstractmethod
18 | def __enter__(self):
19 | raise NotImplementedError
20 |
21 | def __exit__(self, *args) -> None:
22 | raise NotImplementedError
23 |
24 | def get_old_hash(self) -> Optional[SourceHash]:
25 | raise NotImplementedError
26 |
27 | def cached_blobs_total(self) -> Optional[int]:
28 | raise NotImplementedError
29 |
30 | def cached_blobs(self) -> Iterator[bytes]:
31 | raise NotImplementedError
32 |
33 | def get_exclusive_write(self) -> bool:
34 | '''
35 | Returns whether it actually managed to get it
36 | '''
37 | raise NotImplementedError
38 |
39 | def write_new_hash(self, new_hash: SourceHash) -> None:
40 | raise NotImplementedError
41 |
42 | def flush_blobs(self, chunk: Sequence[bytes]) -> None:
43 | raise NotImplementedError
44 |
45 | def finalize(self, new_hash: SourceHash) -> None:
46 | raise NotImplementedError
47 |
--------------------------------------------------------------------------------
/src/cachew/backend/file.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from collections.abc import Iterator, Sequence
3 | from pathlib import Path
4 | from typing import (
5 | BinaryIO,
6 | Optional,
7 | )
8 |
9 | from ..common import SourceHash
10 | from .common import AbstractBackend
11 |
12 |
13 | class FileBackend(AbstractBackend):
14 | jsonl: Path
15 | jsonl_tmp: Path
16 | jsonl_fr: Optional[BinaryIO]
17 | jsonl_tmp_fw: Optional[BinaryIO]
18 |
19 | def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None:
20 | self.logger = logger
21 | self.jsonl = cache_path
22 | self.jsonl_tmp = Path(str(self.jsonl) + '.tmp')
23 |
24 | self.jsonl_fr = None
25 | self.jsonl_tmp_fw = None
26 |
27 | def __enter__(self) -> 'FileBackend':
28 | try:
29 | self.jsonl_fr = self.jsonl.open('rb')
30 | except FileNotFoundError:
31 | self.jsonl_fr = None
32 | return self
33 |
34 | def __exit__(self, *args) -> None:
35 | if self.jsonl_tmp_fw is not None:
36 | # might still exist in case of early exit
37 | self.jsonl_tmp.unlink(missing_ok=True)
38 |
39 | # NOTE: need to unlink first
40 | # otherwise possible that someone else might open the file before we unlink it
41 | self.jsonl_tmp_fw.close()
42 |
43 | if self.jsonl_fr is not None:
44 | self.jsonl_fr.close()
45 |
46 | def get_old_hash(self) -> Optional[SourceHash]:
47 | if self.jsonl_fr is None:
48 | return None
49 | hash_line = self.jsonl_fr.readline().rstrip(b'\n')
50 | return hash_line.decode('utf8')
51 |
52 | def cached_blobs_total(self) -> Optional[int]:
53 | # not really sure how to support that for a plaintext file?
54 | # could wc -l but it might be costly..
55 | return None
56 |
57 | def cached_blobs(self) -> Iterator[bytes]:
58 | assert self.jsonl_fr is not None # should be guaranteed by get_old_hash
59 | yield from self.jsonl_fr # yields line by line
60 |
61 | def get_exclusive_write(self) -> bool:
62 | # NOTE: opening in x (exclusive write) mode just in case, so it throws if file exists
63 | try:
64 | self.jsonl_tmp_fw = self.jsonl_tmp.open('xb')
65 | except FileExistsError:
66 | self.jsonl_tmp_fw = None
67 | return False
68 | else:
69 | return True
70 |
71 | def write_new_hash(self, new_hash: SourceHash) -> None:
72 | assert self.jsonl_tmp_fw is not None
73 | self.jsonl_tmp_fw.write(new_hash.encode('utf8') + b'\n')
74 |
75 | def flush_blobs(self, chunk: Sequence[bytes]) -> None:
76 | fw = self.jsonl_tmp_fw
77 | assert fw is not None
78 | for blob in chunk:
79 | fw.write(blob)
80 | fw.write(b'\n')
81 |
82 | def finalize(self, new_hash: SourceHash) -> None: # noqa: ARG002
83 | # TODO defensive??
84 | self.jsonl_tmp.rename(self.jsonl)
85 |
--------------------------------------------------------------------------------
/src/cachew/backend/sqlite.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sqlite3
3 | import time
4 | import warnings
5 | from collections.abc import Iterator, Sequence
6 | from pathlib import Path
7 | from typing import (
8 | Optional,
9 | )
10 |
11 | import sqlalchemy
12 | from sqlalchemy import Column, Table, event, text
13 | from sqlalchemy.dialects import sqlite
14 |
15 | from ..common import SourceHash
16 | from .common import AbstractBackend
17 |
18 |
19 | class SqliteBackend(AbstractBackend):
20 | def __init__(self, cache_path: Path, *, logger: logging.Logger) -> None:
21 | self.logger = logger
22 | self.engine = sqlalchemy.create_engine(f'sqlite:///{cache_path}', connect_args={'timeout': 0})
23 | # NOTE: timeout is necessary so we don't lose time waiting during recursive calls
24 | # by default, it's several seconds? you'd see 'test_recursive' test performance degrade
25 |
26 | @event.listens_for(self.engine, 'connect')
27 | def set_sqlite_pragma(dbapi_connection, connection_record): # noqa: ARG001
28 | # without wal, concurrent reading/writing is not gonna work
29 |
30 | # ugh. that's odd, how are we supposed to set WAL if the very fact of setting wal might lock the db?
31 | while True:
32 | try:
33 | dbapi_connection.execute('PRAGMA journal_mode=WAL')
34 | break
35 | except sqlite3.OperationalError as oe:
36 | if 'database is locked' not in str(oe):
37 | # ugh, pretty annoying that exception doesn't include database path for some reason
38 | raise RuntimeError(f'Error while setting WAL on {cache_path}') from oe
39 | time.sleep(0.1)
40 |
41 | self.connection = self.engine.connect()
42 |
43 | """
44 | Erm... this is pretty confusing.
45 | https://docs.sqlalchemy.org/en/13/dialects/sqlite.html#transaction-isolation-level
46 |
47 | Somehow without this thing sqlalchemy logs BEGIN (implicit) instead of BEGIN TRANSACTION which actually works in sqlite...
48 |
49 | Judging by sqlalchemy/dialects/sqlite/base.py, looks like some sort of python sqlite driver problem??
50 |
51 | test_transaction should check this behaviour
52 | """
53 |
54 | @event.listens_for(self.connection, 'begin')
55 | # pylint: disable=unused-variable
56 | def do_begin(conn):
57 | # NOTE there is also BEGIN CONCURRENT in newer versions of sqlite. could use it later?
58 | conn.execute(text('BEGIN DEFERRED'))
59 |
60 | self.meta = sqlalchemy.MetaData()
61 | self.table_hash = Table('hash', self.meta, Column('value', sqlalchemy.String))
62 |
63 | # fmt: off
64 | # actual cache
65 | self.table_cache = Table('cache' , self.meta, Column('data', sqlalchemy.BLOB))
66 | # temporary table, we use it to insert and then (atomically?) rename to the above table at the very end
67 | self.table_cache_tmp = Table('cache_tmp', self.meta, Column('data', sqlalchemy.BLOB))
68 | # fmt: on
69 |
70 | def __enter__(self) -> 'SqliteBackend':
71 | # NOTE: deferred transaction
72 | self.transaction = self.connection.begin()
73 | # FIXME this is a bit crap.. is there a nicer way to use another ctx manager here?
74 | self.transaction.__enter__()
75 | return self
76 |
77 | def __exit__(self, *args) -> None:
78 | self.transaction.__exit__(*args)
79 | self.connection.close()
80 | self.engine.dispose()
81 |
82 | def get_old_hash(self) -> Optional[SourceHash]:
83 | # first, try to do as much as possible read-only, benefiting from deferred transaction
84 | old_hashes: Sequence
85 | try:
86 | # not sure if there is a better way...
87 | cursor = self.connection.execute(self.table_hash.select())
88 | except sqlalchemy.exc.OperationalError as e:
89 | # meh. not sure if this is a good way to handle this..
90 | if 'no such table: hash' in str(e):
91 | old_hashes = []
92 | else:
93 | raise e
94 | else:
95 | old_hashes = cursor.fetchall()
96 |
97 | assert len(old_hashes) <= 1, old_hashes # shouldn't happen
98 |
99 | old_hash: Optional[SourceHash]
100 | if len(old_hashes) == 0:
101 | old_hash = None
102 | else:
103 | old_hash = old_hashes[0][0] # returns a tuple...
104 | return old_hash
105 |
106 | def cached_blobs_total(self) -> Optional[int]:
107 | [(total,)] = self.connection.execute(sqlalchemy.select(sqlalchemy.func.count()).select_from(self.table_cache))
108 | return total
109 |
110 | def cached_blobs(self) -> Iterator[bytes]:
111 | rows = self.connection.execute(self.table_cache.select())
112 | # by default, sqlalchemy wraps all results into Row object
113 | # this can cause quite a lot of overhead if you're reading many rows
114 | # it seems that in principle, sqlalchemy supports just returning bare underlying tuple from the dbapi
115 | # but from browsing the code it doesn't seem like this functionality exposed
116 | # if you're looking for cues, see
117 | # - ._source_supports_scalars
118 | # - ._generate_rows
119 | # - ._row_getter
120 | # by using this raw iterator we speed up reading the cache quite a bit
121 | # asked here https://github.com/sqlalchemy/sqlalchemy/discussions/10350
122 | raw_row_iterator = getattr(rows, '_raw_row_iterator', None)
123 | if raw_row_iterator is None:
124 | warnings.warn("CursorResult._raw_row_iterator method isn't found. This could lead to degraded cache reading performance.")
125 | row_iterator = rows
126 | else:
127 | row_iterator = raw_row_iterator()
128 |
129 | for (blob,) in row_iterator:
130 | yield blob
131 |
132 | def get_exclusive_write(self) -> bool:
133 | # NOTE on recursive calls
134 | # somewhat magically, they should work as expected with no extra database inserts?
135 | # the top level call 'wins' the write transaction and once it's gathered all data, will write it
136 | # the 'intermediate' level calls fail to get it and will pass data through
137 | # the cached 'bottom' level is read only and will be yielded without a write transaction
138 | try:
139 | # first 'write' statement will upgrade transaction to write transaction which might fail due to concurrency
140 | # see https://www.sqlite.org/lang_transaction.html
141 | # NOTE: because of 'checkfirst=True', only the last .create will guarantee the transaction upgrade to write transaction
142 | self.table_hash.create(self.connection, checkfirst=True)
143 |
144 | # 'table' used to be old 'cache' table name, so we just delete it regardless
145 | # otherwise it might overinfalte the cache db with stale values
146 | self.connection.execute(text('DROP TABLE IF EXISTS `table`'))
147 |
148 | # NOTE: we have to use .drop and then .create (e.g. instead of some sort of replace)
149 | # since it's possible to have schema changes inbetween calls
150 | # checkfirst=True because it might be the first time we're using cache
151 | self.table_cache_tmp.drop(self.connection, checkfirst=True)
152 | self.table_cache_tmp.create(self.connection)
153 | except sqlalchemy.exc.OperationalError as e:
154 | if e.code == 'e3q8' and 'database is locked' in str(e):
155 | # someone else must be have won the write lock
156 | # not much we can do here
157 | # NOTE: important to close early, otherwise we might hold onto too many file descriptors during yielding
158 | # see test_recursive_deep
159 | # (normally connection is closed in SqliteBackend.__exit__)
160 | self.connection.close()
161 | # in this case all the callee can do is just to call the actual function
162 | return False
163 | else:
164 | raise e
165 | return True
166 |
167 | def flush_blobs(self, chunk: Sequence[bytes]) -> None:
168 | # uhh. this gives a huge speedup for inserting
169 | # since we don't have to create intermediate dictionaries
170 | # TODO move this to __init__?
171 | insert_into_table_cache_tmp_raw = str(self.table_cache_tmp.insert().compile(dialect=sqlite.dialect(paramstyle='qmark')))
172 | # I also tried setting paramstyle='qmark' in create_engine, but it seems to be ignored :(
173 | # idk what benefit sqlalchemy gives at this point, seems to just complicate things
174 | self.connection.exec_driver_sql(insert_into_table_cache_tmp_raw, [(c,) for c in chunk])
175 |
176 | def finalize(self, new_hash: SourceHash) -> None:
177 | # delete hash first, so if we are interrupted somewhere, it mismatches next time and everything is recomputed
178 | # pylint: disable=no-value-for-parameter
179 | self.connection.execute(self.table_hash.delete())
180 |
181 | # checkfirst is necessary since it might not have existed in the first place
182 | # e.g. first time we use cache
183 | self.table_cache.drop(self.connection, checkfirst=True)
184 |
185 | # meh https://docs.sqlalchemy.org/en/14/faq/metadata_schema.html#does-sqlalchemy-support-alter-table-create-view-create-trigger-schema-upgrade-functionality
186 | # also seems like sqlalchemy doesn't have any primitives to escape table names.. sigh
187 | self.connection.execute(text(f"ALTER TABLE `{self.table_cache_tmp.name}` RENAME TO `{self.table_cache.name}`"))
188 |
189 | # pylint: disable=no-value-for-parameter
190 | self.connection.execute(self.table_hash.insert().values([{'value': new_hash}]))
191 |
--------------------------------------------------------------------------------
/src/cachew/common.py:
--------------------------------------------------------------------------------
1 | # TODO better name to represent what it means?
2 | SourceHash = str
3 |
--------------------------------------------------------------------------------
/src/cachew/compat.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/cachew/a25017531afbbeccbbd66d77bd6cc1bcf41d184f/src/cachew/compat.py
--------------------------------------------------------------------------------
/src/cachew/experimental.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 |
4 | def enable_exceptions():
5 | warnings.warn("Exceptions are not an experimental feature anymore and enabled by default.")
6 |
7 |
8 | def disable_exceptions():
9 | warnings.warn("Exceptions are not an experimental feature anymore and enabled by default.")
10 |
--------------------------------------------------------------------------------
/src/cachew/extra.py:
--------------------------------------------------------------------------------
1 | # todo Ideally, needs doublewraps as well? also typing helpers
2 | def mcachew(*args, **kwargs):
3 | """
4 | Stands for 'Maybe cachew'.
5 | Defensive wrapper around @cachew to make it an optional dependency.
6 | """
7 | try:
8 | import cachew
9 | except ModuleNotFoundError:
10 | import warnings
11 |
12 | warnings.warn('cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew')
13 | return lambda orig_func: orig_func
14 | else:
15 | return cachew.cachew(*args, **kwargs)
16 |
17 |
18 | from contextlib import contextmanager
19 |
20 |
21 | @contextmanager
22 | def disabled_cachew():
23 | from . import settings
24 |
25 | orig = settings.ENABLE
26 | try:
27 | settings.ENABLE = False
28 | yield
29 | finally:
30 | settings.ENABLE = orig
31 |
--------------------------------------------------------------------------------
/src/cachew/legacy.py:
--------------------------------------------------------------------------------
1 | import typing
2 | import warnings
3 | from collections.abc import Iterable, Iterator, Sequence
4 | from dataclasses import dataclass
5 | from datetime import date, datetime
6 | from itertools import chain, islice
7 | from pathlib import Path
8 | from typing import (
9 | Any,
10 | Generic,
11 | NamedTuple,
12 | Optional,
13 | TypeVar,
14 | Union,
15 | )
16 |
17 | import sqlalchemy
18 | from sqlalchemy import Column
19 |
20 | from .pytest import parametrize
21 | from .utils import CachewException
22 |
23 |
24 | def get_union_args(cls) -> Optional[tuple[type]]:
25 | if getattr(cls, '__origin__', None) != Union:
26 | return None
27 |
28 | args = cls.__args__
29 | args = [e for e in args if e is not type(None)]
30 | assert len(args) > 0
31 | return args
32 |
33 |
34 | def is_union(cls) -> bool:
35 | return get_union_args(cls) is not None
36 |
37 |
38 | Types = Union[
39 | type[str],
40 | type[int],
41 | type[float],
42 | type[bool],
43 | type[datetime],
44 | type[date],
45 | type[dict],
46 | type[list],
47 | type[Exception],
48 | type[NamedTuple],
49 | ]
50 |
51 | Values = Union[
52 | str,
53 | int,
54 | float,
55 | bool,
56 | datetime,
57 | date,
58 | dict,
59 | list,
60 | Exception,
61 | NamedTuple,
62 | ]
63 |
64 | PRIMITIVE_TYPES = {
65 | str,
66 | int,
67 | float,
68 | bool,
69 | datetime,
70 | date,
71 | dict,
72 | list,
73 | Exception,
74 | }
75 |
76 |
77 | def is_primitive(cls: type) -> bool:
78 | """
79 | >>> from typing import Dict, Any
80 | >>> is_primitive(int)
81 | True
82 | >>> is_primitive(set)
83 | False
84 | >>> is_primitive(dict)
85 | True
86 | """
87 | return cls in PRIMITIVE_TYPES
88 |
89 |
90 | class IsoDateTime(sqlalchemy.TypeDecorator):
91 | # in theory could use something more effecient? e.g. blob for encoded datetime and tz?
92 | # but practically, the difference seems to be pretty small, so perhaps fine for now
93 | impl = sqlalchemy.String
94 |
95 | cache_ok = True
96 |
97 | @property
98 | def python_type(self):
99 | return datetime
100 |
101 | def process_literal_param(self, value, dialect):
102 | raise NotImplementedError() # make pylint happy
103 |
104 | def process_bind_param(self, value: Optional[datetime], dialect) -> Optional[str]: # noqa: ARG002
105 | if value is None:
106 | return None
107 | # ok, it's a bit hacky... attempt to preserve pytz infromation
108 | iso = value.isoformat()
109 | tz = getattr(value, 'tzinfo', None)
110 | if tz is None:
111 | return iso
112 | try:
113 | import pytz
114 | except ImportError:
115 | self.warn_pytz()
116 | return iso
117 | else:
118 | if isinstance(tz, pytz.BaseTzInfo):
119 | zone = tz.zone
120 | # should be present: https://github.com/python/typeshed/blame/968fd6d01d23470e0c8368e7ee7c43f54aaedc0e/stubs/pytz/pytz/tzinfo.pyi#L6
121 | assert zone is not None, tz
122 | return iso + ' ' + zone
123 | else:
124 | return iso
125 |
126 | def process_result_value(self, value: Optional[str], dialect) -> Optional[datetime]: # noqa: ARG002
127 | if value is None:
128 | return None
129 | spl = value.split(' ')
130 | dt = datetime.fromisoformat(spl[0])
131 | if len(spl) <= 1:
132 | return dt
133 | zone = spl[1]
134 | # else attempt to decypher pytz tzinfo
135 | try:
136 | import pytz
137 | except ImportError:
138 | self.warn_pytz()
139 | return dt
140 | else:
141 | tz = pytz.timezone(zone)
142 | return dt.astimezone(tz)
143 |
144 | def warn_pytz(self) -> None:
145 | warnings.warn('install pytz for better timezone support while serializing with cachew')
146 |
147 |
148 | # a bit hacky, but works...
149 | class IsoDate(IsoDateTime):
150 | impl = sqlalchemy.String
151 |
152 | cache_ok = True
153 |
154 | @property
155 | def python_type(self):
156 | return date
157 |
158 | def process_literal_param(self, value, dialect):
159 | raise NotImplementedError() # make pylint happy
160 |
161 | def process_result_value(self, value: Optional[str], dialect) -> Optional[date]: # type: ignore[explicit-override,override]
162 | res = super().process_result_value(value, dialect)
163 | if res is None:
164 | return None
165 | return res.date()
166 |
167 |
168 | jtypes = (int, float, bool, type(None))
169 |
170 |
171 | class ExceptionAdapter(sqlalchemy.TypeDecorator):
172 | '''
173 | Enables support for caching Exceptions. Exception is treated as JSON and serialized.
174 |
175 | It's useful for defensive error handling, in case of cachew in particular for preserving error state.
176 |
177 | I elaborate on it here: [mypy-driven error handling](https://beepb00p.xyz/mypy-error-handling.html#kiss).
178 | '''
179 |
180 | impl = sqlalchemy.JSON
181 |
182 | cache_ok = True
183 |
184 | @property
185 | def python_type(self):
186 | return Exception
187 |
188 | def process_literal_param(self, value, dialect):
189 | raise NotImplementedError() # make pylint happy
190 |
191 | def process_bind_param(self, value: Optional[Exception], dialect) -> Optional[list[Any]]: # noqa: ARG002
192 | if value is None:
193 | return None
194 | sargs: list[Any] = []
195 | for a in value.args:
196 | if any(isinstance(a, t) for t in jtypes):
197 | sargs.append(a)
198 | elif isinstance(a, date):
199 | sargs.append(a.isoformat())
200 | else:
201 | sargs.append(str(a))
202 | return sargs
203 |
204 | def process_result_value(self, value: Optional[str], dialect) -> Optional[Exception]: # noqa: ARG002
205 | if value is None:
206 | return None
207 | # sadly, can't do much to convert back from the strings? Unless I serialize the type info as well?
208 | return Exception(*value)
209 |
210 |
211 | # fmt: off
212 | PRIMITIVES = {
213 | str : sqlalchemy.String,
214 | int : sqlalchemy.Integer,
215 | float : sqlalchemy.Float,
216 | bool : sqlalchemy.Boolean,
217 | datetime : IsoDateTime,
218 | date : IsoDate,
219 | dict : sqlalchemy.JSON,
220 | list : sqlalchemy.JSON,
221 | Exception: ExceptionAdapter,
222 | }
223 | # fmt: on
224 | assert set(PRIMITIVES.keys()) == PRIMITIVE_TYPES
225 |
226 |
227 | def strip_optional(cls) -> tuple[type, bool]:
228 | """
229 | >>> from typing import Optional, NamedTuple
230 | >>> strip_optional(Optional[int])
231 | (, True)
232 | >>> class X(NamedTuple):
233 | ... x: int
234 | >>> strip_optional(X)
235 | (, False)
236 | """
237 | is_opt: bool = False
238 |
239 | args = get_union_args(cls)
240 | if args is not None and len(args) == 1:
241 | cls = args[0] # meh
242 | is_opt = True
243 |
244 | return (cls, is_opt)
245 |
246 |
247 | def strip_generic(tp):
248 | """
249 | >>> from typing import List
250 | >>> strip_generic(List[int])
251 |
252 | >>> strip_generic(str)
253 |
254 | """
255 | GA = getattr(typing, '_GenericAlias') # ugh, can't make both mypy and pylint happy here?
256 | if isinstance(tp, GA):
257 | return tp.__origin__
258 | return tp
259 |
260 |
261 | NT = TypeVar('NT')
262 | # sadly, bound=NamedTuple is not working yet in mypy
263 | # https://github.com/python/mypy/issues/685
264 | # also needs to support dataclasses?
265 |
266 |
267 | @dataclass
268 | class NTBinder(Generic[NT]):
269 | """
270 | >>> class Job(NamedTuple):
271 | ... company: str
272 | ... title: Optional[str]
273 | >>> class Person(NamedTuple):
274 | ... name: str
275 | ... age: int
276 | ... job: Optional[Job]
277 |
278 | NTBinder is a helper class for inteacting with sqlite database.
279 | Hierarchy is flattened:
280 | >>> binder = NTBinder.make(Person)
281 | >>> [(c.name, type(c.type)) for c in binder.columns]
282 | ... # doctest: +NORMALIZE_WHITESPACE
283 | [('name', ),
284 | ('age', ),
285 | ('_job_is_null', ),
286 | ('job_company', ),
287 | ('job_title', )]
288 |
289 |
290 | >>> person = Person(name='alan', age=40, job=None)
291 |
292 | to_row converts object to a sql-friendly tuple. job=None, so we end up with True in _job_is_null field
293 | >>> tuple(binder.to_row(person))
294 | ('alan', 40, True, None, None)
295 |
296 | from_row does reverse conversion
297 | >>> binder.from_row(('alan', 40, True, None, None))
298 | Person(name='alan', age=40, job=None)
299 |
300 | >>> binder.from_row(('ann', 25, True, None, None, 'extra'))
301 | Traceback (most recent call last):
302 | ...
303 | cachew.utils.CachewException: unconsumed items in iterator ['extra']
304 | """
305 |
306 | name: Optional[str] # None means toplevel
307 | type_: Types
308 | span: int # not sure if span should include optional col?
309 | primitive: bool
310 | optional: bool
311 | union: Optional[type] # helper, which isn't None if type is Union
312 | fields: Sequence[Any] # mypy can't handle cyclic definition at this point :(
313 |
314 | @staticmethod
315 | def make(tp: type[NT], name: Optional[str] = None) -> 'NTBinder[NT]':
316 | tp, optional = strip_optional(tp)
317 | union: Optional[type]
318 | fields: tuple[Any, ...]
319 | primitive: bool
320 |
321 | union_args = get_union_args(tp)
322 | if union_args is not None:
323 | CachewUnion = NamedTuple('_CachewUnionRepr', [(x.__name__, Optional[x]) for x in union_args]) # type: ignore[misc]
324 | union = CachewUnion
325 | primitive = False
326 | fields = (NTBinder.make(tp=CachewUnion, name='_cachew_union_repr'),)
327 | span = 1
328 | else:
329 | union = None
330 | tp = strip_generic(tp)
331 | primitive = is_primitive(tp)
332 |
333 | if primitive:
334 | if name is None:
335 | name = '_cachew_primitive' # meh. presumably, top level
336 | if primitive:
337 | fields = ()
338 | span = 1
339 | else:
340 | annotations = typing.get_type_hints(tp)
341 | if annotations == {}:
342 | raise CachewException(
343 | f"{tp} (field '{name}'): doesn't look like a supported type to cache. See https://github.com/karlicoss/cachew#features for the list of supported types."
344 | )
345 | fields = tuple(NTBinder.make(tp=ann, name=fname) for fname, ann in annotations.items())
346 | span = sum(f.span for f in fields) + (1 if optional else 0)
347 | return NTBinder(
348 | name=name,
349 | type_=tp, # type: ignore[arg-type]
350 | span=span,
351 | primitive=primitive,
352 | optional=optional,
353 | union=union,
354 | fields=fields,
355 | )
356 |
357 | @property
358 | def columns(self) -> list[Column]:
359 | return list(self.iter_columns())
360 |
361 | # TODO not necessarily namedtuple? could be primitive type
362 | def to_row(self, obj: NT) -> tuple[Optional[Values], ...]:
363 | return tuple(self._to_row(obj))
364 |
365 | def from_row(self, row: Iterable[Any]) -> NT:
366 | riter = iter(row)
367 | res = self._from_row(riter)
368 | remaining = list(islice(riter, 0, 1))
369 | if len(remaining) != 0:
370 | raise CachewException(f'unconsumed items in iterator {remaining}')
371 | assert res is not None # nosec # help mypy; top level will not be None
372 | return res
373 |
374 | def _to_row(self, obj) -> Iterator[Optional[Values]]:
375 | if self.primitive:
376 | yield obj
377 | elif self.union is not None:
378 | CachewUnion = self.union
379 | (uf,) = self.fields
380 | # TODO assert only one of them matches??
381 | union = CachewUnion(**{f.name: obj if isinstance(obj, f.type_) else None for f in uf.fields})
382 | yield from uf._to_row(union)
383 | else:
384 | if self.optional:
385 | is_none = obj is None
386 | yield is_none
387 | else:
388 | is_none = False
389 | assert obj is not None # TODO hmm, that last assert is not very symmetric...
390 |
391 | if is_none:
392 | for _ in range(self.span - 1):
393 | yield None
394 | else:
395 | yield from chain.from_iterable(f._to_row(getattr(obj, f.name)) for f in self.fields)
396 |
397 | def _from_row(self, row_iter):
398 | if self.primitive:
399 | return next(row_iter)
400 | elif self.union is not None:
401 | CachewUnion = self.union # noqa: F841
402 | (uf,) = self.fields
403 | # TODO assert only one of them is not None?
404 | union_params = [r for r in uf._from_row(row_iter) if r is not None]
405 | assert len(union_params) == 1, union_params
406 | return union_params[0]
407 | else:
408 | if self.optional:
409 | is_none = next(row_iter)
410 | else:
411 | is_none = False
412 |
413 | if is_none:
414 | for _ in range(self.span - 1):
415 | x = next(row_iter)
416 | assert x is None, x # huh. assert is kinda opposite of producing value
417 | return None
418 | else:
419 | return self.type_(*(f._from_row(row_iter) for f in self.fields))
420 |
421 | # TODO not sure if we want to allow optionals on top level?
422 | def iter_columns(self) -> Iterator[Column]:
423 | used_names: set[str] = set()
424 |
425 | def col(name: str, tp) -> Column:
426 | while name in used_names:
427 | name = '_' + name
428 | used_names.add(name)
429 | return Column(name, tp)
430 |
431 | if self.primitive:
432 | if self.name is None:
433 | raise AssertionError
434 | yield col(self.name, PRIMITIVES[self.type_])
435 | else:
436 | prefix = '' if self.name is None else self.name + '_'
437 | if self.optional:
438 | yield col(f'_{prefix}is_null', sqlalchemy.Boolean)
439 | for f in self.fields:
440 | for c in f.iter_columns():
441 | yield col(f'{prefix}{c.name}', c.type)
442 |
443 | def __str__(self):
444 | lines = [' ' * level + str(x.name) + ('?' if x.optional else '') + f' ' for level, x in self.flatten()]
445 | return '\n'.join(lines)
446 |
447 | def __repr__(self):
448 | return str(self)
449 |
450 | def flatten(self, level=0):
451 | yield (level, self)
452 | for f in self.fields:
453 | yield from f.flatten(level=level + 1)
454 |
455 |
456 | def test_mypy_annotations() -> None:
457 | # mypy won't handle, so this has to be dynamic
458 | vs = []
459 | for t in Types.__args__: # type: ignore[attr-defined]
460 | (arg,) = t.__args__
461 | vs.append(arg)
462 |
463 | def types(ts):
464 | return sorted(ts, key=lambda t: str(t))
465 |
466 | assert types(vs) == types(Values.__args__) # type: ignore[attr-defined]
467 |
468 | for p in PRIMITIVE_TYPES:
469 | assert p in Values.__args__ # type: ignore[attr-defined]
470 |
471 |
472 | @parametrize(
473 | ('tp', 'val'),
474 | [
475 | (int, 22),
476 | (bool, False),
477 | (Optional[str], 'abacaba'),
478 | (Union[str, int], 1),
479 | ],
480 | )
481 | def test_ntbinder_primitive(tp, val) -> None:
482 | b = NTBinder.make(tp, name='x')
483 | row = b.to_row(val)
484 | vv = b.from_row(list(row))
485 | assert vv == val
486 |
487 |
488 | def test_unique_columns(tmp_path: Path) -> None: # noqa: ARG001
489 | class Job(NamedTuple):
490 | company: str
491 | title: Optional[str]
492 |
493 | class Breaky(NamedTuple):
494 | job_title: int
495 | job: Optional[Job]
496 |
497 | assert [c.name for c in NTBinder.make(Breaky).columns] == [
498 | 'job_title',
499 | '_job_is_null',
500 | 'job_company',
501 | '_job_title',
502 | ]
503 |
--------------------------------------------------------------------------------
/src/cachew/logging_helper.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import logging
4 | import os
5 | import warnings
6 | from functools import lru_cache
7 | from typing import Union
8 |
9 |
10 | def test() -> None:
11 | import sys
12 | from typing import Callable
13 |
14 | M: Callable[[str], None] = lambda s: print(s, file=sys.stderr)
15 |
16 | ## prepare exception for later
17 | try:
18 | None.whatever # type: ignore[attr-defined] # noqa: B018
19 | except Exception as e:
20 | ex = e
21 | ##
22 |
23 | M(" Logging module's defaults are not great:")
24 | l = logging.getLogger('default_logger')
25 | l.error("For example, this should be logged as error. But it's not even formatted properly, doesn't have logger name or level")
26 |
27 | M("\n The reason is that you need to remember to call basicConfig() first. Let's do it now:")
28 | logging.basicConfig()
29 | l.error("OK, this is better. But the default format kinda sucks, I prefer having timestamps and the file/line number")
30 |
31 | M("\n Also exception logging is kinda lame, doesn't print traceback by default unless you remember to pass exc_info:")
32 | l.exception(ex) # type: ignore[possibly-undefined] # pylint: disable=used-before-assignment
33 |
34 | M("\n\n With make_logger you get a reasonable logging format, colours (via colorlog library) and other neat things:")
35 |
36 | ll = make_logger('test') # No need for basicConfig!
37 | ll.info("default level is INFO")
38 | ll.debug("... so this shouldn't be displayed")
39 | ll.warning("warnings are easy to spot!")
40 |
41 | M("\n Exceptions print traceback by default now:")
42 | ll.exception(ex)
43 |
44 | M("\n You can (and should) use it via regular logging.getLogger after that, e.g. let's set logging level to DEBUG now")
45 | logging.getLogger('test').setLevel(logging.DEBUG)
46 | ll.debug("... now debug messages are also displayed")
47 |
48 |
49 | DEFAULT_LEVEL = 'INFO'
50 | FORMAT = '{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)-4d]{end} %(message)s'
51 | FORMAT_NOCOLOR = FORMAT.format(start='', end='')
52 |
53 |
54 | Level = int
55 | LevelIsh = Union[Level, str, None]
56 |
57 |
58 | def mklevel(level: LevelIsh) -> Level:
59 | if level is None:
60 | return logging.NOTSET
61 | if isinstance(level, int):
62 | return level
63 | return getattr(logging, level.upper())
64 |
65 |
66 | def get_collapse_level() -> Level | None:
67 | # TODO not sure if should be specific to logger name?
68 | cl = os.environ.get('LOGGING_COLLAPSE', None)
69 | if cl is not None:
70 | return mklevel(cl)
71 | # legacy name, maybe deprecate?
72 | cl = os.environ.get('COLLAPSE_DEBUG_LOGS', None)
73 | if cl is not None:
74 | return logging.DEBUG
75 | return None
76 |
77 |
78 | def get_env_level(name: str) -> Level | None:
79 | PREFIX = 'LOGGING_LEVEL_' # e.g. LOGGING_LEVEL_my_hypothesis=debug
80 | # shell doesn't allow using dots in var names without escaping, so also support underscore syntax
81 | lvl = os.environ.get(PREFIX + name, None) or os.environ.get(PREFIX + name.replace('.', '_'), None)
82 | if lvl is not None:
83 | return mklevel(lvl)
84 | return None
85 |
86 |
87 | def setup_logger(logger: str | logging.Logger, *, level: LevelIsh = None) -> None:
88 | """
89 | Wrapper to simplify logging setup.
90 | """
91 | if isinstance(logger, str):
92 | logger = logging.getLogger(logger)
93 |
94 | if level is None:
95 | level = DEFAULT_LEVEL
96 |
97 | # env level always takes precedence
98 | env_level = get_env_level(logger.name)
99 | if env_level is not None:
100 | lvl = env_level
101 | else:
102 | lvl = mklevel(level)
103 |
104 | if logger.level == logging.NOTSET:
105 | # if it's already set, the user requested a different logging level, let's respect that
106 | logger.setLevel(lvl)
107 |
108 | _setup_handlers_and_formatters(name=logger.name)
109 |
110 |
111 | # cached since this should only be done once per logger instance
112 | @lru_cache(None)
113 | def _setup_handlers_and_formatters(name: str) -> None:
114 | logger = logging.getLogger(name)
115 |
116 | logger.addFilter(AddExceptionTraceback())
117 |
118 | ch = logging.StreamHandler()
119 | collapse_level = get_collapse_level()
120 | ch = logging.StreamHandler() if collapse_level is None else CollapseLogsHandler(maxlevel=collapse_level)
121 |
122 | # default level for handler is NOTSET, which will make it process all messages
123 | # we rely on the logger to actually accept/reject log msgs
124 | logger.addHandler(ch)
125 |
126 | # this attribute is set to True by default, which causes log entries to be passed to root logger (e.g. if you call basicConfig beforehand)
127 | # even if log entry is handled by this logger ... not sure what's the point of this behaviour??
128 | logger.propagate = False
129 |
130 | try:
131 | # try colorlog first, so user gets nice colored logs
132 | import colorlog
133 | except ModuleNotFoundError:
134 | warnings.warn("You might want to 'pip install colorlog' for nice colored logs")
135 | formatter = logging.Formatter(FORMAT_NOCOLOR)
136 | else:
137 | # log_color/reset are specific to colorlog
138 | FORMAT_COLOR = FORMAT.format(start='%(log_color)s', end='%(reset)s')
139 | # colorlog should detect tty in principle, but doesn't handle everything for some reason
140 | # see https://github.com/borntyping/python-colorlog/issues/71
141 | if ch.stream.isatty():
142 | formatter = colorlog.ColoredFormatter(FORMAT_COLOR)
143 | else:
144 | formatter = logging.Formatter(FORMAT_NOCOLOR)
145 |
146 | ch.setFormatter(formatter)
147 |
148 |
149 | # by default, logging.exception isn't logging traceback unless called inside of the exception handler
150 | # which is a bit annoying since we have to pass exc_info explicitly
151 | # also see https://stackoverflow.com/questions/75121925/why-doesnt-python-logging-exception-method-log-traceback-by-default
152 | # todo also amend by post about defensive error handling?
153 | class AddExceptionTraceback(logging.Filter):
154 | def filter(self, record: logging.LogRecord) -> bool:
155 | if record.levelname == 'ERROR':
156 | exc = record.msg
157 | if isinstance(exc, BaseException):
158 | if record.exc_info is None or record.exc_info == (None, None, None):
159 | exc_info = (type(exc), exc, exc.__traceback__)
160 | record.exc_info = exc_info
161 | return True
162 |
163 |
164 | # todo also save full log in a file?
165 | class CollapseLogsHandler(logging.StreamHandler):
166 | '''
167 | Collapses subsequent debug log lines and redraws on the same line.
168 | Hopefully this gives both a sense of progress and doesn't clutter the terminal as much?
169 | '''
170 |
171 | last: bool = False
172 |
173 | maxlevel: Level = logging.DEBUG # everything with less or equal level will be collapsed
174 |
175 | def __init__(self, *args, maxlevel: Level, **kwargs) -> None:
176 | super().__init__(*args, **kwargs)
177 | self.maxlevel = maxlevel
178 |
179 | def emit(self, record: logging.LogRecord) -> None:
180 | try:
181 | msg = self.format(record)
182 | cur = record.levelno <= self.maxlevel and '\n' not in msg
183 | if cur:
184 | if self.last:
185 | self.stream.write('\033[K' + '\r') # clear line + return carriage
186 | else:
187 | if self.last:
188 | self.stream.write('\n') # clean up after the last line
189 | self.last = cur
190 | columns, _ = os.get_terminal_size(0)
191 | # ugh. the columns thing is meh. dunno I guess ultimately need curses for that
192 | # TODO also would be cool to have a terminal post-processor? kinda like tail but aware of logging keywords (INFO/DEBUG/etc)
193 | self.stream.write(msg + ' ' * max(0, columns - len(msg)) + ('' if cur else '\n'))
194 | self.flush()
195 | except:
196 | self.handleError(record)
197 |
198 |
199 | def make_logger(name: str, *, level: LevelIsh = None) -> logging.Logger:
200 | logger = logging.getLogger(name)
201 | setup_logger(logger, level=level)
202 | return logger
203 |
204 |
205 | # ughh. hacky way to have a single enlighten instance per interpreter, so it can be shared between modules
206 | # not sure about this. I guess this should definitely be behind some flag
207 | # OK, when stdout is not a tty, enlighten doesn't log anything, good
208 | def get_enlighten():
209 | # TODO could add env variable to disable enlighten for a module?
210 | from unittest.mock import (
211 | Mock, # Mock to return stub so cients don't have to think about it
212 | )
213 |
214 | # for now hidden behind the flag since it's a little experimental
215 | if os.environ.get('ENLIGHTEN_ENABLE', None) is None:
216 | return Mock()
217 |
218 | try:
219 | import enlighten # type: ignore[import-untyped]
220 | except ModuleNotFoundError:
221 | warnings.warn("You might want to 'pip install enlighten' for a nice progress bar")
222 |
223 | return Mock()
224 |
225 | # dirty, but otherwise a bit unclear how to share enlighten manager between packages that call each other
226 | instance = getattr(enlighten, 'INSTANCE', None)
227 | if instance is not None:
228 | return instance
229 | instance = enlighten.get_manager()
230 | setattr(enlighten, 'INSTANCE', instance)
231 | return instance
232 |
233 |
234 | if __name__ == '__main__':
235 | test()
236 |
237 |
238 | ## legacy/deprecated methods for backwards compatilibity
239 | LazyLogger = make_logger
240 | logger = make_logger
241 | ##
242 |
--------------------------------------------------------------------------------
/src/cachew/marshall/cachew.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import sys
4 | import types
5 | from abc import abstractmethod
6 | from collections import abc
7 | from collections.abc import Sequence
8 | from dataclasses import dataclass, is_dataclass
9 | from datetime import date, datetime, timezone
10 | from numbers import Real
11 | from typing import (
12 | Any,
13 | NamedTuple,
14 | Optional,
15 | Union,
16 | get_args,
17 | get_origin,
18 | get_type_hints,
19 | )
20 | from zoneinfo import ZoneInfo
21 |
22 | from ..utils import TypeNotSupported, is_namedtuple
23 | from .common import (
24 | AbstractMarshall,
25 | Json,
26 | T,
27 | )
28 |
29 |
30 | class CachewMarshall(AbstractMarshall[T]):
31 | def __init__(self, Type_: type[T]) -> None:
32 | self.schema = build_schema(Type_)
33 |
34 | def dump(self, obj: T) -> Json:
35 | return self.schema.dump(obj)
36 |
37 | def load(self, dct: Json) -> T:
38 | return self.schema.load(dct)
39 |
40 |
41 | # TODO add generic types later?
42 |
43 |
44 | # NOTE: using slots gives a small speedup (maybe 5%?)
45 | # I suppose faster access to fields or something..
46 |
47 | SLOTS: dict[str, bool]
48 | if sys.version_info[:2] >= (3, 10):
49 | SLOTS = {'slots': True}
50 | else:
51 | # not available :(
52 | SLOTS = {}
53 |
54 |
55 | @dataclass(**SLOTS)
56 | class Schema:
57 | type: Any
58 |
59 | @abstractmethod
60 | def dump(self, obj):
61 | raise NotImplementedError
62 |
63 | @abstractmethod
64 | def load(self, dct):
65 | raise NotImplementedError
66 |
67 |
68 | @dataclass(**SLOTS)
69 | class SPrimitive(Schema):
70 | def dump(self, obj):
71 | # NOTE: returning here directly (instead of calling identity lambda) gives about 20% speedup
72 | # I think custom types should have their own Schema subclass
73 | return obj
74 | # prim = primitives_to.get(self.type)
75 | # assert prim is not None
76 | # return prim(o)
77 |
78 | def load(self, dct):
79 | return dct
80 | # prim = primitives_from.get(self.type)
81 | # assert prim is not None
82 | # return prim(d)
83 |
84 |
85 | @dataclass(**SLOTS)
86 | class SDataclass(Schema):
87 | # using list of tuples instead of dict gives about 5% speedup
88 | fields: tuple[tuple[str, Schema], ...]
89 |
90 | def dump(self, obj):
91 | # TODO would be nice if we didn't create a dictionary here
92 | # considering it is going to be serialized to json anyway
93 | # maybe we need to yield json bits actually?
94 | return {
95 | # would be kinda nice if we didn't have to use getattr here
96 | # but I think for dataclass this is actually the fastest way
97 | # TODO for NamedTuples could just use them as tuples.. think about separating
98 | k: ks.dump(getattr(obj, k))
99 | for k, ks in self.fields
100 | }
101 |
102 | def load(self, dct):
103 | # dict comprehension is meh, but not sure if there is a faster way?
104 | # fmt: off
105 | return self.type(**{
106 | k: ks.load(dct[k])
107 | for k, ks in self.fields
108 | })
109 | # fmt: on
110 |
111 |
112 | @dataclass(**SLOTS)
113 | class SUnion(Schema):
114 | # it's a bit faster to cache indices here, gives about 15% speedup
115 | args: tuple[tuple[int, Schema], ...]
116 |
117 | def dump(self, obj):
118 | if obj is None:
119 | # if it's a None, then doesn't really matter how to serialize and deserialize it
120 | return (0, None)
121 |
122 | # TODO could do a bit of magic here and remember the last index that worked?
123 | # that way if some objects dominate the Union, the first isinstance would always work
124 | for tidx, a in self.args:
125 | if isinstance(obj, a.type): # this takes quite a lot of time (sort of expected?)
126 | # using lists instead of dicts gives a bit of a speedup (about 15%)
127 | # so probably worth it even though a bit cryptic
128 | # also could add a tag or something?
129 | # NOTE: using tuple instead of list gives a tiiny speedup
130 | jj = a.dump(obj)
131 | return (tidx, jj)
132 | # {
133 | # '__union_index__': tidx,
134 | # '__value__': jj,
135 | # }
136 | raise RuntimeError(f"shouldn't happen: {self.args} {obj}")
137 |
138 | def load(self, dct):
139 | # tidx = d['__union_index__']
140 | # s = self.args[tidx]
141 | # return s.load(d['__value__'])
142 | tidx, val = dct
143 | if val is None:
144 | # counterpart for None handling in .dump method
145 | return None
146 |
147 | _, s = self.args[tidx]
148 | return s.load(val)
149 |
150 |
151 | @dataclass(**SLOTS)
152 | class SList(Schema):
153 | arg: Schema
154 |
155 | def dump(self, obj):
156 | return tuple(self.arg.dump(i) for i in obj)
157 |
158 | def load(self, dct):
159 | return [self.arg.load(i) for i in dct]
160 |
161 |
162 | @dataclass(**SLOTS)
163 | class STuple(Schema):
164 | args: tuple[Schema, ...]
165 |
166 | def dump(self, obj):
167 | return tuple(a.dump(i) for a, i in zip(self.args, obj))
168 |
169 | def load(self, dct):
170 | return tuple(a.load(i) for a, i in zip(self.args, dct))
171 |
172 |
173 | @dataclass(**SLOTS)
174 | class SSequence(Schema):
175 | arg: Schema
176 |
177 | def dump(self, obj):
178 | return tuple(self.arg.dump(i) for i in obj)
179 |
180 | def load(self, dct):
181 | return tuple(self.arg.load(i) for i in dct)
182 |
183 |
184 | @dataclass(**SLOTS)
185 | class SDict(Schema):
186 | ft: SPrimitive
187 | tt: Schema
188 |
189 | def dump(self, obj):
190 | # fmt: off
191 | return {
192 | k: self.tt.dump(v)
193 | for k, v in obj.items()
194 | }
195 | # fmt: on
196 |
197 | def load(self, dct):
198 | # fmt: off
199 | return {
200 | k: self.tt.load(v)
201 | for k, v in dct.items()
202 | }
203 | # fmt: on
204 |
205 |
206 | # TODO unify with primitives?
207 | JTypes = {int, str, type(None), float, bool}
208 |
209 |
210 | def _exc_helper(args):
211 | for a in args:
212 | at = type(a)
213 | if at in JTypes:
214 | yield a
215 | elif issubclass(at, date):
216 | # TODO would be nice to restore datetime from cache too
217 | # maybe generally save exception as a union? or intact and let orjson save it?
218 | yield a.isoformat()
219 | else:
220 | yield str(a) # not much we can do..
221 |
222 |
223 | @dataclass(**SLOTS)
224 | class SException(Schema):
225 | def dump(self, obj: Exception) -> Json:
226 | return tuple(_exc_helper(obj.args))
227 |
228 | def load(self, dct: Json):
229 | return self.type(*dct)
230 |
231 |
232 | try:
233 | # defensive to avoid dependency on pytz when we switch to python >= 3.9
234 | import pytz
235 | except ModuleNotFoundError:
236 | # dummy, this is only needed for isinstance check below
237 | class pytz_BaseTzInfo:
238 | zone: str
239 |
240 | def make_tz_pytz(zone: str):
241 | raise RuntimeError(f"Install pytz to deserialize {zone}")
242 |
243 | else:
244 | pytz_BaseTzInfo = pytz.BaseTzInfo # type: ignore[misc,assignment]
245 |
246 | make_tz_pytz = pytz.timezone
247 |
248 |
249 | # just ints to avoid inflating db size
250 | # for now, we try to preserve actual timezone object just in case since they do have somewhat incompatible apis
251 | _TZTAG_ZONEINFO = 1
252 | _TZTAG_PYTZ = 2
253 |
254 |
255 | @dataclass(**SLOTS)
256 | class SDatetime(Schema):
257 | def dump(self, obj: datetime) -> Json:
258 | iso = obj.isoformat()
259 | tz = obj.tzinfo
260 | if tz is None:
261 | return (iso, None, None)
262 |
263 | if isinstance(tz, ZoneInfo):
264 | return (iso, tz.key, _TZTAG_ZONEINFO)
265 | elif isinstance(tz, pytz_BaseTzInfo):
266 | zone = tz.zone
267 | # should be present: https://github.com/python/typeshed/blame/968fd6d01d23470e0c8368e7ee7c43f54aaedc0e/stubs/pytz/pytz/tzinfo.pyi#L6
268 | assert zone is not None, (obj, tz)
269 | return (iso, zone, _TZTAG_PYTZ)
270 | else:
271 | return (iso, None, None)
272 |
273 | def load(self, dct: tuple):
274 | iso, zone, zone_tag = dct
275 | dt = datetime.fromisoformat(iso)
276 | if zone is None:
277 | return dt
278 |
279 | make_tz = ZoneInfo if zone_tag == _TZTAG_ZONEINFO else make_tz_pytz
280 | tz = make_tz(zone)
281 | return dt.astimezone(tz)
282 |
283 |
284 | @dataclass(**SLOTS)
285 | class SDate(Schema):
286 | def dump(self, obj: date) -> Json:
287 | return obj.isoformat()
288 |
289 | def load(self, dct: str):
290 | return date.fromisoformat(dct)
291 |
292 |
293 | PRIMITIVES = {
294 | # int and float are handled a bit differently to allow implicit casts
295 | # isinstance(.., Real) works both for int and for float
296 | # Real can't be serialized back, but if you look in SPrimitive, it leaves the values intact anyway
297 | # since the actual serialization of primitives is handled by orjson
298 | int: Real,
299 | float: Real,
300 | str: str,
301 | type(None): type(None),
302 | bool: bool,
303 | # if type is Any, there isn't much we can do to dump it -- just dump into json and rely on the best
304 | # so in this sense it works exacly like primitives
305 | Any: Any,
306 | }
307 |
308 |
309 | def build_schema(Type) -> Schema:
310 | assert not isinstance(Type, str), Type # just to avoid confusion in case of weirdness with stringish type annotations
311 |
312 | ptype = PRIMITIVES.get(Type)
313 | if ptype is not None:
314 | return SPrimitive(type=ptype)
315 |
316 | origin = get_origin(Type)
317 |
318 | # if origin not none, it's some sort of generic type?
319 | if origin is None:
320 | if issubclass(Type, Exception):
321 | return SException(type=Type)
322 |
323 | if issubclass(Type, datetime):
324 | return SDatetime(type=Type)
325 |
326 | if issubclass(Type, date):
327 | return SDate(type=Type)
328 |
329 | if not (is_dataclass(Type) or is_namedtuple(Type)):
330 | raise TypeNotSupported(type_=Type)
331 | try:
332 | hints = get_type_hints(Type)
333 | except TypeError as te:
334 | # this can happen for instance on 3.9 if pipe syntax was used for Union types
335 | # would be nice to provide a friendlier error though
336 | raise TypeNotSupported(type_=Type) from te
337 | fields = tuple((k, build_schema(t)) for k, t in hints.items())
338 | return SDataclass(
339 | type=Type,
340 | fields=fields,
341 | )
342 |
343 | args = get_args(Type)
344 |
345 | if sys.version_info[:2] >= (3, 10):
346 | is_uniontype = origin is types.UnionType
347 | else:
348 | is_uniontype = False
349 |
350 | is_union = origin is Union or is_uniontype
351 |
352 | if is_union:
353 | # fmt: off
354 | return SUnion(
355 | type=Type,
356 | args=tuple(
357 | (tidx, build_schema(a))
358 | for tidx, a in enumerate(args)
359 | ),
360 | )
361 | # fmt: on
362 |
363 | is_listish = origin is list
364 | if is_listish:
365 | (t,) = args
366 | return SList(
367 | type=Type,
368 | arg=build_schema(t),
369 | )
370 |
371 | # hmm check for is typing.Sequence doesn't pass for some reason
372 | # perhaps because it's a deprecated alias?
373 | is_tuplish = origin is tuple or origin is abc.Sequence
374 | if is_tuplish:
375 | if origin is tuple:
376 | # this is for Tuple[()], which is the way to represent empty tuple
377 | # before python 3.11, get_args for that gives ((),) instead of an empty tuple () as one might expect
378 | if args == ((),):
379 | args = ()
380 | return STuple(
381 | type=Type,
382 | args=tuple(build_schema(a) for a in args),
383 | )
384 | else:
385 | (t,) = args
386 | return SSequence(
387 | type=Type,
388 | arg=build_schema(t),
389 | )
390 |
391 | is_dictish = origin is dict
392 | if is_dictish:
393 | (ft, tt) = args
394 | fts = build_schema(ft)
395 | tts = build_schema(tt)
396 | assert isinstance(fts, SPrimitive)
397 | return SDict(
398 | type=Type,
399 | ft=fts,
400 | tt=tts,
401 | )
402 |
403 | raise RuntimeError(f"unsupported: {Type} {origin} {args}")
404 |
405 |
406 | ######### tests
407 |
408 |
409 | def _test_identity(obj, Type_, expected=None):
410 | if expected is None:
411 | expected = obj
412 |
413 | m = CachewMarshall(Type_)
414 |
415 | j = m.dump(obj)
416 | obj2 = m.load(j)
417 |
418 | # Exception's don't support equality normally, so we need to do some hacks..
419 | def normalise(x):
420 | if isinstance(x, Exception):
421 | return (type(x), x.args)
422 | if type(x) is list: # noqa: E721
423 | return [(type(i), i.args) if isinstance(i, Exception) else i for i in x]
424 | return x
425 |
426 | # ugh that doesn't work
427 | # def exc_eq(s, other):
428 | # return (type(s), s.args) == (type(other), other.args)
429 | # Exception.__eq__ = exc_eq
430 |
431 | assert normalise(expected) == normalise(obj2), (expected, obj2)
432 | return (j, obj2)
433 |
434 |
435 | # TODO customise with cattrs
436 | def test_serialize_and_deserialize() -> None:
437 | import pytest
438 |
439 | helper = _test_identity
440 |
441 | # primitives
442 | helper(1, int)
443 | helper('aaa', str)
444 | helper(None, type(None))
445 | # TODO emit other value as none type? not sure what should happen
446 |
447 | # implicit casts, simple version
448 | helper(None, int)
449 | helper(None, str)
450 | helper(1, float)
451 |
452 | # unions
453 | helper(1, Union[str, int])
454 | if sys.version_info[:2] >= (3, 10):
455 | helper('aaa', str | int)
456 |
457 | # implicit casts, inside other types
458 | # technically not type safe, but might happen in practice
459 | # doesn't matter how to deserialize None anyway so let's allow this
460 | helper(None, Union[str, int])
461 |
462 | # even though 1 is not isinstance(float), often it ends up as float in data
463 | # see https://github.com/karlicoss/cachew/issues/54
464 | helper(1, Union[float, str])
465 | helper(2, Union[float, int])
466 | helper(2.0, Union[float, int])
467 | helper((1, 2), tuple[int, float])
468 |
469 | # optionals
470 | helper('aaa', Optional[str])
471 | helper('aaa', Union[str, None])
472 | helper(None, Union[str, None])
473 | if sys.version_info[:2] >= (3, 10):
474 | helper('aaa', str | None)
475 |
476 | # lists/tuples/sequences
477 | helper([1, 2, 3], list[int])
478 | helper([1, 2, 3], Sequence[int], expected=(1, 2, 3))
479 | helper((1, 2, 3), Sequence[int])
480 | helper((1, 2, 3), tuple[int, int, int])
481 | # TODO test with from __future__ import annotations..
482 | helper([1, 2, 3], list[int])
483 | helper((1, 2, 3), tuple[int, int, int])
484 |
485 | # dicts
486 | helper({'a': 'aa', 'b': 'bb'}, dict[str, str])
487 | helper({'a': None, 'b': 'bb'}, dict[str, Optional[str]])
488 | helper({'a': 'aa', 'b': 'bb'}, dict[str, str])
489 |
490 | # compounds of simple types
491 | helper(['1', 2, '3'], list[Union[str, int]])
492 |
493 | # TODO need to add test for equivalent dataclasses
494 |
495 | @dataclass
496 | class Point:
497 | x: int
498 | y: int
499 |
500 | # dataclasses
501 | helper(Point(x=1, y=2), Point)
502 |
503 | # Namedtuple
504 | class NT(NamedTuple):
505 | first: str
506 | last: str
507 |
508 | helper(NT(first='aaa', last='bbb'), NT)
509 |
510 | @dataclass
511 | class WithJson:
512 | id: int
513 | raw_data: dict[str, Any]
514 |
515 | # json-ish stuff
516 | helper({}, dict[str, Any])
517 | helper(WithJson(id=123, raw_data={'payload': 'whatever', 'tags': ['a', 'b', 'c']}), WithJson)
518 | helper([], list[Any])
519 |
520 | # exceptions
521 | helper(RuntimeError('whatever!'), RuntimeError)
522 | # fmt: off
523 | helper([
524 | RuntimeError('I', 'am', 'exception', 123),
525 | Point(x=1, y=2),
526 | Point(x=11, y=22),
527 | RuntimeError('more stuff'),
528 | RuntimeError(),
529 | ], list[Union[RuntimeError, Point]])
530 |
531 | exc_with_datetime = Exception('I happenned on', datetime.fromisoformat('2021-04-03T10:11:12'))
532 | exc_with_datetime_exp = Exception('I happenned on', '2021-04-03T10:11:12')
533 | helper(exc_with_datetime, Exception, expected=exc_with_datetime_exp)
534 | # fmt: on
535 |
536 | # datetimes
537 | import pytz
538 |
539 | tz_london = pytz.timezone('Europe/London')
540 | dwinter = datetime.strptime('20200203 01:02:03', '%Y%m%d %H:%M:%S')
541 | dsummer = datetime.strptime('20200803 01:02:03', '%Y%m%d %H:%M:%S')
542 | dwinter_tz = tz_london.localize(dwinter)
543 | dsummer_tz = tz_london.localize(dsummer)
544 |
545 | dates_tz = [
546 | dwinter_tz,
547 | dsummer_tz,
548 | ]
549 |
550 | tz_sydney = ZoneInfo('Australia/Sydney')
551 | ## these will have same local time (2025-04-06 02:01:00) in Sydney due to DST shift!
552 | ## the second one will have fold=1 set to disambiguate
553 | utc_before_shift = datetime.fromisoformat('2025-04-05T15:01:00+00:00')
554 | utc_after__shift = datetime.fromisoformat('2025-04-05T16:01:00+00:00')
555 | ##
556 | sydney_before = utc_before_shift.astimezone(tz_sydney)
557 | sydney__after = utc_after__shift.astimezone(tz_sydney)
558 |
559 | dates_tz.extend([sydney_before, sydney__after])
560 |
561 | dates = [
562 | *dates_tz,
563 | dwinter,
564 | dsummer,
565 | dsummer.replace(tzinfo=timezone.utc),
566 | ]
567 | for d in dates:
568 | jj, dd = helper(d, datetime)
569 | assert str(d) == str(dd)
570 |
571 | # test that we preserve zone names
572 | if d in dates_tz:
573 | # this works both with pytz and zoneinfo without getting .zone or .key attributes
574 | assert str(d.tzinfo) == str(dd.tzinfo)
575 |
576 | assert helper(dsummer_tz, datetime)[0] == ('2020-08-03T01:02:03+01:00', 'Europe/London', _TZTAG_PYTZ)
577 | assert helper(dwinter, datetime)[0] == ('2020-02-03T01:02:03', None, None)
578 |
579 | assert helper(sydney_before, datetime)[0] == ('2025-04-06T02:01:00+11:00', 'Australia/Sydney', _TZTAG_ZONEINFO)
580 | assert helper(sydney__after, datetime)[0] == ('2025-04-06T02:01:00+10:00', 'Australia/Sydney', _TZTAG_ZONEINFO)
581 |
582 | assert helper(dwinter.date(), date)[0] == '2020-02-03'
583 |
584 | # unsupported types
585 | class NotSupported:
586 | pass
587 |
588 | with pytest.raises(RuntimeError, match=".*NotSupported.* isn't supported by cachew"):
589 | helper([NotSupported()], list[NotSupported])
590 |
591 | # edge cases
592 | helper((), tuple[()])
593 |
594 |
595 | # TODO test type aliases and such??
596 |
--------------------------------------------------------------------------------
/src/cachew/marshall/common.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | from typing import (
3 | Any,
4 | Generic,
5 | TypeVar,
6 | Union,
7 | )
8 |
9 | Json = Union[dict[str, Any], tuple[Any, ...], str, float, int, bool, None]
10 |
11 |
12 | T = TypeVar('T')
13 |
14 |
15 | class AbstractMarshall(Generic[T]):
16 | @abstractmethod
17 | def dump(self, obj: T) -> Json:
18 | raise NotImplementedError
19 |
20 | @abstractmethod
21 | def load(self, dct: Json) -> T:
22 | raise NotImplementedError
23 |
--------------------------------------------------------------------------------
/src/cachew/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/cachew/a25017531afbbeccbbd66d77bd6cc1bcf41d184f/src/cachew/py.typed
--------------------------------------------------------------------------------
/src/cachew/pytest.py:
--------------------------------------------------------------------------------
1 | """
2 | Helpers to prevent depending on pytest in runtime
3 | """
4 |
5 | import sys
6 | import typing
7 |
8 | under_pytest = 'pytest' in sys.modules
9 |
10 | if typing.TYPE_CHECKING or under_pytest:
11 | import pytest
12 |
13 | parametrize = pytest.mark.parametrize
14 | else:
15 |
16 | def parametrize(*_args, **_kwargs):
17 | def wrapper(f):
18 | return f
19 |
20 | return wrapper
21 |
--------------------------------------------------------------------------------
/src/cachew/tests/marshall.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa: ARG001 # ruff thinks pytest fixtures are unused arguments
2 | import shutil
3 | import sqlite3
4 | import sys
5 | from dataclasses import dataclass
6 | from datetime import datetime, timezone
7 | from pathlib import Path
8 | from typing import (
9 | Any,
10 | Literal,
11 | Union,
12 | )
13 |
14 | import orjson
15 | import pytest
16 |
17 | from ..marshall.cachew import CachewMarshall
18 | from ..marshall.common import Json
19 | from .utils import (
20 | gc_control, # noqa: F401
21 | profile,
22 | running_on_ci,
23 | timer,
24 | )
25 |
26 | Impl = Literal[
27 | 'cachew', # our custom deserialization
28 | 'cattrs',
29 | 'legacy', # our legacy deserialization
30 | ]
31 | # don't include legacy by default, it's only here just for the sake of comparing once before switch
32 | Impls: list[Impl] = ['cachew', 'cattrs']
33 |
34 |
35 | def do_test(*, test_name: str, Type, factory, count: int, impl: Impl = 'cachew') -> None:
36 | if count > 100 and running_on_ci:
37 | pytest.skip("test too heavy for CI, only meant to run manually")
38 |
39 | to_json: Any
40 | from_json: Any
41 | if impl == 'cachew':
42 | marshall = CachewMarshall(Type_=Type)
43 | to_json = marshall.dump
44 | from_json = marshall.load
45 | elif impl == 'legacy':
46 | from ..legacy import NTBinder
47 |
48 | # NOTE: legacy binder emits a tuple which can be inserted directly into the database
49 | # so 'json dump' and 'json load' should really be disregarded for this flavor
50 | # if you're comparing with implementation, you should compare
51 | # legacy serializing as the sum of serializing + json dump
52 | # that said, this way legacy will have a bit of an advantage since custom types (e.g. datetime)
53 | # would normally be handled by sqlalchemy instead
54 | binder = NTBinder.make(Type)
55 | to_json = binder.to_row
56 | from_json = binder.from_row
57 | elif impl == 'cattrs':
58 | from cattrs import Converter
59 |
60 | converter = Converter()
61 |
62 | from typing import get_args
63 |
64 | # TODO use later
65 | # from typing import Union, get_origin
66 | # import types
67 | # def is_union(type_) -> bool:
68 | # origin = get_origin(type_)
69 | # return origin is Union or origin is types.UnionType
70 |
71 | def union_structure_hook_factory(_):
72 | def union_hook(data, type_):
73 | args = get_args(type_)
74 |
75 | if data is None: # we don't try to coerce None into anything
76 | return None
77 |
78 | for t in args:
79 | try:
80 | res = converter.structure(data, t)
81 | except Exception:
82 | continue
83 | else:
84 | return res
85 | raise ValueError(f"Could not cast {data} to {type_}")
86 |
87 | return union_hook
88 |
89 | # borrowed from https://github.com/python-attrs/cattrs/issues/423
90 | # uhh, this doesn't really work straightaway...
91 | # likely need to combine what cattr does with configure_tagged_union
92 | # converter.register_structure_hook_factory(is_union, union_structure_hook_factory)
93 | # configure_tagged_union(
94 | # union=Type,
95 | # converter=converter,
96 | # )
97 | # NOTE: this seems to give a bit of speedup... maybe raise an issue or something?
98 | # fmt: off
99 | unstruct_func = converter._unstructure_func.dispatch(Type) # type: ignore[call-arg, misc] # about 20% speedup
100 | struct_func = converter._structure_func .dispatch(Type) # type: ignore[call-arg, misc] # TODO speedup
101 | # fmt: on
102 |
103 | to_json = unstruct_func
104 | # todo would be nice to use partial? but how do we bind a positional arg?
105 | from_json = lambda x: struct_func(x, Type)
106 | else:
107 | raise RuntimeError(impl)
108 |
109 | print(file=sys.stderr) # kinda annoying, pytest starts printing on the same line as test name
110 |
111 | with profile(test_name + ':baseline'), timer(f'building {count} objects of type {Type}'):
112 | objects = list(factory(count=count))
113 |
114 | jsons: list[Json] = [None for _ in range(count)]
115 | with profile(test_name + ':serialize'), timer(f'serializing {count} objects of type {Type}'):
116 | for i in range(count):
117 | jsons[i] = to_json(objects[i])
118 |
119 | strs: list[bytes] = [None for _ in range(count)] # type: ignore[misc]
120 | with profile(test_name + ':json_dump'), timer(f'json dump {count} objects of type {Type}'):
121 | for i in range(count):
122 | # TODO any orjson options to speed up?
123 | strs[i] = orjson.dumps(jsons[i])
124 |
125 | db = Path('/tmp/cachew_test/db.sqlite')
126 | if db.parent.exists():
127 | shutil.rmtree(db.parent)
128 | db.parent.mkdir()
129 |
130 | with profile(test_name + ':sqlite_dump'), timer(f'sqlite dump {count} objects of type {Type}'):
131 | with sqlite3.connect(db) as conn:
132 | conn.execute('CREATE TABLE data (value BLOB)')
133 | conn.executemany('INSERT INTO data (value) VALUES (?)', [(s,) for s in strs])
134 | conn.close()
135 |
136 | strs2: list[bytes] = [None for _ in range(count)] # type: ignore[misc]
137 | with profile(test_name + ':sqlite_load'), timer(f'sqlite load {count} objects of type {Type}'):
138 | with sqlite3.connect(db) as conn:
139 | i = 0
140 | for (value,) in conn.execute('SELECT value FROM data'):
141 | strs2[i] = value
142 | i += 1
143 | conn.close()
144 |
145 | cache = db.parent / 'cache.jsonl'
146 |
147 | with profile(test_name + ':jsonl_dump'), timer(f'jsonl dump {count} objects of type {Type}'):
148 | with cache.open('wb') as fw:
149 | for s in strs:
150 | fw.write(s + b'\n')
151 |
152 | strs3: list[bytes] = [None for _ in range(count)] # type: ignore[misc]
153 | with profile(test_name + ':jsonl_load'), timer(f'jsonl load {count} objects of type {Type}'):
154 | i = 0
155 | with cache.open('rb') as fr:
156 | for l in fr:
157 | l = l.rstrip(b'\n')
158 | strs3[i] = l
159 | i += 1
160 |
161 | assert strs2[:100] + strs2[-100:] == strs3[:100] + strs3[-100:] # just in case
162 |
163 | jsons2: list[Json] = [None for _ in range(count)]
164 | with profile(test_name + ':json_load'), timer(f'json load {count} objects of type {Type}'):
165 | for i in range(count):
166 | # TODO any orjson options to speed up?
167 | jsons2[i] = orjson.loads(strs2[i])
168 |
169 | objects2 = [None for _ in range(count)]
170 | with profile(test_name + ':deserialize'), timer(f'deserializing {count} objects of type {Type}'):
171 | for i in range(count):
172 | objects2[i] = from_json(jsons2[i])
173 |
174 | assert objects[:100] + objects[-100:] == objects2[:100] + objects2[-100:]
175 |
176 |
177 | @dataclass
178 | class Name:
179 | first: str
180 | last: str
181 |
182 |
183 | @pytest.mark.parametrize('impl', Impls)
184 | @pytest.mark.parametrize('count', [99, 1_000_000, 5_000_000])
185 | @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
186 | def test_union_str_dataclass(impl: Impl, count: int, gc_control, request) -> None:
187 | # NOTE: previously was union_str_namedtuple, but adapted to work with cattrs for now
188 | # perf difference between datacalss/namedtuple here seems negligible so old benchmark results should apply
189 |
190 | if impl == 'cattrs':
191 | pytest.skip('TODO need to adjust the handling of Union types..')
192 |
193 | def factory(count: int):
194 | objects: list[Union[str, Name]] = []
195 | for i in range(count):
196 | if i % 2 == 0:
197 | objects.append(str(i))
198 | else:
199 | objects.append(Name(first=f'first {i}', last=f'last {i}'))
200 | return objects
201 |
202 | do_test(test_name=request.node.name, Type=Union[str, Name], factory=factory, count=count, impl=impl)
203 |
204 |
205 | # OK, performance with calling this manually (not via pytest) is the same
206 | # do_test_union_str_dataclass(count=1_000_000, test_name='adhoc')
207 |
208 |
209 | @pytest.mark.parametrize('impl', Impls)
210 | @pytest.mark.parametrize('count', [99, 1_000_000, 5_000_000])
211 | @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
212 | def test_datetimes(impl: Impl, count: int, gc_control, request) -> None:
213 | if impl == 'cattrs':
214 | pytest.skip('TODO support datetime with pytz for cattrs')
215 |
216 | import pytz
217 |
218 | def factory(*, count: int):
219 | tzs = [
220 | pytz.timezone('Europe/Berlin'),
221 | timezone.utc,
222 | pytz.timezone('America/New_York'),
223 | ]
224 | start = datetime.fromisoformat('1990-01-01T00:00:00')
225 | end = datetime.fromisoformat('2030-01-01T00:00:00')
226 | step = (end - start) / count
227 | for i in range(count):
228 | dt = start + step * i
229 | tz = tzs[i % len(tzs)]
230 | yield dt.replace(tzinfo=tz)
231 |
232 | do_test(test_name=request.node.name, Type=datetime, factory=factory, count=count, impl=impl)
233 |
234 |
235 | @pytest.mark.parametrize('impl', Impls)
236 | @pytest.mark.parametrize('count', [99, 1_000_000])
237 | @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
238 | def test_nested_dataclass(impl: Impl, count: int, gc_control, request) -> None:
239 | # NOTE: was previously named test_many_from_cachew
240 | @dataclass
241 | class UUU:
242 | xx: int
243 | yy: int
244 |
245 | @dataclass
246 | class TE2:
247 | value: int
248 | uuu: UUU
249 | value2: int
250 |
251 | def factory(*, count: int):
252 | for i in range(count):
253 | yield TE2(value=i, uuu=UUU(xx=i, yy=i), value2=i)
254 |
255 | do_test(test_name=request.node.name, Type=TE2, factory=factory, count=count, impl=impl)
256 |
257 |
258 | # TODO next test should probs be runtimeerror?
259 |
--------------------------------------------------------------------------------
/src/cachew/tests/test_future_annotations.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import os
4 | import sys
5 | import textwrap
6 | from collections.abc import Iterator
7 | from dataclasses import dataclass
8 | from pathlib import Path
9 | from subprocess import check_output
10 | from typing import Any
11 |
12 | import pytest
13 | from more_itertools import one
14 |
15 | from .. import cachew
16 |
17 |
18 | # fmt: off
19 | @dataclass
20 | class NewStyleTypes1:
21 | a_str : str
22 | a_dict : dict[str, Any]
23 | a_list : list[Any]
24 | a_tuple : tuple[float, str]
25 | # fmt: on
26 |
27 |
28 | def test_types1(tmp_path: Path) -> None:
29 | # fmt: off
30 | obj = NewStyleTypes1(
31 | a_str = 'abac',
32 | a_dict = {'a': True, 'x': {'whatever': 3.14}},
33 | a_list = ['aba', 123, None],
34 | a_tuple = (1.23, '3.2.1'),
35 | )
36 | # fmt: on
37 |
38 | @cachew(tmp_path)
39 | def get() -> Iterator[NewStyleTypes1]:
40 | yield obj
41 |
42 | assert one(get()) == obj
43 | assert one(get()) == obj
44 |
45 |
46 | # fmt: off
47 | @dataclass
48 | class NewStyleTypes2:
49 | an_opt : str | None
50 | a_union : str | int
51 | # fmt: on
52 |
53 |
54 | def test_types2(tmp_path: Path) -> None:
55 | if sys.version_info[:2] <= (3, 9):
56 | pytest.skip("can only use new style union types from 3.10")
57 |
58 | # fmt: off
59 | obj = NewStyleTypes2(
60 | an_opt = 'hello',
61 | a_union = 999,
62 | )
63 | # fmt: on
64 |
65 | @cachew(tmp_path)
66 | def get() -> Iterator[NewStyleTypes2]:
67 | yield obj
68 |
69 | assert one(get()) == obj
70 | assert one(get()) == obj
71 |
72 |
73 | @pytest.mark.parametrize('use_future_annotations', [False, True])
74 | @pytest.mark.parametrize('local', [False, True])
75 | @pytest.mark.parametrize('throw', [False, True])
76 | def test_future_annotations(
77 | *,
78 | use_future_annotations: bool,
79 | local: bool,
80 | throw: bool,
81 | tmp_path: Path,
82 | ) -> None:
83 | """
84 | Checks handling of postponed evaluation of annotations (from __future__ import annotations)
85 | """
86 |
87 | # NOTE: to avoid weird interactions with existing interpreter in which pytest is running
88 | # , we compose a program and running in python directly instead
89 | # (also not sure if it's even possible to tweak postponed annotations without doing that)
90 |
91 | if use_future_annotations and local and throw:
92 | # when annotation is local (like inner class), then they end up as strings
93 | # so we can't eval it as we don't have access to a class defined inside function
94 | # keeping this test just to keep track of whether this is fixed at some point
95 | # possibly relevant:
96 | # - https://peps.python.org/pep-0563/#keeping-the-ability-to-use-function-local-state-when-defining-annotations
97 | pytest.skip("local aliases/classses don't work with from __future__ import annotations")
98 |
99 | _PREAMBLE = f'''
100 | from pathlib import Path
101 | import tempfile
102 |
103 | from cachew import cachew, settings
104 | settings.THROW_ON_ERROR = {throw}
105 |
106 | temp_dir = tempfile.TemporaryDirectory()
107 | td = Path(temp_dir.name)
108 |
109 | '''
110 |
111 | _TEST = '''
112 | T = int
113 |
114 | @cachew(td)
115 | def fun() -> list[T]:
116 | print("called")
117 | return [1, 2]
118 |
119 | assert list(fun()) == [1, 2]
120 | assert list(fun()) == [1, 2]
121 | '''
122 |
123 | if use_future_annotations:
124 | code = '''
125 | from __future__ import annotations
126 | '''
127 | else:
128 | code = ''
129 |
130 | code += _PREAMBLE
131 |
132 | if local:
133 | code += f'''
134 | def test() -> None:
135 | {textwrap.indent(_TEST, prefix=" ")}
136 |
137 | test()
138 | '''
139 | else:
140 | code += _TEST
141 |
142 | run_py = tmp_path / 'run.py'
143 | run_py.write_text(code)
144 |
145 | cache_dir = tmp_path / 'cache'
146 | cache_dir.mkdir()
147 |
148 | res = check_output(
149 | [sys.executable, run_py],
150 | env={'TMPDIR': str(cache_dir), **os.environ},
151 | text=True,
152 | )
153 | called = int(res.count('called'))
154 | if use_future_annotations and local and not throw:
155 | # cachew fails to set up, so no caching but at least it works otherwise
156 | assert called == 2
157 | else:
158 | assert called == 1
159 |
--------------------------------------------------------------------------------
/src/cachew/tests/utils.py:
--------------------------------------------------------------------------------
1 | import gc
2 | import os
3 | import sys
4 | from contextlib import contextmanager
5 | from pathlib import Path
6 |
7 | import pytest
8 |
9 | PROFILES = Path(__file__).absolute().parent / 'profiles'
10 |
11 |
12 | @contextmanager
13 | def profile(name: str):
14 | # ugh. seems like pyinstrument slows down code quite a bit?
15 | if os.environ.get('PYINSTRUMENT') is None:
16 | yield
17 | return
18 |
19 | from pyinstrument import Profiler
20 |
21 | with Profiler() as profiler:
22 | yield
23 |
24 | PROFILES.mkdir(exist_ok=True)
25 | results_file = PROFILES / f"{name}.html"
26 |
27 | print("results for " + name, file=sys.stderr)
28 | profiler.print()
29 |
30 | results_file.write_text(profiler.output_html())
31 |
32 |
33 | def timer(name: str):
34 | from codetiming import Timer
35 |
36 | return Timer(name=name, text=name + ': ' + '{:.2f}s')
37 |
38 |
39 | @pytest.fixture
40 | def gc_control(*, gc_on: bool):
41 | if gc_on:
42 | # no need to do anything, should be on by default
43 | yield
44 | return
45 |
46 | gc.disable()
47 | try:
48 | yield
49 | finally:
50 | gc.enable()
51 |
52 |
53 | running_on_ci = 'CI' in os.environ
54 |
--------------------------------------------------------------------------------
/src/cachew/utils.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 |
3 |
4 | class CachewException(RuntimeError):
5 | pass
6 |
7 |
8 | @dataclass
9 | class TypeNotSupported(CachewException):
10 | type_: type
11 |
12 | def __str__(self) -> str:
13 | return f"{self.type_} isn't supported by cachew. See https://github.com/karlicoss/cachew#features for the list of supported types."
14 |
15 |
16 | # https://stackoverflow.com/a/2166841/706389
17 | def is_namedtuple(t) -> bool:
18 | b = getattr(t, '__bases__', None)
19 | if b is None:
20 | return False
21 | if len(b) != 1 or b[0] is not tuple:
22 | return False
23 | f = getattr(t, '_fields', None)
24 | if not isinstance(f, tuple):
25 | return False
26 | # pylint: disable=unidiomatic-typecheck
27 | return all(type(n) == str for n in f) # noqa: E721
28 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | minversion = 3.21
3 | # relies on the correct version of Python installed
4 | envlist = ruff,tests,mypy
5 | # https://github.com/tox-dev/tox/issues/20#issuecomment-247788333
6 | # hack to prevent .tox from crapping to the project directory
7 | toxworkdir = {env:TOXWORKDIR_BASE:}{toxinidir}/.tox
8 |
9 | [testenv]
10 | # TODO how to get package name from setuptools?
11 | package_name = "cachew"
12 | passenv =
13 | # useful for tests to know they are running under ci
14 | CI
15 | CI_*
16 | # respect user's cache dirs to prevent tox from crapping into project dir
17 | PYTHONPYCACHEPREFIX
18 | MYPY_CACHE_DIR
19 | RUFF_CACHE_DIR
20 |
21 | # default is 'editable', in which tox builds wheel first for some reason? not sure if makes much sense
22 | package = uv-editable
23 |
24 |
25 | [testenv:ruff]
26 | skip_install = true
27 | dependency_groups = testing
28 | commands =
29 | {envpython} -m ruff check \
30 | {posargs}
31 |
32 |
33 | [testenv:tests]
34 | dependency_groups = testing
35 | extras = optional
36 | commands =
37 | # posargs allow test filtering, e.g. tox ... -- -k test_name
38 | {envpython} -m pytest \
39 | --pyargs {[testenv]package_name} \
40 | {posargs}
41 |
42 |
43 | [testenv:mypy]
44 | dependency_groups = testing
45 | extras = optional
46 | commands =
47 | {envpython} -m mypy --no-install-types \
48 | -p {[testenv]package_name} \
49 | # txt report is a bit more convenient to view on CI
50 | --txt-report .coverage.mypy \
51 | --html-report .coverage.mypy \
52 | {posargs}
53 |
--------------------------------------------------------------------------------