├── .flake8 ├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── dependabot.yml ├── release-drafter.yml ├── workflows.md └── workflows │ ├── constraints.txt │ ├── dependabot-auto-merge.yml │ ├── release-please.yml │ ├── release.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .prettierignore ├── .readthedocs.yml ├── CONTRIBUTING.rst ├── LICENSE ├── README.rst ├── build_docs.sh ├── codecov.yml ├── dev ├── __init__.py ├── btree experiments.ipynb ├── display.ipynb ├── measure_ram.py ├── multi.ipynb ├── multicolumn.md ├── munchmark.ipynb ├── output.png ├── perf.ipynb └── wrdl-dev.ipynb ├── docs ├── Makefile ├── conf.py ├── demos.rst ├── ducks.concurrent.rst ├── ducks.frozen.rst ├── ducks.mutable.rst ├── ducks.rst ├── favicon.ico ├── how_it_works.rst ├── img │ ├── ducks-main.png │ └── perf_bench.png ├── index.rst ├── make.bat ├── modules.rst ├── quick_start.rst └── requirements.txt ├── ducks ├── __init__.py ├── btree.py ├── concurrent │ ├── __init__.py │ └── main.py ├── constants.py ├── exceptions.py ├── frozen │ ├── __init__.py │ ├── frozen_attr.py │ ├── init_helpers.py │ ├── main.py │ └── utils.py ├── mutable │ ├── __init__.py │ ├── main.py │ └── mutable_attr.py ├── pickling.py └── utils.py ├── examples ├── __init__.py ├── collision.py ├── concurrent_perf.ipynb ├── data │ ├── crossword_words.txt │ └── wordle_words.csv ├── img │ ├── word0.png │ ├── word1.png │ ├── word2.png │ ├── word3.png │ ├── word4.png │ └── word5.png ├── pandas_index.py ├── percentile.py ├── perf_demo.ipynb ├── update.py └── wordle.ipynb ├── noxfile.py ├── poetry.lock ├── pyproject.toml ├── test ├── __init__.py ├── concurrent │ ├── __init__.py │ ├── concurrent_utils.py │ ├── test_multi_writer.py │ └── test_read_update.py ├── conftest.py ├── mutable │ ├── __init__.py │ └── test_soak.py ├── test_basic_operations.py ├── test_btree.py ├── test_container_ops.py ├── test_edge_cases.py ├── test_examples.py ├── test_exceptions.py ├── test_fancy_gets.py ├── test_missing_attribute.py ├── test_mixed_cardinality.py ├── test_multiple_operations.py ├── test_mutations.py ├── test_nones.py ├── test_pickling.py ├── test_range_queries.py ├── test_stale_objects.py └── test_wrong_type.py └── tmp ├── Makefile ├── conf.py ├── index.rst └── make.bat /.flake8: -------------------------------------------------------------------------------- 1 | 2 | [flake8] 3 | # Eventually would add D and DAR 4 | select = B,B9,C,E,F,N,RST,S,W 5 | ignore = E203,E501,RST201,RST203,RST301,W503,B902,N805,DAR402,S403,S404,S605,S603,S403,S301,B015,N818,F401,F811 6 | max-line-length = 119 7 | max-complexity = 10 8 | docstring-convention = google 9 | per-file-ignores = tests/*:S101 10 | rst-roles = class,const,func,meth,mod,ref 11 | rst-directives = deprecated 12 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | dev/* linguist-vendored 2 | examples/* linguist-vendored 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: github-actions 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | - package-ecosystem: pip 8 | directory: "/.github/workflows" 9 | schedule: 10 | interval: daily 11 | - package-ecosystem: pip 12 | directory: "/docs" 13 | schedule: 14 | interval: daily 15 | - package-ecosystem: pip 16 | directory: "/" 17 | schedule: 18 | interval: daily 19 | -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - title: ":boom: Breaking Changes" 3 | label: "breaking" 4 | - title: ":rocket: Features" 5 | label: "enhancement" 6 | - title: ":fire: Removals and Deprecations" 7 | label: "removal" 8 | - title: ":beetle: Fixes" 9 | label: "bug" 10 | - title: ":racehorse: Performance" 11 | label: "performance" 12 | - title: ":rotating_light: Testing" 13 | label: "testing" 14 | - title: ":construction_worker: Continuous Integration" 15 | label: "ci" 16 | - title: ":books: Documentation" 17 | label: "documentation" 18 | - title: ":hammer: Refactoring" 19 | label: "refactoring" 20 | - title: ":lipstick: Style" 21 | label: "style" 22 | - title: ":package: Dependencies" 23 | labels: 24 | - "dependencies" 25 | - "build" 26 | template: | 27 | ## Changes 28 | 29 | $CHANGES 30 | -------------------------------------------------------------------------------- /.github/workflows.md: -------------------------------------------------------------------------------- 1 | # .github/workflows 2 | 3 | ## ISSUE_TEMPLATE 4 | 5 | Used as templates when creating new issues in the repo 6 | 7 | ## Workflows 8 | 9 | ### constraints.txt 10 | 11 | Constraints.txt is a pip install file that constrains some python requirements outside of poetry. 12 | 13 | ### dependabot-auto-merge.yml 14 | 15 | A workflow that runs on PRs that will automatically merge dependabot update PRs that pass testing 16 | 17 | ### release-please.yml 18 | 19 | A workflow that manages releases for the repo. On merges to the main branch, it scans them for [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/) 20 | and will then create or update a release pr with those changes. The [release-please](https://github.com/googleapis/release-please) docs have more info. 21 | 22 | When that release pr is merged, it will build and upload to pypi. 23 | 24 | ### release.yml 25 | 26 | A workflow that runs on the creation of releases to upload the package to pypi 27 | 28 | ### Required Secrets 29 | 30 | * THIS_PAT - a personal access token that has access to create releases on this repo and edit the repo's settings. Used with release-please and repo-manager 31 | * PYPI_TOKEN - a pypi token that can upload to pypi for this package. Used with release 32 | 33 | ## dependabot.yml 34 | 35 | Configures dependabot updates and alerts for this repo 36 | 37 | ## release-drafter.yml 38 | 39 | Configures how release notes are written by release-please 40 | -------------------------------------------------------------------------------- /.github/workflows/constraints.txt: -------------------------------------------------------------------------------- 1 | pip==22.3.1 2 | nox==2022.8.7 3 | nox-poetry==1.0.3 4 | poetry==1.4.2 5 | virtualenv==20.21.0 6 | poetry-dynamic-versioning==0.25.0 7 | toml==0.10.2 8 | -------------------------------------------------------------------------------- /.github/workflows/dependabot-auto-merge.yml: -------------------------------------------------------------------------------- 1 | name: Auto Merge Dependabot 2 | 3 | on: 4 | pull_request: 5 | 6 | jobs: 7 | auto-merge: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v3.6.0 11 | - uses: ahmadnassri/action-dependabot-auto-merge@v2 12 | with: 13 | target: minor 14 | github-token: ${{ secrets.THIS_PAT }} 15 | -------------------------------------------------------------------------------- /.github/workflows/release-please.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | name: release-please 6 | jobs: 7 | release-please: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: google-github-actions/release-please-action@v3 11 | with: 12 | token: ${{ secrets.THIS_PAT }} 13 | release-type: python 14 | package-name: ducks 15 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release to pypi 2 | 3 | on: 4 | release: 5 | types: [released] 6 | 7 | jobs: 8 | release: 9 | name: Release 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Check out the repository 13 | uses: actions/checkout@v3.6.0 14 | with: 15 | fetch-depth: 2 16 | 17 | - name: Set up Python 18 | uses: actions/setup-python@v4.8.0 19 | with: 20 | python-version: "3.10" 21 | 22 | - name: Upgrade pip 23 | run: | 24 | pip install --constraint=.github/workflows/constraints.txt pip 25 | pip --version 26 | 27 | - name: Install Poetry 28 | run: | 29 | pip install --constraint=.github/workflows/constraints.txt poetry poetry-dynamic-versioning 30 | poetry --version 31 | 32 | - name: Build package 33 | run: | 34 | poetry build --ansi 35 | 36 | - name: Publish package on PyPI 37 | uses: pypa/gh-action-pypi-publish@v1.8.11 38 | with: 39 | user: __token__ 40 | password: ${{ secrets.PYPI_TOKEN }} 41 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | - push 5 | 6 | jobs: 7 | 8 | coverage: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Check out the repository 12 | uses: actions/checkout@v3.6.0 13 | 14 | - name: Set up Python 15 | uses: actions/setup-python@v4.8.0 16 | with: 17 | python-version: "3.10" 18 | 19 | - name: Upgrade pip 20 | run: | 21 | pip install --constraint=.github/workflows/constraints.txt pip 22 | pip --version 23 | 24 | - name: Upgrade pip in virtual environments 25 | shell: python 26 | run: | 27 | import os 28 | import pip 29 | 30 | with open(os.environ["GITHUB_ENV"], mode="a") as io: 31 | print(f"VIRTUALENV_PIP={pip.__version__}", file=io) 32 | 33 | - name: Run tests and collect coverage 34 | run: | 35 | pip install pytest pytest-cov pytest-xdist 36 | pip install . 37 | pytest 38 | 39 | - name: Upload coverage report 40 | uses: codecov/codecov-action@v3.1.4 41 | 42 | tests: 43 | name: ${{ matrix.session }} ${{ matrix.python }} / ${{ matrix.os }} 44 | runs-on: ${{ matrix.os }} 45 | strategy: 46 | fail-fast: false 47 | matrix: 48 | include: 49 | - { python: "3.10", os: "ubuntu-latest", session: "pre-commit" } 50 | - { python: "3.10", os: "ubuntu-latest", session: "safety" } 51 | - { python: "3.10", os: "ubuntu-latest", session: "tests" } 52 | - { python: "3.9", os: "ubuntu-latest", session: "tests" } 53 | - { python: "3.8", os: "ubuntu-latest", session: "tests" } 54 | - { python: "3.7", os: "ubuntu-latest", session: "tests" } 55 | - { python: "3.10", os: "ubuntu-latest", session: "docs-build" } 56 | 57 | env: 58 | NOXSESSION: ${{ matrix.session }} 59 | FORCE_COLOR: "1" 60 | PRE_COMMIT_COLOR: "always" 61 | 62 | steps: 63 | - name: Check out the repository 64 | uses: actions/checkout@v3.6.0 65 | 66 | - name: Set up Python ${{ matrix.python }} 67 | uses: actions/setup-python@v4.8.0 68 | with: 69 | python-version: ${{ matrix.python }} 70 | 71 | - name: Upgrade pip 72 | run: | 73 | pip install --constraint=.github/workflows/constraints.txt pip 74 | pip --version 75 | 76 | - name: Upgrade pip in virtual environments 77 | shell: python 78 | run: | 79 | import os 80 | import pip 81 | 82 | with open(os.environ["GITHUB_ENV"], mode="a") as io: 83 | print(f"VIRTUALENV_PIP={pip.__version__}", file=io) 84 | 85 | - name: Install Poetry, nox, and requirements 86 | run: | 87 | pip install --upgrade -r .github/workflows/constraints.txt 88 | poetry --version 89 | nox --version 90 | 91 | - name: Compute pre-commit cache key 92 | if: matrix.session == 'pre-commit' 93 | id: pre-commit-cache 94 | shell: python 95 | run: | 96 | import hashlib 97 | import sys 98 | 99 | python = "py{}.{}".format(*sys.version_info[:2]) 100 | payload = sys.version.encode() + sys.executable.encode() 101 | digest = hashlib.sha256(payload).hexdigest() 102 | result = "${{ runner.os }}-{}-{}-pre-commit".format(python, digest[:8]) 103 | 104 | print("::set-output name=result::{}".format(result)) 105 | 106 | - name: Restore pre-commit cache 107 | uses: actions/cache@v3.3.3 108 | if: matrix.session == 'pre-commit' 109 | with: 110 | path: ~/.cache/pre-commit 111 | key: ${{ steps.pre-commit-cache.outputs.result }}-${{ hashFiles('.pre-commit-config.yaml') }} 112 | restore-keys: | 113 | ${{ steps.pre-commit-cache.outputs.result }}- 114 | 115 | - name: Run Nox 116 | run: | 117 | nox --force-color --python=${{ matrix.python }} 118 | 119 | - name: Upload coverage data 120 | if: always() && matrix.session == 'tests' 121 | uses: "actions/upload-artifact@v3.1.3" 122 | with: 123 | name: coverage-data 124 | path: ".coverage" 125 | 126 | - name: Upload documentation 127 | if: matrix.session == 'docs-build' 128 | uses: actions/upload-artifact@v3.1.3 129 | with: 130 | name: docs 131 | path: docs/_build 132 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | .idea/ 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: black 5 | name: black 6 | entry: black 7 | language: system 8 | types: [python] 9 | require_serial: true 10 | - id: check-added-large-files 11 | name: Check for added large files 12 | entry: check-added-large-files 13 | language: system 14 | - id: check-toml 15 | name: Check Toml 16 | entry: check-toml 17 | language: system 18 | types: [toml] 19 | - id: check-yaml 20 | name: Check Yaml 21 | entry: check-yaml 22 | language: system 23 | types: [yaml] 24 | - id: end-of-file-fixer 25 | name: Fix End of Files 26 | entry: end-of-file-fixer 27 | language: system 28 | types: [text] 29 | stages: [commit, push, manual] 30 | - id: flake8 31 | name: flake8 32 | entry: flake8 33 | language: system 34 | types: [python] 35 | exclude: "^(test/*|examples/*|noxfile.py|docs/*|tmp/*)" 36 | require_serial: true 37 | args: ["--config=.flake8"] 38 | - id: pyupgrade 39 | name: pyupgrade 40 | description: Automatically upgrade syntax for newer versions. 41 | entry: pyupgrade 42 | language: system 43 | types: [python] 44 | args: [--py37-plus] 45 | - id: reorder-python-imports 46 | name: Reorder python imports 47 | entry: reorder-python-imports 48 | language: system 49 | types: [python] 50 | args: [--application-directories=src] 51 | - repo: https://github.com/pre-commit/mirrors-prettier 52 | rev: v2.7.1 53 | hooks: 54 | - id: prettier 55 | - repo: https://github.com/rhysd/actionlint 56 | rev: v1.6.15 57 | hooks: 58 | - id: actionlint-docker 59 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | .github/* 2 | CHANGELOG.md 3 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-20.04 5 | tools: 6 | python: "3.9" 7 | 8 | python: 9 | install: 10 | - method: pip 11 | path: . 12 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | Contributor Guide 2 | ================= 3 | 4 | Thank you for your interest in improving this project. 5 | This project is open-source under the `MIT license`_ and 6 | welcomes contributions in the form of bug reports, feature requests, and pull requests. 7 | 8 | Here is a list of important resources for contributors: 9 | 10 | - `Source Code`_ 11 | - `Documentation`_ 12 | - `Issue Tracker`_ 13 | - `Code of Conduct`_ 14 | 15 | .. _MIT license: https://opensource.org/licenses/MIT 16 | .. _Source Code: https://github.com/manimino/ducks 17 | .. _Documentation: https://ducks.readthedocs.io/ 18 | .. _Issue Tracker: https://github.com/manimino/ducks/issues 19 | 20 | How to report a bug 21 | ------------------- 22 | 23 | Report bugs on the `Issue Tracker`_. 24 | 25 | When filing an issue, make sure to answer these questions: 26 | 27 | - Which operating system and Python version are you using? 28 | - Which version of this project are you using? 29 | - What did you do? 30 | - What did you expect to see? 31 | - What did you see instead? 32 | 33 | The best way to get your bug fixed is to provide a test case, 34 | and/or steps to reproduce the issue. 35 | 36 | 37 | How to request a feature 38 | ------------------------ 39 | 40 | Request features on the `Issue Tracker`_. 41 | 42 | 43 | How to set up your development environment 44 | ------------------------------------------ 45 | 46 | You need Python 3.7+ and the following tools: 47 | 48 | - Poetry_ 49 | - Nox_ 50 | - nox-poetry_ 51 | 52 | Install the package with development requirements: 53 | 54 | .. code:: console 55 | 56 | $ poetry install 57 | 58 | You can now run an interactive Python session, 59 | or the command-line interface: 60 | 61 | .. code:: console 62 | 63 | $ poetry run python 64 | 65 | .. _Poetry: https://python-poetry.org/ 66 | .. _Nox: https://nox.thea.codes/ 67 | .. _nox-poetry: https://nox-poetry.readthedocs.io/ 68 | 69 | 70 | How to test the project 71 | ----------------------- 72 | 73 | Run the full test suite: 74 | 75 | .. code:: console 76 | 77 | $ nox 78 | 79 | List the available Nox sessions: 80 | 81 | .. code:: console 82 | 83 | $ nox --list-sessions 84 | 85 | You can also run a specific Nox session. 86 | For example, invoke the unit test suite like this: 87 | 88 | .. code:: console 89 | 90 | $ nox --session=tests 91 | 92 | Unit tests are located in the ``tests`` directory, 93 | and are written using the pytest_ testing framework. 94 | 95 | .. _pytest: https://pytest.readthedocs.io/ 96 | 97 | 98 | How to submit changes 99 | --------------------- 100 | 101 | Open a `pull request`_ to submit changes to this project. 102 | 103 | Your pull request needs to meet the following guidelines for acceptance: 104 | 105 | - The Nox test suite must pass without errors and warnings. 106 | - Include unit tests. This project maintains 100% code coverage. 107 | - If your changes add functionality, update the documentation accordingly. 108 | 109 | Feel free to submit early, though—we can always iterate on this. 110 | 111 | To run linting and code formatting checks before committing your change, you can install pre-commit as a Git hook by running the following command: 112 | 113 | .. code:: console 114 | 115 | $ nox --session=pre-commit -- install 116 | 117 | It is recommended to open an issue before starting work on anything. 118 | This will allow a chance to talk it over with the owners and validate your approach. 119 | 120 | .. _pull request: https://github.com/manimino/ducks/pulls 121 | .. github-only 122 | .. _Code of Conduct: CODE_OF_CONDUCT.rst 123 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Theo Walker 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://raw.githubusercontent.com/manimino/ducks/main/docs/img/ducks-main.png 2 | :alt: Ducks, the Python object indexer 3 | 4 | ========= 5 | ducks 🦆 6 | ========= 7 | 8 | Index your Python objects for fast lookup by their attributes. 9 | 10 | .. image:: https://img.shields.io/github/stars/manimino/ducks.svg?style=social&label=Star&maxAge=2592000 11 | :target: https://github.com/manimino/ducks 12 | :alt: GitHub stars 13 | .. image:: https://github.com/manimino/ducks/workflows/tests/badge.svg 14 | :target: https://github.com/manimino/ducks/actions 15 | :alt: tests Actions Status 16 | .. image:: https://codecov.io/github/manimino/ducks/coverage.svg?branch=main 17 | :target: https://codecov.io/gh/manimino/ducks 18 | :alt: Coverage 19 | .. image:: https://img.shields.io/static/v1?label=license&message=MIT&color=2ea44f 20 | :target: https://github.com/manimino/ducks/blob/main/LICENSE 21 | :alt: license - MIT 22 | .. image:: https://img.shields.io/static/v1?label=python&message=3.7%2B&color=2ea44f 23 | :target: https://github.com/manimino/ducks/ 24 | :alt: python - 3.7+ 25 | 26 | ------- 27 | Install 28 | ------- 29 | 30 | .. code-block:: 31 | 32 | pip install ducks 33 | 34 | ----- 35 | Usage 36 | ----- 37 | 38 | The main container in ducks is called Dex. 39 | 40 | .. code-block:: 41 | 42 | from ducks import Dex 43 | 44 | # make some objects 45 | objects = [ 46 | {'x': 3, 'y': 'a'}, 47 | {'x': 6, 'y': 'b'}, 48 | {'x': 9, 'y': 'c'} 49 | ] 50 | 51 | # Create a Dex containing the objects. 52 | # Index on x and y. 53 | dex = Dex(objects, ['x', 'y']) 54 | 55 | # match objects 56 | dex[{ 57 | 'x': {'>': 5, '<': 10}, # where 5 < x < 10 58 | 'y': {'in': ['a', 'b']} # and y is 'a' or 'b' 59 | }] 60 | # result: [{'x': 6, 'y': 'b'}] 61 | 62 | This is a Dex of dicts, but the objects can be any type, even primitives like strings. 63 | 64 | Dex supports ==, !=, in, not in, <, <=, >, >=. 65 | 66 | The indexes can be dict keys, object attributes, or custom functions. 67 | 68 | See `Quick Start `_ for more examples of all of these. 69 | 70 | -------------- 71 | Is ducks fast? 72 | -------------- 73 | 74 | Yes. Here's how the ducks containers compare to other datastores on an example task. 75 | 76 | .. image:: https://raw.githubusercontent.com/manimino/ducks/main/docs/img/perf_bench.png 77 | :width: 600 78 | 79 | In this benchmark, two million objects are generated. Each datastore is used to find the subset of 200 of them that match 80 | four constraints. The ducks containers Dex and FrozenDex are shown to be very efficient at this, outperforming by 5x and 81 | and 10x respectively. 82 | 83 | Benchmark code is `in the Jupyter notebook `_. 84 | 85 | ---- 86 | Docs 87 | ---- 88 | 89 | `Quick Start `_ covers all the features you need, like 90 | pickling, nested attribute handling, and thread concurrency. 91 | 92 | `How It Works `_ is a deep dive on the implementation details. 93 | 94 | `Demos `_ has short scripts showing example uses. 95 | -------------------------------------------------------------------------------- /build_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pushd docs/ 4 | sphinx-apidoc ../ducks -o .; make html 5 | popd 6 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | coverage: 3 | status: 4 | project: 5 | default: 6 | target: "100" 7 | patch: 8 | default: 9 | target: "100" 10 | -------------------------------------------------------------------------------- /dev/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/dev/__init__.py -------------------------------------------------------------------------------- /dev/measure_ram.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script was used to measure RAM usage of different collection sizes. 3 | This was used during design; it's not relevant to users of Dex. 4 | """ 5 | import os 6 | import subprocess 7 | import sys 8 | from array import array 9 | 10 | import numpy as np 11 | from cykhash import Int64Set 12 | 13 | 14 | TOT_ITEMS = 10**6 15 | 16 | print_names = { 17 | "pytup": "tuple", 18 | "pyset": "set", 19 | "pyarr": "array (int64)", 20 | "cyk": "cykhash Int64Set", 21 | "nparr": "numpy array (int64)", 22 | "btree": "BTrees.LLBTree", 23 | } 24 | 25 | 26 | def get_ram(): 27 | return ( 28 | int( 29 | os.popen(f"ps -o pid,rss -p {os.getpid()}").read().split("\n")[1].split()[1] 30 | ) 31 | * 1024 32 | ) 33 | 34 | 35 | def cyk(items_per=10): 36 | n_sets = TOT_ITEMS // items_per 37 | ls = [None for _ in range(n_sets)] 38 | baseline = get_ram() 39 | for i in range(n_sets): 40 | offset = i * items_per 41 | iset = Int64Set(range(offset, offset + items_per)) 42 | ls[i] = iset 43 | used = get_ram() - baseline 44 | ram = round(used / TOT_ITEMS, 1) 45 | print("cykhash_set", items_per, ram) 46 | 47 | 48 | def nparr(items_per=10): 49 | n_sets = TOT_ITEMS // items_per 50 | ls = [None for _ in range(n_sets)] 51 | baseline = get_ram() 52 | for i in range(n_sets): 53 | offset = i * items_per 54 | ls[i] = np.array(range(offset, offset + items_per)) 55 | used = get_ram() - baseline 56 | ram = round(used / TOT_ITEMS, 1) 57 | print("Numpy_array", items_per, ram) 58 | 59 | 60 | def pyset(items_per=10): 61 | n_sets = TOT_ITEMS // items_per 62 | ls = [None for _ in range(n_sets)] 63 | baseline = get_ram() 64 | for i in range(n_sets): 65 | offset = i * items_per 66 | iset = set(range(offset, offset + items_per)) 67 | ls[i] = iset 68 | used = get_ram() - baseline 69 | ram = round(used / TOT_ITEMS, 1) 70 | print("python_set", items_per, ram) 71 | 72 | 73 | def pytup(items_per=10): 74 | n_tups = TOT_ITEMS // items_per 75 | baseline = get_ram() 76 | ls = [None for _ in range(n_tups)] 77 | for i in range(n_tups): 78 | offset = i * items_per 79 | ls[i] = tuple(range(offset, offset + items_per)) 80 | used = get_ram() - baseline 81 | ram = round(used / TOT_ITEMS, 1) 82 | print("python_tuple", items_per, ram) 83 | 84 | 85 | def pyarr(items_per=10): 86 | n_arrs = TOT_ITEMS // items_per 87 | baseline = get_ram() 88 | ls = [None for _ in range(n_arrs)] 89 | for i in range(n_arrs): 90 | arr = array("q") 91 | offset = i * items_per 92 | arr.extend(range(offset, offset + items_per)) 93 | ls[i] = arr 94 | used = get_ram() - baseline 95 | ram = round(used / TOT_ITEMS, 1) 96 | print("python_array", items_per, ram) 97 | 98 | 99 | def main(method, items_per): 100 | iper = int(items_per) 101 | if method == "pytup": 102 | f = pytup 103 | elif method == "pyset": 104 | f = pyset 105 | elif method == "cyk": 106 | f = cyk 107 | elif method == "nparr": 108 | f = nparr 109 | elif method == "pyarr": 110 | f = pyarr 111 | else: 112 | print("what?!", method) 113 | raise ValueError() 114 | f(iper) 115 | 116 | 117 | def row_dict_to_table(rd): 118 | # makes a github markdown table out of a dict of {row: {column: value}} 119 | # kinda jank looking but pycharm's autoformatter will fix it 120 | for r in rd: 121 | header = "| |" + " | ".join(str(x) for x in rd[r]) 122 | print() 123 | break 124 | print(header + " |") 125 | dashes = ["|---"] 126 | for r in rd: 127 | for k in rd[r]: 128 | dashes.append("-" * (2 + len(str(rd[r][k])))) 129 | break 130 | print("|".join(dashes) + "---|") 131 | for r in rd: 132 | s = "| " + print_names[r] + " | " 133 | s += " | ".join(str(x) for x in rd[r].values()) 134 | print(s + " |") 135 | 136 | 137 | if __name__ == "__main__": 138 | if len(sys.argv) > 1: 139 | main(sys.argv[1], sys.argv[2]) 140 | else: 141 | results = dict() 142 | for method in ["pyset", "pytup", "pyarr", "cyk", "nparr"]: 143 | m_result = dict() 144 | for items_per in [1, 2, 3, 4, 5, 10, 25, 50, 100, 1000, 10000]: 145 | txt = subprocess.check_output( 146 | f"python measure_ram.py {method} {items_per}".split() 147 | ) 148 | res = txt.decode().strip() 149 | _, _, bytes_per = res.split() 150 | m_result[items_per] = bytes_per 151 | results[method] = m_result 152 | print(results) 153 | row_dict_to_table(results) 154 | -------------------------------------------------------------------------------- /dev/multi.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "e8f13c30", 6 | "metadata": {}, 7 | "source": [ 8 | "#### Multi-attribute indexing\n", 9 | "\n", 10 | "The one place SQLite will still have a speed edge is in multidimensional range queries using a multi-attribute index. For equality, no prob - concatenate the values into a tuple and you're good to go. That beats SQLite by a lot, and works on both index types. But `a < 5 and b < 6`, not so much.\n", 11 | "\n", 12 | "Here, let's demo." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 24, 18 | "id": "a7a50a4f", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "\n", 23 | "import random\n", 24 | "from litebox import LiteBox\n", 25 | "from ducks import Dex, FrozenDex\n", 26 | "\n", 27 | "objs = [{'a': random.random(), 'b': random.random()} for _ in range(10**6)]\n", 28 | "lb = LiteBox(objs, {'a': float, 'b': float})\n", 29 | "lb_multi = LiteBox(objs, {'a': float, 'b': float}, index=[('a', 'b')])\n", 30 | "fb = Dex(objs, ['a', 'b'])\n", 31 | "ffb = FrozenDex(objs, ['a', 'b'])\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 25, 37 | "id": "496467f9", 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "528 µs ± 10.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "%%timeit\n", 50 | "lb.find(\"a < 0.001 and b < 0.001\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 26, 56 | "id": "13264dd0", 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "677 µs ± 12.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "%%timeit\n", 69 | "fb[{'a': {'<': 0.001}, 'b': {'<': 0.001}}]" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 27, 75 | "id": "9533ee92", 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "99.1 µs ± 780 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "%%timeit\n", 88 | "ffb[{'a': {'<': 0.001}, 'b': {'<': 0.001}}]" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 29, 94 | "id": "3cd618f2", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "# and now the multi-attribute indexing, blam" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 28, 104 | "id": "3b7529fa", 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "46.7 µs ± 322 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "%%timeit\n", 117 | "lb_multi.find(\"a < 0.001 and b < 0.001\")" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 23, 123 | "id": "b3ba56c6", 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "204 µs ± 3.33 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "%%timeit\n", 136 | "# not gonna beat it with something naive either\n", 137 | "[o for o in ffb[{'a': {'<': 0.001}}] if o['b'] < 0.001]" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "id": "7cfe9510", 143 | "metadata": {}, 144 | "source": [ 145 | "Unfortunately, there's not really a good way to implement a multi-attr index here. \n", 146 | "BTree doesn't support multi-attribute lookups afaik.\n", 147 | "\n", 148 | "So we're kinda stuck." 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "id": "dab0ec6e", 154 | "metadata": {}, 155 | "source": [ 156 | "Contrary to popular belief, you can't just \"concatenate the keys and use a regular BTree\". At least, not with this implementation; it doesn't support separate lookups for \"parts\" of a key, so you'd be treating the whole key as one object. \n", 157 | "\n", 158 | "But! We can make a BTree of `{obj: obj}`, so `BTree{key1: BTree{key2: values}}` could work. Except that when `key1`'s values are all unique... you get a whole ton of BTrees.\n", 159 | "\n", 160 | "OK, so we still don't have a good idea. Making a multi-attribute BTree out of a single-attribute one doesn't seem doable.\n", 161 | "\n", 162 | "The best hack I can think of is:\n", 163 | " - Build the tree on concatenated keys `{(key1, key2): values}` \n", 164 | " - Get {keys: values} in the range `(k1_min, -inf) < (k1, k2) < (k1_max, inf)`. \n", 165 | " - Post-filter keys that don't match the k2 constraint.\n", 166 | " - Return only the values with matching keys.\n", 167 | "\n", 168 | "The order bound isn't living up to tree standards, but I bet it would be passable most of the time anyway. Probably beats doing a separate search on key2 and intersecting the results.\n", 169 | "\n", 170 | "The `-inf / inf` values would need to be some type-independent thing. `None` is always small in BTrees so that could be the lower bound.\n", 171 | "\n", 172 | "Could cram it in at the value level instead? `BTree({key1: [(key2, val), (key2, val) ...]` Avoids the awkward comparisons. Burns some RAM though. And it's really equivalent to just using one index and doing the rest in a list comprehension outside the container. \n", 173 | "\n", 174 | "### todo\n", 175 | "think about the frozen arrays and how you would implement it there. That might give good insights.\n", 176 | "Sparse ndarrays maybe? Quad / octrees?" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 3, 182 | "id": "5f91c00a", 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "from ducks.btree import BTree\n", 187 | "from random import random\n" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 5, 193 | "id": "7b796fee", 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "objs = [\n", 198 | " {'i': i, 'a': random()*10, 'b': random()} for i in range(10**3)\n", 199 | "]\n", 200 | "\n", 201 | "# Task: Find objs where 1 < a < 2 and 0.5 < b < 0.6." 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 6, 207 | "id": "ad908e8a", 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "tree = BTree()\n", 212 | "for o in objs:\n", 213 | " tree[o['a']] = (o['b'], o)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 10, 219 | "id": "6274a4a2", 220 | "metadata": { 221 | "scrolled": true 222 | }, 223 | "outputs": [ 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "{'i': 191, 'a': 1.0570674936431124, 'b': 0.5649662294471903}\n", 229 | "{'i': 437, 'a': 1.3542792185455155, 'b': 0.5753256982901156}\n", 230 | "{'i': 185, 'a': 1.401839984653963, 'b': 0.5310477476841865}\n", 231 | "{'i': 772, 'a': 1.44039489179562, 'b': 0.5176671572926902}\n", 232 | "{'i': 457, 'a': 1.469287583082859, 'b': 0.5475469700864543}\n", 233 | "{'i': 943, 'a': 1.5722080241319658, 'b': 0.5615369447345585}\n", 234 | "{'i': 231, 'a': 1.6165395202332788, 'b': 0.5551452004632332}\n", 235 | "{'i': 92, 'a': 1.7698873658963565, 'b': 0.5834212111319615}\n", 236 | "{'i': 392, 'a': 1.834056255838259, 'b': 0.545838844154715}\n", 237 | "{'i': 691, 'a': 1.8549647165079397, 'b': 0.5517766855664482}\n" 238 | ] 239 | } 240 | ], 241 | "source": [ 242 | "for b, o in tree.get_range_expr({'>': 1, '<': 2}):\n", 243 | " if b < 0.6 and b > 0.5:\n", 244 | " print(o)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 12, 250 | "id": "634cfaf3", 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "# attributes are hashable, so this could work as a dict too. But that's less general.\n", 255 | "# Or parallel arrays, one for each attribute, plus one for the object ID. Nah, too hard to add/remove items.\n" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "id": "7c03ff5d", 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [] 265 | } 266 | ], 267 | "metadata": { 268 | "kernelspec": { 269 | "display_name": "Python 3 (ipykernel)", 270 | "language": "python", 271 | "name": "python3" 272 | }, 273 | "language_info": { 274 | "codemirror_mode": { 275 | "name": "ipython", 276 | "version": 3 277 | }, 278 | "file_extension": ".py", 279 | "mimetype": "text/x-python", 280 | "name": "python", 281 | "nbconvert_exporter": "python", 282 | "pygments_lexer": "ipython3", 283 | "version": "3.9.7" 284 | } 285 | }, 286 | "nbformat": 4, 287 | "nbformat_minor": 5 288 | } 289 | -------------------------------------------------------------------------------- /dev/multicolumn.md: -------------------------------------------------------------------------------- 1 | ### Multicolumn indexing idea 2 | 3 | User supplies a tuple of fields, like [a, b, c]. 4 | 5 | We don't simply store the tuple of values, no no. We store a class, with like **slots** or something to make it ram 6 | efficient. 7 | 8 | The key here is that the class has **gt** defined, like so: 9 | 10 | ``` 11 | class MultiObject(): 12 | 13 | def __init__(self, tuple): 14 | self.tuple = tuple # holds a value for each field 15 | 16 | def __gt__(self, tuple): 17 | # TODO: this is possible totally wrong, just wanted to jot down the idea real quick. 18 | # Will evaluate it more srsly later. 19 | return tuple[0] > self.tuple[0] and tuple[1] > self.tuple[1] and tuple[2] > self.tuple[2] 20 | ``` 21 | 22 | Then you can use an ordinary BTree or numpy array to compare these objects. 23 | 24 | Update works intuitively, you've got add / remove, EZPZ. 25 | 26 | I think you can define **gt** in such a way as it allows prefix queries. Not sure if that's a good design choice. 27 | 28 | But then what is **lt**? I think there's too many constraints on **gt** here, it doesn't quite work as written. 29 | 30 | Yeah, this seems like a bad idea. Leaving it here for now as it might be the beginnings of a good idea. 31 | 32 | TODO: Look at the Pandas multicolumn index implementation, that should be a much better starting point. 33 | -------------------------------------------------------------------------------- /dev/munchmark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5e17014a", 6 | "metadata": {}, 7 | "source": [ 8 | "## Benchmarks\n", 9 | " - Range query, 1 attribute\n", 10 | " - Range query, 2 attributes\n", 11 | " " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "id": "9c246cee", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "from dexer import Dexer, FrozenDexer\n", 22 | "import time\n", 23 | "import pandas as pd\n", 24 | "from timeit import timeit, repeat\n", 25 | "from litebox import LiteBox\n", 26 | "from statistics import stdev, mean" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 156, 32 | "id": "b7f1d9b9", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 178, 40 | "id": "6c0cce05", 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "1000000\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "cookies = []\n", 53 | "FLAVORS = ['Peanut', 'Chocolate', 'Macadamia', 'Almond', 'Cinnamon', \n", 54 | " 'Butter', 'Caramel', 'Fudge', 'Candy', 'Mystery']\n", 55 | "COLORS = ['Red', 'Orange', 'Yellow', 'Green', 'Blue',\n", 56 | " 'Purple', 'Rainbow', 'Black', 'White', 'Invisible']\n", 57 | "TAGS = [str(i).zfill(4) for i in range(10**4)]\n", 58 | "cookies = []\n", 59 | "num = 0\n", 60 | "for i in range(10**6):\n", 61 | " \n", 62 | " cookie = {\n", 63 | " 'num': num,\n", 64 | " 'size': i % 10**6, \n", 65 | " 'chips': i % 10**5, \n", 66 | " 'sugar': i % 10**4, \n", 67 | " 'flavor': FLAVORS[i % 10], \n", 68 | " 'tag': TAGS[(i // 10) % len(TAGS)],\n", 69 | " 'color': COLORS[(i // 100) % 10]\n", 70 | " }\n", 71 | " cookies.append(cookie)\n", 72 | " num += 1\n", 73 | "\n", 74 | "\n", 75 | "print(len(cookies))" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 179, 81 | "id": "7d3c129b", 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "CPU times: user 1.03 s, sys: 23.3 ms, total: 1.05 s\n", 89 | "Wall time: 1.05 s\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "%%time\n", 95 | "df = pd.DataFrame(cookies)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 180, 101 | "id": "30f155ea", 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "name": "stdout", 106 | "output_type": "stream", 107 | "text": [ 108 | "CPU times: user 6.67 s, sys: 113 ms, total: 6.78 s\n", 109 | "Wall time: 6.79 s\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "%%time\n", 115 | "lb = LiteBox(cookies, {'num': int, 'size': int, 'chips': int, 'sugar': int, 'flavor': str, 'tag': str, 'color': str})" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 181, 121 | "id": "b8087f4e", 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "CPU times: user 9.47 s, sys: 52.8 ms, total: 9.52 s\n", 129 | "Wall time: 9.53 s\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "%%time\n", 135 | "dex = Dexer(cookies, on=['num', 'size', 'chips', 'sugar' 'flavor', 'tag', 'color'])" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 182, 141 | "id": "4d6ac177", 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "CPU times: user 6.14 s, sys: 32.9 ms, total: 6.17 s\n", 149 | "Wall time: 6.17 s\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "%%time\n", 155 | "fdex = FrozenDexer(cookies, on=['num', 'size', 'chips', 'sugar' 'flavor', 'tag', 'color'])" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "id": "e8c325f7", 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 171, 169 | "id": "f9ccfa9c", 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "# One-attribute range query returning 100 results\n", 174 | "\n", 175 | "def find_range1_df():\n", 176 | " return df[df['size'] < 100]\n", 177 | "\n", 178 | "def find_range1_lc():\n", 179 | " return [o for o in cookies if o['size'] < 100]\n", 180 | "\n", 181 | "def find_range1_lb():\n", 182 | " return lb.find('size < 100')\n", 183 | "\n", 184 | "def find_range1_dex():\n", 185 | " return dex.find({\n", 186 | " 'size': {'<': 100},\n", 187 | " })\n", 188 | "\n", 189 | "def find_range1_fdex():\n", 190 | " return fdex.find({\n", 191 | " 'size': {'<': 100},\n", 192 | " })\n", 193 | "\n", 194 | "RANGE1 = [find_range1_df, find_range1_lc, find_range1_lb, find_range1_dex, find_range1_fdex]" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 172, 200 | "id": "420d4187", 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | "100\n", 208 | "100\n", 209 | "100\n", 210 | "100\n", 211 | "100\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "for f in RANGE1:\n", 217 | " print(len(f()))\n" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 173, 223 | "id": "884272fd", 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "# Two-attribute range query returning 100 results\n", 228 | "def find_range2_df():\n", 229 | " return df[(df['size'] < 100) & (df['chips'] < 100)]\n", 230 | "\n", 231 | "def find_range2_lc():\n", 232 | " return [o for o in cookies if o['size'] < 1000 and o['chips'] < 100]\n", 233 | "\n", 234 | "def find_range2_lb():\n", 235 | " return lb.find('size < 1000 and chips < 100')\n", 236 | "\n", 237 | "def find_range2_dex():\n", 238 | " return dex.find({\n", 239 | " 'size': {'<': 1000},\n", 240 | " 'chips': {'<': 100},\n", 241 | " })\n", 242 | "\n", 243 | "def find_range2_fdex():\n", 244 | " return fdex.find({\n", 245 | " 'size': {'<': 1000},\n", 246 | " 'chips': {'<': 100},\n", 247 | " })\n", 248 | "\n", 249 | "RANGE2 = [find_range2_df, find_range2_lc, find_range2_lb, find_range2_dex, find_range2_fdex]" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 174, 255 | "id": "534be384", 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "100\n", 263 | "100\n", 264 | "100\n", 265 | "100\n", 266 | "100\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "for f in RANGE2:\n", 272 | " print(len(f()))\n" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "id": "59675ff1", 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "id": "5e8fad87", 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 187, 294 | "id": "3b443b72", 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "# One-attribute exact match query returning 100 results\n", 299 | "def find_eq1_df():\n", 300 | " return df[(df['tag'] == '1111')]\n", 301 | "\n", 302 | "def find_eq1_lc():\n", 303 | " return [o for o in cookies if o['tag'] == '1111']\n", 304 | "\n", 305 | "def find_eq1_lb():\n", 306 | " return lb.find('tag == \"1111\"')\n", 307 | "\n", 308 | "def find_eq1_dex():\n", 309 | " return dex.find({\n", 310 | " 'tag': '1111'\n", 311 | " })\n", 312 | "\n", 313 | "def find_eq1_fdex():\n", 314 | " return fdex.find({\n", 315 | " 'tag': '1111'\n", 316 | " })\n", 317 | "\n", 318 | "EQ1 = [find_eq1_df, find_eq1_lc, find_eq1_lb, find_eq1_dex, find_eq1_fdex]" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "id": "c3bbca8a", 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "id": "e1cbc5b2", 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 148, 340 | "id": "07b5c96c", 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "def run_timings(f, result_len, n_times=10):\n", 345 | " assert len(f()) == result_len\n", 346 | " return min(repeat(f, number=n_times))/n_times\n" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 188, 352 | "id": "84c6ec42", 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "name": "stdout", 357 | "output_type": "stream", 358 | "text": [ 359 | "find_range1_df\n", 360 | "find_range1_lc\n", 361 | "find_range1_lb\n", 362 | "find_range1_dex\n", 363 | "find_range1_fdex\n", 364 | "find_range2_df\n", 365 | "find_range2_lc\n", 366 | "find_range2_lb\n", 367 | "find_range2_dex\n", 368 | "find_range2_fdex\n", 369 | "find_eq1_df\n", 370 | "find_eq1_lc\n", 371 | "find_eq1_lb\n", 372 | "find_eq1_dex\n", 373 | "find_eq1_fdex\n" 374 | ] 375 | } 376 | ], 377 | "source": [ 378 | "n_repeat = 100\n", 379 | "results = dict()\n", 380 | "for f in RANGE1 + RANGE2 + EQ1:\n", 381 | " print(f.__name__)\n", 382 | " results[f.__name__] = run_timings(f, 100)" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "id": "635245dc", 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 189, 396 | "id": "6783869e", 397 | "metadata": {}, 398 | "outputs": [ 399 | { 400 | "name": "stdout", 401 | "output_type": "stream", 402 | "text": [ 403 | "find_range1_df 0.0009101113071665167\n", 404 | "find_range1_lc 0.051943747501354665\n", 405 | "find_range1_lb 5.443879636004567e-05\n", 406 | "find_range1_dex 4.8754794988781214e-05\n", 407 | "find_range1_fdex 1.6833702102303504e-05\n", 408 | "find_range2_df 0.0022733806050382554\n", 409 | "find_range2_lc 0.0498123213998042\n", 410 | "find_range2_lb 0.0003733630990609527\n", 411 | "find_range2_dex 0.0005058821989223361\n", 412 | "find_range2_fdex 6.0588796623051165e-05\n", 413 | "find_eq1_df 0.04267703660298139\n", 414 | "find_eq1_lc 0.04795632830355316\n", 415 | "find_eq1_lb 5.073370411992073e-05\n", 416 | "find_eq1_dex 1.3539998326450586e-05\n", 417 | "find_eq1_fdex 1.1276802979409695e-05\n" 418 | ] 419 | } 420 | ], 421 | "source": [ 422 | "for r, t in results.items():\n", 423 | " print(r, t)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "id": "acfa6774", 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "id": "9d5b4eb5", 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "id": "d57de5f1", 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "# One-attribute 'in' query, 1000 results\n", 450 | "match_tags = [str(t).zfill(4) for t in range(1000, 1010)]\n", 451 | "\n", 452 | "def find_in1_df():\n", 453 | " return df[(df['tag'] in match_tags)]\n", 454 | "\n", 455 | "def find_in1_lc():\n", 456 | " return [o for o in cookies if o['tag'] in match_tags]\n", 457 | "\n", 458 | "def find_in1_lb():\n", 459 | " return lb.find(f'tag in {match_tags}')\n", 460 | "\n", 461 | "def find_in1_dex():\n", 462 | " return dex.find({\n", 463 | " 'tag': {'in': match_tags}\n", 464 | " })\n", 465 | "\n", 466 | "def find_in1_fdex():\n", 467 | " return fdex.find({\n", 468 | " 'tag': {'in': match_tags}\n", 469 | " })\n", 470 | "\n", 471 | "IN1 = [find_in1_df, find_in1_lc, find_in1_lb, find_in1_dex, find_in1_fdex]" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "id": "6954be5b", 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [] 481 | } 482 | ], 483 | "metadata": { 484 | "kernelspec": { 485 | "display_name": "Python 3 (ipykernel)", 486 | "language": "python", 487 | "name": "python3" 488 | }, 489 | "language_info": { 490 | "codemirror_mode": { 491 | "name": "ipython", 492 | "version": 3 493 | }, 494 | "file_extension": ".py", 495 | "mimetype": "text/x-python", 496 | "name": "python", 497 | "nbconvert_exporter": "python", 498 | "pygments_lexer": "ipython3", 499 | "version": "3.9.7" 500 | } 501 | }, 502 | "nbformat": 4, 503 | "nbformat_minor": 5 504 | } 505 | -------------------------------------------------------------------------------- /dev/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/dev/output.png -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | # -- Path setup -------------------------------------------------------------- 7 | # If extensions (or modules to document with autodoc) are in another directory, 8 | # add these directories to sys.path here. If the directory is relative to the 9 | # documentation root, use os.path.abspath to make it absolute, like shown here. 10 | # 11 | import os 12 | import sys 13 | 14 | sys.path.insert(0, os.path.abspath(".")) 15 | sys.path.insert(0, os.path.abspath("..")) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = "ducks" 21 | copyright = "2022, Theo Walker" 22 | author = "Theo Walker" 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = "0.5.1" 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon"] 34 | 35 | # Add any paths that contain templates here, relative to this directory. 36 | templates_path = ["_templates"] 37 | 38 | # List of patterns, relative to source directory, that match files and 39 | # directories to ignore when looking for source files. 40 | # This pattern also affects html_static_path and html_extra_path. 41 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 42 | 43 | 44 | # -- Options for HTML output ------------------------------------------------- 45 | 46 | # The theme to use for HTML and HTML Help pages. See the documentation for 47 | # a list of builtin themes. 48 | # 49 | html_theme = "sphinx_rtd_theme" 50 | 51 | # Add any paths that contain custom static files (such as style sheets) here, 52 | # relative to this directory. They are copied after the builtin static files, 53 | # so a file named "default.css" will overwrite the builtin "default.css". 54 | html_static_path = ["_static"] 55 | 56 | html_favicon = "favicon.ico" 57 | 58 | 59 | def skip(app, what, name, obj, would_skip, options): 60 | if name in ["__getitem__", "__init__"]: 61 | return False 62 | return would_skip 63 | 64 | 65 | def setup(app): 66 | app.connect("autodoc-skip-member", skip) 67 | -------------------------------------------------------------------------------- /docs/demos.rst: -------------------------------------------------------------------------------- 1 | .. _demos: 2 | 3 | ===== 4 | Demos 5 | ===== 6 | 7 | These are mini-projects that demonstrate using ``ducks`` in applications. 8 | 9 | * `Auto-updating `_ - Keep Dex updated when objects change 10 | * `Wordle solver `_ - Solve string matching problems faster than regex 11 | * `Collision detection `_ - Find objects based on type and proximity (grid-based) 12 | * `Percentiles `_ - Find by percentile (median, p99, etc.) 13 | -------------------------------------------------------------------------------- /docs/ducks.concurrent.rst: -------------------------------------------------------------------------------- 1 | ducks.concurrent package 2 | ======================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | ducks.concurrent.main module 8 | ---------------------------- 9 | 10 | .. automodule:: ducks.concurrent.main 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: ducks.concurrent 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/ducks.frozen.rst: -------------------------------------------------------------------------------- 1 | ducks.frozen package 2 | ==================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | ducks.frozen.frozen\_attr module 8 | -------------------------------- 9 | 10 | .. automodule:: ducks.frozen.frozen_attr 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | ducks.frozen.init\_helpers module 16 | --------------------------------- 17 | 18 | .. automodule:: ducks.frozen.init_helpers 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | ducks.frozen.main module 24 | ------------------------ 25 | 26 | .. automodule:: ducks.frozen.main 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | ducks.frozen.utils module 32 | ------------------------- 33 | 34 | .. automodule:: ducks.frozen.utils 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | Module contents 40 | --------------- 41 | 42 | .. automodule:: ducks.frozen 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | -------------------------------------------------------------------------------- /docs/ducks.mutable.rst: -------------------------------------------------------------------------------- 1 | ducks.mutable package 2 | ===================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | ducks.mutable.main module 8 | ------------------------- 9 | 10 | .. automodule:: ducks.mutable.main 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | ducks.mutable.mutable\_attr module 16 | ---------------------------------- 17 | 18 | .. automodule:: ducks.mutable.mutable_attr 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: ducks.mutable 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/ducks.rst: -------------------------------------------------------------------------------- 1 | ducks package 2 | ============= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | ducks.concurrent 11 | ducks.frozen 12 | ducks.mutable 13 | 14 | Submodules 15 | ---------- 16 | 17 | ducks.btree module 18 | ------------------ 19 | 20 | .. automodule:: ducks.btree 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | ducks.constants module 26 | ---------------------- 27 | 28 | .. automodule:: ducks.constants 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | 33 | ducks.exceptions module 34 | ----------------------- 35 | 36 | .. automodule:: ducks.exceptions 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | 41 | ducks.pickling module 42 | --------------------- 43 | 44 | .. automodule:: ducks.pickling 45 | :members: 46 | :undoc-members: 47 | :show-inheritance: 48 | 49 | ducks.utils module 50 | ------------------ 51 | 52 | .. automodule:: ducks.utils 53 | :members: 54 | :undoc-members: 55 | :show-inheritance: 56 | 57 | Module contents 58 | --------------- 59 | 60 | .. automodule:: ducks 61 | :members: 62 | :undoc-members: 63 | :show-inheritance: 64 | -------------------------------------------------------------------------------- /docs/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/docs/favicon.ico -------------------------------------------------------------------------------- /docs/how_it_works.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | How ducks works 3 | =============== 4 | 5 | For each attribute in a Dex, it holds a B-tree that maps every unique value to the objects with that value. 6 | 7 | This is a rough idea of the data structure: 8 | 9 | .. code-block:: 10 | 11 | class Dex: 12 | indexes = { 13 | 'attribute1': BTree({10: set(some_obj_ids), 20: set(other_obj_ids)}), 14 | 'attribute2': BTree({'abc': set(some_obj_ids), 'def': set(other_obj_ids)}), 15 | } 16 | obj_map = {obj_ids: objects} 17 | } 18 | 19 | During a lookup, the object ID sets matching each query value are retrieved. The set operations `union`, 20 | `intersect`, and `difference` are applied to get the matching object IDs. Finally, the object IDs are converted 21 | to objects and returned. 22 | 23 | In practice, Dex and FrozenDex have a bit more to them, as they are optimized to have much better 24 | memory usage and speed than a naive implementation. For example, FrozenDex makes heavy use of sorted Numpy arrays. 25 | 26 | ------------- 27 | Dex internals 28 | ------------- 29 | 30 | Here's more detailed pseudocode of a Dex: 31 | 32 | .. code-block:: 33 | 34 | class Dex: 35 | # holds each attribute index and the id-to-object map 36 | indexes = { 37 | 'attr1': MutableAttrIndex(), 38 | 'attr2': MutableAttrIndex() 39 | } 40 | 'obj_map': {obj_ids: objects} 41 | } 42 | 43 | 44 | class MutableAttrIndex: 45 | # maps the values for one attribute to object IDs 46 | tree = BTree({ 47 | val1: set_like(some_obj_ids), 48 | val2: set_like(other_obj_ids) 49 | }) 50 | ``` 51 | 52 | To run a query: 53 | 54 | #. Dex breaks the query down into individual attribute value lookups. 55 | #. The object IDs associated with the query attribute values are retrieved from MutableAttrIndex. 56 | #. The set-like containers are converted to sets if needed. 57 | #. Operations like `intersect` are performed on the sets to get the final object IDs. 58 | #. The object IDs are mapped to objects, which are then returned. 59 | 60 | Memory efficiency 61 | ================= 62 | 63 | That "set-like container" is there for memory efficiency reasons. Imagine building an index on a million distinct 64 | values. If actual sets were used, we'd get a million sets of size 1. Collections have a lot of overhead, so that would 65 | be a poor choice. We can do better. 66 | 67 | Memory usage of different collections 68 | ===================================== 69 | 70 | First, let's do some measuring of collection overhead. We'll store a large number of distinct int64s in collections of 71 | each type, vary the size of the collections, and check the memory usage per object. 72 | 73 | We expect bigger collections to be more efficient (fewer bytes per object). Ten million sets of size 1 should 74 | take up more RAM than ten sets of size 1 million. 75 | 76 | Bytes per entry for each collection type and size: 77 | 78 | 79 | +-----------------------+---------+---------+---------+---------+---------+--------+---------+ 80 | | | 1 | 2 | 5 | 10 | 25 | 50 | 100 | 81 | +=======================+=========+=========+=========+=========+=========+========+=========+ 82 | | set | 260.1 | 146.3 | 195.0 | 113.8 | 124.2 | 78.3 | 116.9 | 83 | +-----------------------+---------+---------+---------+---------+---------+--------+---------+ 84 | | tuple | 89.7 | 69.4 | 50.9 | 47.0 | 43.1 | 41.8 | 41.1 | 85 | +-----------------------+---------+---------+---------+---------+---------+--------+---------+ 86 | | cykhash Int64Set | 160.1 | 79.9 | 38.1 | 25.3 | 15.5 | 23.5 | 22.4 | 87 | +-----------------------+---------+---------+---------+---------+---------+--------+---------+ 88 | | numpy array (int64) | 161.1 | 80.3 | 35.0 | 22.1 | 13.5 | 10.9 | 9.4 | 89 | +-----------------------+---------+---------+---------+---------+---------+--------+---------+ 90 | | array (int64) | 106.0 | 53.2 | 28.0 | 21.0 | 11.6 | 10.6 | 9.1 | 91 | +-----------------------+---------+---------+---------+---------+---------+--------+---------+ 92 | 93 | That table tells us a story. 94 | 95 | * Small collections of any type are extremely inefficient. Don't make collections of size 1. 96 | * Immutable collections are cheaper. Tuples, arrays, and numpy arrays cost less memory than the set types. 97 | * Typed collections are cheaper. Numpy arrays and `cykhash `_ Int64Sets are cheaper 98 | than tuples or Python sets. 99 | 100 | The best collection in terms of memory usage is a big array. But Dex is mutable; we need to add and remove 101 | objects in a few microseconds. Rewriting a big array on change is too slow. So we'll save the arrays for 102 | FrozenDex. So the single best one for Dex is cykhash Int64Set. By why pick just one? 103 | 104 | Blending collection types 105 | ========================= 106 | 107 | For smaller collections, below ~10 numbers, cykhash is a bit inefficient, so we use Python 108 | int64 arrays there instead. The arrays are immutable, but it's fast to discard a small array and make another one when 109 | changes occur. 110 | 111 | And for collections of size 1, we just store the number, no container needed! That takes 28 bytes. 112 | 113 | So the code is a bit more complex than the pseudocode above, in order to keep collection overhead from filling RAM. 114 | 115 | Here is the table again. Dex (bottom line) uses cykhash, array, and integer types to stay RAM-efficient at all 116 | collection sizes. 117 | 118 | +--------------------+---------+---------+---------+--------+---------+--------+---------+ 119 | | | 1 | 2 | 5 | 10 | 25 | 50 | 100 | 120 | +====================+=========+=========+=========+========+=========+========+=========+ 121 | | set | 260.1 | 146.3 | 195.0 | 113.8 | 124.2 | 78.3 | 116.9 | 122 | +--------------------+---------+---------+---------+--------+---------+--------+---------+ 123 | | cykhash Int64Set | 160.1 | 79.9 | 38.1 | 25.3 | 15.5 | 23.5 | 22.4 | 124 | +--------------------+---------+---------+---------+--------+---------+--------+---------+ 125 | | array (int64) | 106.0 | 53.2 | 28.0 | 21.0 | 11.6 | 10.6 | 9.1 | 126 | +--------------------+---------+---------+---------+--------+---------+--------+---------+ 127 | | FilterBox storage | 28.0 | 53.2 | 28.0 | 21.0 | 15.5 | 23.5 | 22.4 | 128 | +--------------------+---------+---------+---------+--------+---------+--------+---------+ 129 | 130 | That's 4 to 10 times better than naively using Python sets to store ints. There's no tradeoff; 131 | Int64Set operations are about as fast as Python sets. 132 | 133 | ------------------- 134 | FrozenDex Internals 135 | ------------------- 136 | 137 | The FrozenDex implementation is very different from Dex. It is able to achieve better speed and lower memory usage 138 | by using data structures that don't support changes. 139 | 140 | FrozenDex pseudocode: 141 | 142 | .. code-block:: 143 | 144 | class FrozenDex: 145 | # holds each attribute index and an array of objects 146 | indexes = { 147 | 'attr1': FrozenAttrIndex(), 148 | 'attr2': FrozenAttrIndex() 149 | } 150 | 'objects': np.array(dtype="O") 151 | } 152 | 153 | class FrozenAttrIndex: 154 | # maps the values for a single attribute to indexes in the 'objects' array 155 | 156 | # parallel arrays store attribute values and object indices 157 | val_arr = np.array(attribute value for each object) # sorted by val_arr 158 | obj_idx_arr = np.array(index in objects array for each object) # sorted by val_arr 159 | 160 | # but if a value has lots of objects, store it in this tree instead 161 | tree = BTree({ 162 | value: np.array(sorted_obj_arr_indexes) 163 | }) 164 | 165 | Key points: 166 | 167 | * The objects are stored in a Numpy array in FrozenDex 168 | * Each FrozenAttrIndex maps values to object array indexes 169 | * FrozenAttrIndex has two different ways to do that mapping - parallel arrays and BTree 170 | 171 | Note that there are no "set" types anywhere here - so how do set operations like intersect work? 172 | 173 | Sorted arrays are sets 174 | ====================== 175 | 176 | If you have the arrays: 177 | 178 | .. code-block:: 179 | 180 | [1, 3, 5, 7, 9] 181 | [1, 2, 3, 4, 5, 6, 7] 182 | 183 | What is their intersection? Do you need to convert them to sets to figure it out? 184 | 185 | Of course not -- sorted array intersection is easy. It can be solved by iterating over both lists, advancing 186 | the pointer of the smaller value each time, and outputting the matches. 187 | `Galloping search `_ can make this even faster. It is faster than 188 | computing the intersection of hashsets. 189 | 190 | FrozenDex uses a great package called 191 | `sortednp `_ that implements fast set operations on sorted numpy arrays. 192 | So once we have the object indexes for each part of a query, ``sortednp.intersect`` and friends will get us the final 193 | object indexes. 194 | 195 | Sorted arrays are trees 196 | ======================= 197 | 198 | FrozenDex uses sorted arrays in another way - to store values. Bisecting an array to find a value is similar to 199 | traversing a tree. Range queries are easy on sorted value arrays as well. 200 | 201 | So, a FrozenAttrIndex has a pair of arrays, one containing values in sorted order, and the other containing 202 | the object indexes for those values. Looking up the object indexes for a value or range of values is straightforward. 203 | 204 | That's not the only way FrozenDex maps values to objects, though. Just as Dex uses different containers depending on 205 | length, so too does FrozenDex. 206 | 207 | When a value has many associated objects, storing the value repeatedly in an array is clearly inefficient. 208 | So values that have many objects are stored in a BTree lookup instead. The BTree maps values to arrays of object 209 | indexes. 210 | 211 | We can't use the BTree for everything -- if a value is associated with only a few objects, allocating a numpy array to 212 | store the object indexes would incur lots of overhead. So having both data structures is the right way to go. 213 | 214 | Integer types 215 | ============= 216 | 217 | And there's one last optimization. The indexes are stored in `uint32` arrays if there are less than a few 218 | billion objects, which is usually the case. `uint32` operations are a little faster than `uint64`, in addition to being 219 | more RAM-efficient. FrozenDex will automatically select `uint64` when there are too many objects for 32-bit addressing. 220 | 221 | Thanks to these optimizations, FrozenDex is a very efficient tool. 222 | 223 | ----------------------- 224 | ConcurrentDex Internals 225 | ----------------------- 226 | 227 | ConcurrentDex contains: 228 | 229 | * an instance of Dex 230 | * a `readerwriterlock `_ 231 | 232 | It exposes each method of the Dex, wrapped in the appropriate lock type using `with read_lock()` or 233 | `with write_lock()`. 234 | 235 | Performance 236 | =========== 237 | 238 | Each lock operation adds about 5µs. Not huge, but it does add up when doing many operations in a row. 239 | 240 | For this reason, the ``read_lock()`` and ``write_lock()`` methods are exposed. 241 | 242 | This allows patterns like: 243 | 244 | .. code-block:: 245 | 246 | cdex = ConcurrentDex(...) 247 | with cdex.write_lock() 248 | for item in a_million_items: 249 | cdex.box.add(item) # cdex.box is the underlying Dex. 250 | 251 | which are faster than calling ``cdex.add()`` many times. 252 | 253 | By default, ConcurrentDex favors readers, allowing multiple readers to share a lock. Writers wait for all 254 | readers to release the lock. This behavior is customizable on init via the ``priority`` kwarg. 255 | 256 | Reasons to trust it 257 | =================== 258 | 259 | Concurrency bugs are notoriously tricky to find. ConcurrentDex is unlikely to have them because: 260 | 261 | * It uses a very simple, coarse-grained concurrency that locks the whole object at once 262 | * It's built on a widely-used lock library 263 | * There are concurrent operation tests that succeed on ConcurrentDex and fail on Dex, proving the 264 | locks are working properly (see ``tests/concurrent``). 265 | -------------------------------------------------------------------------------- /docs/img/ducks-main.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/docs/img/ducks-main.png -------------------------------------------------------------------------------- /docs/img/perf_bench.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/docs/img/perf_bench.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. ducks documentation master file, created by 2 | sphinx-quickstart on Fri Aug 5 07:18:13 2022. 3 | 4 | .. include:: ../README.rst 5 | 6 | 7 | Contents 8 | ======== 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | 13 | Home 14 | quick_start 15 | demos 16 | how_it_works 17 | ducks 18 | 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | ducks 2 | ===== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | ducks 8 | -------------------------------------------------------------------------------- /docs/quick_start.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Quick Start 3 | =========== 4 | 5 | ----------- 6 | Basic Usage 7 | ----------- 8 | 9 | The main container in ducks is called Dex. 10 | 11 | .. code-block:: 12 | 13 | from ducks import Dex 14 | 15 | # make some objects 16 | objects = [ 17 | {'x': 3, 'y': 'a'}, 18 | {'x': 6, 'y': 'b'}, 19 | {'x': 9, 'y': 'c'} 20 | ] 21 | 22 | # Create a Dex containing the objects. 23 | # Index on x and y. 24 | dex = Dex(objects, ['x', 'y']) 25 | 26 | # match objects 27 | dex[{ 28 | 'x': {'>': 5, '<': 10}, # where 5 < x < 10 29 | 'y': {'in': ['a', 'b']} # and y is 'a' or 'b' 30 | }] 31 | # result: [{'x': 6, 'y': 'b'}] 32 | 33 | This is a Dex of dicts, but the objects can be any type. 34 | 35 | Dex supports ==, !=, in, not in, <, <=, >, >=. 36 | 37 | The indexes can be dict keys, object attributes, or custom functions. 38 | 39 | Alternative forms: 40 | * ``{'a': 1}`` may be used in place of ``{'a': {'==': 1}}`` 41 | * ``{'a': [1, 2, 3]}`` may be used in place of ``{'a': {'in': [1, 2, 3]}}`` 42 | * ``eq``, ``ge``, ``gt``, and so on can be used in place of ``==``, ``>=``, ``>`` 43 | 44 | ------------------- 45 | Add, remove, update 46 | ------------------- 47 | 48 | Dex supports add, remove, and update of objects. 49 | 50 | .. code-block:: 51 | 52 | from ducks import Dex 53 | 54 | class Thing: 55 | def __init__(self): 56 | self.x = 1 57 | self.y = 1 58 | 59 | def __repr__(self): 60 | return f"Thing(x: {self.x}, y: {self.y})" 61 | 62 | # make an empty Dex 63 | dex = Dex([], ['x', 'y']) 64 | 65 | # add an object 66 | obj = Thing() 67 | dex.add(obj) 68 | print(dex[{'x': 1}]) # find it 69 | 70 | # update it 71 | obj.x = 2 72 | dex.update(obj) 73 | print(dex[{'x': 2}]) # find updated obj 74 | 75 | # remove it 76 | dex.remove(obj) 77 | print(list(dex)) # dex now contains no objects 78 | 79 | Update notifies Dex that an object's attributes have changed, so the index can be updated accordingly. 80 | There's an example in :ref:`demos` of how to automatically update Dex when objects change. 81 | 82 | --------- 83 | FrozenDex 84 | --------- 85 | 86 | If you don't need add, remove, or update, use a FrozenDex instead. 87 | It is used just like a Dex, but it's faster and more memory-efficient. 88 | 89 | .. code-block:: 90 | 91 | from ducks import FrozenDex 92 | 93 | dex = FrozenDex([{'a': 1, 'b': 2}], ['a']) 94 | dex[{'a': 1}] # result: [{'a': 1, 'b': 2}] 95 | 96 | FrozenDex is thread-safe because it does not allow writes. 97 | 98 | ------------- 99 | ConcurrentDex 100 | ------------- 101 | 102 | For multithreaded cases where writes are needed, use ConcurrentDex. It is a thin wrapper around a Dex 103 | that uses a lock to provide thread-safety. 104 | 105 | .. code-block:: 106 | 107 | from ducks import ConcurrentDex, FAIR, READERS, WRITERS 108 | 109 | objects = [{'a': 1, 'b': 2}] 110 | dex = ConcurrentDex(objects, ['a'], priority=READERS) 111 | dex[{'a': 1}] # result: [{'a': 1, 'b': 2}] 112 | 113 | The ConcurrentDex API is the same as Dex. An optional kwarg 'priority' allows prioritization of readers, 114 | writers, or neither; the default is to prioritize reads. 115 | 116 | ------------------- 117 | Function attributes 118 | ------------------- 119 | 120 | Ducks can also index using functions evaluated on the objects. This allows indexing of object types such as strings. 121 | 122 | Let's find strings that are palindromes of length 3: 123 | 124 | .. code-block:: 125 | 126 | from ducks import Dex 127 | strings = [ 128 | 'ooh', 'wow', 129 | 'kayak', 'bob' 130 | ] 131 | 132 | # define a function that 133 | # takes the object as input 134 | def is_palindrome(s): 135 | return s == s[::-1] 136 | 137 | # make a Dex 138 | dex = Dex(strings, [is_palindrome, len]) 139 | dex[{ 140 | is_palindrome: True, 141 | len: 3 142 | }] 143 | # result: ['wow', 'bob'] 144 | 145 | Functions are evaluated on the object when it is added to the Dex. 146 | 147 | ----------- 148 | Nested data 149 | ----------- 150 | 151 | Use functions to get values from nested data structures. 152 | 153 | .. code-block:: 154 | 155 | from ducks import Dex 156 | 157 | objs = [ 158 | {'a': {'b': [1, 2, 3]}}, 159 | {'a': {'b': [4, 5, 6]}} 160 | ] 161 | 162 | def get_nested(obj): 163 | return obj['a']['b'][0] 164 | 165 | dex = Dex(objs, [get_nested]) 166 | dex[{get_nested: 4}] 167 | # result: {'a': {'b': [4, 5, 6]}} 168 | 169 | ------------------ 170 | Missing attributes 171 | ------------------ 172 | 173 | Objects don't need to have every attribute. 174 | 175 | Indexes are sparse. Objects that are missing an attribute will not be stored 176 | under that attribute. This saves lots of memory. 177 | 178 | * To find all objects that have an attribute, match the special value ``ANY``. 179 | * To find objects missing the attribute, do ``{'!=': ANY}``. 180 | * In functions, raise ``MissingAttribute`` to tell ducks the attribute is missing. 181 | 182 | Example: 183 | 184 | .. code-block:: 185 | 186 | from ducks import Dex, ANY, MissingAttribute 187 | 188 | objs = [{'a': 1}, {'a': 2}, {}] 189 | 190 | def get_a(obj): 191 | try: 192 | return obj['a'] 193 | except KeyError: 194 | raise MissingAttribute # tell Dex this attribute is missing 195 | 196 | dex = Dex(objs, ['a', get_a]) 197 | 198 | print(dex[{'a': ANY}]) # [{'a': 1}, {'a': 2}] 199 | print(dex[{get_a: ANY}]) # [{'a': 1}, {'a': 2}] 200 | print(dex[{'a': {'!=': ANY}}]) # [{}] 201 | 202 | Note that ``None`` is treated as a normal attribute value and is stored. 203 | 204 | -------- 205 | Pickling 206 | -------- 207 | 208 | Dex, ConcurrentDex, and FrozenDex can be pickled using the special functions ``save`` and ``load``. 209 | 210 | .. code-block:: 211 | 212 | from ducks import Dex, save, load 213 | dex = Dex([1.2, 1.8, 2.7], [round]) 214 | save(dex, 'numbers.dex') 215 | loaded_dex = load('numbers.dex') 216 | loaded_dex[{round: 2}] 217 | # result: 1.8 218 | 219 | Objects inside the dex will be saved along with it. 220 | 221 | ---------- 222 | Class APIs 223 | ---------- 224 | 225 | There are three container classes: 226 | 227 | * **Dex**: Can add, remove, and update objects after creation. 228 | `[API] `_ 229 | * **ConcurrentDex**: Same as Dex, but thread-safe. 230 | `[API] `_ 231 | * **FrozenDex**: Cannot be changed after creation, it's read-only. But it's super fast. 232 | `[API] `_ 233 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | furo==2022.12.7 2 | sphinx==5.3.0 3 | sphinx-rtd-theme==1.3.0 4 | -------------------------------------------------------------------------------- /ducks/__init__.py: -------------------------------------------------------------------------------- 1 | from ducks.concurrent.main import ConcurrentDex # noqa: F401 2 | from ducks.concurrent.main import FAIR # noqa: F401 3 | from ducks.concurrent.main import READERS # noqa: F401 4 | from ducks.concurrent.main import WRITERS # noqa: F401 5 | from ducks.constants import ANY # noqa: F401 6 | from ducks.exceptions import MissingAttribute # noqa: F401 7 | from ducks.frozen.main import FrozenDex # noqa: F401 8 | from ducks.mutable.main import Dex # noqa: F401 9 | from ducks.pickling import load # noqa: F401 10 | from ducks.pickling import save # noqa: F401 11 | -------------------------------------------------------------------------------- /ducks/btree.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from typing import Dict 3 | from typing import List 4 | from typing import Tuple 5 | 6 | from BTrees.OOBTree import OOBTree 7 | 8 | 9 | class BTree: 10 | """ 11 | Wraps an OOBTree instance. Tweaks it a bit: 12 | - BTrees len() does a full tree traversal, which is very slow. So we maintain a count instead. 13 | - BTrees stores None values as if they were just really really small. So "x < 1" will find the Nones. 14 | Here instead we disallow None entirely, make it throw TypeError. 15 | - Provide a nice interface for using >, >=, <, <= to get value ranges. 16 | """ 17 | 18 | def __init__(self, d: Dict[Any, Any] = None): 19 | if d: 20 | if None in d: 21 | raise TypeError( 22 | "None is not allowed in BTree because it breaks comparisons." 23 | ) 24 | self.tree = OOBTree(d) 25 | self.length = len(d) 26 | else: 27 | self.tree = OOBTree() 28 | self.length = 0 29 | 30 | def get_range_expr(self, expr: Dict[str, Any]) -> List: 31 | """Get values matching a range expression like {'>': 3, '<=': 5}""" 32 | min_key, max_key, include_min, include_max = range_expr_to_args(expr) 33 | return self.get_range(min_key, max_key, include_min, include_max) 34 | 35 | def get_range( 36 | self, 37 | min_key=None, 38 | max_key=None, 39 | include_min: bool = True, 40 | include_max: bool = True, 41 | ) -> List: 42 | """ 43 | Get values in the range of [min_key, max_key]. include_min and include_max 44 | determine whether values for the start and end keys will be included. 45 | 46 | Examples: 47 | Get all values: None, None, True, True 48 | Get 1 < key < 10: 1, 10, False, False 49 | Get key >= 3: 3, None, True, True 50 | """ 51 | if len(self) == 0: 52 | return [] 53 | excludemin = not include_min 54 | excludemax = not include_max 55 | return self.tree.values( 56 | min_key, max_key, excludemin=excludemin, excludemax=excludemax 57 | ) 58 | 59 | def get(self, key, default=None): 60 | return self.tree.get(key, default) 61 | 62 | def keys(self): 63 | return self.tree.keys() 64 | 65 | def values(self): 66 | return self.tree.values() 67 | 68 | def items(self): 69 | return self.tree.items() 70 | 71 | def __len__(self): 72 | return self.length 73 | 74 | def __setitem__(self, key, value): 75 | if key is None: 76 | raise TypeError( 77 | "None is not allowed in BTree because it breaks comparisons." 78 | ) 79 | if len(self) == 0: 80 | # OOBTree oddity: it allows a non-comparable object on the first insert, but 81 | # if it gets one, all future inserts will fail. 82 | # So let's raise a TypeError if the very first insert is a non-comparable type. 83 | key > key 84 | if key not in self.tree: 85 | self.length += 1 86 | self.tree[key] = value 87 | 88 | def __getitem__(self, key): 89 | return self.tree[key] 90 | 91 | def __delitem__(self, key): 92 | self.length -= 1 93 | del self.tree[key] 94 | 95 | def __contains__(self, item): 96 | return item in self.tree 97 | 98 | 99 | def range_expr_to_args(expr: Dict[str, Any]) -> Tuple[Any, Any, bool, bool]: 100 | """ 101 | Turn a range expr into (min_key, max_key, include_min, include_max), which are easier to use with BTrees. 102 | e.g., translates {'<': 3} into get_values(3, None, True, False). 103 | Will ignore keys in expr other than '<', '<=', '>', '>='. 104 | """ 105 | min_key = None 106 | max_key = None 107 | include_min = True 108 | include_max = True 109 | if ">" in expr: 110 | min_key = expr[">"] 111 | include_min = False 112 | if ">=" in expr: 113 | min_key = expr[">="] 114 | include_min = True 115 | if "<" in expr: 116 | max_key = expr["<"] 117 | include_max = False 118 | if "<=" in expr: 119 | max_key = expr["<="] 120 | include_max = True 121 | return min_key, max_key, include_min, include_max 122 | -------------------------------------------------------------------------------- /ducks/concurrent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/ducks/concurrent/__init__.py -------------------------------------------------------------------------------- /ducks/concurrent/main.py: -------------------------------------------------------------------------------- 1 | import pickle # nosec 2 | from contextlib import contextmanager 3 | from typing import Any 4 | from typing import Callable 5 | from typing import Dict 6 | from typing import Iterable 7 | from typing import Iterator 8 | from typing import List 9 | from typing import Optional 10 | from typing import Union 11 | 12 | from ducks.mutable.main import Dex 13 | from readerwriterlock.rwlock import RWLockFair 14 | from readerwriterlock.rwlock import RWLockRead 15 | from readerwriterlock.rwlock import RWLockWrite 16 | 17 | 18 | """Lock priority options""" 19 | READERS = "readers" 20 | WRITERS = "writers" 21 | FAIR = "fair" 22 | 23 | 24 | class ConcurrentDex: 25 | def __init__( 26 | self, 27 | objs: Optional[Iterable[Any]] = None, 28 | on: Iterable[Union[str, Callable]] = None, 29 | priority: str = READERS, 30 | ): 31 | """Contains a Dex instance and a readerwriterlock. Wraps each Dex method in a read or write lock. 32 | 33 | Args: 34 | objs: see Dex API 35 | on: see Dex API 36 | priority: 'readers', 'writers', or 'fair'. Default 'readers'. Change this according to your usage pattern. 37 | """ 38 | self.priority = priority 39 | self.box = Dex(objs, on) 40 | if priority == READERS: 41 | self.lock = RWLockRead() 42 | elif priority == WRITERS: 43 | self.lock = RWLockWrite() 44 | elif priority == FAIR: 45 | self.lock = RWLockFair() 46 | else: 47 | raise ValueError(f"priority must be {READERS}, {WRITERS}, or {FAIR}.") 48 | self._indexes = self.box._indexes # only used during testing 49 | 50 | @contextmanager 51 | def read_lock(self): 52 | """Lock the ConcurrentDex for reading.""" 53 | with self.lock.gen_rlock(): 54 | yield 55 | 56 | @contextmanager 57 | def write_lock(self): 58 | """Lock the ConcurrentDex for writing. 59 | 60 | When doing many write operations at once, it is more efficient to do:: 61 | with cfb.read_lock(): 62 | for item in items: 63 | cfb.box.add(item) # calls add() on the underlying Dex. 64 | 65 | This performs locking only once, versus calling cfb.add() which locks for each item. 66 | The same pattern works for update() and remove(). 67 | """ 68 | with self.lock.gen_wlock(): 69 | yield 70 | 71 | def get_values(self, attr: Union[str, Callable]): 72 | """Get a read lock and perform Dex get_values().""" 73 | with self.read_lock(): 74 | return self.box.get_values(attr) 75 | 76 | def remove(self, obj: Any): 77 | """Get a write lock and perform Dex.remove().""" 78 | with self.write_lock(): 79 | self.box.remove(obj) 80 | 81 | def add(self, obj: Any): 82 | """Get a write lock and perform Dex.add().""" 83 | with self.write_lock(): 84 | self.box.add(obj) 85 | 86 | def update(self, obj: Any): 87 | """Get a write lock and perform Dex.update().""" 88 | with self.write_lock(): 89 | self.box.update(obj) 90 | 91 | def __len__(self) -> int: 92 | """Get a read lock and get length of Dex.""" 93 | with self.read_lock(): 94 | return len(self.box) 95 | 96 | def __contains__(self, obj: Any) -> bool: 97 | """Get a read lock and check if the item is in the Dex.""" 98 | with self.read_lock(): 99 | return obj in self.box 100 | 101 | def __iter__(self) -> Iterator: 102 | """Get a read lock, make a list of the objects in the Dex, and return an iter to the list.""" 103 | with self.read_lock(): 104 | return iter(list(self.box)) 105 | 106 | def __getitem__(self, query: Dict) -> List[Any]: 107 | """Get a read lock and perform Dex __getitem__.""" 108 | with self.read_lock(): 109 | return self.box[query] 110 | 111 | 112 | def save(c_box: ConcurrentDex, filepath: str): 113 | """Saves a ConcurrentDex to a pickle file.""" 114 | saved = { 115 | "objs": list(c_box.box.obj_map.values()), 116 | "on": list(c_box.box._indexes.keys()), 117 | "priority": c_box.priority, 118 | } 119 | with open(filepath, "wb") as fh: 120 | pickle.dump(saved, fh) 121 | 122 | 123 | def load(saved: Dict) -> ConcurrentDex: 124 | """Creates a ConcurrentDex from the pickle file contents.""" 125 | return ConcurrentDex(saved["objs"], saved["on"], saved["priority"]) 126 | -------------------------------------------------------------------------------- /ducks/constants.py: -------------------------------------------------------------------------------- 1 | SIZE_THRESH = 100 2 | 3 | ARR_TYPE = "q" # python array type meaning "int64": https://docs.python.org/3/library/array.html 4 | SET_SIZE_MIN = 10 5 | ARRAY_SIZE_MAX = 20 6 | 7 | 8 | class MatchAnything(set): 9 | pass 10 | 11 | 12 | """ 13 | ANY allows lookups like find({'attr': ANY}), which gives all objects that have an 'attr' attribute. 14 | 15 | Why is this a set()? 16 | We need a value that we can do "is" comparisons on, that will only be True 17 | when it's literally this object. set() is a simple object that satisfies this property. 18 | "ANY is ANY" evaluates to True, but "set() is ANY" evaluates to False. 19 | """ 20 | ANY = MatchAnything() 21 | 22 | VALID_OPERATORS = [ 23 | "==", 24 | "eq", 25 | "!=", 26 | "ne", 27 | "in", 28 | "not in", 29 | "<", 30 | "lt", 31 | "<=", 32 | "lte", 33 | "le", 34 | ">", 35 | "gt", 36 | ">=", 37 | "gte", 38 | "ge", 39 | "is", 40 | "is not", 41 | ] 42 | 43 | OPERATOR_MAP = { 44 | "eq": "==", 45 | "lt": "<", 46 | "le": "<=", # Python style <= 47 | "lte": "<=", # ElasticSearch style <= 48 | "gt": ">", 49 | "ge": ">=", # Python style >= 50 | "gte": ">=", # ElasticSearch style >= 51 | } 52 | 53 | EXCLUDE_OPERATORS = {"not in": "in", "!=": "=="} 54 | -------------------------------------------------------------------------------- /ducks/exceptions.py: -------------------------------------------------------------------------------- 1 | class FrozenError(Exception): 2 | """Raised when attempting to modify a FrozenDex""" 3 | 4 | 5 | class AttributeNotFoundError(Exception): 6 | """Raised when querying an attribute we don't have""" 7 | 8 | 9 | class MissingAttribute(Exception): 10 | """Raise this in your attribute functions to denote that the object is missing this attribute. Finds that 11 | match the attribute will never return this object. Finds that exclude the attribute will.""" 12 | -------------------------------------------------------------------------------- /ducks/frozen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/ducks/frozen/__init__.py -------------------------------------------------------------------------------- /ducks/frozen/frozen_attr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Performs object lookup for a single attribute in a FrozenDex. 3 | """ 4 | from bisect import bisect_left 5 | from bisect import bisect_right 6 | from typing import Callable 7 | from typing import Set 8 | from typing import Union 9 | 10 | import numpy as np 11 | from ducks.btree import BTree 12 | from ducks.constants import ANY 13 | from ducks.constants import SIZE_THRESH 14 | from ducks.frozen.init_helpers import get_vals 15 | from ducks.frozen.init_helpers import run_length_encode 16 | from ducks.utils import make_empty_array 17 | 18 | 19 | class FrozenAttrIndex: 20 | """ 21 | Stores data and handles requests that are relevant to a single attribute of a FrozenDex. 22 | 23 | There are three places where object indexes are stored. 24 | - none_ids stores all indexes for with the attribute value None 25 | - val_to_obj_ids stores object ids for attribute values that have many objects 26 | - val_arr + obj_id_arr store all the rest. 27 | """ 28 | 29 | def __init__(self, attr: Union[str, Callable], objs: np.ndarray, dtype: str): 30 | # sort the objects by attribute value, using their hashes and handling collisions 31 | self.dtype = dtype 32 | self.attr = attr 33 | 34 | # Nones get stored in their own special spot so they don't break sortability. A little convent for the Nones. 35 | self.none_ids = make_empty_array(self.dtype) 36 | 37 | # We will pull repeated attributes out into a BTree and pre-sort their indexes. 38 | # Saves memory, and makes object lookups *way* faster. 39 | self.val_to_obj_ids = BTree() 40 | 41 | obj_id_arr = np.arange(len(objs), dtype=self.dtype) 42 | for i in range(len(objs)): 43 | obj_id_arr[i] = i 44 | obj_id_arr, val_arr = get_vals(objs, obj_id_arr, self.attr) 45 | 46 | # extract Nones. These will make the array unsortable if left in. 47 | none_idx = np.array( 48 | [i for i in range(len(val_arr)) if val_arr[i] is None], dtype=self.dtype 49 | ) 50 | if len(none_idx): 51 | none_flag = np.zeros_like(val_arr, dtype="bool") 52 | none_flag[none_idx] = True 53 | self.none_ids = np.sort(obj_id_arr[none_flag]) 54 | obj_id_arr = obj_id_arr[~none_flag] 55 | val_arr = val_arr[~none_flag] 56 | 57 | # Attempt to sort the values. 58 | sort_order = np.argsort(val_arr) # Throws TypeError if unsortable. 59 | val_arr = val_arr[sort_order] 60 | obj_id_arr = obj_id_arr[sort_order] 61 | 62 | val_starts, val_run_lengths, unique_vals = run_length_encode(val_arr) 63 | unused = np.ones_like(obj_id_arr, dtype="bool") 64 | n_unused = len(unused) 65 | for i, val in enumerate(unique_vals): 66 | if val_run_lengths[i] > SIZE_THRESH: 67 | # extract these 68 | start = val_starts[i] 69 | end = start + val_run_lengths[i] 70 | unused[start:end] = False 71 | n_unused -= val_run_lengths[i] 72 | self.val_to_obj_ids[val] = np.sort(obj_id_arr[start:end]) 73 | self.val_arr = val_arr[unused] 74 | self.obj_id_arr = obj_id_arr[unused] 75 | 76 | def get(self, val) -> np.ndarray: 77 | """Get indexes of objects whose attribute is val.""" 78 | if val is ANY: 79 | return self.get_all() 80 | if val is None: 81 | return self.none_ids 82 | if val in self.val_to_obj_ids: 83 | return self.val_to_obj_ids[val] 84 | # find by bisection 85 | left = bisect_left(self.val_arr, val) 86 | if left == len(self.val_arr) or self.val_arr[left] != val: 87 | return make_empty_array(self.dtype) 88 | right = bisect_right(self.val_arr, val) 89 | return np.sort(self.obj_id_arr[left:right]) 90 | 91 | def get_all(self) -> np.ndarray: 92 | """Get indexes of every object with this attribute. Used when matching ANY.""" 93 | arrs = [self.obj_id_arr] 94 | for v in self.val_to_obj_ids.values(): 95 | arrs.append(v) 96 | arrs.append(self.none_ids) 97 | return np.sort(np.concatenate(arrs)) 98 | 99 | def get_values(self) -> Set: 100 | """Get each value we have objects for.""" 101 | vals = set(self.val_to_obj_ids.keys()) 102 | vals = vals.union(self.val_arr) 103 | if len(self.none_ids): 104 | vals.add(None) 105 | return vals 106 | 107 | def _get_val_arr_matches(self, lo, hi, include_lo=False, include_hi=False): 108 | """Get the matches for this range query from the parallel arrays""" 109 | if len(self.val_arr) == 0: 110 | return make_empty_array(self.dtype) 111 | 112 | if lo is None: 113 | left = 0 114 | lo = self.val_arr[0] 115 | include_lo = True 116 | else: 117 | left = bisect_left(self.val_arr, lo) 118 | 119 | if hi is None: 120 | right = len(self.val_arr) 121 | hi = self.val_arr[right - 1] 122 | include_hi = True 123 | else: 124 | right = bisect_right(self.val_arr, hi) 125 | 126 | # move left pointer up to fit > constraint 127 | if not include_lo: 128 | while left < len(self.val_arr) and self.val_arr[left] == lo: 129 | left += 1 130 | if left == len(self.val_arr): 131 | return make_empty_array(self.dtype) 132 | 133 | # move right pointer down to fit < constraint 134 | if not include_hi: 135 | while right > left and self.val_arr[right - 1] == hi: 136 | right -= 1 137 | 138 | small_matches = self.obj_id_arr[left:right] 139 | return small_matches 140 | 141 | def get_ids_by_range( 142 | self, lo, hi, include_lo=False, include_hi=False 143 | ) -> np.ndarray: 144 | """Get the object IDs associated with this value range as an Int64Set. Only usable when self.d is a tree.""" 145 | if len(self) == 0: 146 | return make_empty_array(self.dtype) 147 | 148 | # Get matches from the val_to_obj_ids BTree 149 | big_matches_list = list( 150 | self.val_to_obj_ids.get_range(lo, hi, include_lo, include_hi) 151 | ) 152 | 153 | # Get matches from the parallel arrays 154 | small_matches = self._get_val_arr_matches(lo, hi, include_lo, include_hi) 155 | 156 | # do return 157 | if len(big_matches_list) == 1 and len(small_matches) == 0: 158 | # each big_matches is stored pre-sorted, no need to sort 159 | return big_matches_list[0] 160 | 161 | # concat all arrays and sort 162 | matches = np.sort(np.concatenate([small_matches] + big_matches_list)) 163 | return matches 164 | 165 | def __len__(self): 166 | return len(self.val_arr) + len(self.val_to_obj_ids) + len(self.none_ids) 167 | -------------------------------------------------------------------------------- /ducks/frozen/init_helpers.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | from typing import Union 3 | 4 | import numpy as np 5 | from ducks.utils import get_attribute 6 | from ducks.utils import make_empty_array 7 | 8 | 9 | def get_vals(objs: np.ndarray, obj_id_arr: np.ndarray, attr: Union[Callable, str]): 10 | """Gets vals by attribute. Returned arrays will be shorter than input if objects are missing attributes.""" 11 | val_arr = np.empty(len(objs), dtype="O") 12 | success = np.empty(len(objs), dtype=bool) 13 | for i, obj in enumerate(objs): 14 | val_arr[i], success[i] = get_attribute(obj, attr) 15 | 16 | val_arr = val_arr[success] 17 | obj_id_arr = obj_id_arr[success] 18 | return obj_id_arr, val_arr 19 | 20 | 21 | def run_length_encode(arr: np.ndarray): 22 | """ 23 | Find counts of each element in the arr (sorted) via run-length encoding. 24 | 25 | Takes 10ms for 1M objs. 26 | """ 27 | if len(arr) == 0: 28 | return ( 29 | make_empty_array("int64"), 30 | make_empty_array("int64"), 31 | make_empty_array("int64"), 32 | ) 33 | mismatch_val = arr[1:] != arr[:-1] 34 | change_pts = np.append(np.where(mismatch_val), len(arr) - 1) 35 | counts = np.diff(np.append(-1, change_pts)) 36 | starts = np.cumsum(np.append(0, counts))[:-1] 37 | return starts, counts, arr[change_pts] 38 | -------------------------------------------------------------------------------- /ducks/frozen/main.py: -------------------------------------------------------------------------------- 1 | import pickle # nosec 2 | from bisect import bisect_left 3 | from typing import Any 4 | from typing import Callable 5 | from typing import Dict 6 | from typing import Iterable 7 | from typing import Optional 8 | from typing import Set 9 | from typing import Union 10 | 11 | import numpy as np 12 | import sortednp as snp 13 | from ducks.btree import range_expr_to_args 14 | from ducks.frozen.frozen_attr import FrozenAttrIndex 15 | from ducks.frozen.utils import snp_difference 16 | from ducks.utils import make_empty_array 17 | from ducks.utils import split_query 18 | from ducks.utils import standardize_expr 19 | from ducks.utils import validate_and_standardize_operators 20 | from ducks.utils import validate_query 21 | 22 | 23 | class FrozenDex: 24 | def __init__(self, objs: Iterable[Any], on: Iterable[Union[str, Callable]]): 25 | """Create a FrozenDex containing the ``objs``, queryable by the ``on`` attributes. 26 | 27 | Args: 28 | objs: The objects that FrozenDex will contain. 29 | 30 | on: The attributes that will be used for finding objects. 31 | Must contain at least one. 32 | 33 | It's OK if the objects in ``objs`` are missing some or all of the attributes in ``on``. 34 | 35 | For the objects that do contain the attributes on ``on``, those attribute values must be hashable and sortable. 36 | Most Python objects are hashable. Implement the function ``__lt__(self, other)`` to make a class sortable. 37 | An attribute value of ``None`` is acceptable as well, even though None is not sortable. 38 | """ 39 | if not on: 40 | raise ValueError("Need at least one attribute.") 41 | if isinstance(on, str): 42 | on = [on] 43 | 44 | self.obj_arr = np.empty(len(objs), dtype="O") 45 | self.dtype = "uint32" if len(objs) < 2**32 else "uint64" 46 | for i, obj in enumerate(objs): 47 | self.obj_arr[i] = obj 48 | 49 | self._indexes = {} 50 | for attr in on: 51 | self._indexes[attr] = FrozenAttrIndex(attr, self.obj_arr, self.dtype) 52 | 53 | # only used during contains() checks 54 | self.sorted_obj_ids = np.sort([id(obj) for obj in self.obj_arr]) 55 | 56 | def _find( # noqa: C901 57 | self, 58 | match: Optional[Dict[Union[str, Callable], Any]] = None, 59 | exclude: Optional[Dict[Union[str, Callable], Any]] = None, 60 | ) -> np.ndarray: 61 | """Find objects in the FrozenDex that satisfy the match and exclude constraints. 62 | 63 | Args: 64 | match: Dict of ``{attribute: expression}`` defining the subset of objects that match. 65 | If ``None``, all objects will match. 66 | 67 | Each attribute is a string or Callable. Must be one of the attributes specified in the constructor. 68 | 69 | The expression can be any of the following: 70 | - A dict of ``{operator: value}``, such as ``{'==': 1}`` ``{'>': 5}``, or ``{'in': [1, 2, 3]}``. 71 | - A single value, which is a shorthand for `{'==': value}`. 72 | - A list of values, which is a shorthand for ``{'in': [list_of_values]}``. 73 | - ``ducks.ANY``, which matches all objects having the attribute. 74 | 75 | Valid operators are '==' 'in', '<', '<=', '>', '>='. 76 | The aliases 'eq' 'lt', 'le', 'lte', 'gt', 'ge', and 'gte' work too. 77 | To match a None value, use ``{'==': None}``. There is no separate operator for None values. 78 | 79 | exclude: Dict of ``{attribute: expression}`` defining the subset of objects that do not match. 80 | If ``None``, no objects will be excluded. 81 | 82 | Each attribute is a string or Callable. Must be one of the attributes specified in the constructor. 83 | Valid expressions are the same as in ``match``. 84 | 85 | Returns: 86 | Numpy array of objects matching the constraints. Array will be in the same order as the original objects. 87 | """ 88 | # validate input and convert expressions to dict 89 | validate_query(self._indexes, match, exclude) 90 | for arg in [match, exclude]: 91 | if arg: 92 | for key in arg: 93 | arg[key] = standardize_expr(arg[key]) 94 | 95 | # perform 'match' query 96 | if match: 97 | hit_arrays = [] 98 | for attr, expr in match.items(): 99 | hit_array = self._match_attr_expr(attr, expr) 100 | if len(hit_array) == 0: 101 | # this attr had no matches, therefore the intersection will be empty. We can stop here. 102 | return make_empty_array("O") 103 | hit_arrays.append(hit_array) 104 | 105 | # intersect all the hit_arrays, starting with the smallest 106 | for i, hit_array in enumerate(sorted(hit_arrays, key=len)): 107 | if i == 0: 108 | hits = hit_array 109 | else: 110 | hits = snp.intersect(hits, hit_array) 111 | else: 112 | hits = np.arange(len(self.obj_arr), dtype=self.dtype) 113 | 114 | # perform 'exclude' query 115 | if exclude: 116 | exc_arrays = [] 117 | for attr, expr in exclude.items(): 118 | exc_arrays.append(self._match_attr_expr(attr, expr)) 119 | 120 | # subtract each of the exc_arrays, starting with the largest 121 | for exc_array in sorted(exc_arrays, key=len, reverse=True): 122 | hits = snp_difference(hits, exc_array) 123 | if len(hits) == 0: 124 | break 125 | 126 | return self.obj_arr[hits] 127 | 128 | def _match_attr_expr(self, attr: Union[str, Callable], expr: dict) -> np.ndarray: 129 | """Look at an attr, handle its expr appropriately""" 130 | validate_and_standardize_operators(expr) 131 | matches = None 132 | # handle 'in' and '==' 133 | eq_expr = {op: val for op, val in expr.items() if op in ["==", "in"]} 134 | for op, val in eq_expr.items(): 135 | if op == "==": 136 | op_matches = self._indexes[attr].get(val) 137 | elif op == "in": 138 | op_matches = self._match_any_value_in(attr, expr["in"]) 139 | matches = ( 140 | op_matches if matches is None else snp.intersect(op_matches, matches) 141 | ) 142 | 143 | # handle range query 144 | range_expr = { 145 | op: val for op, val in expr.items() if op in ["<", ">", "<=", ">="] 146 | } 147 | if range_expr: 148 | min_key, max_key, include_min, include_max = range_expr_to_args(range_expr) 149 | range_matches = self._indexes[attr].get_ids_by_range( 150 | min_key, max_key, include_min, include_max 151 | ) 152 | matches = ( 153 | range_matches 154 | if matches is None 155 | else snp.intersect(range_matches, matches) 156 | ) 157 | return matches 158 | 159 | def get_values(self, attr: Union[str, Callable]) -> Set: 160 | """Get the set of unique values we have for the given attribute. 161 | 162 | Args: 163 | attr: The attribute to get values for. 164 | 165 | Returns: 166 | Set of all unique values for this attribute. 167 | """ 168 | return self._indexes[attr].get_values() 169 | 170 | def _match_any_value_in( 171 | self, attr: Union[str, Callable], values: Iterable[Any] 172 | ) -> np.ndarray: 173 | """ "Get the union of object ID matches for the values.""" 174 | matches = [self._indexes[attr].get(v) for v in values] 175 | if matches: 176 | return np.sort(np.concatenate(matches)) 177 | else: 178 | return make_empty_array(self.dtype) 179 | 180 | def __contains__(self, obj): 181 | obj_id = id(obj) 182 | idx = bisect_left(self.sorted_obj_ids, obj_id) 183 | if ( 184 | idx < 0 185 | or idx >= len(self.sorted_obj_ids) 186 | or self.sorted_obj_ids[idx] != obj_id 187 | ): 188 | return False 189 | return True 190 | 191 | def __iter__(self): 192 | return iter(self.obj_arr) 193 | 194 | def __len__(self): 195 | return len(self.obj_arr) 196 | 197 | def __getitem__(self, query: Dict) -> np.ndarray: 198 | """Find objects in the FrozenDex that satisfy the constraints. 199 | 200 | Args: 201 | query: Dict of ``{attribute: expression}`` defining the subset of objects that match. 202 | If ``{}``, all objects will match. 203 | 204 | Each attribute is a string or Callable. Must be one of the attributes specified in the constructor. 205 | 206 | The expression can be any of the following: 207 | - A dict of ``{operator: value}``, such as ``{'==': 1}`` ``{'>': 5}``, or ``{'in': [1, 2, 3]}``. 208 | - A single value, which is a shorthand for `{'==': value}`. 209 | - A list of values, which is a shorthand for ``{'in': [list_of_values]}``. 210 | 211 | The expression ``{'==': ducks.ANY}`` will match all objects having the attribute. 212 | The expression ``{'!=': ducks.ANY}`` will match all objects without the attribute. 213 | 214 | Valid operators are '==', '!=', 'in', 'not in', '<', '<=', '>', '>='. 215 | The aliases 'eq', 'ne', 'lt', 'le', 'lte', 'gt', 'ge', and 'gte' work too. 216 | To match a None value, use ``{'==': None}``. There is no separate operator for None values. 217 | 218 | Returns: 219 | Numpy array of objects matching the constraints. Array will be in the same order as the original objects. 220 | """ 221 | if not isinstance(query, dict): 222 | raise TypeError(f"Got {type(query)}; expected a dict.") 223 | std_query = dict() 224 | for attr, expr in query.items(): 225 | std_query[attr] = standardize_expr(expr) 226 | match_query, exclude_query = split_query(std_query) 227 | return self._find(match_query, exclude_query) 228 | 229 | 230 | def save(box: FrozenDex, filepath: str): 231 | """Saves this object to a pickle file.""" 232 | with open(filepath, "wb") as fh: 233 | pickle.dump(box, fh) 234 | 235 | 236 | def load(box: FrozenDex): 237 | """Creates a FrozenDex from the pickle file contents.""" 238 | # If this was created by one Python process and loaded by another, the object IDs will no longer 239 | # correspond to the objects. Re-create the object ID array with the correct IDs. 240 | box.sorted_obj_ids = np.sort([id(obj) for obj in box.obj_arr]) 241 | -------------------------------------------------------------------------------- /ducks/frozen/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sortednp as snp 3 | 4 | 5 | def snp_difference(left: np.ndarray, right: np.ndarray): 6 | # difference = left - indexes_in_intersection(left, right) 7 | _, indexes = snp.intersect(left, right, indices=True) 8 | indexes_to_discard = indexes[0] 9 | keep_these = np.ones_like(left, dtype=bool) 10 | keep_these[indexes_to_discard] = False 11 | return left[keep_these] 12 | -------------------------------------------------------------------------------- /ducks/mutable/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/ducks/mutable/__init__.py -------------------------------------------------------------------------------- /ducks/mutable/main.py: -------------------------------------------------------------------------------- 1 | import pickle # nosec 2 | from operator import itemgetter 3 | from typing import Any 4 | from typing import Callable 5 | from typing import Dict 6 | from typing import Iterable 7 | from typing import List 8 | from typing import Optional 9 | from typing import Set 10 | from typing import Union 11 | 12 | from cykhash import Int64Set 13 | from ducks.mutable.mutable_attr import MutableAttrIndex 14 | from ducks.utils import cyk_intersect 15 | from ducks.utils import cyk_union 16 | from ducks.utils import split_query 17 | from ducks.utils import standardize_expr 18 | from ducks.utils import validate_query 19 | 20 | 21 | class Dex: 22 | def __init__( 23 | self, 24 | objs: Optional[Iterable[Any]] = None, 25 | on: Iterable[Union[str, Callable]] = None, 26 | ): 27 | """ 28 | Create a Dex containing the ``objs``, queryable by the ``on`` attributes. 29 | 30 | Args: 31 | objs: The objects that Dex will contain initially. Optional. 32 | 33 | on: The attributes that will be used for finding objects. 34 | Must contain at least one. 35 | 36 | It's OK if the objects in ``objs`` are missing some or all of the attributes in ``on``. 37 | 38 | For the objects that do contain the attributes in ``on``, those attribute values must be hashable and sortable. 39 | Most Python objects are hashable. Implement the function ``__lt__(self, other)`` to make a class sortable. 40 | An attribute value of ``None`` is acceptable as well, even though None is not sortable. 41 | """ 42 | if not on: 43 | raise ValueError("Need at least one attribute.") 44 | if isinstance(on, str): 45 | on = [on] 46 | 47 | if objs: 48 | self.obj_map = {id(obj): obj for obj in objs} 49 | else: 50 | self.obj_map = dict() 51 | 52 | # Build an index for each attribute 53 | self._indexes = {} 54 | for attr in on: 55 | self._indexes[attr] = MutableAttrIndex(attr, objs) 56 | 57 | def _find( 58 | self, 59 | match: Dict[Union[str, Callable], Dict[str, Any]], 60 | exclude: Dict[Union[str, Callable], Dict[str, Any]], 61 | ) -> List: 62 | """Find objects in the Dex that satisfy the match and exclude constraints. 63 | 64 | Args: 65 | match: Dict of ``{attribute: expression}`` defining the subset of objects that match. 66 | If ``None``, all objects will match. 67 | 68 | Each attribute is a string or Callable. Must be one of the attributes specified in the constructor. 69 | 70 | The expression can be any of the following: 71 | - A dict of ``{operator: value}``, such as ``{'==': 1}`` ``{'>': 5}``, or ``{'in': [1, 2, 3]}``. 72 | - A single value, which is a shorthand for `{'==': value}`. 73 | - A list of values, which is a shorthand for ``{'in': [list_of_values]}``. 74 | 75 | The special value ``ducks.ANY`` will match all objects having the attribute. 76 | 77 | Valid operators are '==' 'in', '<', '<=', '>', '>='. 78 | The aliases 'eq' 'lt', 'le', 'lte', 'gt', 'ge', and 'gte' work too. 79 | To match a None value, use ``{'==': None}``. There is no separate operator for None values. 80 | 81 | exclude: Dict of ``{attribute: expression}`` defining the subset of objects that do not match. 82 | If ``None``, no objects will be excluded. 83 | 84 | Each attribute is a string or Callable. Must be one of the attributes specified in the constructor. 85 | Valid expressions are the same as in ``match``. 86 | 87 | Returns: 88 | List of objects matching the constraints. List will be unordered. 89 | """ 90 | # validate input and convert expressions to dict 91 | validate_query(self._indexes, match, exclude) 92 | obj_ids = self._find_ids(match, exclude) 93 | return self._obj_ids_to_objs(obj_ids) 94 | 95 | def add(self, obj: Any): 96 | """Add the object, evaluating any attributes and storing the results. 97 | If the object is already present, it will not be updated.""" 98 | ptr = id(obj) 99 | if ptr in self.obj_map: 100 | return 101 | self.obj_map[ptr] = obj 102 | for attr in self._indexes: 103 | self._indexes[attr].add(ptr, obj) 104 | 105 | def remove(self, obj: Any): 106 | """Remove the object. Raises KeyError if not present.""" 107 | ptr = id(obj) 108 | if ptr not in self.obj_map: 109 | raise KeyError 110 | 111 | for attr in self._indexes: 112 | self._indexes[attr].remove(ptr, obj) 113 | del self.obj_map[ptr] 114 | 115 | def update(self, obj: Any): 116 | """Remove and re-add the object, updating all stored attributes. Raises KeyError if object not present.""" 117 | self.remove(obj) 118 | self.add(obj) 119 | 120 | def get_values(self, attr: Union[str, Callable]) -> Set: 121 | """Get the unique values we have for the given attribute. 122 | 123 | Args: 124 | attr: The attribute to get values for. 125 | 126 | Returns: 127 | Set of all unique values for this attribute. 128 | """ 129 | return self._indexes[attr].get_values() 130 | 131 | def _find_ids( 132 | self, 133 | match: Optional[Dict[Union[str, Callable], Dict]] = None, 134 | exclude: Optional[Dict[Union[str, Callable], Dict]] = None, 135 | ) -> Int64Set: 136 | """Perform lookup based on given constraints. Return a set of object IDs.""" 137 | # perform 'match' query 138 | if match: 139 | # find intersection of each attr 140 | hit_sets = [] 141 | for attr, expr in match.items(): 142 | hit_set = self._match_attr_expr(attr, expr) 143 | if len(hit_set) == 0: 144 | # this attr had no matches, therefore the intersection will be empty. We can stop here. 145 | return Int64Set() 146 | hit_sets.append(hit_set) 147 | 148 | for i, hit_set in enumerate(sorted(hit_sets, key=len)): 149 | # intersect this attr's hits with our hits so far 150 | if i == 0: 151 | hits = hit_set 152 | else: 153 | hits = cyk_intersect(hits, hit_set) 154 | else: 155 | # 'match' is unspecified, so match all objects 156 | hits = Int64Set(self.obj_map.keys()) 157 | 158 | # perform 'exclude' query 159 | if exclude: 160 | exc_sets = [] 161 | for attr, expr in exclude.items(): 162 | exc_sets.append(self._match_attr_expr(attr, expr)) 163 | 164 | for exc_set in sorted(exc_sets, key=len, reverse=True): 165 | hits = Int64Set.difference(hits, exc_set) 166 | if len(hits) == 0: 167 | break 168 | 169 | return hits 170 | 171 | def _match_attr_expr( 172 | self, attr: Union[str, Callable], expr: Dict[str, Any] 173 | ) -> Int64Set: 174 | """Look at an attr, handle its expr appropriately""" 175 | matches = None 176 | # handle 'in' and '==' 177 | eq_expr = {op: val for op, val in expr.items() if op in ["==", "in"]} 178 | for op, val in eq_expr.items(): 179 | if op == "==": 180 | op_matches = self._indexes[attr].get_obj_ids(val) 181 | elif op == "in": 182 | op_matches = self._match_any_value_in(attr, expr["in"]) 183 | matches = ( 184 | op_matches if matches is None else cyk_intersect(op_matches, matches) 185 | ) 186 | 187 | # handle range query 188 | range_expr = { 189 | op: val for op, val in expr.items() if op in ["<", ">", "<=", ">="] 190 | } 191 | if range_expr: 192 | range_matches = self._indexes[attr].get_ids_by_range(range_expr) 193 | matches = ( 194 | range_matches 195 | if matches is None 196 | else cyk_intersect(range_matches, matches) 197 | ) 198 | return matches 199 | 200 | def _match_any_value_in( 201 | self, attr: Union[str, Callable], values: Iterable[Any] 202 | ) -> Int64Set: 203 | """Handle 'in' queries. Return the union of object ID matches for the values.""" 204 | matches = Int64Set() 205 | for v in values: 206 | v_matches = self._indexes[attr].get_obj_ids(v) 207 | matches = cyk_union(matches, v_matches) 208 | return Int64Set(matches) 209 | 210 | def _obj_ids_to_objs(self, obj_ids: Int64Set) -> List[Any]: 211 | """Look up each obj_id in self.obj_map, and return the list of objs.""" 212 | # Using itemgetter is about 10% faster than doing a comprehension like [self.objs[ptr] for ptr in hits] 213 | if len(obj_ids) == 0: 214 | return [] 215 | elif len(obj_ids) == 1: 216 | return [ 217 | itemgetter(*obj_ids)(self.obj_map) 218 | ] # itemgetter returns a single item here, not in a collection 219 | else: 220 | return list( 221 | itemgetter(*obj_ids)(self.obj_map) 222 | ) # itemgetter returns a tuple of items here, so make it a list 223 | 224 | def __contains__(self, obj: Any): 225 | return id(obj) in self.obj_map 226 | 227 | def __iter__(self): 228 | return iter(self.obj_map.values()) 229 | 230 | def __len__(self): 231 | return len(self.obj_map) 232 | 233 | def __getitem__(self, query: Dict) -> List[Any]: 234 | """Find objects in the Dex that satisfy the constraints. 235 | 236 | Args: 237 | query: Dict of ``{attribute: expression}`` defining the subset of objects that match. 238 | If ``{}``, all objects will match. 239 | 240 | Each attribute is a string or Callable. Must be one of the attributes specified in the constructor. 241 | 242 | The expression can be any of the following: 243 | - A dict of ``{operator: value}``, such as ``{'==': 1}`` ``{'>': 5}``, or ``{'in': [1, 2, 3]}``. 244 | - A single value, which is a shorthand for `{'==': value}`. 245 | - A list of values, which is a shorthand for ``{'in': [list_of_values]}``. 246 | 247 | The expression ``{'==': ducks.ANY}`` will match all objects having the attribute. 248 | The expression ``{'!=': ducks.ANY}`` will match all objects without the attribute. 249 | 250 | Valid operators are '==', '!=', 'in', 'not in', '<', '<=', '>', '>='. 251 | The aliases 'eq', 'ne', 'lt', 'le', 'lte', 'gt', 'ge', and 'gte' work too. 252 | To match a None value, use ``{'==': None}``. There is no separate operator for None values. 253 | 254 | Returns: 255 | List of objects matching the constraints. List will be unordered. 256 | """ 257 | if not isinstance(query, dict): 258 | raise TypeError(f"Got {type(query)}; expected a dict.") 259 | std_query = dict() 260 | for attr, expr in query.items(): 261 | std_query[attr] = standardize_expr(expr) 262 | match_query, exclude_query = split_query(std_query) 263 | return self._find(match_query, exclude_query) 264 | 265 | 266 | def save(box: Dex, filepath: str): 267 | """Saves this object to a pickle file.""" 268 | # We can't pickle this easily, because: 269 | # - Int64Sets cannot be pickled, so the MutableAttrIndex is hard to save. 270 | # - Object IDs are specific to the process that created them, so the object map will be invalid if saved. 271 | # Therefore, this just pickles the objects and the list of what to build indexes on. 272 | # The Dex container will be built anew with __init__ on load. 273 | # A bit slow, but it's simple, guaranteed to work, and is very robust against changes in the container code. 274 | saved = {"objs": list(box.obj_map.values()), "on": list(box._indexes.keys())} 275 | with open(filepath, "wb") as fh: 276 | pickle.dump(saved, fh) 277 | 278 | 279 | def load(saved: Dict) -> Dex: 280 | """Creates a Dex from the pickle file.""" 281 | return Dex(saved["objs"], saved["on"]) 282 | -------------------------------------------------------------------------------- /ducks/mutable/mutable_attr.py: -------------------------------------------------------------------------------- 1 | from array import array 2 | from typing import Any 3 | from typing import Callable 4 | from typing import Dict 5 | from typing import Hashable 6 | from typing import Iterable 7 | from typing import Optional 8 | from typing import Set 9 | from typing import Union 10 | 11 | from cykhash import Int64Set 12 | from ducks.btree import BTree 13 | from ducks.constants import ANY 14 | from ducks.constants import ARR_TYPE 15 | from ducks.constants import ARRAY_SIZE_MAX 16 | from ducks.constants import SET_SIZE_MIN 17 | from ducks.utils import get_attribute 18 | 19 | 20 | class MutableAttrIndex: 21 | """Stores data and handles requests that are relevant to a single attribute of a Dex.""" 22 | 23 | def __init__( 24 | self, 25 | attr: Union[Callable, str], 26 | objs: Optional[Iterable[Any]] = None, 27 | ): 28 | self.attr = attr 29 | self.none_ids = Int64Set() # Stores object IDs for the attribute value None 30 | self.tree = BTree() # Stores object IDs for all other values 31 | self.n_obj_ids = 0 32 | if objs: 33 | for obj in objs: 34 | self.add(id(obj), obj) 35 | 36 | def add(self, ptr: int, obj: Any): 37 | """Add an object if it has this attribute.""" 38 | val, success = get_attribute(obj, self.attr) 39 | if not success: 40 | return 41 | self._add_val(ptr, val) 42 | self.n_obj_ids += 1 43 | 44 | def get_obj_ids(self, val: Any) -> Int64Set: 45 | """Get the object IDs associated with this value as an Int64Set.""" 46 | if val is ANY: 47 | return self.get_all_ids() 48 | if val is None: 49 | return self.none_ids 50 | ids = self.tree.get(val, Int64Set()) 51 | if type(ids) is array: 52 | return Int64Set(ids) 53 | elif type(ids) is Int64Set: 54 | return ids 55 | else: 56 | return Int64Set([ids]) 57 | 58 | def remove(self, ptr: int, obj: Any): 59 | """Remove a single object from the index. ptr is already known to be in the Dex. 60 | Runs in O(1) if obj has this attr and the value of the attr hasn't changed. O(n_keys) otherwise.""" 61 | removed = False 62 | val, success = get_attribute(obj, self.attr) 63 | if success: 64 | removed = self._try_remove(ptr, val) 65 | if not removed: 66 | # do O(n) search 67 | for val in list(self.tree.keys()): 68 | removed = self._try_remove(ptr, val) 69 | if removed: 70 | break 71 | 72 | def get_all_ids(self) -> Int64Set: 73 | """Get the ID of every object that has this attribute. 74 | Called when matching or excluding ``{attr: hashindex.ANY}``.""" 75 | obj_ids = Int64Set(self.none_ids) 76 | for val in self.tree.values(): 77 | self._add_val_to_set(val, obj_ids) 78 | return obj_ids 79 | 80 | def get_values(self) -> Set: 81 | """Get unique values we have objects for.""" 82 | vals = set(self.tree.keys()) 83 | if len(self.none_ids): 84 | vals.add(None) 85 | return vals 86 | 87 | def get_ids_by_range(self, expr: Dict[str, Any]): 88 | """Get object IDs based on less than / greater than some value""" 89 | obj_ids = Int64Set() 90 | vals = self.tree.get_range_expr(expr) 91 | for val in vals: 92 | self._add_val_to_set(val, obj_ids) 93 | return obj_ids 94 | 95 | def _add_val(self, ptr, val): 96 | if val is None: 97 | self.none_ids.add(ptr) 98 | elif val in self.tree: 99 | obj_ids = self.tree[val] 100 | if type(obj_ids) is Int64Set: 101 | self.tree[val].add(ptr) 102 | elif type(obj_ids) is array: 103 | if len(obj_ids) == ARRAY_SIZE_MAX: 104 | # upgrade array -> set 105 | obj_ids = Int64Set(obj_ids) 106 | obj_ids.add(ptr) 107 | self.tree[val] = obj_ids 108 | else: 109 | obj_ids.append(ptr) 110 | else: 111 | # obj_ids was an int, now we have two. upgrade int -> array 112 | self.tree[val] = array(ARR_TYPE, [obj_ids, ptr]) 113 | else: 114 | # new val, add the int 115 | self.tree[val] = ptr 116 | 117 | @staticmethod 118 | def _add_val_to_set(val: Any, obj_ids: Int64Set): 119 | """We need to do this a lot""" 120 | if type(val) in [array, Int64Set]: 121 | for v in val: 122 | obj_ids.add(v) 123 | else: 124 | obj_ids.add(val) 125 | 126 | def _try_remove(self, ptr: int, val: Hashable) -> bool: 127 | """Try to remove the object from self.tree[val]. Return True on success, False otherwise.""" 128 | # handle None 129 | if val is None and ptr in self.none_ids: 130 | self.none_ids.remove(ptr) 131 | self.n_obj_ids -= 1 132 | return True 133 | 134 | # first, check that the ptr is in here 135 | if val not in self.tree: 136 | return False 137 | if type(self.tree[val]) in [array, Int64Set]: 138 | if ptr not in self.tree[val]: 139 | return False 140 | else: 141 | if self.tree[val] != ptr: 142 | return False 143 | 144 | # must be in the tree 145 | obj_ids = self.tree[val] 146 | if type(self.tree[val]) in [array, Int64Set]: 147 | self.tree[val].remove(ptr) 148 | if type(obj_ids) is array: 149 | if len(self.tree[val]) == 1: 150 | # downgrade array -> int 151 | self.tree[val] = self.tree[val][0] 152 | else: 153 | if len(self.tree[val]) < SET_SIZE_MIN: 154 | # downgrade set -> array 155 | self.tree[val] = array(ARR_TYPE, list(self.tree[val])) 156 | else: 157 | # downgrade int -> nothing 158 | del self.tree[val] 159 | self.n_obj_ids -= 1 160 | return True 161 | 162 | def __len__(self): 163 | return self.n_obj_ids 164 | -------------------------------------------------------------------------------- /ducks/pickling.py: -------------------------------------------------------------------------------- 1 | import pickle # nosec 2 | from typing import Union 3 | 4 | from ducks.concurrent.main import ConcurrentDex 5 | from ducks.concurrent.main import load as c_load 6 | from ducks.concurrent.main import save as c_save 7 | from ducks.frozen.main import FrozenDex 8 | from ducks.frozen.main import load as f_load 9 | from ducks.frozen.main import save as f_save 10 | from ducks.mutable.main import Dex 11 | from ducks.mutable.main import load as m_load 12 | from ducks.mutable.main import save as m_save 13 | 14 | 15 | def save(box: Union[Dex, FrozenDex, ConcurrentDex], filepath: str): 16 | """Save a Dex, FrozenDex, or ConcurrentDex to a file.""" 17 | if type(box) is Dex: 18 | m_save(box, filepath) 19 | if type(box) is FrozenDex: 20 | f_save(box, filepath) 21 | if type(box) is ConcurrentDex: 22 | c_save(box, filepath) 23 | 24 | 25 | def load(filepath: str) -> Union[Dex, FrozenDex, ConcurrentDex]: 26 | """Load a Dex, FrozenDex, or ConcurrentDex from a pickle file.""" 27 | with open(filepath, "rb") as fh: 28 | saved = pickle.load(fh) # nosec 29 | if isinstance(saved, FrozenDex): 30 | f_load(saved) # mutates saved 31 | return saved 32 | elif "priority" in saved: 33 | return c_load(saved) 34 | else: 35 | return m_load(saved) 36 | -------------------------------------------------------------------------------- /ducks/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from typing import Callable 3 | from typing import Dict 4 | from typing import List 5 | from typing import Optional 6 | from typing import Tuple 7 | from typing import Union 8 | 9 | import numpy as np 10 | from cykhash import Int64Set 11 | from ducks.constants import ANY 12 | from ducks.constants import EXCLUDE_OPERATORS 13 | from ducks.constants import OPERATOR_MAP 14 | from ducks.constants import VALID_OPERATORS 15 | from ducks.exceptions import AttributeNotFoundError 16 | from ducks.exceptions import MissingAttribute 17 | 18 | 19 | def get_attribute(obj: Any, attr: Union[Callable, str]) -> Tuple[Any, bool]: 20 | """Get the object's attribute value. Return (value, success). Unsuccessful if attribute is missing.""" 21 | if callable(attr): 22 | try: 23 | val = attr(obj) 24 | except MissingAttribute: 25 | return None, False 26 | elif isinstance(obj, dict): 27 | try: 28 | val = obj[attr] 29 | except KeyError: 30 | return None, False 31 | else: 32 | try: 33 | val = getattr(obj, attr) 34 | except AttributeError: 35 | return None, False 36 | return val, True 37 | 38 | 39 | def get_attributes(cls) -> List[str]: 40 | """Helper function to grab the attributes of a class""" 41 | return list(cls.__annotations__.keys()) 42 | 43 | 44 | def split_query(query: Dict) -> Tuple[Dict, Dict]: 45 | """Split query into match and exclude terms""" 46 | match_query = dict() 47 | exclude_query = dict() 48 | for attr, expr in query.items(): 49 | match_expr = dict() 50 | exclude_expr = dict() 51 | for op, val in expr.items(): 52 | if op in EXCLUDE_OPERATORS: 53 | # invert "not in" -> "in", etc. 54 | exclude_expr[EXCLUDE_OPERATORS[op]] = val 55 | else: 56 | match_expr[op] = val 57 | if match_expr: 58 | match_query[attr] = match_expr 59 | if exclude_expr: 60 | exclude_query[attr] = exclude_expr 61 | return match_query, exclude_query 62 | 63 | 64 | def standardize_expr(expr: Any) -> Dict: 65 | """Turn a find() expr into a dict of {operator: value}.""" 66 | if isinstance(expr, dict): 67 | return validate_and_standardize_operators(expr) 68 | if isinstance(expr, list): 69 | return {"in": expr} 70 | if isinstance(expr, set) and expr is not ANY: 71 | raise ValueError(f"Expression {expr} is a set. Did you mean to make a dict?") 72 | # otherwise, it's a value 73 | return {"==": expr} 74 | 75 | 76 | def validate_and_standardize_operators(expr: Dict) -> Dict: 77 | std_expr = {} 78 | for op, val in expr.items(): 79 | if op in OPERATOR_MAP: 80 | std_expr[OPERATOR_MAP[op]] = val 81 | else: 82 | std_expr[op] = val 83 | for op in std_expr: 84 | if op not in VALID_OPERATORS: 85 | raise ValueError( 86 | f"Invalid operator: {op}. Operator must be one of: {VALID_OPERATORS}." 87 | ) 88 | if "<" in std_expr and "<=" in std_expr: 89 | raise ValueError(f"Either '<' or '<=' may be used in {expr}, not both.") 90 | if ">" in std_expr and ">=" in std_expr: 91 | raise ValueError(f"Either '>' or '>=' may be used in {expr}, not both.") 92 | return std_expr 93 | 94 | 95 | def validate_query( 96 | indexes: Dict, 97 | match: Optional[Dict[Union[str, Callable], Any]] = None, 98 | exclude: Optional[Dict[Union[str, Callable], Any]] = None, 99 | ): 100 | # input validation -- check that we have an index for all desired lookups 101 | required_indexes = set() 102 | if match: 103 | required_indexes.update(match.keys()) 104 | if exclude: 105 | required_indexes.update(exclude.keys()) 106 | missing_indexes = required_indexes.difference(indexes) 107 | if missing_indexes: 108 | raise AttributeNotFoundError( 109 | f"Cannot find on: {list(missing_indexes)}. Attributes must be specified on creation." 110 | ) 111 | 112 | 113 | def make_empty_array(dtype: str): 114 | """Shorthand for making a length-0 numpy array.""" 115 | return np.empty(0, dtype=dtype) 116 | 117 | 118 | def cyk_intersect(s1: Int64Set, s2: Int64Set) -> Int64Set: 119 | """Cykhash intersections are faster on small.intersect(big); handle that appropriately. 120 | https://github.com/realead/cykhash/issues/7""" 121 | return s1.intersection(s2) if len(s1) < len(s2) else s2.intersection(s1) 122 | 123 | 124 | def cyk_union(s1: Int64Set, s2: Int64Set) -> Int64Set: 125 | """Cykhash unions are faster on big.union(small); handle that appropriately. 126 | https://github.com/realead/cykhash/issues/7""" 127 | return s1.union(s2) if len(s1) > len(s2) else s2.union(s1) 128 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/examples/__init__.py -------------------------------------------------------------------------------- /examples/collision.py: -------------------------------------------------------------------------------- 1 | """ 2 | Are any mice in range of a cat? Let's find out. 3 | We don't want to do all n_cats * n_mice comparisons, so we'll use Dex to find ones in the same or adjacent 4 | grid squares. 5 | """ 6 | from ducks import Dex 7 | 8 | 9 | class Cat: 10 | def __init__(self, name, x, y): 11 | self.name = name 12 | self.x = x 13 | self.y = y 14 | 15 | 16 | class Mouse: 17 | def __init__(self, name, x, y): 18 | self.name = name 19 | self.x = x 20 | self.y = y 21 | 22 | 23 | def in_range(mouse: Mouse, cat: Cat, radius: float = 1.0): 24 | return ((mouse.x - cat.x) ** 2 + (mouse.y - cat.y) ** 2) ** 0.5 < radius 25 | 26 | 27 | def main(): 28 | mice = [ 29 | Mouse("Mickey", 0.3, 0.5), 30 | Mouse("Minnie", 0.3, 0.6), 31 | Mouse("Hannah", 5.3, 5.5), 32 | Mouse("Jerry", 5.1, 1.5), 33 | ] 34 | cats = [ 35 | Cat("Tab", 4.0, 3.6), 36 | Cat("Tom", 4.9, 1.1), 37 | Cat("Hobbes", 2.2, 2.2), 38 | Cat("Garfield", 3.6, 1.9), 39 | ] 40 | 41 | def grid_x(obj): 42 | return int(obj.x) 43 | 44 | def grid_y(obj): 45 | return int(obj.y) 46 | 47 | def get_type(obj): 48 | return type(obj).__name__ 49 | 50 | fb = Dex(mice + cats, [grid_x, grid_y, get_type]) 51 | for m in mice: 52 | # only search the grid squares near this mouse, and only look at Cats 53 | nearby_cats = fb[ 54 | { 55 | grid_x: [grid_x(m), grid_x(m) - 1, grid_x(m) + 1], 56 | grid_y: [grid_y(m), grid_y(m) - 1, grid_y(m) + 1], 57 | get_type: "Cat", 58 | } 59 | ] 60 | for c in nearby_cats: 61 | if in_range(m, c): 62 | print(f"Mouse {m.name} is in range of cat {c.name}!") 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /examples/concurrent_perf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "baa0976a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import time\n", 11 | "\n", 12 | "from ducks import Dex, ConcurrentDex, FAIR, READERS, WRITERS" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "7fa5b553", 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/plain": [ 24 | "'\\nAdding 1M items\\n - Dex: 919 ms\\n - ConcurrentDex, priority=READERS: 3.69 s\\n - ConcurrentDex, priority=WRITERS: \\n - ConcurrentDex, priority=FAIR: \\n \\nFind each of 1M items\\n - Dex: 2.91 s\\n - ConcurrentDex, priority=READERS: 3.73 s\\n - ConcurrentDex, priority=WRITERS: \\n - ConcurrentDex, priority=FAIR: \\n'" 25 | ] 26 | }, 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "\"\"\"\n", 34 | "Adding 1M items\n", 35 | " - Dex: 919 ms\n", 36 | " - ConcurrentDex, priority=READERS: 3.69 s\n", 37 | " - ConcurrentDex, priority=WRITERS: \n", 38 | " - ConcurrentDex, priority=FAIR: \n", 39 | " \n", 40 | "Find each of 1M items\n", 41 | " - Dex: 2.91 s\n", 42 | " - ConcurrentDex, priority=READERS: 3.73 s\n", 43 | " - ConcurrentDex, priority=WRITERS: \n", 44 | " - ConcurrentDex, priority=FAIR: \n", 45 | "\"\"\"" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "id": "9f3afff8", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "data = [{'x': i} for i in range(10**6)]" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "id": "200558db", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "fb = Dex(None, on='x')\n", 66 | "cfb_read = ConcurrentDex(None, on='x', priority=READERS)\n", 67 | "cfb_write = ConcurrentDex(None, on='x', priority=WRITERS)\n", 68 | "cfb_fair = ConcurrentDex(None, on='x', priority=FAIR)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 5, 74 | "id": "6dfedc2d", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "def adds(box):\n", 79 | " t0 = time.time()\n", 80 | " for d in data:\n", 81 | " box.add(d)\n", 82 | " return time.time() - t0" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 6, 88 | "id": "9dfcc06d", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "def finds(box):\n", 93 | " t0 = time.time()\n", 94 | " for d in data:\n", 95 | " box.find({'x' :d['x']})\n", 96 | " return time.time() - t0\n" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 7, 102 | "id": "c998ba85", 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "box_type, add, find\n", 110 | "Dex, 0.9054300785064697, 3.01261830329895\n", 111 | "readers, 3.5315771102905273, 8.01264214515686\n", 112 | "writers, 5.2841057777404785, 9.07332968711853\n", 113 | "fair, 4.2892725467681885, 8.714087963104248\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "print('box_type, add, find')\n", 119 | "for box in [fb, cfb_read, cfb_write, cfb_fair]:\n", 120 | " if type(box) is Dex:\n", 121 | " box_s = 'Dex'\n", 122 | " else:\n", 123 | " box_s = box.priority\n", 124 | " t_add = adds(box)\n", 125 | " t_read = finds(box)\n", 126 | " print(f'{box_s}, {t_add}, {t_read}')" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 8, 132 | "id": "84c5e70e", 133 | "metadata": { 134 | "scrolled": true 135 | }, 136 | "outputs": [ 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "2.78 µs ± 23.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "%%timeit \n", 147 | "fb.find({'x': 1})" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 9, 153 | "id": "9b891e11", 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "name": "stdout", 158 | "output_type": "stream", 159 | "text": [ 160 | "8.09 µs ± 110 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "%%timeit \n", 166 | "cfb_read.find({'x': 1})" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 10, 172 | "id": "68be8c54", 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "8.84 µs ± 84.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" 180 | ] 181 | } 182 | ], 183 | "source": [ 184 | "%%timeit \n", 185 | "cfb_write.find({'x': 1})" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 11, 191 | "id": "10781c93", 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "8.46 µs ± 86.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" 199 | ] 200 | } 201 | ], 202 | "source": [ 203 | "%%timeit \n", 204 | "cfb_fair.find({'x': 1})" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "id": "3883fec9", 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [] 214 | } 215 | ], 216 | "metadata": { 217 | "kernelspec": { 218 | "display_name": "Python 3 (ipykernel)", 219 | "language": "python", 220 | "name": "python3" 221 | }, 222 | "language_info": { 223 | "codemirror_mode": { 224 | "name": "ipython", 225 | "version": 3 226 | }, 227 | "file_extension": ".py", 228 | "mimetype": "text/x-python", 229 | "name": "python", 230 | "nbconvert_exporter": "python", 231 | "pygments_lexer": "ipython3", 232 | "version": "3.9.7" 233 | } 234 | }, 235 | "nbformat": 4, 236 | "nbformat_minor": 5 237 | } 238 | -------------------------------------------------------------------------------- /examples/data/crossword_words.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/examples/data/crossword_words.txt -------------------------------------------------------------------------------- /examples/img/word0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/examples/img/word0.png -------------------------------------------------------------------------------- /examples/img/word1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/examples/img/word1.png -------------------------------------------------------------------------------- /examples/img/word2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/examples/img/word2.png -------------------------------------------------------------------------------- /examples/img/word3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/examples/img/word3.png -------------------------------------------------------------------------------- /examples/img/word4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/examples/img/word4.png -------------------------------------------------------------------------------- /examples/img/word5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/examples/img/word5.png -------------------------------------------------------------------------------- /examples/pandas_index.py: -------------------------------------------------------------------------------- 1 | """ 2 | Demo - Using Ducks as an indexer for Pandas 3 | 4 | Pandas allows index columns and even supports multi-column indexing. 5 | https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html 6 | However, its use is not very intuitive. If you'd rather use a Dex, here's how. 7 | """ 8 | import random 9 | 10 | import pandas as pd 11 | from ducks import FrozenDex 12 | 13 | # make some objects 14 | objs = [ 15 | { 16 | "fruit": random.choice( 17 | ["apple", "banana", "cherry", "kiwi", "lime", "watermelon"] 18 | ), 19 | "size": i % 10, 20 | } 21 | for i in range(1000) 22 | ] 23 | 24 | # put them in a dataframe 25 | df = pd.DataFrame(objs) 26 | 27 | 28 | # make lookup functions that match attributes to dataframe rows 29 | def get_fruit(i): 30 | """Get the fruit for this position in the df""" 31 | return df.iloc[i]["fruit"] 32 | 33 | 34 | def get_size(i): 35 | return df.iloc[i]["size"] 36 | 37 | 38 | # Build index 39 | dex = FrozenDex(list(range(len(df))), [get_fruit, get_size]) 40 | 41 | 42 | # Perform index lookups 43 | rows = df.iloc[dex[{get_fruit: "apple", get_size: {">=": 8}}]] 44 | print(rows) 45 | -------------------------------------------------------------------------------- /examples/percentile.py: -------------------------------------------------------------------------------- 1 | """ 2 | Look up objects by the percentile rank of an attribute. 3 | 4 | In this example, we find requests with latency > p99 (99th percentile) 5 | and requests with median latency (50th percentile). 6 | """ 7 | import functools 8 | from bisect import bisect_left 9 | from typing import Any 10 | 11 | import numpy as np 12 | from ducks import Dex 13 | 14 | 15 | def percentile(cutoffs: np.ndarray, attr: str, obj: Any) -> int: 16 | """Compute percentile on obj[attr] according to the cutoffs.""" 17 | p = bisect_left(cutoffs, obj[attr]) 18 | # handle values that are outside the min and max of cutoffs. 19 | # can happen due to float precision errors, or when new data is added. 20 | if p < 0: 21 | return 0 22 | if p > 99: 23 | return 99 24 | return p 25 | 26 | 27 | def main(): 28 | objs = [{"num": i, "latency": 1 + (i / 100) ** 3} for i in range(1000)] 29 | # make an array of size 100 containing the min cutoff values for each percentile 30 | latencies = np.array([obj["latency"] for obj in objs]) 31 | cutoffs = np.quantile(latencies, np.linspace(0, 1, 100)) 32 | p_latency = functools.partial(percentile, cutoffs, "latency") 33 | fb = Dex(objs, [p_latency]) 34 | print("requests with first-percentile latency:") 35 | for obj in fb[{p_latency: [0, 1]}]: 36 | print(obj) 37 | print("\nrequests with median (50th percentile) latency:") 38 | for obj in fb[{p_latency: 50}]: 39 | print(obj) 40 | print("\nrequests with 99th percentile latency:") 41 | for obj in fb[{p_latency: 99}]: 42 | print(obj) 43 | 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /examples/update.py: -------------------------------------------------------------------------------- 1 | from ducks import Dex 2 | 3 | 4 | class Changey: 5 | """A class containing a variable _n that changes. On change, it will update each Dex in its listeners.""" 6 | 7 | def __init__(self, n): 8 | self._n = n 9 | self.listeners = [] 10 | 11 | def add_listener(self, f: Dex): 12 | self.listeners.append(f) 13 | 14 | @property 15 | def n(self): 16 | return self._n 17 | 18 | @n.setter 19 | def n(self, new_n): 20 | for f in self.listeners: 21 | f.remove(self) 22 | self._n = new_n 23 | for f in self.listeners: 24 | f.add(self) 25 | 26 | 27 | def main(): 28 | objs = [Changey(1) for _ in range(10)] 29 | f = Dex(objs, ["n"]) 30 | for obj in objs: 31 | obj.add_listener(f) 32 | assert len(f[{"n": 1}]) == 10 33 | 34 | # change an object 35 | objs[0].n = 2 36 | 37 | # see that changes are propagated to Dex 38 | assert len(f[{"n": 1}]) == 9 39 | assert len(f[{"n": 2}]) == 1 40 | print("Completed. See code for details.") 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /noxfile.py: -------------------------------------------------------------------------------- 1 | """Nox sessions.""" 2 | import os 3 | import shlex 4 | import shutil 5 | import sys 6 | from pathlib import Path 7 | from textwrap import dedent 8 | 9 | import nox 10 | import toml 11 | 12 | try: 13 | from nox_poetry import Session 14 | from nox_poetry import session 15 | except ImportError: 16 | message = f"""\ 17 | Nox failed to import the 'nox-poetry' package. 18 | 19 | Please install it using the following command: 20 | 21 | {sys.executable} -m pip install nox-poetry""" 22 | raise SystemExit(dedent(message)) from None 23 | 24 | 25 | package = "ducks" 26 | python_versions = ["3.10", "3.9", "3.8", "3.7"] 27 | nox.needs_version = ">= 2021.6.6" 28 | nox.options.sessions = ( 29 | "pre-commit", 30 | "bandit", 31 | "safety", 32 | "tests", 33 | "docs-build", 34 | ) 35 | mypy_type_packages = () 36 | pyproject = toml.load("pyproject.toml") 37 | test_requirements = pyproject["tool"]["poetry"]["dev-dependencies"].keys() 38 | 39 | 40 | def activate_virtualenv_in_precommit_hooks(session: Session) -> None: 41 | """Activate virtualenv in hooks installed by pre-commit. 42 | 43 | This function patches git hooks installed by pre-commit to activate the 44 | session's virtual environment. This allows pre-commit to locate hooks in 45 | that environment when invoked from git. 46 | 47 | Args: 48 | session: The Session object. 49 | """ 50 | assert session.bin is not None # noqa: S101 51 | 52 | # Only patch hooks containing a reference to this session's bindir. Support 53 | # quoting rules for Python and bash, but strip the outermost quotes so we 54 | # can detect paths within the bindir, like /python. 55 | bindirs = [ 56 | bindir[1:-1] if bindir[0] in "'\"" else bindir 57 | for bindir in (repr(session.bin), shlex.quote(session.bin)) 58 | ] 59 | 60 | virtualenv = session.env.get("VIRTUAL_ENV") 61 | if virtualenv is None: 62 | return 63 | 64 | headers = { 65 | # pre-commit < 2.16.0 66 | "python": f"""\ 67 | import os 68 | os.environ["VIRTUAL_ENV"] = {virtualenv!r} 69 | os.environ["PATH"] = os.pathsep.join(( 70 | {session.bin!r}, 71 | os.environ.get("PATH", ""), 72 | )) 73 | """, 74 | # pre-commit >= 2.16.0 75 | "bash": f"""\ 76 | VIRTUAL_ENV={shlex.quote(virtualenv)} 77 | PATH={shlex.quote(session.bin)}"{os.pathsep}$PATH" 78 | """, 79 | } 80 | 81 | hookdir = Path(".git") / "hooks" 82 | if not hookdir.is_dir(): 83 | return 84 | 85 | for hook in hookdir.iterdir(): 86 | if hook.name.endswith(".sample") or not hook.is_file(): 87 | continue 88 | 89 | if not hook.read_bytes().startswith(b"#!"): 90 | continue 91 | 92 | text = hook.read_text() 93 | 94 | if not any( 95 | Path("A") == Path("a") and bindir.lower() in text.lower() or bindir in text 96 | for bindir in bindirs 97 | ): 98 | continue 99 | 100 | lines = text.splitlines() 101 | 102 | for executable, header in headers.items(): 103 | if executable in lines[0].lower(): 104 | lines.insert(1, dedent(header)) 105 | hook.write_text("\n".join(lines)) 106 | break 107 | 108 | 109 | @session(name="pre-commit", python=python_versions[0]) 110 | def precommit(session: Session) -> None: 111 | """Lint using pre-commit.""" 112 | args = session.posargs or ["run", "--all-files", "--show-diff-on-failure"] 113 | session.install( 114 | "black", 115 | "darglint", 116 | "flake8", 117 | "flake8-bandit", 118 | "flake8-bugbear", 119 | "flake8-docstrings", 120 | "flake8-rst-docstrings", 121 | "pep8-naming", 122 | "pre-commit", 123 | "pre-commit-hooks", 124 | "pyupgrade", 125 | "reorder-python-imports", 126 | ) 127 | session.install(".") 128 | session.run("pre-commit", *args) 129 | if args and args[0] == "install": 130 | activate_virtualenv_in_precommit_hooks(session) 131 | 132 | 133 | @session(python=python_versions[0]) 134 | def safety(session: Session) -> None: 135 | """Scan dependencies for insecure packages.""" 136 | requirements = session.poetry.export_requirements() 137 | session.install("safety") 138 | # safety is erroring on some numpy vulnerabilities. Fixing them requires going python >=3.8 139 | # once numpy is upgrades to at least 1.2.2, remove --continue-on-error 140 | session.run( 141 | "safety", 142 | "check", 143 | "--full-report", 144 | f"--file={requirements}", 145 | "--continue-on-error", 146 | ) 147 | 148 | 149 | @session(python=python_versions) 150 | def mypy(session: Session) -> None: 151 | """Type-check using mypy.""" 152 | args = session.posargs or ["ducks"] 153 | session.install(".") 154 | session.install("mypy", "pytest") 155 | if len(mypy_type_packages) > 0: 156 | session.install(*mypy_type_packages) 157 | session.run("mypy", *args) 158 | 159 | 160 | @session(python=python_versions[0]) 161 | def bandit(session: Session) -> None: 162 | """Run bandit security tests""" 163 | args = session.posargs or ["-r", "./ducks"] 164 | session.run("bandit", *args) 165 | 166 | 167 | @session(python=python_versions) 168 | def tests(session: Session) -> None: 169 | """Run the test suite.""" 170 | session.install(".") 171 | session.install(*test_requirements) 172 | session.run("poetry", "run", "pytest", *session.posargs) 173 | 174 | 175 | @session(name="docs-build", python=python_versions[0]) 176 | def docs_build(session: Session) -> None: 177 | """Build the documentation.""" 178 | args = session.posargs or ["docs", "docs/_build"] 179 | if not session.posargs and "FORCE_COLOR" in os.environ: 180 | args.insert(0, "--color") 181 | 182 | session.install(".") 183 | session.install("sphinx", "sphinx-rtd-theme") 184 | 185 | build_dir = Path("docs", "_build") 186 | if build_dir.exists(): 187 | shutil.rmtree(build_dir) 188 | 189 | session.run("sphinx-build", *args) 190 | 191 | 192 | @session(python=python_versions[0]) 193 | def docs(session: Session) -> None: 194 | """Build and serve the documentation with live reloading on file changes.""" 195 | args = session.posargs or ["--open-browser", "docs", "docs/_build"] 196 | session.install(".") 197 | session.install("sphinx", "sphinx-autobuild", "sphinx-click", "furo") 198 | 199 | build_dir = Path("docs", "_build") 200 | if build_dir.exists(): 201 | shutil.rmtree(build_dir) 202 | 203 | session.run("sphinx-autobuild", *args) 204 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "ducks" 3 | version = "0.5.1" 4 | description = "Provides Dex, a Python container for indexing objects of any type." 5 | authors = ["Theo Walker "] 6 | license = "MIT" 7 | repository = "https://github.com/manimino/ducks/" 8 | documentation = "https://pypi.org/project/ducks/" 9 | readme = "README.rst" 10 | 11 | [tool.poetry.dependencies] 12 | python = "^3.7" 13 | cykhash = "^2.0.0" 14 | numpy = "^1.14" 15 | readerwriterlock = "^1.0.9" 16 | sortednp = ">=0.4,<0.6" 17 | BTrees = "^4.10.0" 18 | 19 | [tool.poetry.dev-dependencies] 20 | pytest = "^7.4" 21 | coverage = "^7.2.7" 22 | safety = "^2.3.1" 23 | pre-commit = "^2.21.0" 24 | black = "^22.10.0" 25 | darglint = "^1.8.1" 26 | reorder-python-imports = "^3.10.0" 27 | pre-commit-hooks = "^4.3.0" 28 | pyupgrade = "^3.3.2" 29 | pytest-cov = "^4.1.0" 30 | pytest-xdist = "^3.5.0" 31 | bandit = "^1.7.5" 32 | flake8 = "^5.0.4" 33 | 34 | [build-system] 35 | requires = ["poetry-core>=1.0.0"] 36 | build-backend = "poetry.core.masonry.api" 37 | 38 | [tool.pytest.ini_options] 39 | addopts = "-n 4 --ignore examples --cov=ducks --cov-report=term-missing --cov-fail-under 99" 40 | 41 | [tool.bandit] 42 | exclude= "tests/ examples/ docks/ dev/" 43 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/test/__init__.py -------------------------------------------------------------------------------- /test/concurrent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/test/concurrent/__init__.py -------------------------------------------------------------------------------- /test/concurrent/concurrent_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import pytest 4 | from ducks.concurrent.main import FAIR 5 | from ducks.concurrent.main import READERS 6 | from ducks.concurrent.main import WRITERS 7 | 8 | 9 | def slow_wrapper(method): 10 | """Adds a tiny delay to a method. Good for triggering race conditions that would otherwise be very rare.""" 11 | 12 | def wrapped_method(*args): 13 | time.sleep(0.001) 14 | return method(*args) 15 | 16 | return wrapped_method 17 | 18 | 19 | @pytest.fixture(params=[READERS, WRITERS, FAIR]) 20 | def priority(request): 21 | return request.param 22 | -------------------------------------------------------------------------------- /test/concurrent/test_multi_writer.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import time 3 | from typing import Any 4 | from typing import List 5 | 6 | import pytest 7 | from ducks import ConcurrentDex 8 | 9 | from .concurrent_utils import priority 10 | from .concurrent_utils import slow_wrapper 11 | 12 | 13 | def worker_add_remove(objs: List[Any], box: ConcurrentDex, end_full=True): 14 | """The worker thread adds and removes each element of objs.""" 15 | for i in range(10): 16 | for obj in objs: 17 | box.add(obj) 18 | for obj in objs: 19 | try: 20 | box.remove(obj) 21 | except KeyError: 22 | # the other worker may have removed it already; that's OK. 23 | pass 24 | if end_full: 25 | for obj in objs: 26 | box.add(obj) 27 | 28 | 29 | @pytest.mark.parametrize( 30 | "end_full, expected_len", 31 | [ 32 | (True, 10), 33 | (False, 0), 34 | ], 35 | ) 36 | def test_add_remove(priority, end_full, expected_len): 37 | objs = [{"x": i % 2} for i in range(10)] 38 | 39 | box = ConcurrentDex(objs, ["x"], priority=priority) 40 | # box = Dex(objs, ['x']) # <--- use this instead, and you will observe frequent failures on this test. 41 | 42 | # Patch indexes 'add' to add a small delay, forcing race conditions to occur more often 43 | box._indexes["x"].add = slow_wrapper(box._indexes["x"].add) 44 | 45 | duration = 0.2 46 | t0 = time.time() 47 | while time.time() - t0 < duration: 48 | t1 = threading.Thread( 49 | target=worker_add_remove, args=[objs, box], kwargs={"end_full": end_full} 50 | ) 51 | t2 = threading.Thread( 52 | target=worker_add_remove, args=[objs, box], kwargs={"end_full": end_full} 53 | ) 54 | t1.start() 55 | t2.start() 56 | t1.join() 57 | t2.join() 58 | assert len(box) == expected_len 59 | assert len(box._indexes["x"]) == expected_len # fails on Dex 60 | -------------------------------------------------------------------------------- /test/concurrent/test_read_update.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | from ducks import ConcurrentDex 4 | 5 | from .concurrent_utils import priority 6 | 7 | 8 | def worker_read_update(cfb: ConcurrentDex): 9 | # this is one concurrency mode -- using the cfb's lock while 10 | # modifying both the objects and cfb. It's... probably the wrong pattern. 11 | for obj in cfb: 12 | with cfb.write_lock(): 13 | obj["x"] += 1 14 | cfb.box.update(obj) 15 | 16 | 17 | def test_read_update(priority): 18 | objs = [{"x": 0} for _ in range(10)] 19 | cfb = ConcurrentDex(objs, ["x"], priority=priority) 20 | threads = [] 21 | for _ in range(5): 22 | threads.append(threading.Thread(target=worker_read_update, args=(cfb,))) 23 | for t in threads: 24 | t.start() 25 | for t in threads: 26 | t.join() 27 | for obj in cfb: 28 | assert obj["x"] == 5 29 | 30 | 31 | def worker_update(cfb, obj_write_lock): 32 | with obj_write_lock: 33 | for obj in cfb: 34 | obj["x"] += 1 35 | cfb.update(obj) 36 | 37 | 38 | def test_two_lock_updating(priority): 39 | # This is a more sensible locking strategy; objs has its own lock 40 | # and cfb just worries about itself. Which one is correct kinda depends on how 41 | # sensitive the user is to stale results. This one would allow stale reads to occur; 42 | # the other one wouldn't. But this one also allows reads to happen between the writes 43 | # so that's nice. 44 | objs = [{"x": 0} for _ in range(10)] 45 | cfb = ConcurrentDex(objs, ["x"], priority=priority) 46 | threads = [] 47 | obj_lock = threading.Lock() 48 | for _ in range(5): 49 | threads.append(threading.Thread(target=worker_update, args=(cfb, obj_lock))) 50 | for t in threads: 51 | t.start() 52 | for t in threads: 53 | t.join() 54 | for obj in cfb: 55 | assert obj["x"] == 5 56 | -------------------------------------------------------------------------------- /test/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ducks import ConcurrentDex 3 | from ducks import Dex 4 | from ducks import FrozenDex 5 | 6 | 7 | @pytest.fixture(params=[Dex, FrozenDex, ConcurrentDex]) 8 | def box_class(request): 9 | return request.param 10 | 11 | 12 | class AssertRaises: 13 | """ 14 | While the unittest package has an assertRaises context manager, it is incompatible with pytest + fixtures. 15 | Cleaner to just implement an AssertRaises here. 16 | """ 17 | 18 | def __init__(self, exc_type): 19 | self.exc_type = exc_type 20 | 21 | def __enter__(self): 22 | pass 23 | 24 | def __exit__(self, exception_type, exception_value, exception_traceback): 25 | assert exception_type == self.exc_type 26 | return True # suppress the exception 27 | 28 | 29 | class Attr: 30 | def __init__(self, n: int): 31 | self.n = n 32 | 33 | def __hash__(self): 34 | return self.n 35 | 36 | def __eq__(self, other): 37 | return self.n == other.n 38 | 39 | def __repr__(self): 40 | return str(self.n) 41 | 42 | def __lt__(self, other): 43 | return self.n < other.n 44 | -------------------------------------------------------------------------------- /test/mutable/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manimino/ducks/0217a0e9673fde155a81ac9ab23dfd3538fcd235/test/mutable/__init__.py -------------------------------------------------------------------------------- /test/mutable/test_soak.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dex (mutable form) is pretty complex. 3 | Let's run a lengthy test to make sure all the pieces work as expected across many add / remove operations. 4 | """ 5 | import random 6 | import time 7 | from datetime import datetime 8 | 9 | from ducks import Dex 10 | from ducks.utils import get_attribute 11 | 12 | 13 | PLANETS = ( 14 | ["mercury"] * 1 15 | + ["venus"] * 2 16 | + ["earth"] * 4 17 | + ["mars"] * 8 18 | + ["jupiter"] * 16 19 | + ["saturn"] * 32 20 | + ["uranus"] * 64 21 | + ["neptune"] * 128 22 | ) 23 | 24 | 25 | class Collider: 26 | 27 | VALS = list(range(10)) 28 | 29 | def __init__(self): 30 | self.n = random.choice(self.VALS) 31 | 32 | def __hash__(self): 33 | return self.n % 2 34 | 35 | def __eq__(self, other): 36 | return self.n == other.n 37 | 38 | def __lt__(self, other): 39 | return self.n < other.n 40 | 41 | 42 | class Thing: 43 | def __init__(self, id_num): 44 | self.id_num = id_num 45 | self.ts_sec = datetime.now().replace(microsecond=0) 46 | self.ts = datetime.now() 47 | self.planet = random.choice(PLANETS) 48 | self.collider = Collider() 49 | if random.random() > 0.5: 50 | self.sometimes = True 51 | 52 | 53 | def planet_len(obj): 54 | if isinstance(obj, dict): 55 | return len(obj["planet"]) 56 | else: 57 | return len(obj.planet) 58 | 59 | 60 | def make_dict_thing(id_num): 61 | t = Thing(id_num) 62 | return { 63 | "id_num": t.id_num, 64 | "ts_sec": t.ts_sec, 65 | "ts": t.ts, 66 | "planet": t.planet, 67 | "collider": t.collider, 68 | planet_len: planet_len(t), 69 | } 70 | 71 | 72 | class SoakTest: 73 | """ 74 | Keep running insert / update / remove operations at random for a long time. 75 | Check periodically to make sure[] results are correct. 76 | """ 77 | 78 | def __init__(self): 79 | self.t0 = time.time() 80 | self.t_report = {5 * i for i in range(1000)} 81 | random.seed(time.time()) 82 | self.seed = random.choice(range(10**6)) 83 | print("running soak test with seed:", self.seed) 84 | random.seed(self.seed) 85 | self.f = Dex(on=["ts_sec", "ts", "planet", "collider", "sometimes", planet_len]) 86 | # self.f = Dex(on=[planet_len]) 87 | self.objs = dict() 88 | self.max_id_num = 0 89 | 90 | def run(self, duration): 91 | while time.time() - self.t0 < duration: 92 | op = random.choice( 93 | [ 94 | self.add, 95 | self.add_many, 96 | self.remove, 97 | self.remove_all, 98 | self.check_equal, 99 | ] 100 | ) 101 | op() 102 | 103 | def add(self): 104 | self.max_id_num += 1 105 | # randomly pick between a dict and a class instance 106 | if random.random() < 0.5: 107 | t = Thing(self.max_id_num) 108 | else: 109 | t = make_dict_thing(self.max_id_num) 110 | self.objs[self.max_id_num] = t 111 | self.f.add(t) 112 | 113 | def add_many(self): 114 | for _ in range(random.choice([10, 100, 1000])): 115 | self.add() 116 | 117 | def remove(self): 118 | if self.objs: 119 | key = random.choice(list(self.objs.keys())) 120 | obj = self.objs[key] 121 | self.f.remove(obj) 122 | del self.objs[key] 123 | 124 | def remove_all(self): 125 | for t in self.objs.values(): 126 | self.f.remove(t) 127 | self.objs = dict() 128 | 129 | def remove_all_but_one(self): 130 | key = random.choice(list(self.objs.keys())) 131 | for k in self.objs: 132 | if k != key: 133 | self.f.remove(self.objs[k]) 134 | del self.objs[k] 135 | 136 | def random_obj(self): 137 | if not len(self.objs): 138 | return None 139 | return random.choice(list(self.objs.values())) 140 | 141 | def check_equal(self): 142 | # check a string key 143 | ls = [ 144 | o for o in self.objs.values() if get_attribute(o, "planet")[0] == "saturn" 145 | ] 146 | f_ls = self.f[{"planet": "saturn"}] 147 | assert len(ls) == len(f_ls) 148 | assert len(self.objs) == len(self.f._indexes["planet"]) 149 | # check a functional key 150 | ls = [o for o in self.objs.values() if get_attribute(o, planet_len)[0] == 6] 151 | f_ls = self.f[{planet_len: 6}] 152 | assert len(ls) == len(f_ls) 153 | assert len(self.objs) == len(self.f._indexes[planet_len]) 154 | # check a null-ish key 155 | ls = [ 156 | o for o in self.objs.values() if get_attribute(o, "sometimes")[1] is False 157 | ] 158 | f_ls = self.f[{"sometimes": {"!=": True}}] 159 | assert len(ls) == len(f_ls) 160 | # check a colliding key 161 | c = Collider() 162 | ls = [o for o in self.objs.values() if get_attribute(o, "collider")[0] == c] 163 | f_ls = self.f[{"collider": c}] 164 | assert len(ls) == len(f_ls) 165 | assert len(self.objs) == len(self.f._indexes["collider"]) 166 | # check an object-ish key 167 | t = self.random_obj() 168 | if t is not None: 169 | target_ts = get_attribute(t, "ts_sec") 170 | ls = [ 171 | o 172 | for o in self.objs.values() 173 | if get_attribute(o, "ts_sec")[0] == target_ts[0] 174 | ] 175 | f_ls = self.f[{"ts_sec": target_ts[0]}] 176 | assert len(ls) == len(f_ls) 177 | 178 | 179 | def test_soak(): 180 | st = SoakTest() 181 | st.run(3) 182 | -------------------------------------------------------------------------------- /test/test_basic_operations.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | from typing import Union 4 | 5 | import pytest 6 | from ducks import Dex 7 | from ducks import FrozenDex 8 | from ducks.utils import get_attributes 9 | 10 | from .conftest import AssertRaises 11 | 12 | 13 | @dataclass 14 | class Pokemon: 15 | name: str 16 | type1: str 17 | type2: Optional[str] 18 | 19 | def __repr__(self): 20 | if self.type2 is None: 21 | return f"{self.name}: {self.type1}" 22 | return f"{self.name}: {self.type1}/{self.type2}" 23 | 24 | def __hash__(self): 25 | t = (self.name, self.type1, self.type2) 26 | return hash(t) 27 | 28 | def __lt__(self, other): 29 | return self.name < other.name 30 | 31 | 32 | def make_test_ducks(box_class) -> Union[Dex, FrozenDex]: 33 | zapdos = Pokemon("Zapdos", "Electric", "Flying") 34 | pikachu_1 = Pokemon("Pikachu", "Electric", None) 35 | pikachu_2 = Pokemon("Pikachu", "Electric", None) 36 | eevee = Pokemon("Eevee", "Normal", None) 37 | f = box_class([zapdos, pikachu_1, pikachu_2, eevee], on=get_attributes(Pokemon)) 38 | return f 39 | 40 | 41 | def test_find_one(box_class): 42 | f = make_test_ducks(box_class) 43 | result = f[{"name": "Zapdos"}] 44 | assert len(result) == 1 45 | 46 | 47 | def test_find_union(box_class): 48 | f = make_test_ducks(box_class) 49 | result = f[{"name": ["Pikachu", "Eevee"]}] 50 | assert len(result) == 3 51 | 52 | 53 | def test_find_union_with_mismatch(box_class): 54 | f = make_test_ducks(box_class) 55 | result = f[{"name": ["Pikachu", "Shykadu"]}] 56 | assert len(result) == 2 57 | 58 | 59 | def test_find_in_iterable_of_one(box_class): 60 | f = make_test_ducks(box_class) 61 | result = f[{"name": {"in": {"Pikachu"}}}] 62 | assert len(result) == 2 63 | 64 | 65 | @pytest.mark.parametrize( 66 | "expr, expected_len", 67 | [ 68 | ({">": "Yapdos"}, 1), 69 | ({">=": "Yapdos"}, 1), 70 | ({">": "AAA", "<": "zzz"}, 4), 71 | ({">=": "Eevee", "<": "Pikachu"}, 1), 72 | ({">=": "Eevee", "<=": "Pikachu"}, 3), 73 | ({"ge": "Eevee", "le": "Pikachu"}, 3), 74 | ({">": "Eevee", "<": "Zapdos"}, 2), 75 | ({"gt": "Eevee", "lt": "Zapdos"}, 2), 76 | ({"<": "Eevee"}, 0), 77 | ({"<": "Eevee", ">": "Zapdos"}, 0), 78 | ], 79 | ) 80 | def test_find_greater_less(box_class, expr, expected_len): 81 | f = make_test_ducks(box_class) 82 | result = f[{"name": expr}] 83 | assert len(result) == expected_len 84 | 85 | 86 | def test_find_sub_obj(box_class): 87 | objs = [ 88 | {"p": Pokemon("Zapdos", "Electric", "Flying")}, 89 | {"p": Pokemon("Pikachu", "Electric", None)}, 90 | ] 91 | f = box_class(objs, on=["p"]) 92 | found_empty = f[{}] 93 | assert len(found_empty) == 2 94 | for obj in objs: 95 | assert obj in found_empty 96 | 97 | 98 | def test_find_exclude_only(box_class): 99 | f = make_test_ducks(box_class) 100 | result = f[{"type2": {"!=": None}}] # Zapdos is the only one with a type2 101 | assert len(result) == 1 102 | assert result[0].name == "Zapdos" 103 | 104 | 105 | def test_two_attrs(box_class): 106 | f = make_test_ducks(box_class) 107 | result = f[ 108 | { 109 | "name": {"in": ["Pikachu", "Zapdos"]}, 110 | "type1": "Electric", 111 | "type2": {"!=": "Flying"}, 112 | } 113 | ] 114 | assert len(result) == 2 115 | assert result[0].name == "Pikachu" 116 | assert result[1].name == "Pikachu" 117 | 118 | 119 | def test_three_attrs(box_class): 120 | f = make_test_ducks(box_class) 121 | result = f[ 122 | { 123 | "name": {"in": ["Pikachu", "Zapdos"]}, 124 | "type1": "Electric", 125 | "type2": "Flying", 126 | } 127 | ] 128 | assert len(result) == 1 129 | assert result[0].name == "Zapdos" 130 | 131 | 132 | def test_exclude_all(box_class): 133 | f = make_test_ducks(box_class) 134 | result = f[{"type1": {"not in": ["Electric", "Normal"]}}] 135 | assert len(result) == 0 136 | 137 | 138 | def test_remove(box_class): 139 | f = make_test_ducks(box_class) 140 | two_chus = f[{"name": "Pikachu"}] 141 | assert len(two_chus) == 2 142 | if box_class == FrozenDex: 143 | with AssertRaises(AttributeError): 144 | f.remove(two_chus[1]) 145 | else: 146 | f.remove(two_chus[1]) 147 | one_chu = f[{"name": "Pikachu"}] 148 | assert len(one_chu) == 1 149 | 150 | 151 | def test_add(box_class): 152 | f = make_test_ducks(box_class) 153 | glaceon = Pokemon("Glaceon", "Ice", None) 154 | if box_class == FrozenDex: 155 | with AssertRaises(AttributeError): 156 | f.add(glaceon) 157 | else: 158 | f.add(glaceon) 159 | res = f[{"name": "Glaceon"}] 160 | assert res == [glaceon] 161 | 162 | 163 | def test_multi_exclude(box_class): 164 | fb = make_test_ducks(box_class) 165 | res = fb[{"name": {"!=": "Pikachu"}, "type1": {"not in": ["Normal"]}}] 166 | zapdos_ls = [p for p in fb if p.name == "Zapdos"] 167 | assert res == zapdos_ls 168 | 169 | 170 | def test_get_values(box_class): 171 | fb = make_test_ducks(box_class) 172 | assert fb.get_values("name") == {"Zapdos", "Pikachu", "Eevee"} 173 | assert fb.get_values("type1") == {"Electric", "Normal"} 174 | assert fb.get_values("type2") == {"Flying", None} 175 | -------------------------------------------------------------------------------- /test/test_btree.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ducks.btree import BTree 3 | 4 | from .conftest import AssertRaises 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "expr, result", 9 | [ 10 | ({">": 8}, [9]), 11 | ({">": 6}, [7, 8, 9]), 12 | ({"<": 1}, [0]), 13 | ({"<": 3}, [0, 1, 2]), 14 | ({">=": 9}, [9]), 15 | ({">=": 9, "<": 1}, []), 16 | ({">=": 5, "<": 6}, [5]), 17 | ({">=": 5, "<=": 5}, [5]), 18 | ({">": 6, "<=": 7}, [7]), 19 | ({">": 6, "<=": 8}, [7, 8]), 20 | ({">": 6, "<=": 6}, []), 21 | ({">=": 6, "<": 6}, []), 22 | ({">=": 999}, []), 23 | ({">": 999}, []), 24 | ({"<=": -1}, []), 25 | ({"<": -1}, []), 26 | ({">": 9.5}, []), 27 | ({">": 8.5, "<": 9.5}, [9]), 28 | ({">=": 7.5, "<=": 9.5}, [8, 9]), 29 | ({">": 4, "<": 3}, []), 30 | ({">=": 4, "<=": 3}, []), 31 | ({">=": 999, "<=": -1}, []), 32 | ({">": 999, "<": -1}, []), 33 | ({">": -100, "<": 100}, list(range(10))), 34 | ({">=": -100, "<=": 100}, list(range(10))), 35 | ], 36 | ) 37 | def test_get_range_expr(expr, result): 38 | bt = BTree({i: i for i in range(10)}) 39 | assert list(bt.get_range_expr(expr)) == result 40 | 41 | 42 | def test_init_with_none(): 43 | objs = {i: i for i in range(10)} 44 | objs[None] = 13 45 | with AssertRaises(TypeError): 46 | _ = BTree(objs) 47 | 48 | 49 | def test_add_none(): 50 | objs = {i: i for i in range(10)} 51 | bt = BTree(objs) 52 | with AssertRaises(TypeError): 53 | bt[None] = 13 54 | 55 | 56 | def test_get(): 57 | bt = BTree({1: "a"}) 58 | assert bt.get(1) == "a" 59 | assert bt[1] == "a" 60 | assert bt.get(2) is None 61 | assert bt.get(3, 4) == 4 62 | 63 | 64 | def test_get_empty(): 65 | bt = BTree() 66 | assert len(bt.get_range_expr({">": 5})) == 0 67 | assert bt.get(3) is None 68 | assert bt.get(3, 45) == 45 69 | with AssertRaises(KeyError): 70 | _ = bt[3] 71 | 72 | 73 | def test_len_full_init(): 74 | bt = BTree({i: i for i in range(10)}) 75 | assert len(bt) == 10 76 | del bt[0] 77 | assert len(bt) == 9 78 | bt[0] = 0 79 | assert len(bt) == 10 80 | bt[1] = 99 # key already present 81 | assert len(bt) == 10 82 | 83 | 84 | def test_len_empty_init(): 85 | bt = BTree() 86 | assert len(bt) == 0 87 | bt[0] = 0 88 | assert len(bt) == 1 89 | bt[0] = 99 # key already present 90 | assert len(bt) == 1 91 | del bt[0] 92 | assert len(bt) == 0 93 | 94 | 95 | def test_keys_values(): 96 | bt = BTree({"a": 1, "b": 2}) 97 | assert list(bt.keys()) == ["a", "b"] 98 | assert list(bt.values()) == [1, 2] 99 | assert list(bt.items()) == [("a", 1), ("b", 2)] 100 | 101 | 102 | def test_bad_expr(): 103 | bt = BTree({"a": 1, "b": 2}) 104 | with AssertRaises(TypeError): 105 | bt.get_range_expr({"<=": 99}) 106 | 107 | 108 | def test_bad_first_insert(): 109 | bt = BTree() 110 | with AssertRaises(TypeError): 111 | bt[{"x": 1}] = 5 112 | bt = BTree() 113 | -------------------------------------------------------------------------------- /test/test_container_ops.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ducks import Dex 3 | from ducks import FrozenDex 4 | from ducks.constants import SIZE_THRESH 5 | 6 | 7 | def test_iter_small(box_class): 8 | ls = [{"i": i} for i in range(5)] 9 | f = box_class(ls, ["i"]) 10 | assert len(f) == len(ls) 11 | f_ls = list(f) 12 | assert len(f_ls) == len(ls) 13 | for item in ls: 14 | assert item in f_ls 15 | assert len(f_ls) == len(ls) 16 | 17 | 18 | @pytest.mark.parametrize( 19 | "idx_order", 20 | [ 21 | ["i", "j"], 22 | ["j", "i"], 23 | ], 24 | ) 25 | def test_iter_large(box_class, idx_order): 26 | ls = [{"i": i, "j": -(i % 3)} for i in range(SIZE_THRESH * 3 + 3)] 27 | ls += [{"j": 16}] # make sure there's at least one hasfbucket 28 | f = box_class(ls, idx_order) 29 | assert len(f) == len(ls) 30 | f_ls = list(f) 31 | assert len(f_ls) == len(ls) 32 | for item in ls: 33 | assert item in f_ls 34 | assert len(f_ls) == len(ls) 35 | 36 | 37 | @pytest.mark.parametrize( 38 | "idx_order", 39 | [ 40 | ["i", "j"], 41 | ["j", "i"], 42 | ], 43 | ) 44 | def test_make_from(box_class, idx_order): 45 | """See if we can make one index type from the other type.""" 46 | make_type = Dex if box_class == FrozenDex else FrozenDex 47 | ls = [{"i": i, "j": -(i % 3)} for i in range(SIZE_THRESH * 3 + 3)] 48 | f = box_class(ls, on=idx_order) 49 | other_f = make_type(f, on=idx_order) 50 | assert len(other_f) == len(f) 51 | 52 | 53 | def test_box_contains(box_class): 54 | ls = [{"i": i} for i in range(5)] 55 | f = box_class(ls, ["i"]) 56 | for item in ls: 57 | assert item in f 58 | 59 | 60 | def test_box_not_contains(box_class): 61 | yes = {"i": 1} 62 | f = box_class([yes], "i") 63 | # test a ton of these because coverage can drop otherwise 64 | for i in [None, -1000, "apples", 1000, (1, 2, 3), 0.5] + list(range(100)): 65 | no = {"i": i} 66 | assert no not in f 67 | -------------------------------------------------------------------------------- /test/test_edge_cases.py: -------------------------------------------------------------------------------- 1 | from ducks import Dex 2 | 3 | from .conftest import AssertRaises 4 | 5 | 6 | def test_get_zero(box_class): 7 | def _f(x): 8 | return x[0] 9 | 10 | f = box_class(["a", "b", "c"], on=[_f]) 11 | assert f[{_f: "c"}] == ["c"] 12 | assert len(f[{_f: "d"}]) == 0 13 | 14 | 15 | def test_get_in_no_results(box_class): 16 | def _f(x): 17 | return x[0] 18 | 19 | f = box_class(["a", "b", "c"], on=[_f]) 20 | assert len(f[{_f: {"in": ["d"]}}]) == 0 21 | assert len(f[{_f: {"in": []}}]) == 0 22 | 23 | 24 | def test_double_add(): 25 | f = Dex(on="s") 26 | x = {"s": "hello"} 27 | f.add(x) 28 | f.add(x) 29 | assert len(f) == 1 30 | assert f[{"s": "hello"}] == [x] 31 | f.remove(x) 32 | assert len(f) == 0 33 | assert f[{"s": "hello"}] == [] 34 | 35 | 36 | def test_empty_index(box_class): 37 | f = box_class([], on=["stuff"]) 38 | result = f[{"stuff": 3}] 39 | assert len(result) == 0 40 | result = f[{"stuff": {"<": 3}}] 41 | assert len(result) == 0 42 | 43 | 44 | def test_arg_order(): 45 | data = [{"a": i % 5, "b": i % 3} for i in range(100)] 46 | f = Dex(data, ["a", "b"]) 47 | assert len(f[{"a": 1, "b": 2}]) == len(f[{"b": 2, "a": 1}]) 48 | 49 | 50 | class NoSort: 51 | def __init__(self, x): 52 | self.x = x 53 | 54 | def __hash__(self): 55 | return hash(self.x) 56 | 57 | def __eq__(self, other): 58 | return self.x == other.x 59 | 60 | 61 | def test_unsortable_values(box_class): 62 | """We need to support values that are hashable, even if they cannot be sorted.""" 63 | objs = [{"a": NoSort(0)}, {"a": NoSort(1)}] 64 | with AssertRaises(TypeError): 65 | box_class(objs, ["a"]) 66 | 67 | 68 | def test_not_in(box_class): 69 | """the things we do for 100% coverage""" 70 | f = box_class([{"a": 1}], on=["a"]) 71 | assert {"a": 0} not in f 72 | assert {"a": 2} not in f 73 | 74 | 75 | def test_in_with_greater(box_class): 76 | """ 77 | Technically someone could query a '<' along with an 'in'. Does that work properly? 78 | """ 79 | f = box_class([{"a": 1}], on="a") 80 | assert len(f[{"a": {"<=": 1, "in": [1]}}]) == 1 81 | assert len(f[{"a": {">": 1, "in": [1]}}]) == 0 82 | assert len(f[{"a": {"<=": 1, "in": [0]}}]) == 0 83 | assert len(f[{"a": {"<": 1, "in": [0]}}]) == 0 84 | -------------------------------------------------------------------------------- /test/test_examples.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def test_get_nearby(box_class): 5 | # set of tuples 6 | t = {(random.random() * 10, random.random() * 10) for _ in range(10**4)} 7 | 8 | def _x(obj): 9 | return int(obj[0]) 10 | 11 | def _y(obj): 12 | return int(obj[1]) 13 | 14 | f = box_class(t, [_x, _y]) 15 | for pt in f[{_x: 0, _y: 0}]: 16 | assert _x(pt) < 1 and _y(pt) < 1 17 | 18 | 19 | def test_wordle(box_class): 20 | ws = [ 21 | ("ABOUT", 1226734006), 22 | ("OTHER", 978481319), 23 | ("WHICH", 810514085), 24 | ("THEIR", 782849411), 25 | ] 26 | 27 | def has_t(w): 28 | return "T" in w[0] 29 | 30 | def has_h(w): 31 | return "H" in w[0] 32 | 33 | f = box_class(ws, [has_t, has_h]) 34 | found = f[{}] 35 | found_ws = [f[0] for f in found] 36 | for w in ws: 37 | assert w[0] in found_ws 38 | -------------------------------------------------------------------------------- /test/test_exceptions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ducks import ConcurrentDex 3 | from ducks import Dex 4 | from ducks import FrozenDex 5 | from ducks.constants import SIZE_THRESH 6 | from ducks.exceptions import AttributeNotFoundError 7 | 8 | from .conftest import AssertRaises 9 | from .conftest import Attr 10 | 11 | 12 | def test_remove_empty(): 13 | f = Dex([], on=["stuff"]) 14 | with AssertRaises(KeyError): 15 | f.remove("nope") 16 | 17 | 18 | def test_no_index(): 19 | with AssertRaises(ValueError): 20 | Dex(["a"]) 21 | 22 | 23 | def test_empty_index(): 24 | with AssertRaises(ValueError): 25 | FrozenDex(["a"], []) 26 | 27 | 28 | def test_bad_query(box_class): 29 | f = box_class([{"a": 1}], on=["a"]) 30 | with AssertRaises(TypeError): 31 | _ = f[[]] 32 | with AssertRaises(TypeError): 33 | _ = f[["a", 1]] 34 | with AssertRaises(AttributeNotFoundError): 35 | _ = f[{"b": 1}] 36 | 37 | 38 | @pytest.mark.parametrize("n_items", [1, 5, SIZE_THRESH + 1]) 39 | def test_remove_missing_value(n_items): 40 | """ 41 | When the value hashes to a bucket, but the bucket does not contain the value, is 42 | an empty result correctly retrieved? 43 | """ 44 | data = [Attr(i) for i in range(5)] 45 | f = Dex(data, ["n"]) 46 | assert len(f[{"n": -1}]) == 0 47 | with AssertRaises(KeyError): 48 | f.remove(Attr(-1)) 49 | 50 | 51 | def test_bad_priority(): 52 | with AssertRaises(ValueError): 53 | _ = ConcurrentDex(None, on=["x"], priority="lol") 54 | 55 | 56 | def test_bad_expr(box_class): 57 | f = box_class(["ok"], on="x") 58 | with AssertRaises(ValueError): 59 | _ = f[{"x": {">", 2}}] 60 | 61 | 62 | def test_bad_operator(box_class): 63 | f = box_class(["ok"], on="x") 64 | with AssertRaises(ValueError): 65 | _ = f[{"x": {"qq": 2}}] 66 | 67 | 68 | def test_bad_gt_lt(box_class): 69 | f = box_class(["ok"], on="x") 70 | with AssertRaises(ValueError): 71 | _ = f[{"x": {">": 2, ">=": 3}}] 72 | with AssertRaises(ValueError): 73 | _ = f[{"x": {"<": 2, "<=": 3}}] 74 | -------------------------------------------------------------------------------- /test/test_fancy_gets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test attribute lookups of different kinds 3 | e.g. getting dict attributes, or applying functions, or getting properties from namedtuples 4 | """ 5 | import pytest 6 | from ducks.constants import SIZE_THRESH 7 | 8 | 9 | def make_dict_data(): 10 | dicts = [ 11 | {"t0": 0.1, "t1": 0.2, "s": "ABC"}, 12 | {"t0": 0.3, "t1": 0.4, "s": "DEF"}, 13 | {"t0": 0.5, "t1": 0.6, "s": "GHI"}, 14 | ] 15 | return dicts 16 | 17 | 18 | def test_dicts(box_class): 19 | dicts = make_dict_data() 20 | f = box_class(dicts, ["t0", "t1", "s"]) 21 | result = f[ 22 | { 23 | "t0": {"in": [0.1, 0.3]}, 24 | "s": {"in": ["ABC", "DEF"]}, 25 | "t1": {"!=": 0.4}, 26 | } 27 | ] 28 | assert result == [dicts[0]] 29 | 30 | 31 | def test_getter_fn(box_class): 32 | def _middle_letter(obj): 33 | return obj["s"][1] 34 | 35 | dicts = make_dict_data() 36 | f = box_class(dicts, on=[_middle_letter]) 37 | result = f[{_middle_letter: "H"}] 38 | assert result == [dicts[2]] 39 | 40 | 41 | @pytest.mark.parametrize("n", [SIZE_THRESH + 1, 5]) 42 | def test_get_all(box_class, n): 43 | """There's a special fast-path when all items are being retrieved.""" 44 | f = box_class([{"a": 1} for _ in range(n)], ["a"]) 45 | result = f[{}] 46 | assert len(result) == n 47 | -------------------------------------------------------------------------------- /test/test_missing_attribute.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ducks import ANY 3 | from ducks import Dex 4 | from ducks.constants import SIZE_THRESH 5 | from ducks.exceptions import MissingAttribute 6 | 7 | 8 | @pytest.mark.parametrize("n_items", [1, 5, SIZE_THRESH + 1]) 9 | def test_missing_function(box_class, n_items): 10 | def even(obj): 11 | if obj % 2: 12 | raise MissingAttribute 13 | return True 14 | 15 | objs = range(n_items) 16 | fb = box_class(objs, [even]) 17 | n_even = len([x for x in range(n_items) if x % 2 == 0]) 18 | n_odd = n_items - n_even 19 | assert len(fb) == n_items 20 | assert len(fb[{even: True}]) == n_even 21 | assert len(fb[{even: {"!=": True}}]) == n_odd 22 | for idx in fb._indexes.values(): 23 | assert len(idx) == n_even 24 | 25 | 26 | missing_attr_data = [ 27 | {"a": 1, "b": 2}, 28 | {"a": 3}, 29 | {"b": 4}, 30 | {}, 31 | ] 32 | 33 | 34 | def test_add_with_missing_attributes(): 35 | fb = Dex([], ["a", "b"]) 36 | for d in missing_attr_data: 37 | fb.add(d) 38 | assert len(fb) == 4 39 | assert len(fb._indexes["a"]) == 2 40 | assert len(fb._indexes["b"]) == 2 41 | assert len(fb[{"b": {"not in": [2, 4]}}]) == 2 42 | assert len(fb[{"a": {"not in": [1, 3]}}]) == 2 43 | 44 | 45 | def test_remove_with_missing_attributes(): 46 | fb = Dex(missing_attr_data, ["a", "b"]) 47 | for d in missing_attr_data: 48 | fb.remove(d) 49 | assert len(fb) == 0 50 | for idx in fb._indexes.values(): 51 | assert len(idx) == 0 52 | 53 | 54 | def test_missing_attributes(box_class): 55 | fb = box_class(missing_attr_data, ["a", "b"]) 56 | for d in missing_attr_data: 57 | assert d in fb 58 | assert len(fb._indexes["a"]) == 2 59 | assert len(fb._indexes["b"]) == 2 60 | 61 | 62 | def test_add_none(): 63 | f = Dex(on="s") 64 | f.add(None) 65 | result = f[{"s": None}] 66 | assert result == [] 67 | 68 | 69 | def test_empty_attribute(box_class): 70 | fb = box_class([None], on=["a"]) 71 | assert len(fb) == 1 72 | 73 | 74 | def test_find_having_attr(box_class): 75 | fb = box_class(missing_attr_data, ["a", "b"]) 76 | assert len(fb[{"a": ANY}]) == 2 77 | assert len(fb[{"b": ANY}]) == 2 78 | assert len(fb[{"a": 1, "b": ANY}]) == 1 79 | 80 | 81 | def test_find_missing_attr(box_class): 82 | fb = box_class(missing_attr_data, ["a", "b"]) 83 | assert len(fb[{"a": {"!=": ANY}}]) == 2 84 | assert len(fb[{"b": {"!=": ANY}}]) == 2 85 | assert len(fb[{"a": 3, "b": {"!=": ANY}}]) == 1 86 | assert len(fb[{"a": {"!=": ANY}, "b": {"!=": ANY}}]) == 1 87 | 88 | 89 | @pytest.mark.parametrize("n_items", [2, 10, SIZE_THRESH * 2 + 2]) 90 | def test_many_missing(box_class, n_items): 91 | data = [] 92 | for i in range(n_items): 93 | if i % 2: 94 | data.append({"a": 1}) 95 | else: 96 | data.append({}) 97 | fb = box_class(data, ["a"]) 98 | assert len(fb[{"a": ANY}]) == n_items // 2 99 | assert len(fb[{"a": {"!=": ANY}}]) == n_items // 2 100 | 101 | 102 | @pytest.mark.parametrize("n_items", [2, 10, SIZE_THRESH * 2 + 2]) 103 | def test_get_values(box_class, n_items): 104 | data = [] 105 | for i in range(n_items): 106 | if i % 2: 107 | data.append({"a": 1}) 108 | else: 109 | data.append({}) 110 | fb = box_class(data, ["a"]) 111 | assert fb.get_values("a") == {1} 112 | -------------------------------------------------------------------------------- /test/test_mixed_cardinality.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ducks.constants import SIZE_THRESH 3 | 4 | from .conftest import Attr 5 | 6 | 7 | @pytest.mark.parametrize("thresh", [10**i for i in range(5)]) 8 | def test_thresh(box_class, thresh): 9 | def size_thresh_n(obj): 10 | return obj["size"] < thresh 11 | 12 | n_items = 10**4 13 | objs = [{"size": i} for i in range(n_items)] 14 | fb = box_class(objs, [size_thresh_n]) 15 | assert len(fb[{size_thresh_n: True}]) == thresh 16 | assert len(fb[{size_thresh_n: False}]) == n_items - thresh 17 | 18 | 19 | def test_bad_hash_mixed(box_class): 20 | objs = [{"n": Attr(i)} for i in range(100)] + [ 21 | {"n": Attr(0)} for _ in range(SIZE_THRESH + 1) 22 | ] 23 | fb = box_class(objs, ["n"]) 24 | assert len(fb[{"n": objs[1]["n"]}]) == 1 25 | assert len(fb[{"n": objs[0]["n"]}]) == SIZE_THRESH + 2 26 | -------------------------------------------------------------------------------- /test/test_multiple_operations.py: -------------------------------------------------------------------------------- 1 | def test_eq_and_greater(box_class): 2 | objs = [{"x": i} for i in range(10)] 3 | fb = box_class(objs, "x") 4 | assert fb[{"x": {"==": 1, ">": 0}}] == [objs[1]] 5 | 6 | 7 | def test_eq_and_in(box_class): 8 | objs = [{"x": i} for i in range(10)] 9 | fb = box_class(objs, "x") 10 | assert fb[{"x": {"eq": 1, "in": [1, 2, 3]}}] == [objs[1]] 11 | 12 | 13 | def test_greater_less_and_in(box_class): 14 | objs = [{"x": i} for i in range(10)] 15 | fb = box_class(objs, "x") 16 | assert len(fb[{"x": {"gt": 1, "lt": 5, "in": [1, 2, 3]}}]) == 2 17 | 18 | 19 | def test_gte_lte_in_and_eq(box_class): 20 | objs = [{"x": i} for i in range(10)] 21 | fb = box_class(objs, "x") 22 | assert len(fb[{"x": {"gte": 1, "lte": 5, "in": [1, 2, 3], "eq": 2}}]) == 1 23 | -------------------------------------------------------------------------------- /test/test_mutations.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ducks.constants import ARRAY_SIZE_MAX 3 | from ducks.constants import SET_SIZE_MIN 4 | from ducks.constants import SIZE_THRESH 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "n_items", [SIZE_THRESH, ARRAY_SIZE_MAX + 1, SET_SIZE_MIN - 1, SET_SIZE_MIN + 1] 9 | ) 10 | def test_many_gets(box_class, n_items): 11 | """At one point there was a bug involving several sequential gets, let's make sure that can't come back.""" 12 | 13 | def f5(i): 14 | return i["n"] % 5 15 | 16 | data = [{"n": i} for i in range(n_items)] 17 | f = box_class(data, ["n", f5]) 18 | for _ in range(4): 19 | # just a lot of queries in every conceivable flavor 20 | assert len(f[{"n": {"in": [1, 2, 3, 4, 5]}, f5: {"in": [3, 4]}}]) == 2 21 | assert len(f[{"n": {"in": [1, 2]}, f5: {"in": [1, 2]}}]) == 2 22 | assert len(f[{"n": {"in": [1, 2, 3, 4, 5]}}]) == 5 23 | assert len(f[{"n": {"in": [1, 2, 3, 4, 5]}, f5: {"not in": [1, 2]}}]) == 3 24 | assert len(f[{"n": {"in": [6, 7, 8], "!=": 3}, f5: {"not in": [1, 2]}}]) == 1 25 | assert ( 26 | len(f[{"n": {"in": [6, 7, 8], "!=": -1000}, f5: {"not in": [3, 4]}}]) == 2 27 | ) 28 | assert ( 29 | len(f[{f5: {"==": 1, "in": [3, 4]}, "n": {"==": -1000, "!=": -1000}}]) == 0 30 | ) 31 | assert ( 32 | len( 33 | f[ 34 | { 35 | "n": {"in": [-1000, 3, 4, 5], "!=": -1000}, 36 | f5: {"not in": [3, 4]}, 37 | } 38 | ] 39 | ) 40 | == 1 41 | ) 42 | assert len(f[{}]) == n_items 43 | 44 | 45 | def test_mutated_return(box_class): 46 | """If the user modifies the returned array, none of our arrays change, right?""" 47 | data = [{"n": 0} for _ in range(5)] 48 | f = box_class(data, ["n"]) 49 | arr = f[{"n": 0}] 50 | assert len(arr) == 5 51 | assert all(a["n"] == 0 for a in arr) 52 | arr[0] = {"n": 1} 53 | arr2 = f[{"n": 0}] 54 | assert len(arr) == 5 55 | assert all(a["n"] == 0 for a in arr2) 56 | -------------------------------------------------------------------------------- /test/test_nones.py: -------------------------------------------------------------------------------- 1 | """ 2 | None is a value that cannot be compared with <, > etc. But we definitely need 3 | to support it as it's a common attribute value. 4 | These tests check that None is handled properly. 5 | """ 6 | import pytest 7 | from ducks import Dex 8 | from ducks.constants import ARRAY_SIZE_MAX 9 | from ducks.constants import SET_SIZE_MIN 10 | 11 | 12 | def test_none(box_class): 13 | objs = [{"ok": i} for i in range(10)] 14 | objs.append({"ok": None}) 15 | fb = box_class(objs, "ok") 16 | assert len(fb[{"ok": None}]) == 1 17 | 18 | 19 | @pytest.mark.parametrize( 20 | "n_none", [1, ARRAY_SIZE_MAX - 1, ARRAY_SIZE_MAX + 1, SET_SIZE_MIN] 21 | ) 22 | def test_add_remove_none(n_none): 23 | objs = [{"a": i} for i in range(10)] 24 | for i in range(n_none): 25 | objs.append({"a": None}) 26 | fb = Dex(objs, "a") 27 | assert len(fb[{"a": [1, 2, None]}]) == 2 + n_none 28 | assert len(fb[{"a": [None]}]) == n_none 29 | fb.remove(objs[0]) # {'a': 0} 30 | fb.remove(objs[-1]) # {'a': None} 31 | assert len(fb) == len(objs) - 2 32 | -------------------------------------------------------------------------------- /test/test_pickling.py: -------------------------------------------------------------------------------- 1 | from ducks import load 2 | from ducks import save 3 | 4 | 5 | def test_save_and_load(box_class, tmp_path): 6 | fn = tmp_path / "box.pkl" 7 | objs = [{"i": i} for i in range(10)] 8 | box = box_class(objs, "i") 9 | save(box, fn) 10 | box2 = load(fn) 11 | assert len(box2) == 10 12 | objs2 = list(box2) # objs get cloned as well 13 | assert box2[{"i": 3}] == [objs2[3]] 14 | assert box2[{"i": [6]}] == [objs2[6]] 15 | assert box2[{"i": {">": 8}}] == [objs2[9]] 16 | for obj in objs2: 17 | assert obj in box2 18 | -------------------------------------------------------------------------------- /test/test_range_queries.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ducks.constants import SIZE_THRESH 3 | 4 | 5 | @pytest.mark.parametrize( 6 | "expr, result", 7 | [ 8 | ({">": 8}, [9]), 9 | ({">": 6}, [7, 8, 9]), 10 | ({"<": 1}, [0]), 11 | ({"<": 3}, [0, 1, 2]), 12 | ({">=": 9}, [9]), 13 | ({">=": 9, "<": 1}, []), 14 | ({">=": 5, "<": 6}, [5]), 15 | ({">=": 5, "<=": 5}, [5]), 16 | ({">": 6, "<=": 7}, [7]), 17 | ({">": 6, "<=": 8}, [7, 8]), 18 | ({">": 6, "<=": 6}, []), 19 | ({">=": 6, "<": 6}, []), 20 | ({">=": 999}, []), 21 | ({">": 999}, []), 22 | ({"<=": -1}, []), 23 | ({"<": -1}, []), 24 | ({">": 9.5}, []), 25 | ({">": 8.5, "<": 9.5}, [9]), 26 | ({">=": 7.5, "<=": 9.5}, [8, 9]), 27 | ({">": 4, "<": 3}, []), 28 | ({">=": 4, "<=": 3}, []), 29 | ({">=": 999, "<=": -1}, []), 30 | ({">": 999, "<": -1}, []), 31 | ({">": -100, "<": 100}, list(range(10))), 32 | ({">=": -100, "<=": 100}, list(range(10))), 33 | ], 34 | ) 35 | def test_get_range_expr(box_class, expr, result): 36 | objs = [{"a": i} for i in range(10)] + [{"a": None}] 37 | fb = box_class(objs, "a") 38 | assert list(sorted(o["a"] for o in fb[{"a": expr}])) == result 39 | 40 | 41 | @pytest.mark.parametrize( 42 | "expr", 43 | [ 44 | {">": 8}, 45 | {">": 6}, 46 | {"<": 1}, 47 | {"<": 3}, 48 | {">=": 9}, 49 | {">=": 9, "<": 1}, 50 | {">=": 5, "<": 6}, 51 | {">=": 5, "<=": 5}, 52 | {">": 6, "<=": 7}, 53 | {">": 6, "<=": 8}, 54 | {">": 6, "<=": 6}, 55 | {">=": 6, "<": 6}, 56 | {">=": 999}, 57 | {">": 999}, 58 | {"<=": -1}, 59 | {"<": -1}, 60 | {">": 9.5}, 61 | {">": 8.5, "<": 9.5}, 62 | {">=": 7.5, "<=": 9.5}, 63 | {">": 4, "<": 3}, 64 | {">=": 4, "<=": 3}, 65 | {">=": 999, "<=": -1}, 66 | {">": 999, "<": -1}, 67 | {">": -100, "<": 100}, 68 | {">=": -100, "<=": 100}, 69 | ], 70 | ) 71 | def test_get_big(box_class, expr): 72 | objs = [{"a": i % 10} for i in range(SIZE_THRESH * 11)] 73 | objs += [{"a": None} for _ in range(SIZE_THRESH + 1)] 74 | fb = box_class(objs, "a") 75 | found = fb[{"a": expr}] 76 | result = [o for o in objs if o["a"] is not None] 77 | for op, val in expr.items(): 78 | if op == ">": 79 | result = [o for o in result if o["a"] > val] 80 | if op == "<": 81 | result = [o for o in result if o["a"] < val] 82 | if op == ">=": 83 | result = [o for o in result if o["a"] >= val] 84 | if op == "<=": 85 | result = [o for o in result if o["a"] <= val] 86 | found = list(sorted(found, key=lambda o: o["a"])) 87 | result = list(sorted(result, key=lambda o: o["a"])) 88 | assert found == result 89 | -------------------------------------------------------------------------------- /test/test_stale_objects.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ducks import Dex 3 | from ducks.constants import SIZE_THRESH 4 | 5 | from .conftest import AssertRaises 6 | from .conftest import Attr 7 | 8 | 9 | @pytest.mark.parametrize("n_items", [5, SIZE_THRESH + 1]) 10 | def test_get_stale_objects(box_class, n_items): 11 | objs = [{"z": Attr(1)} for _ in range(n_items)] 12 | f = box_class(objs, ["z"]) 13 | for o in objs: 14 | o["z"] = Attr(2) 15 | found = f[{"z": Attr(1)}] 16 | assert len(found) == n_items # still finds by their old value 17 | found = f[{"z": Attr(2)}] 18 | assert len(found) == 0 19 | 20 | 21 | @pytest.mark.parametrize("n_items", [1, SIZE_THRESH * 2 + 2]) 22 | def test_remove_stale_objects(n_items): 23 | objs = [{"z": 1} for _ in range(n_items)] 24 | f = Dex(objs, ["z"]) 25 | for o in objs: 26 | o["z"] = 2 27 | for o in objs: 28 | f.remove(o) 29 | assert len(f) == 0 30 | assert len(f._indexes["z"]) == 0 31 | 32 | 33 | @pytest.mark.parametrize("n_items", [1, 5, SIZE_THRESH * 2 + 2]) 34 | def test_remove_missing_object(n_items): 35 | objs = [{"z": Attr(1)} for _ in range(n_items)] 36 | f = Dex(objs, ["z"]) 37 | with AssertRaises(KeyError): 38 | f.remove(Attr(2)) 39 | 40 | 41 | def test_external_object_modification(box_class): 42 | """ 43 | What happens if the values are mutable, and someone mutates them externally? 44 | Answer: It gives an unexpected result. Attributes are stored by reference, so 45 | if the attribute is mutated externally, it will change inside the container as well. 46 | Luckily, this is rare; most attributes will be ints and strings which are immutable. 47 | Other python containers have the same problem -- you can break a frozenset if it has 48 | a mutable attribute as a key, for example. 49 | """ 50 | objs = [{"a": Attr(1)}] 51 | fb = box_class(objs, "a") 52 | assert len(fb[{"a": Attr(1)}]) == 1 53 | objs[0]["a"].n = 5000 54 | # external modification changed our results 55 | assert len(fb[{"a": Attr(1)}]) == 0 56 | assert len(fb[{"a": Attr(5000)}]) == 1 57 | -------------------------------------------------------------------------------- /test/test_wrong_type.py: -------------------------------------------------------------------------------- 1 | """ 2 | Now that we're using trees, all objects have to be comparable, including the query values. 3 | Try doing various bad things with types. 4 | """ 5 | import pytest 6 | from ducks import Dex 7 | 8 | from .conftest import AssertRaises 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "expr, expected, raises", 13 | [ 14 | ("lol", 0, True), 15 | (["lol"], 0, True), 16 | ([1, "lol"], 1, True), 17 | ({"<": 3}, 3, False), 18 | ( 19 | {"<": "lol"}, 20 | 0, 21 | True, 22 | ), # todo implement frozen value based thing, then this will work 23 | ], 24 | ) 25 | def test_find_wrong_type(box_class, expr, expected, raises): 26 | if type(expr) is list: 27 | # you can't write {'in': ['lol']} in a parametrize 28 | # other keys work, but not 'in'. It looks like parametrize must 29 | # be calling eval() or something. Pretty annoying. 30 | expr = {"in": expr} 31 | objs = [{"x": i} for i in range(10)] 32 | fb = box_class(objs, "x") 33 | if raises: 34 | with AssertRaises(TypeError): 35 | fb[{"x": expr}] 36 | else: 37 | assert len(fb[{"x": expr}]) == expected 38 | 39 | 40 | def test_add_wrong_type(): 41 | objs = [{"x": i} for i in range(10)] 42 | fb = Dex(objs, "x") 43 | assert len(fb._indexes["x"].tree) == 10 44 | with AssertRaises(TypeError): 45 | fb.add({"x": "lol"}) 46 | -------------------------------------------------------------------------------- /tmp/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /tmp/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | # -- Path setup -------------------------------------------------------------- 7 | # If extensions (or modules to document with autodoc) are in another directory, 8 | # add these directories to sys.path here. If the directory is relative to the 9 | # documentation root, use os.path.abspath to make it absolute, like shown here. 10 | # 11 | # import os 12 | # import sys 13 | # sys.path.insert(0, os.path.abspath('.')) 14 | # -- Project information ----------------------------------------------------- 15 | 16 | project = "ducks" 17 | copyright = "2022, Theo Walker" 18 | author = "Theo Walker" 19 | 20 | # The full version, including alpha/beta/rc tags 21 | release = "1.0.0" 22 | 23 | 24 | # -- General configuration --------------------------------------------------- 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be 27 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 28 | # ones. 29 | extensions = [] 30 | 31 | # Add any paths that contain templates here, relative to this directory. 32 | templates_path = ["_templates"] 33 | 34 | # List of patterns, relative to source directory, that match files and 35 | # directories to ignore when looking for source files. 36 | # This pattern also affects html_static_path and html_extra_path. 37 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 38 | 39 | 40 | # -- Options for HTML output ------------------------------------------------- 41 | 42 | # The theme to use for HTML and HTML Help pages. See the documentation for 43 | # a list of builtin themes. 44 | # 45 | html_theme = "alabaster" 46 | 47 | # Add any paths that contain custom static files (such as style sheets) here, 48 | # relative to this directory. They are copied after the builtin static files, 49 | # so a file named "default.css" will overwrite the builtin "default.css". 50 | html_static_path = ["_static"] 51 | -------------------------------------------------------------------------------- /tmp/index.rst: -------------------------------------------------------------------------------- 1 | .. ducks documentation master file, created by 2 | sphinx-quickstart on Fri Aug 26 11:10:55 2022. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to the ducks documentation! 7 | ================================= 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /tmp/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | --------------------------------------------------------------------------------