├── .github └── workflows │ ├── ci.yaml │ └── publish.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── beavers ├── __init__.py ├── assets │ └── favicon.ico ├── dag.py ├── kafka.py ├── pandas_wrapper.py ├── perspective_wrapper.py ├── polars_wrapper.py ├── pyarrow_kafka.py ├── pyarrow_replay.py ├── pyarrow_wrapper.py ├── replay.py ├── table.html └── testing.py ├── docs ├── concepts │ ├── advanced.md │ ├── dag.md │ ├── kafka.md │ ├── pandas.md │ ├── perspective.md │ ├── polars.md │ ├── pyarrow.md │ └── replay.md ├── contributing.md ├── faq.md ├── index.md ├── install.md ├── reference │ ├── dag.md │ ├── kafka.md │ ├── pandas_wrapper.md │ ├── pyarrow_wrapper.md │ └── replay.md ├── requirements.in ├── requirements.txt └── static │ └── icons │ └── beavers │ ├── icon.png │ └── logo.svg ├── examples ├── __init__.py ├── advanced_concepts.py ├── dag_concepts.py ├── etfs.py ├── kafka_concepts.py ├── pandas_concepts.py ├── perspective_concepts.py ├── polars_concepts.py ├── pyarrow_concepts.py └── replay_concepts.py ├── mkdocs.yml ├── poetry.lock ├── pyproject.toml ├── scripts ├── README.md ├── kafka_test_bench.py └── perpective_test_bench.py ├── tests ├── __init__.py ├── conftest.py ├── test_dag.py ├── test_docs.py ├── test_etfs.py ├── test_kafka.py ├── test_pandas_wrapper.py ├── test_perpective_wrapper.py ├── test_polars_wrapper.py ├── test_pyarrow_kafka.py ├── test_pyarrow_replay.py ├── test_pyarrow_wrapper.py ├── test_replay.py └── test_util.py └── tox.ini /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: beavers CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: 15 | - "3.10" 16 | - "3.11" 17 | - "3.12" 18 | - "3.13" 19 | fail-fast: false 20 | steps: 21 | - name: Checkout sources 22 | uses: actions/checkout@v4 23 | 24 | - name: Setup Python 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | cache: "pip" 29 | 30 | - name: Install pip 31 | run: "python -m pip install --upgrade pip" 32 | - name: Install tox and poetry 33 | run: "python -m pip install tox tox-gh-actions poetry==2.1.1" 34 | - name: Install poetry plugin 35 | run: 'poetry self add "poetry-dynamic-versioning[plugin]"' 36 | 37 | - name: Run tox 38 | run: tox 39 | 40 | - name: Upload coverage to Codecov 41 | uses: codecov/codecov-action@v4 42 | if: "matrix.python-version == '3.10'" 43 | with: 44 | fail_ci_if_error: true 45 | token: ${{ secrets.CODECOV_TOKEN }} 46 | -------------------------------------------------------------------------------- /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | release: 5 | types: [ published ] 6 | branches: [ main ] 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build-and-publish: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Checkout sources 15 | uses: actions/checkout@v3 16 | 17 | - name: Setup Python 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: "3.10" 21 | 22 | - name: Install poetry and dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | python -m pip install poetry==2.1.1 26 | poetry self add "poetry-dynamic-versioning[plugin]" 27 | 28 | - name: Configure poetry 29 | env: 30 | pypi_token: ${{ secrets.PyPI_TOKEN }} 31 | run: poetry config pypi-token.pypi $pypi_token 32 | 33 | - name: Build and publish 34 | run: poetry publish --build 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # Custom 163 | /.idea 164 | /.pytest_cache 165 | /.ruff_cache 166 | /venv 167 | *.csv 168 | coverrage.xml 169 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3.10 3 | repos: 4 | - repo: https://github.com/pycqa/pydocstyle 5 | rev: 6.3.0 6 | hooks: 7 | - id: pydocstyle 8 | files: ^beavers/(dag|replay|kafka|arrow).py 9 | additional_dependencies: 10 | - tomli 11 | 12 | - repo: https://github.com/pre-commit/pre-commit-hooks 13 | rev: v5.0.0 14 | hooks: 15 | - id: check-toml 16 | - id: check-yaml 17 | - id: end-of-file-fixer 18 | - id: mixed-line-ending 19 | - repo: https://github.com/charliermarsh/ruff-pre-commit 20 | rev: v0.11.12 21 | hooks: 22 | - id: ruff 23 | args: ['--fix'] 24 | - id: ruff-format 25 | - repo: https://github.com/PyCQA/bandit 26 | rev: 1.8.3 27 | hooks: 28 | - id: bandit 29 | additional_dependencies: 30 | - tomli 31 | args: 32 | - "--config=pyproject.toml" 33 | - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks 34 | rev: v2.14.0 35 | hooks: 36 | - id: pretty-format-toml 37 | files: "^.*.toml" 38 | args: 39 | - "--autofix" 40 | - repo: https://github.com/python-poetry/poetry 41 | rev: 2.1.3 42 | hooks: 43 | - id: poetry-check 44 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.11" 7 | 8 | mkdocs: 9 | configuration: mkdocs.yml 10 | 11 | python: 12 | install: 13 | - requirements: docs/requirements.txt 14 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) 6 | and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). 7 | 8 | 9 | ## [v0.13.0](https://github.com/tradewelltech/beavers/releases/tag/v0.13.0) - 2025-02-13 10 | 11 | [Compare with v0.12.1](https://github.com/tradewelltech/beavers/compare/v0.12.1...v0.13.0) 12 | 13 | ### Added 14 | 15 | - Add missing badge (#72) ([5bf44e9](https://github.com/tradewelltech/beavers/commit/5bf44e982740651ccf1a168ce88b4376519181ee) by 0x26res). 16 | - Add polars support ([f30da87](https://github.com/tradewelltech/beavers/commit/f30da8779c2a683f2ec2d9607134658ac70d4afb) by aandres3). 17 | 18 | ## [v0.12.1](https://github.com/tradewelltech/beavers/releases/tag/v0.12.1) - 2025-02-03 19 | 20 | [Compare with v0.12.0](https://github.com/tradewelltech/beavers/compare/v0.12.0...v0.12.1) 21 | 22 | ## [v0.12.0](https://github.com/tradewelltech/beavers/releases/tag/v0.12.0) - 2024-11-25 23 | 24 | [Compare with v0.11.0](https://github.com/tradewelltech/beavers/compare/v0.11.0...v0.12.0) 25 | 26 | ### Added 27 | 28 | - Add changelog link ([b84d6e6](https://github.com/tradewelltech/beavers/commit/b84d6e6ef42c590379f9bdd16319b1ecb9978b52) by aandres3). 29 | 30 | 31 | ## [v0.11.0](https://github.com/tradewelltech/beavers/releases/tag/v0.11.0) - 2024-11-15 32 | 33 | [Compare with v0.10.0](https://github.com/tradewelltech/beavers/compare/v0.10.0...v0.11.0) 34 | 35 | ### Added 36 | 37 | - Add python 3.13 ([1984bb2](https://github.com/tradewelltech/beavers/commit/1984bb2c7b14126084d5497243418f8bc0123494) by aandres3). 38 | 39 | ### Fixed 40 | 41 | - Fix perspective html (#70) ([ebc090d](https://github.com/tradewelltech/beavers/commit/ebc090d5a9ac7bbf31384a826cf94326426386e0) by 0x26res). 42 | 43 | ## [v0.10.0](https://github.com/tradewelltech/beavers/releases/tag/v0.10.0) - 2024-11-11 44 | 45 | [Compare with v0.9.1](https://github.com/tradewelltech/beavers/compare/v0.9.1...v0.10.0) 46 | 47 | ### Added 48 | 49 | - Add latest version of everything ([f339a52](https://github.com/tradewelltech/beavers/commit/f339a52ac8046e72f64ba4f838259d90b0791a6d) by aandres3). 50 | 51 | ### Fixed 52 | 53 | - Fix wrong offset resolution (#65) ([610bad6](https://github.com/tradewelltech/beavers/commit/610bad6cdadb29014ddc098b79e2ca5df18f1c71) by 0x26res). 54 | 55 | 56 | ## [v0.9.1](https://github.com/tradewelltech/beavers/releases/tag/v0.9.1) - 2024-09-20 57 | 58 | [Compare with v0.9.0](https://github.com/tradewelltech/beavers/compare/v0.9.0...v0.9.1) 59 | 60 | ### Fixed 61 | 62 | - Fix following perspective update ([f06f375](https://github.com/tradewelltech/beavers/commit/f06f375028c99017231faf9f5ab78c3f7f4e028e) by aandres). 63 | 64 | ## [v0.9.0](https://github.com/tradewelltech/beavers/releases/tag/v0.9.0) - 2024-07-30 65 | 66 | [Compare with v0.8.0](https://github.com/tradewelltech/beavers/compare/v0.8.0...v0.9.0) 67 | 68 | ### Added 69 | 70 | - Add perspective tools ([07878be](https://github.com/tradewelltech/beavers/commit/07878bec527d6e2523345ca437e6a64b77c47182) by aandres). 71 | 72 | ## [v0.8.0](https://github.com/tradewelltech/beavers/releases/tag/v0.8.0) - 2024-07-01 73 | 74 | [Compare with v0.7.0](https://github.com/tradewelltech/beavers/compare/v0.7.0...v0.8.0) 75 | 76 | ### Added 77 | 78 | - Add constructor to mock consumer ([370d5d6](https://github.com/tradewelltech/beavers/commit/370d5d68eb60662a110026ab7844fc3d9c6bf59b) by aandres). 79 | - Add log message for resolved offsets ([0816ea3](https://github.com/tradewelltech/beavers/commit/0816ea3bde7ec0b667b3d6b62935ebc2d7228adf) by aandres). 80 | 81 | ### Fixed 82 | 83 | - Fix offset resolution on end of topic ([ff76c35](https://github.com/tradewelltech/beavers/commit/ff76c3519d4ae36040cf138059952c9304bc1b3d) by aandres). 84 | 85 | ## [v0.7.0](https://github.com/tradewelltech/beavers/releases/tag/v0.7.0) - 2024-06-25 86 | 87 | [Compare with v0.6.0](https://github.com/tradewelltech/beavers/compare/v0.6.0...v0.7.0) 88 | 89 | ### Added 90 | 91 | - Add poll time metrics ([efa487a](https://github.com/tradewelltech/beavers/commit/efa487a3e86f7748c160413ccba749e277e1bc5e) by aandres). 92 | 93 | ## [v0.6.0](https://github.com/tradewelltech/beavers/releases/tag/v0.6.0) - 2024-06-24 94 | 95 | [Compare with v0.5.0](https://github.com/tradewelltech/beavers/compare/v0.5.0...v0.6.0) 96 | 97 | ### Added 98 | 99 | - Add some missing replay code (#56) ([9973baa](https://github.com/tradewelltech/beavers/commit/9973baa73fd781656938578f9f0cefe7a283a389) by 0x26res). 100 | - Add contributing and code of conduct guide, update deps (#55) ([3bd1147](https://github.com/tradewelltech/beavers/commit/3bd114724b5f2ac1095b00b8e90a55dd3a7333ab) by 0x26res). 101 | 102 | ### Fixed 103 | 104 | - fix: make group optional (#54) ([03d27af](https://github.com/tradewelltech/beavers/commit/03d27af029d95be874a0b6b5e5cbc625945b984b) by 0x26res). 105 | 106 | ### Changed 107 | 108 | - Change engine to dag, add talk to the doc ([cd57456](https://github.com/tradewelltech/beavers/commit/cd57456a271f99a81602f7d7d385f0caea84acd2) by aandres). 109 | 110 | ## [v0.5.0](https://github.com/tradewelltech/beavers/releases/tag/v0.5.0) - 2024-01-23 111 | 112 | [Compare with v0.4.0](https://github.com/tradewelltech/beavers/compare/v0.4.0...v0.5.0) 113 | 114 | ### Added 115 | 116 | - Add python 12 support (#53) ([344ff69](https://github.com/tradewelltech/beavers/commit/344ff69309d81780d9d08effc2fdfe3b1f8d9b22) by 0x26res). 117 | - Add prune ([4e5b06f](https://github.com/tradewelltech/beavers/commit/4e5b06f073c2e210f4cca8d67f096698c52c3fa9) by aandres). 118 | - Add kafka json to arrow support (#50) ([120c116](https://github.com/tradewelltech/beavers/commit/120c116d13ab46604d54088bb07d851ff5d3fd00) by 0x26res). 119 | 120 | 121 | ## [v0.4.0](https://github.com/tradewelltech/beavers/releases/tag/v0.4.0) - 2023-11-26 122 | 123 | [Compare with v0.3.1](https://github.com/tradewelltech/beavers/compare/v0.3.1...v0.4.0) 124 | 125 | ### Added 126 | 127 | - Add some arrow replay code ([d8026ec](https://github.com/tradewelltech/beavers/commit/d8026ecf744886b0bb7406814904adb3308ba0b9) by 0x26res). 128 | 129 | ## [v0.3.1](https://github.com/tradewelltech/beavers/releases/tag/v0.3.1) - 2023-10-26 130 | 131 | [Compare with v0.3.0](https://github.com/tradewelltech/beavers/compare/v0.3.0...v0.3.1) 132 | ### Added 133 | 134 | - Add pandas module (#47) ([ac81344](https://github.com/tradewelltech/beavers/commit/ac8134452c3a9636ea5a119e65db87df5a245271) by 0x26res). 135 | 136 | ## [v0.3.0](https://github.com/tradewelltech/beavers/releases/tag/v0.3.0) - 2023-09-29 137 | 138 | [Compare with v0.2.0](https://github.com/tradewelltech/beavers/compare/v0.2.0...v0.3.0) 139 | 140 | ### Added 141 | 142 | - Add faq, make kafka extra dep, update readme, use poetry in tox. (#44) ([de0ddf5](https://github.com/tradewelltech/beavers/commit/de0ddf5baa51fbf5a9b818364e8a2e589a2b0974) by 0x26res). 143 | - Add pyarrow module (#42) ([1117f37](https://github.com/tradewelltech/beavers/commit/1117f375b36a5eac1468c3a5888f1fdc6e9f1ba7) by 0x26res). 144 | - Add developer page (#41) ([b717b62](https://github.com/tradewelltech/beavers/commit/b717b6224bf9e5fd585ff6b0bed77b3333ad2a68) by 0x26res). 145 | - Add logos ([7f6b1cf](https://github.com/tradewelltech/beavers/commit/7f6b1cfc09453927ede5e485c242311362b1e417) by aandres). 146 | 147 | ### Fixed 148 | 149 | - Fix logo (#45) ([f24f0dc](https://github.com/tradewelltech/beavers/commit/f24f0dcb8a911f193aa045da0b6a0f20a69fc64e) by 0x26res). 150 | - Fix tests ([cc52ae6](https://github.com/tradewelltech/beavers/commit/cc52ae6f454d6cf3afd98b6804fd750de5a2eab1) by aandres). 151 | 152 | ### Changed 153 | 154 | - change update docs deps (#40) ([04bf706](https://github.com/tradewelltech/beavers/commit/04bf706f9277285b9dac922bb0255402d095da6e) by 0x26res). 155 | 156 | ## [v0.2.0](https://github.com/tradewelltech/beavers/releases/tag/v0.2.0) - 2023-09-19 157 | 158 | [Compare with v0.1.0](https://github.com/tradewelltech/beavers/compare/v0.1.0...v0.2.0) 159 | 160 | ### Added 161 | 162 | - Add changelog ([7ee7685](https://github.com/tradewelltech/beavers/commit/7ee76853ff4186dc1b7c9449022511a6ad477fbe) by aandres). 163 | - Add empty factory ([ee07562](https://github.com/tradewelltech/beavers/commit/ee0756289d4ed79787e760de4441933afd1aa9d7) by aandres). 164 | - Add offset policies, fix committed ([99c1ad7](https://github.com/tradewelltech/beavers/commit/99c1ad76f6d49f4a641749bdea5ec60e73392507) by aandres). 165 | - Add logging ([c8449ab](https://github.com/tradewelltech/beavers/commit/c8449aba69d18ec070755e1efbd89f083b639289) by aandres). 166 | - Add test script ([077bfc2](https://github.com/tradewelltech/beavers/commit/077bfc278809676e048ba121119e1ec67a97bb5f) by aandres). 167 | - Add kafka doc ([806a471](https://github.com/tradewelltech/beavers/commit/806a47188fa4b2c7234f3059975668142fb3c49b) by aandres). 168 | 169 | ### Fixed 170 | 171 | - Fix test, fix coverage ([6f0e371](https://github.com/tradewelltech/beavers/commit/6f0e371916c2ba61147f61adfd5995c32fe63212) by aandres). 172 | - Fix covertage ([9db6eec](https://github.com/tradewelltech/beavers/commit/9db6eec070d4e7783bc6028f85ad468b0b26e7c8) by aandres). 173 | - Fix example ([39f4b44](https://github.com/tradewelltech/beavers/commit/39f4b44f48b2b5efe2761f762e7d85ee256df76d) by aandres). 174 | 175 | ## [v0.1.0](https://github.com/tradewelltech/beavers/releases/tag/v0.1.0) - 2023-08-24 176 | 177 | [Compare with v0.0.4](https://github.com/tradewelltech/beavers/compare/v0.0.4...v0.1.0) 178 | 179 | ## [v0.0.4](https://github.com/tradewelltech/beavers/releases/tag/v0.0.4) - 2023-08-22 180 | 181 | [Compare with v0.0.3](https://github.com/tradewelltech/beavers/compare/v0.0.3...v0.0.4) 182 | 183 | ### Added 184 | 185 | - Add dag metrics ([c46a4ee](https://github.com/tradewelltech/beavers/commit/c46a4eec655984c2525fe094942fd002deeb5645) by aandres). 186 | - Add missing assert ([86b924f](https://github.com/tradewelltech/beavers/commit/86b924f06d78cf3b3a8b98e8137275490b61f815) by aandres). 187 | - Add replay doc ([d5b9b43](https://github.com/tradewelltech/beavers/commit/d5b9b43bd3012e292ad86219c5fd304d3fb11198) by aandres). 188 | - Add repaly metrics ([ba274ef](https://github.com/tradewelltech/beavers/commit/ba274ef7d53cda1e380a7defbd5d4884cf018e4a) by aandres). 189 | - Add test ([8e87c6e](https://github.com/tradewelltech/beavers/commit/8e87c6e8a76b6dadcedf810b0373d12cba7f3309) by aandres). 190 | - Add install section ([520ced1](https://github.com/tradewelltech/beavers/commit/520ced1def5b7508507df6cd65339515680b41fe) by aandres). 191 | 192 | ### Fixed 193 | 194 | - Fix equality check on nodes ([fa1a09f](https://github.com/tradewelltech/beavers/commit/fa1a09f300b2dd2c307a09f80b8ab37cfd949ea4) by aandres). 195 | - fix test ([85005d5](https://github.com/tradewelltech/beavers/commit/85005d5abcc82685396c39bcf1618aacf0b8ed75) by aandres). 196 | - Fix tox ([7bef814](https://github.com/tradewelltech/beavers/commit/7bef81471d21b405c5982ca19baf1b7ae345f930) by aandres). 197 | 198 | ### Removed 199 | 200 | - Remove dead code ([af932d4](https://github.com/tradewelltech/beavers/commit/af932d41ab86fde774dd77f67070ba98a9977df4) by aandres). 201 | 202 | ## [v0.0.3](https://github.com/tradewelltech/beavers/releases/tag/v0.0.3) - 2023-07-05 203 | 204 | [Compare with v0.0.2](https://github.com/tradewelltech/beavers/compare/v0.0.2...v0.0.3) 205 | 206 | ### Added 207 | 208 | - Add doc ([cb624c7](https://github.com/tradewelltech/beavers/commit/cb624c706920134d362430b0a094b0c722890e43) by aandres). 209 | - Add kafka ([92c37fb](https://github.com/tradewelltech/beavers/commit/92c37fba76b8c26943327834198a24505d0bea79) by aandres). 210 | 211 | ### Fixed 212 | 213 | - Fix kafka test coverage ([ecbc890](https://github.com/tradewelltech/beavers/commit/ecbc890f1adddaf236631e95ccf41ed6002430f3) by aandres). 214 | - Fix icon ([8887278](https://github.com/tradewelltech/beavers/commit/88872786071f882f23335c47721fd53a23771b2e) by aandres). 215 | 216 | ## [v0.0.2](https://github.com/tradewelltech/beavers/releases/tag/v0.0.2) - 2023-06-30 217 | 218 | [Compare with v0.0.1](https://github.com/tradewelltech/beavers/compare/v0.0.1...v0.0.2) 219 | 220 | ### Added 221 | 222 | - Add advanced concept ([3450d72](https://github.com/tradewelltech/beavers/commit/3450d728872962dff7101189d20a4e81a48d8e2e) by aandres). 223 | - Add concept page, rename stabilize ([9c0b9eb](https://github.com/tradewelltech/beavers/commit/9c0b9eba0bf0bd604e0195530bc25e2fb767509a) by aandres). 224 | - Add doc to main api ([4048ae7](https://github.com/tradewelltech/beavers/commit/4048ae7c29c56ffa789b3c1c4f7a3c53aba44a75) by aandres). 225 | - Add const test ([e1af0bd](https://github.com/tradewelltech/beavers/commit/e1af0bdf61d144e76c029421a274433f6967df4c) by aandres). 226 | - Add hook for pydoc ([ad10948](https://github.com/tradewelltech/beavers/commit/ad109481ff06ea4ae26acd3e1279fc056fd5ee54) by aandres). 227 | - Add replay ([c807bef](https://github.com/tradewelltech/beavers/commit/c807bef6354573124d410e13c85450d0cdacf681) by aandres). 228 | - Add ETF example ([e3c4c2e](https://github.com/tradewelltech/beavers/commit/e3c4c2e9f3423e814d47c1dc40e182c88f05c9ba) by aandres). 229 | 230 | ### Fixed 231 | 232 | - fix typos ([8df6f74](https://github.com/tradewelltech/beavers/commit/8df6f7412ae96a6cbe55b1941d6475d3754fc0de) by aandres). 233 | - Fix coverage ([fafaa9a](https://github.com/tradewelltech/beavers/commit/fafaa9a49c4c038094058ab8f99346c9e45e9dde) by aandres). 234 | - Fix test coverage ([41938e9](https://github.com/tradewelltech/beavers/commit/41938e9c0c558d56cb89144fb539d00cf85254cf) by aandres). 235 | - Fix ci ([9c46069](https://github.com/tradewelltech/beavers/commit/9c46069ce380cc59a5c53aa1743a9f369d7283bf) by aandres). 236 | 237 | ### Removed 238 | 239 | - Remove trailing blank space ([22195ca](https://github.com/tradewelltech/beavers/commit/22195ca075c77deef92f0a7ea00025f0f1a71561) by aandres). 240 | 241 | ## [v0.0.1](https://github.com/tradewelltech/beavers/releases/tag/v0.0.1) - 2023-05-10 242 | 243 | [Compare with v0.0.1.rc](https://github.com/tradewelltech/beavers/compare/v0.0.1.rc...v0.0.1) 244 | 245 | ### Added 246 | 247 | - Add ci badge ([fdad06c](https://github.com/tradewelltech/beavers/commit/fdad06ca65ed1135d052c4e9e4a13e48b50cdabe) by aandres). 248 | - Add material ([31a46e4](https://github.com/tradewelltech/beavers/commit/31a46e4f5e39824736064fcc13d7fea600be5ac9) by aandres). 249 | - Add python doc requirements ([e1bcd00](https://github.com/tradewelltech/beavers/commit/e1bcd00aba018dba8e16d781b0f6ca9e783105c0) by aandres). 250 | - Add docs ([3c1e87a](https://github.com/tradewelltech/beavers/commit/3c1e87aa14d3d132189d7d5a3bbe66e6df0a57c5) by aandres). 251 | - Add coverage to deps ([ced0670](https://github.com/tradewelltech/beavers/commit/ced0670226f4ef43539efa75b1e6e455efda1df2) by aandres). 252 | 253 | ### Fixed 254 | 255 | - Fix branch ([9847cb9](https://github.com/tradewelltech/beavers/commit/9847cb9b4fd4d59c3060318805caeffbe8582cf7) by aandres). 256 | - Fix read the docs ([ecf5d25](https://github.com/tradewelltech/beavers/commit/ecf5d25cefe8be2c0448913d4f1ef100753a644a) by aandres). 257 | 258 | ### Removed 259 | 260 | - Remove duplicate and snyk ([b7e8539](https://github.com/tradewelltech/beavers/commit/b7e8539a682162de0fff1a9b6a5f55ca5f550da2) by aandres). 261 | 262 | ## [v0.0.1.rc](https://github.com/tradewelltech/beavers/releases/tag/v0.0.1.rc) - 2023-05-09 263 | 264 | [Compare with first commit](https://github.com/tradewelltech/beavers/compare/1cc83cb780e53ef55308100c655c321dcc945d3b...v0.0.1.rc) 265 | 266 | ### Added 267 | 268 | - add pre commit ([12d7ffa](https://github.com/tradewelltech/beavers/commit/12d7ffa203c8c88cbb68f683fc2d992960e170fe) by aandres). 269 | - Add engine code ([e2f0949](https://github.com/tradewelltech/beavers/commit/e2f0949dd5dc69692455c7564c5f6bcfd997754d) by aandres). 270 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributor Covenant Code of Conduct 3 | 4 | ## Our Pledge 5 | 6 | We as members, contributors, and leaders pledge to make participation in our 7 | community a harassment-free experience for everyone, regardless of age, body 8 | size, visible or invisible disability, ethnicity, sex characteristics, gender 9 | identity and expression, level of experience, education, socio-economic status, 10 | nationality, personal appearance, race, religion, or sexual identity 11 | and orientation. 12 | 13 | We pledge to act and interact in ways that contribute to an open, welcoming, 14 | diverse, inclusive, and healthy community. 15 | 16 | ## Our Standards 17 | 18 | Examples of behavior that contributes to a positive environment for our 19 | community include: 20 | 21 | * Demonstrating empathy and kindness toward other people 22 | * Being respectful of differing opinions, viewpoints, and experiences 23 | * Giving and gracefully accepting constructive feedback 24 | * Accepting responsibility and apologizing to those affected by our mistakes, 25 | and learning from the experience 26 | * Focusing on what is best not just for us as individuals, but for the 27 | overall community 28 | 29 | Examples of unacceptable behavior include: 30 | 31 | * The use of sexualized language or imagery, and sexual attention or 32 | advances of any kind 33 | * Trolling, insulting or derogatory comments, and personal or political attacks 34 | * Public or private harassment 35 | * Publishing others' private information, such as a physical or email 36 | address, without their explicit permission 37 | * Other conduct which could reasonably be considered inappropriate in a 38 | professional setting 39 | 40 | ## Enforcement Responsibilities 41 | 42 | Community leaders are responsible for clarifying and enforcing our standards of 43 | acceptable behavior and will take appropriate and fair corrective action in 44 | response to any behavior that they deem inappropriate, threatening, offensive, 45 | or harmful. 46 | 47 | Community leaders have the right and responsibility to remove, edit, or reject 48 | comments, commits, code, wiki edits, issues, and other contributions that are 49 | not aligned to this Code of Conduct, and will communicate reasons for moderation 50 | decisions when appropriate. 51 | 52 | ## Scope 53 | 54 | This Code of Conduct applies within all community spaces, and also applies when 55 | an individual is officially representing the community in public spaces. 56 | Examples of representing our community include using an official email address, 57 | posting via an official social media account, or acting as an appointed 58 | representative at an online or offline event. 59 | 60 | ## Enforcement 61 | 62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 63 | reported to the community leaders responsible for enforcement at 64 | [INSERT CONTACT METHOD]. 65 | All complaints will be reviewed and investigated promptly and fairly. 66 | 67 | All community leaders are obligated to respect the privacy and security of the 68 | reporter of any incident. 69 | 70 | ## Enforcement Guidelines 71 | 72 | Community leaders will follow these Community Impact Guidelines in determining 73 | the consequences for any action they deem in violation of this Code of Conduct: 74 | 75 | ### 1. Correction 76 | 77 | **Community Impact**: Use of inappropriate language or other behavior deemed 78 | unprofessional or unwelcome in the community. 79 | 80 | **Consequence**: A private, written warning from community leaders, providing 81 | clarity around the nature of the violation and an explanation of why the 82 | behavior was inappropriate. A public apology may be requested. 83 | 84 | ### 2. Warning 85 | 86 | **Community Impact**: A violation through a single incident or series 87 | of actions. 88 | 89 | **Consequence**: A warning with consequences for continued behavior. No 90 | interaction with the people involved, including unsolicited interaction with 91 | those enforcing the Code of Conduct, for a specified period of time. This 92 | includes avoiding interactions in community spaces as well as external channels 93 | like social media. Violating these terms may lead to a temporary or 94 | permanent ban. 95 | 96 | ### 3. Temporary Ban 97 | 98 | **Community Impact**: A serious violation of community standards, including 99 | sustained inappropriate behavior. 100 | 101 | **Consequence**: A temporary ban from any sort of interaction or public 102 | communication with the community for a specified period of time. No public or 103 | private interaction with the people involved, including unsolicited interaction 104 | with those enforcing the Code of Conduct, is allowed during this period. 105 | Violating these terms may lead to a permanent ban. 106 | 107 | ### 4. Permanent Ban 108 | 109 | **Community Impact**: Demonstrating a pattern of violation of community 110 | standards, including sustained inappropriate behavior, harassment of an 111 | individual, or aggression toward or disparagement of classes of individuals. 112 | 113 | **Consequence**: A permanent ban from any sort of public interaction within 114 | the community. 115 | 116 | ## Attribution 117 | 118 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 119 | version 2.0, available at 120 | [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0]. 121 | 122 | Community Impact Guidelines were inspired by 123 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 124 | 125 | For answers to common questions about this code of conduct, see the FAQ at 126 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available 127 | at [https://www.contributor-covenant.org/translations][translations]. 128 | 129 | [homepage]: https://www.contributor-covenant.org 130 | [v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html 131 | [Mozilla CoC]: https://github.com/mozilla/diversity 132 | [FAQ]: https://www.contributor-covenant.org/faq 133 | [translations]: https://www.contributor-covenant.org/translations 134 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Beavers 2 | 3 | See the [contributing](https://beavers.readthedocs.io/en/latest/contributing/) section of the doc. 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![PyPI Version][pypi-image]][pypi-url] 3 | [![Python Version][versions-image]][versions-url] 4 | [![Github Stars][stars-image]][stars-url] 5 | [![codecov][codecov-image]][codecov-url] 6 | [![Build Status][build-image]][build-url] 7 | [![Documentation][doc-image]][doc-url] 8 | [![License][license-image]][license-url] 9 | [![Downloads][downloads-image]][downloads-url] 10 | [![Downloads][downloads-month-image]][downloads-month-url] 11 | [![Code style: black][codestyle-image]][codestyle-url] 12 | [![snyk][snyk-image]][snyk-url] 13 | 14 | trackgit-views 15 | 16 | 17 | ![Beavers Logo][5] 18 | 19 | # Beavers 20 | 21 | [Documentation][6] / [Installation][7] / [Repository][1] / [PyPI][8] 22 | 23 | [Beavers][1] is a python library for stream processing, optimized for analytics. 24 | 25 | It is used at [Tradewell Technologies][2], 26 | to calculate analytics and serve model predictions, 27 | for both realtime and batch jobs. 28 | 29 | ## Key Features 30 | 31 | - Works in **real time** (eg: reading from Kafka) and **replay mode** (eg: reading from Parquet files). 32 | - Optimized for analytics, using micro-batches (instead of processing records one by one). 33 | - Similar to [incremental][3], it updates nodes in a dag incrementally. 34 | - Taking inspiration from [kafka streams][4], there are two types of nodes in the dag: 35 | - **Stream**: ephemeral micro-batches of events (cleared after every cycle). 36 | - **State**: durable state derived from streams. 37 | - Clear separation between the business logic and the IO. 38 | So the same dag can be used in real time mode, replay mode or can be easily tested. 39 | - Functional interface: no inheritance or decorator required. 40 | - Support for complicated joins, not just "linear" data flow. 41 | 42 | ## Limitations 43 | 44 | - No concurrency support. 45 | To speed up calculation use libraries like pandas, pyarrow or polars. 46 | - No async code. 47 | To speed up IO use kafka driver native thread or parquet IO thread pool. 48 | - No support for persistent state. 49 | Instead of saving state, replay historic data from kafka to prime stateful nodes. 50 | 51 | ## Talks 52 | 53 | - [Unified batch and stream processing in python | PyData Global 2023][9] 54 | 55 | [1]: https://github.com/tradewelltech/beavers 56 | [2]: https://www.tradewelltech.co/ 57 | [3]: https://github.com/janestreet/incremental 58 | [4]: https://www.confluent.io/blog/kafka-streams-tables-part-1-event-streaming/ 59 | [5]: https://raw.githubusercontent.com/tradewelltech/beavers/master/docs/static/icons/beavers/logo.svg 60 | [6]: https://beavers.readthedocs.io/en/latest/ 61 | [7]: https://beavers.readthedocs.io/en/latest/install/ 62 | [8]: https://pypi.org/project/beavers/ 63 | [9]: https://www.youtube.com/watch?v=8pUwsGA8SQM 64 | 65 | [pypi-image]: https://img.shields.io/pypi/v/beavers 66 | [pypi-url]: https://pypi.org/project/beavers/ 67 | [build-image]: https://github.com/tradewelltech/beavers/actions/workflows/ci.yaml/badge.svg 68 | [build-url]: https://github.com/tradewelltech/beavers/actions/workflows/ci.yaml 69 | [stars-image]: https://img.shields.io/github/stars/tradewelltech/beavers 70 | [stars-url]: https://github.com/tradewelltech/beavers 71 | [versions-image]: https://img.shields.io/pypi/pyversions/beavers 72 | [versions-url]: https://pypi.org/project/beavers/ 73 | [doc-image]: https://readthedocs.org/projects/beavers/badge/?version=latest 74 | [doc-url]: https://beavers.readthedocs.io/en/latest/?badge=latest 75 | [license-image]: http://img.shields.io/:license-Apache%202-blue.svg 76 | [license-url]: https://github.com/tradewelltech/beavers/blob/main/LICENSE 77 | [codecov-image]: https://codecov.io/gh/tradewelltech/beavers/branch/main/graph/badge.svg?token=GY6KL7NT1Q 78 | [codecov-url]: https://codecov.io/gh/tradewelltech/beavers 79 | [downloads-image]: https://pepy.tech/badge/beavers 80 | [downloads-url]: https://static.pepy.tech/badge/beavers 81 | [downloads-month-image]: https://pepy.tech/badge/beavers/month 82 | [downloads-month-url]: https://static.pepy.tech/badge/beavers/month 83 | [codestyle-image]: https://img.shields.io/badge/code%20style-black-000000.svg 84 | [codestyle-url]: https://github.com/ambv/black 85 | [snyk-image]: https://snyk.io/advisor/python/beavers/badge.svg 86 | [snyk-url]: https://snyk.io/advisor/python/beavers 87 | -------------------------------------------------------------------------------- /beavers/__init__.py: -------------------------------------------------------------------------------- 1 | from beavers.dag import Dag, Node, TimerManager 2 | 3 | __version__ = "0.0.0" 4 | __all__ = ["Dag", "Node", "TimerManager"] 5 | -------------------------------------------------------------------------------- /beavers/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tradewelltech/beavers/ec9979086868589ab82b47ce55fa11cc31b32c16/beavers/assets/favicon.ico -------------------------------------------------------------------------------- /beavers/pandas_wrapper.py: -------------------------------------------------------------------------------- 1 | """Module for building dags using pandas.""" 2 | 3 | import dataclasses 4 | from typing import Callable, Optional, ParamSpec 5 | 6 | import pandas as pd 7 | 8 | from beavers import Dag, Node 9 | from beavers.dag import NodePrototype 10 | 11 | P = ParamSpec("P") 12 | 13 | 14 | def _empty_df(dtypes: pd.Series) -> pd.DataFrame: 15 | return pd.DataFrame(columns=dtypes.index).astype(dtypes) 16 | 17 | 18 | def _get_stream_dtypes(node: Node[pd.DataFrame]) -> pd.Series: 19 | empty = node._get_empty() 20 | if not isinstance(empty, pd.DataFrame): 21 | raise TypeError(f"Argument should be a {Node.__name__}[pd.DataFrame]") 22 | else: 23 | return empty.dtypes 24 | 25 | 26 | @dataclasses.dataclass() 27 | class _LastTracker: 28 | key_columns: list[str] 29 | current: pd.DataFrame 30 | 31 | def __call__(self, stream: pd.DataFrame): 32 | self.current = ( 33 | pd.concat([self.current, stream]) 34 | .groupby(self.key_columns, as_index=False) 35 | .tail(1) 36 | .reset_index(drop=True) 37 | ) 38 | 39 | return self.current 40 | 41 | 42 | @dataclasses.dataclass(frozen=True) 43 | class PandasWrapper: 44 | """Helper call for adding pandas Nodes to a Dag.""" 45 | 46 | _dag: Dag 47 | 48 | def source_df( 49 | self, dtypes: pd.Series, name: Optional[str] = None 50 | ) -> Node[pd.DataFrame]: 51 | empty = _empty_df(dtypes) 52 | return self._dag.source_stream(empty, name=name) 53 | 54 | def df_stream( 55 | self, function: Callable[P, pd.DataFrame], dtypes: pd.Series 56 | ) -> NodePrototype[pd.DataFrame]: 57 | return self._dag.stream(function, empty=_empty_df(dtypes)) 58 | 59 | def last_by_keys( 60 | self, stream: Node[pd.DataFrame], keys: list[str] 61 | ) -> Node[pd.DataFrame]: 62 | """Build a state of the latest row by keys.""" 63 | dtypes = _get_stream_dtypes(stream) 64 | for key in keys: 65 | assert key in dtypes, key 66 | return self._dag.state(_LastTracker(keys, _empty_df(dtypes))).map(stream) 67 | -------------------------------------------------------------------------------- /beavers/perspective_wrapper.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import pathlib 3 | from typing import Any, Literal, Optional, Sequence 4 | 5 | import perspective 6 | import pyarrow as pa 7 | import tornado 8 | from perspective.handlers.tornado import PerspectiveTornadoHandler 9 | 10 | from beavers import Dag, Node 11 | from beavers.kafka import KafkaDriver 12 | 13 | COMPARATORS = ( 14 | "==", 15 | "!=", 16 | ">", 17 | ">=", 18 | "<", 19 | "<=", 20 | "begins with", 21 | "contains", 22 | "ends with", 23 | "in", 24 | "not in", 25 | "is not null", 26 | "is null", 27 | ) 28 | 29 | _SOURCE_DIRECTORY = pathlib.Path(__file__).parent 30 | TABLE_PATH = str(_SOURCE_DIRECTORY / "table.html") 31 | ASSETS_DIRECTORY = str(_SOURCE_DIRECTORY / "assets") 32 | 33 | 34 | @dataclasses.dataclass(frozen=True) 35 | class PerspectiveTableDefinition: 36 | """ 37 | API table definition 38 | """ 39 | 40 | name: str 41 | index_column: str 42 | remove_column: Optional[str] = None 43 | sort: list[tuple[str, Literal["asc", "desc"]]] = dataclasses.field( 44 | default_factory=list 45 | ) 46 | filters: list[tuple[str, str, Any]] = dataclasses.field(default_factory=list) 47 | hidden_columns: Sequence[str] = () 48 | limit: Optional[int] = None 49 | 50 | def validate(self, schema: pa.Schema): 51 | assert self.index_column in schema.names, self.index_column 52 | if self.remove_column is not None: 53 | assert isinstance(self.remove_column, str) 54 | assert self.remove_column in schema.names, self.remove_column 55 | 56 | assert isinstance(self.sort, list) 57 | for column, order in self.sort: 58 | assert isinstance(column, str) 59 | assert column in schema.names 60 | assert order in ("asc", "desc") 61 | for column in self.hidden_columns: 62 | assert isinstance(column, str) 63 | assert column in schema.names 64 | for each_filter in self.filters: 65 | assert len(each_filter) in (2, 3) 66 | assert isinstance(each_filter[0], str), each_filter 67 | assert each_filter[1] in COMPARATORS 68 | 69 | 70 | @dataclasses.dataclass(frozen=True) 71 | class _TableConfig: 72 | """ 73 | Internal perspective table config, which is passed to the html template 74 | """ 75 | 76 | name: str 77 | index: str 78 | columns: list[str] 79 | sort: Sequence[tuple[str, Literal["asc", "desc"]]] 80 | filters: Sequence[tuple[str, str, Any]] 81 | 82 | @staticmethod 83 | def from_definition(definition: PerspectiveTableDefinition, schema: pa.Schema): 84 | return _TableConfig( 85 | name=definition.name, 86 | index=definition.index_column, 87 | columns=[f for f in schema.names if f not in definition.hidden_columns], 88 | sort=[] if definition.sort is None else definition.sort, 89 | filters=definition.filters, 90 | ) 91 | 92 | 93 | class TableRequestHandler(tornado.web.RequestHandler): 94 | """Renders the table.html template, using the provided configurations""" 95 | 96 | _tables: Optional[dict[str, _TableConfig]] = None 97 | _default_table: Optional[str] = None 98 | 99 | def initialize(self, table_configs: list[_TableConfig]) -> None: 100 | self._tables = { 101 | table_config.name: table_config for table_config in table_configs 102 | } 103 | self._default_table = table_configs[0].name 104 | 105 | async def get(self, path: str) -> None: 106 | table_name = path or self._default_table 107 | table_config = self._tables[table_name] 108 | 109 | await self.render( 110 | TABLE_PATH, 111 | table_config=table_config, 112 | perspective_version=perspective.__version__, 113 | ) 114 | 115 | 116 | def _table_to_bytes(table: pa.Table) -> bytes: 117 | """Serialize a table as bytes, to pass it to a perspective table""" 118 | with pa.BufferOutputStream() as sink: 119 | with pa.ipc.new_stream(sink, table.schema) as writer: 120 | for batch in table.to_batches(): 121 | writer.write_batch(batch) 122 | return sink.getvalue().to_pybytes() 123 | 124 | 125 | @dataclasses.dataclass(frozen=True) 126 | class _UpdateRunner: 127 | kafka_driver: KafkaDriver 128 | 129 | def __call__(self): 130 | self.kafka_driver.run_cycle(0.0) 131 | 132 | 133 | @dataclasses.dataclass() 134 | class _PerspectiveNode: 135 | table_definition: PerspectiveTableDefinition 136 | schema: pa.Schema 137 | table: perspective.Table | None = None 138 | 139 | def __call__(self, table: pa.Table) -> None: 140 | """Pass the arrow data to perspective""" 141 | self.table.update(_table_to_bytes(table)) 142 | 143 | def get_table_config(self) -> _TableConfig: 144 | return _TableConfig.from_definition(self.table_definition, self.schema) 145 | 146 | 147 | @dataclasses.dataclass(frozen=True) 148 | class PerspectiveDagWrapper: 149 | """Helper for adding perspective Nodes to a Dag.""" 150 | 151 | _dag: Dag 152 | 153 | def to_perspective( 154 | self, 155 | node: Node, 156 | table_definition: PerspectiveTableDefinition, 157 | schema: Optional[pa.Schema] = None, 158 | ) -> None: 159 | """Add a source stream of type `pa.Table`.""" 160 | if schema is None: 161 | assert node._is_stream(), "Must provide a schema for state nodes" 162 | empty = node._empty_factory() 163 | assert isinstance(empty, pa.Table), "Only pyarrow.Table nodes supported" 164 | schema = empty.schema 165 | table_definition.validate(schema) 166 | self._dag.state( 167 | _PerspectiveNode( 168 | table_definition, 169 | schema, 170 | table=None, 171 | ) 172 | ).map(node) 173 | 174 | 175 | DATA_TYPES = [ 176 | (pa.types.is_integer, "integer"), 177 | (pa.types.is_floating, "float"), 178 | (pa.types.is_boolean, "boolean"), 179 | (pa.types.is_date, "date"), 180 | (pa.types.is_string, "string"), 181 | (pa.types.is_timestamp, "datetime"), 182 | ] 183 | 184 | 185 | def to_perspective_type(data_type: pa.DataType) -> Any: 186 | for predicate, perspective_type in DATA_TYPES: 187 | if predicate(data_type): 188 | return perspective_type 189 | raise TypeError(f"Unsupported type: {data_type}") 190 | 191 | 192 | def to_perspective_schema(schema: pa.Schema) -> dict[str, Any]: 193 | return {f.name: to_perspective_type(f.type) for f in schema} 194 | 195 | 196 | def perspective_thread( 197 | perspective_server: perspective.Server, 198 | kafka_driver: KafkaDriver, 199 | nodes: list[_PerspectiveNode], 200 | ): 201 | local_client = perspective_server.new_local_client() 202 | for node in nodes: 203 | assert node.table is None 204 | node.table = local_client.table( 205 | to_perspective_schema(node.schema), 206 | name=node.table_definition.name, 207 | index=node.table_definition.index_column, 208 | ) 209 | 210 | callback = tornado.ioloop.PeriodicCallback( 211 | callback=_UpdateRunner(kafka_driver), callback_time=1_000 212 | ) 213 | callback.start() 214 | 215 | 216 | def run_web_application( 217 | kafka_driver: KafkaDriver, 218 | assets_directory: str = ASSETS_DIRECTORY, 219 | port: int = 8082, 220 | ) -> None: 221 | server = perspective.Server() 222 | 223 | nodes: list[_PerspectiveNode] = [] 224 | for node in kafka_driver._dag._nodes: 225 | if isinstance(node._function, _PerspectiveNode): 226 | nodes.append(node._function) 227 | assert len(nodes) > 0, "No perspective table nodes" 228 | assert len({n.table_definition.name for n in nodes}) == len(nodes), ( 229 | "Duplicate table name" 230 | ) 231 | 232 | web_app = tornado.web.Application( 233 | [ 234 | ( 235 | r"/websocket", 236 | PerspectiveTornadoHandler, 237 | {"perspective_server": server}, 238 | ), 239 | ( 240 | r"/assets/(.*)", 241 | tornado.web.StaticFileHandler, 242 | {"path": assets_directory, "default_filename": None}, 243 | ), 244 | ( 245 | r"/([a-z0-9_]*)", 246 | TableRequestHandler, 247 | {"table_configs": [node.get_table_config() for node in nodes]}, 248 | ), 249 | ], 250 | serve_traceback=True, 251 | ) 252 | web_app.listen(port) 253 | loop = tornado.ioloop.IOLoop.current() 254 | loop.call_later(0, perspective_thread, server, kafka_driver, nodes) 255 | loop.start() 256 | -------------------------------------------------------------------------------- /beavers/polars_wrapper.py: -------------------------------------------------------------------------------- 1 | """Module for building dags using polars.""" 2 | 3 | import dataclasses 4 | from operator import itemgetter 5 | from typing import Callable, Optional, ParamSpec, Iterable, Any 6 | 7 | import polars as pl 8 | from polars._typing import IntoExprColumn 9 | 10 | from beavers.dag import Dag, Node, NodePrototype 11 | 12 | P = ParamSpec("P") 13 | 14 | 15 | @dataclasses.dataclass() 16 | class _LastByKey: 17 | key_columns: tuple[str, ...] 18 | current: pl.DataFrame 19 | 20 | def __call__(self, stream: pl.DataFrame) -> pl.DataFrame: 21 | self.current = ( 22 | pl.concat([self.current, stream]) 23 | .group_by(self.key_columns, maintain_order=True) 24 | .last() 25 | .select(self.current.columns) 26 | ) 27 | return self.current 28 | 29 | 30 | def _get_stream_schema(node: Node[pl.DataFrame]) -> pl.Schema: 31 | empty = node._get_empty() 32 | if not isinstance(empty, pl.DataFrame): 33 | raise TypeError(f"Argument should be a {Node.__name__}[pl.DataFrame]") 34 | else: 35 | return empty.schema 36 | 37 | 38 | def _get_stream_dtype(node: Node[pl.Series]) -> pl.DataType: 39 | empty = node._get_empty() 40 | if not isinstance(empty, pl.Series): 41 | raise TypeError(f"Argument should be a {Node.__name__}[pl.Series]") 42 | else: 43 | return empty.dtype 44 | 45 | 46 | @dataclasses.dataclass(frozen=True) 47 | class _TableFilter: 48 | predicate: tuple[IntoExprColumn | Iterable[IntoExprColumn], ...] 49 | constraints: dict[str, Any] 50 | 51 | def __call__(self, table: pl.DataFrame) -> pl.DataFrame: 52 | return table.filter(*self.predicate, **self.constraints) 53 | 54 | 55 | @dataclasses.dataclass(frozen=True) 56 | class PolarsDagWrapper: 57 | """Helper for adding polars Nodes to a Dag.""" 58 | 59 | _dag: Dag 60 | 61 | def source_table( 62 | self, schema: pl.Schema, name: Optional[str] = None 63 | ) -> Node[pl.DataFrame]: 64 | """Add a source stream of type `pl.DataFrame`.""" 65 | 66 | return self._dag.source_stream(empty=schema.to_frame(), name=name) 67 | 68 | def table_stream( 69 | self, function: Callable[P, pl.DataFrame], schema: pl.Schema 70 | ) -> NodePrototype[pl.DataFrame]: 71 | """Add a stream node of output type `pl.DataFrame`""" 72 | return self._dag.stream(function, empty=schema.to_frame()) 73 | 74 | def filter_stream( 75 | self, 76 | stream: Node[pl.DataFrame], 77 | *predicates: IntoExprColumn | Iterable[IntoExprColumn], 78 | **constraints: Any, 79 | ) -> Node[pl.DataFrame]: 80 | """Filter a stream Node of type `pl.DataFrame`.""" 81 | schema = _get_stream_schema(stream) 82 | return self._dag.stream( 83 | _TableFilter(tuple(predicates), dict(constraints)), 84 | empty=schema.to_frame(), 85 | ).map(stream) 86 | 87 | def last_by_keys( 88 | self, stream: Node[pl.DataFrame], keys: list[str] 89 | ) -> Node[pl.DataFrame]: 90 | """Build a state of the latest row by keys.""" 91 | schema = _get_stream_schema(stream) 92 | for key in keys: 93 | assert isinstance(key, str), "Keys must be strings" 94 | return self._dag.state(_LastByKey(tuple(keys), schema.to_frame())).map(stream) 95 | 96 | def concat_series(self, *streams: Node[pl.Series]) -> Node[pl.Series]: 97 | if len(streams) == 0: 98 | raise ValueError("Must pass at least one series") 99 | series_type = None 100 | for stream in streams: 101 | each_type = _get_stream_dtype(stream) 102 | if series_type is None: 103 | series_type = each_type 104 | elif series_type != each_type: 105 | raise TypeError(f"Series type mismatch {series_type} vs {each_type}") 106 | 107 | empty = pl.Series(dtype=series_type) 108 | return self._dag.stream(lambda *x: pl.concat(x), empty=empty).map(*streams) 109 | 110 | def get_series(self, stream: Node[pl.DataFrame], name: str) -> Node[pl.Series]: 111 | empty = _get_stream_schema(stream).to_frame()[name] 112 | return self._dag.stream(itemgetter(name), empty=empty).map(stream) 113 | -------------------------------------------------------------------------------- /beavers/pyarrow_kafka.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import io 3 | import json 4 | 5 | import confluent_kafka 6 | import pyarrow as pa 7 | import pyarrow.json 8 | 9 | from beavers.kafka import ( 10 | KafkaMessageDeserializer, 11 | KafkaMessageSerializer, 12 | KafkaProducerMessage, 13 | ) 14 | 15 | 16 | @dataclasses.dataclass(frozen=True) 17 | class JsonDeserializer(KafkaMessageDeserializer[pa.Table]): 18 | schema: pa.Schema 19 | 20 | def __call__(self, messages: confluent_kafka.Message) -> pa.Table: 21 | if messages: 22 | with io.BytesIO() as buffer: 23 | for message in messages: 24 | buffer.write(message.value()) 25 | buffer.write(b"\n") 26 | buffer.seek(0) 27 | return pyarrow.json.read_json( 28 | buffer, 29 | parse_options=pyarrow.json.ParseOptions( 30 | explicit_schema=self.schema 31 | ), 32 | ) 33 | else: 34 | return self.schema.empty_table() 35 | 36 | 37 | @dataclasses.dataclass(frozen=True) 38 | class JsonSerializer(KafkaMessageSerializer[pa.Table]): 39 | topic: str 40 | 41 | def __call__(self, table: pa.Table): 42 | return [ 43 | KafkaProducerMessage( 44 | self.topic, 45 | key=None, 46 | value=json.dumps(message, default=str).encode("utf-8"), 47 | ) 48 | for message in table.to_pylist() 49 | ] 50 | -------------------------------------------------------------------------------- /beavers/pyarrow_replay.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from typing import Callable 3 | 4 | import pandas as pd 5 | import pyarrow as pa 6 | 7 | from beavers.dag import UTC_MAX 8 | from beavers.replay import DataSink, DataSource 9 | 10 | 11 | class ArrowTableDataSource(DataSource[pa.Table]): 12 | def __init__( 13 | self, table: pa.Table, timestamp_extractor: Callable[[pa.Table], pa.Array] 14 | ): 15 | assert callable(timestamp_extractor) 16 | self._table = table 17 | self._empty_table = table.schema.empty_table() 18 | self._timestamp_column = timestamp_extractor(table).to_pandas( 19 | date_as_object=False 20 | ) 21 | assert self._timestamp_column.is_monotonic_increasing, ( 22 | "Timestamp column should be monotonic increasing" 23 | ) 24 | self._index = 0 25 | 26 | def read_to(self, timestamp: pd.Timestamp) -> pa.Table: 27 | new_index = self._timestamp_column.searchsorted(timestamp, side="right") 28 | if new_index > self._index: 29 | from_index = self._index 30 | self._index = new_index 31 | return self._table.slice(from_index, new_index - from_index) 32 | else: 33 | results = self._empty_table 34 | return results 35 | 36 | def get_next(self) -> pd.Timestamp: 37 | if self._index >= len(self._table): 38 | return UTC_MAX 39 | else: 40 | return self._timestamp_column.iloc[self._index] 41 | 42 | 43 | @dataclasses.dataclass 44 | class ArrowTableDataSink(DataSink[pa.Table]): 45 | saver: Callable[[pa.Table], None] 46 | chunks: list[pa.Table] = dataclasses.field(default_factory=list) 47 | 48 | def append(self, timestamp: pd.Timestamp, data: pa.Table): 49 | self.chunks.append(data) 50 | 51 | def close(self): 52 | if self.chunks: 53 | results = pa.concat_tables(self.chunks) 54 | self.saver(results) 55 | -------------------------------------------------------------------------------- /beavers/pyarrow_wrapper.py: -------------------------------------------------------------------------------- 1 | """Module for building dags using pyarrow.""" 2 | 3 | import dataclasses 4 | from typing import Callable, Iterable, Optional, ParamSpec, Sequence 5 | 6 | import numpy as np 7 | import pyarrow as pa 8 | 9 | from beavers.dag import Dag, Node, NodePrototype, _check_function 10 | 11 | P = ParamSpec("P") 12 | 13 | 14 | @dataclasses.dataclass(frozen=True) 15 | class _TableFiler: 16 | predicate: Callable[[pa.Table, ...], pa.Array] 17 | 18 | def __call__(self, table: pa.Table, *args, **kwargs) -> pa.Table: 19 | return table.filter(self.predicate(table, *args, **kwargs)) 20 | 21 | 22 | def _get_last_by(table: pa.Table, keys: Sequence[str]) -> pa.Table: 23 | return table.take( 24 | table.select(keys) 25 | .append_column("_beavers_index", pa.array(np.arange(len(table)))) 26 | .group_by(keys) 27 | .aggregate([("_beavers_index", "max")])["_beavers_index_max"] 28 | .sort() 29 | ) 30 | 31 | 32 | def _concat_arrow_arrays( 33 | arrow_arrays: Sequence[pa.ChunkedArray], 34 | ) -> [pa.Array | pa.ChunkedArray]: 35 | arrays: list[pa.Array] = [] 36 | for arrow_array in arrow_arrays: 37 | if isinstance(arrow_array, pa.ChunkedArray): 38 | arrays.extend(arrow_array.iterchunks()) 39 | elif isinstance(arrow_array, pa.Array): 40 | arrays.append(arrow_array) 41 | else: 42 | raise TypeError(arrow_array) 43 | 44 | return pa.chunked_array(arrays) 45 | 46 | 47 | def _check_column(column: str, schema: pa.Schema): 48 | if not isinstance(column, str): 49 | raise TypeError(column) 50 | elif column not in schema.names: 51 | raise TypeError(f"field {column} no in schema: {schema.names}") 52 | 53 | 54 | def _check_array(node: Node[pa.Array | pa.ChunkedArray]) -> pa.DataType: 55 | empty = node._get_empty() 56 | if not isinstance(empty, (pa.Array, pa.ChunkedArray)): 57 | raise TypeError(f"Argument should be a {Node.__name__}[pa.Array]") 58 | else: 59 | return empty.type 60 | 61 | 62 | def _check_columns(columns: list[str], schema: pa.Schema) -> list[str]: 63 | if not isinstance(columns, Iterable): 64 | raise TypeError(columns) 65 | for column in columns: 66 | if not isinstance(column, str): 67 | raise TypeError(column) 68 | elif column not in schema.names: 69 | raise TypeError(f"field {column} no in schema: {schema.names}") 70 | return list(columns) 71 | 72 | 73 | def _get_stream_schema(node: Node[pa.Table]) -> pa.Schema: 74 | empty = node._get_empty() 75 | if not isinstance(empty, pa.Table): 76 | raise TypeError(f"Argument should be a {Node.__name__}[pa.Table]") 77 | else: 78 | return empty.schema 79 | 80 | 81 | @dataclasses.dataclass() 82 | class _LastByKey: 83 | key_columns: tuple[str, ...] 84 | current: pa.Table 85 | 86 | def __call__(self, stream: pa.Table) -> pa.Table: 87 | self.current = _get_last_by( 88 | pa.concat_tables([self.current, stream]), self.key_columns 89 | ) 90 | return self.current 91 | 92 | 93 | @dataclasses.dataclass(frozen=True) 94 | class ArrowDagWrapper: 95 | """Helper for adding pyarrow Nodes to a Dag.""" 96 | 97 | _dag: Dag 98 | 99 | def source_table( 100 | self, schema: pa.Schema, name: Optional[str] = None 101 | ) -> Node[pa.Table]: 102 | """Add a source stream of type `pa.Table`.""" 103 | return self._dag.source_stream(empty=schema.empty_table(), name=name) 104 | 105 | def table_stream( 106 | self, function: Callable[P, pa.Table], schema: pa.Schema 107 | ) -> NodePrototype[pa.Table]: 108 | """Add a stream node of output type `pa.Table`""" 109 | return self._dag.stream(function, empty=schema.empty_table()) 110 | 111 | def filter_stream( 112 | self, 113 | predicate: Callable[[pa.Table, ...], pa.Array], 114 | stream: Node[pa.Table], 115 | *args: Node, 116 | **kwargs: Node, 117 | ) -> Node[pa.Table]: 118 | """Filter a stream Node of type `pa.Table`.""" 119 | function = _TableFiler(predicate) 120 | schema = _get_stream_schema(stream) 121 | _check_function(function) 122 | return self._dag.stream(function, empty=schema.empty_table()).map( 123 | stream, *args, **kwargs 124 | ) 125 | 126 | def last_by_keys( 127 | self, stream: Node[pa.Table], keys: Sequence[str] 128 | ) -> Node[pa.Table]: 129 | """Build a state of the latest row by keys.""" 130 | schema = _get_stream_schema(stream) 131 | keys = _check_columns(keys, schema) 132 | return self._dag.state(_LastByKey(keys, schema.empty_table())).map(stream) 133 | 134 | def get_column(self, stream: Node[pa.Table], key: str) -> Node[pa.ChunkedArray]: 135 | """Return a column from a stream node of type pa.Table.""" 136 | schema = _get_stream_schema(stream) 137 | _check_column(key, schema) 138 | field = schema.field(key) 139 | empty = pa.chunked_array([pa.array([], field.type)]) 140 | return self._dag.stream(lambda x: x[key], empty=empty).map(stream) 141 | 142 | def concat_arrays( 143 | self, *streams: Node[pa.Array | pa.ChunkedArray] 144 | ) -> Node[pa.ChunkedArray]: 145 | if len(streams) == 0: 146 | raise ValueError("Must pass at least one array") 147 | array_type = None 148 | for stream in streams: 149 | each_type = _check_array(stream) 150 | if array_type is None: 151 | array_type = each_type 152 | elif array_type != each_type: 153 | raise TypeError(f"Array type mismatch {array_type} vs {each_type}") 154 | 155 | empty = pa.chunked_array([pa.array([], array_type)]) 156 | return self._dag.stream(lambda *x: _concat_arrow_arrays(x), empty=empty).map( 157 | *streams 158 | ) 159 | -------------------------------------------------------------------------------- /beavers/replay.py: -------------------------------------------------------------------------------- 1 | """Module for replaying historical data.""" 2 | 3 | import abc 4 | import collections.abc 5 | import dataclasses 6 | import logging 7 | import time 8 | from typing import Callable, Generic, Iterator, Optional, Protocol, TypeVar 9 | 10 | import pandas as pd 11 | 12 | from beavers.dag import UTC_MAX, Dag, Node 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | T = TypeVar("T") 17 | 18 | 19 | @dataclasses.dataclass(frozen=True) 20 | class ReplayContext: 21 | """ 22 | Stores the information about a replay. 23 | 24 | Attributes 25 | ---------- 26 | start: pd.Timestamp 27 | Start of the replay 28 | end: pd.Timestamp 29 | End of the replay. 30 | This is exclusive, the replay will stop 1ns before 31 | frequency: 32 | How often should the replay run 33 | 34 | """ 35 | 36 | start: pd.Timestamp 37 | end: pd.Timestamp 38 | frequency: pd.Timedelta 39 | 40 | def __post_init__(self): 41 | """Check arguments are valid.""" 42 | assert self.start.tzname() == "UTC" 43 | assert self.end.tzname() == "UTC" 44 | 45 | 46 | class DataSource(Protocol[T]): 47 | """Interface for replaying historical data from a file or database.""" 48 | 49 | def read_to(self, timestamp: pd.Timestamp) -> T: 50 | """ 51 | Read from the data source, all the way to the provided timestamp (inclusive). 52 | 53 | This function is stateful and must remember the previous timestamp 54 | for which data was read. 55 | 56 | Parameters 57 | ---------- 58 | timestamp 59 | End of the time interval for which data is required (inclusive) 60 | 61 | Returns 62 | ------- 63 | data 64 | The data for the interval (or empty if no data is found) 65 | 66 | """ 67 | 68 | def get_next(self) -> pd.Timestamp: 69 | """ 70 | Return the next timestamp for which there is data. 71 | 72 | If no data is available this should return `UTC_MAX` 73 | 74 | 75 | Returns 76 | ------- 77 | timestamp: pd.Timestamp 78 | Timestamp of the next available data point (or `UTC_MAX` if no more data 79 | is available) 80 | 81 | """ 82 | 83 | 84 | class DataSink(Protocol[T]): 85 | """Interface for saving the results of a replay to a file or database.""" 86 | 87 | def append(self, timestamp: pd.Timestamp, data: T): 88 | """ 89 | Append data for the current cycle. 90 | 91 | Parameters 92 | ---------- 93 | timestamp: 94 | End of the time interval for which data was replayed (inclusive) 95 | data: 96 | The generated data 97 | 98 | """ 99 | 100 | def close(self): 101 | """Flush the data and clean up resources.""" 102 | 103 | 104 | class DataSourceProvider(Protocol[T]): 105 | """Interface for the provision of `DataSource`.""" 106 | 107 | def __call__(self, replay_context: ReplayContext) -> DataSource[T]: 108 | """ 109 | Create a `DataSource` for the given replay_context. 110 | 111 | Parameters 112 | ---------- 113 | replay_context: 114 | Information about the replay that's about to run 115 | 116 | Returns 117 | ------- 118 | DataSource[T]: 119 | Source for the replay 120 | 121 | """ 122 | 123 | 124 | class DataSinkProvider(Protocol[T]): 125 | """Interface for the provision of `DataSink`.""" 126 | 127 | @abc.abstractmethod 128 | def __call__(self, replay_context: ReplayContext) -> DataSink[T]: 129 | """ 130 | Create a `DataSink` for the given replay_context. 131 | 132 | Parameters 133 | ---------- 134 | replay_context: 135 | Information about the replay that's about to run 136 | 137 | Returns 138 | ------- 139 | DataSink[T]: 140 | Sink for the replay 141 | 142 | """ 143 | 144 | 145 | @dataclasses.dataclass(frozen=True) 146 | class _ReplaySource(Generic[T]): 147 | """Internal class used to store `DataSource` at runtime.""" 148 | 149 | name: str 150 | node: Node[T] 151 | data_source: DataSource[T] 152 | 153 | 154 | @dataclasses.dataclass(frozen=True) 155 | class _ReplaySink(Generic[T]): 156 | """Internal class used to store `DataSink` at runtime.""" 157 | 158 | name: str 159 | nodes: list[Node[T]] 160 | data_sink: DataSink[T] 161 | 162 | 163 | @dataclasses.dataclass(frozen=True) 164 | class ReplayCycleMetrics: 165 | """Metrics for each replay cycle.""" 166 | 167 | timestamp: pd.Timestamp 168 | cycle_id: int 169 | source_records: int 170 | sink_records: int 171 | cycle_time_ns: int 172 | warp_ratio: float 173 | 174 | 175 | @dataclasses.dataclass 176 | class ReplayDriver: 177 | """ 178 | Orchestrate the replay of data for dag. 179 | 180 | This will: 181 | 182 | - create the relevant `DataSource`s 183 | - create the relevant `DataSink`s 184 | - stream the data from the sources 185 | - inject the input data in the dag source nodes 186 | - execute the dag 187 | - collect the output data and pass it to the sink 188 | - close the sink at the end of the run 189 | 190 | Notes 191 | ----- 192 | Do not call the constructor directly, use `create` instead 193 | 194 | """ 195 | 196 | dag: Dag 197 | replay_context: ReplayContext 198 | sources: list[_ReplaySource] 199 | sinks: list[_ReplaySink] 200 | current_time: pd.Timestamp 201 | 202 | @staticmethod 203 | def create( 204 | dag: Dag, 205 | replay_context: ReplayContext, 206 | data_source_providers: dict[str, DataSourceProvider], 207 | data_sink_providers: dict[str, DataSinkProvider], 208 | ) -> "ReplayDriver": 209 | return ReplayDriver( 210 | dag, 211 | replay_context, 212 | _create_sources(dag, replay_context, data_source_providers), 213 | _create_sinks(dag, replay_context, data_sink_providers), 214 | current_time=replay_context.start, 215 | ) 216 | 217 | def run(self): 218 | while not self.is_done(): 219 | self.run_cycle() 220 | for sink in self.sinks: 221 | sink.data_sink.close() 222 | 223 | def is_done(self) -> bool: 224 | return self.current_time > self.replay_context.end 225 | 226 | def run_cycle(self) -> Optional[ReplayCycleMetrics]: 227 | st = time.time_ns() 228 | source_records, next_timestamp = self.read_sources() 229 | if source_records or self.dag.get_next_timer() <= self.current_time: 230 | timestamp = min(self.current_time, self.replay_context.end) 231 | self.dag.execute(timestamp) 232 | sink_records = self.flush_sinks() 233 | et = time.time_ns() 234 | warp_ratio = self.replay_context.frequency.value / (et - st) 235 | metrics = ReplayCycleMetrics( 236 | timestamp=timestamp, 237 | cycle_id=self.dag.get_cycle_id(), 238 | source_records=source_records, 239 | sink_records=sink_records, 240 | cycle_time_ns=et - st, 241 | warp_ratio=warp_ratio, 242 | ) 243 | logger.info( 244 | f"Running cycle={metrics.cycle_id} " 245 | f"timestamp={metrics.timestamp} " 246 | f"source_records={metrics.source_records} " 247 | f"sink_records={metrics.sink_records} " 248 | f"warp={warp_ratio:.1f}" 249 | ) 250 | else: 251 | metrics = None 252 | 253 | self.current_time = max( 254 | next_timestamp, self.current_time + self.replay_context.frequency 255 | ).ceil(self.replay_context.frequency) 256 | return metrics 257 | 258 | def read_sources(self) -> tuple[int, pd.Timestamp]: 259 | records = 0 260 | next_timestamp = self.replay_context.end 261 | for replay_source in self.sources: 262 | source_data = replay_source.data_source.read_to(self.current_time) 263 | next_timestamp = min(next_timestamp, replay_source.data_source.get_next()) 264 | if len(source_data) > 0: 265 | replay_source.node.set_stream(source_data) 266 | records += len(source_data) 267 | return records, next_timestamp 268 | 269 | def flush_sinks(self) -> int: 270 | records = 0 271 | for sink in self.sinks: 272 | for node in sink.nodes: 273 | if node.get_cycle_id() == self.dag.get_cycle_id(): 274 | sink_value = node.get_sink_value() 275 | records += ( 276 | len(sink_value) 277 | if isinstance(sink_value, collections.abc.Sized) 278 | else 1 279 | ) 280 | sink.data_sink.append(self.current_time, node.get_sink_value()) 281 | return records 282 | 283 | 284 | def _create_sources( 285 | dag: Dag, 286 | replay_context: ReplayContext, 287 | data_source_providers: dict[str, DataSourceProvider], 288 | ) -> list[_ReplaySource]: 289 | source_nodes = dag.get_sources() 290 | nodes_names = sorted(source_nodes.keys()) 291 | source_names = sorted(data_source_providers.keys()) 292 | if nodes_names != source_names: 293 | raise ValueError( 294 | "Source node and DataSource names don't match: " 295 | f"{nodes_names} vs {source_names}" 296 | ) 297 | return [ 298 | _ReplaySource( 299 | name, source_nodes[name], data_source_providers[name](replay_context) 300 | ) 301 | for name in data_source_providers.keys() 302 | ] 303 | 304 | 305 | def _create_sinks( 306 | dag: Dag, 307 | replay_context: ReplayContext, 308 | data_sink_providers: dict[str, DataSinkProvider], 309 | ) -> list[_ReplaySink]: 310 | sink_nodes = dag.get_sinks() 311 | nodes_names = sorted(sink_nodes.keys()) 312 | sink_names = sorted(data_sink_providers.keys()) 313 | if nodes_names != sink_names: 314 | raise ValueError( 315 | f"Sink node and DataSink names don't match: {nodes_names} vs {sink_names}" 316 | ) 317 | return [ 318 | _ReplaySink(name, sink_nodes[name], data_sink_providers[name](replay_context)) 319 | for name in data_sink_providers.keys() 320 | ] 321 | 322 | 323 | class IteratorDataSourceAdapter(DataSource[T]): 324 | """ 325 | Adapter between an iterator of `DataSource` and a DataSource. 326 | 327 | This can be used to stitch together various `DataSource` for incremental date range 328 | """ 329 | 330 | def __init__( 331 | self, 332 | sources: Iterator[DataSource[T]], 333 | empty: T, 334 | concatenator: Callable[[T, T], T], 335 | ): 336 | self._sources = sources 337 | self._empty = empty 338 | self._concatenator = concatenator 339 | self._current = self._next() 340 | 341 | def read_to(self, timestamp: pd.Timestamp) -> T: 342 | if self._current is None: 343 | return self._empty 344 | else: 345 | this_batch = self._current.read_to(timestamp) 346 | while self._current is not None and self._current.get_next() == UTC_MAX: 347 | self._current = self._next() 348 | next_batch = ( 349 | self._empty 350 | if self._current is None 351 | else self._current.read_to(timestamp) 352 | ) 353 | if next_batch and this_batch: 354 | this_batch = self._concatenator(this_batch, next_batch) 355 | elif next_batch: 356 | this_batch = next_batch 357 | 358 | return this_batch 359 | 360 | def get_next(self) -> pd.Timestamp: 361 | if self._current is None: 362 | return UTC_MAX 363 | else: 364 | return self._current.get_next() 365 | 366 | def _next(self) -> Optional[DataSource]: 367 | try: 368 | return next(self._sources) 369 | except StopIteration: 370 | return None 371 | 372 | 373 | class NoOpDataSink(DataSink): 374 | """DataSink that does nothing.""" 375 | 376 | def append(self, timestamp: pd.Timestamp, data: T): 377 | pass 378 | 379 | def close(self): 380 | pass 381 | 382 | 383 | class NoOpDataSinkProvider: 384 | """DataSinkProvider that provides a NoOpDataSink.""" 385 | 386 | def __call__(self, context: ReplayContext) -> DataSink[T]: 387 | return NoOpDataSink() 388 | -------------------------------------------------------------------------------- /beavers/table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {{table_config.name}} Beavers 5 | 6 | 7 | 11 | 12 | 13 | 14 | 19 | 20 | 32 | 33 | 34 | 35 | 36 | 37 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /beavers/testing.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional, Sequence, TypeVar 2 | 3 | import pandas as pd 4 | 5 | from beavers.dag import Dag 6 | 7 | T = TypeVar("T") 8 | 9 | 10 | class DagTestBench: 11 | def __init__(self, dag: Dag): 12 | self.dag = dag 13 | for output_name, output_sinks in self.dag.get_sinks().items(): 14 | assert len(output_sinks) == 1, output_name 15 | 16 | def set_source( 17 | self, 18 | source_name: str, 19 | source_data: Any, 20 | ) -> "DagTestBench": 21 | source = self.dag.get_sources()[source_name] 22 | source.set_stream(source_data) 23 | return self 24 | 25 | def execute(self, now: Optional[pd.Timestamp] = None) -> "DagTestBench": 26 | self.dag.execute(now) 27 | return self 28 | 29 | def assert_sink_list( 30 | self, 31 | sink_name: str, 32 | expected_messages: Sequence[T], 33 | ) -> "DagTestBench": 34 | sinks = self.dag.get_sinks()[sink_name] 35 | assert len(sinks) == 1 36 | cycle_id = sinks[0].get_cycle_id() 37 | assert cycle_id == self.dag.get_cycle_id() 38 | actual_messages = sinks[0].get_sink_value() 39 | assert len(actual_messages) == len(expected_messages), ( 40 | f"Sink {sink_name} value size mismatch" 41 | ) 42 | for actual_message, expected_message in zip(actual_messages, expected_messages): 43 | assert actual_message == expected_message 44 | return self 45 | 46 | def assert_sink_not_updated(self, sink_name: str) -> "DagTestBench": 47 | sinks = self.dag.get_sinks()[sink_name] 48 | assert len(sinks) == 1 49 | cycle_id = sinks[0].get_cycle_id() 50 | assert cycle_id < self.dag.get_cycle_id(), ( 51 | f"Sink {sink_name} got updated this cycle" 52 | ) 53 | return self 54 | -------------------------------------------------------------------------------- /docs/concepts/advanced.md: -------------------------------------------------------------------------------- 1 | # Advanced 2 | 3 | This section discuss advanced features that control how updates propagate in the DAG. 4 | 5 | ## How updates propagate in the DAG 6 | 7 | - Nodes are notified if any of their input node was updated during the current execution cycle 8 | ```python 9 | --8<-- "examples/advanced_concepts.py:propagate_any" 10 | ``` 11 | - You can check if a node updated by looking at its `cycle_id` 12 | ```python 13 | --8<-- "examples/advanced_concepts.py:propagate_cycle_id" 14 | ``` 15 | - If several inputs of a node get updated during the same cycle, the node will be executed once (and not once per input) 16 | ```python 17 | --8<-- "examples/advanced_concepts.py:propagate_both" 18 | ``` 19 | - Stream nodes (and sources) are not considered updated if their output is empty 20 | ```python 21 | --8<-- "examples/advanced_concepts.py:propagate_empty" 22 | ``` 23 | 24 | 25 | ## Now node 26 | 27 | Beavers can be used in both `live` and `replay` mode. 28 | In `replay` mode, the wall clock isn't relevant. 29 | To access the current time of the replay, you should use the now node: 30 | 31 | ```python 32 | --8<-- "examples/advanced_concepts.py:now_node" 33 | ``` 34 | 35 | The now node is shared for the whole DAG. 36 | Its value gets updated silently. 37 | 38 | ## TimerManager 39 | 40 | To be notified when time passes, nodes can subscribe to a `TimerManager` node. 41 | 42 | ```python 43 | --8<-- "examples/advanced_concepts.py:timer_manager" 44 | ``` 45 | 46 | ## Silent updates 47 | 48 | Some node may update too often, or their updates may not be relevant to other nodes. 49 | In this case it's possible to silence them: 50 | 51 | ```python 52 | --8<-- "examples/advanced_concepts.py:silence" 53 | ``` 54 | 55 | `silence` returns a new silenced node (rather than modify the existing node) 56 | 57 | ## Value Cutoff 58 | 59 | By default, state nodes will update everytime they are notified. 60 | The framework doesn't check that their value has changed. 61 | 62 | You can add a cutoff, to prevent updates when the value hasn't changed: 63 | 64 | ```python 65 | --8<-- "examples/advanced_concepts.py:cutoff" 66 | ``` 67 | 68 | You can also provide a custom comparator to allow some tolerance when deciding if a value has changed: 69 | 70 | ```python 71 | --8<-- "examples/advanced_concepts.py:cutoff_custom" 72 | ``` 73 | -------------------------------------------------------------------------------- /docs/concepts/dag.md: -------------------------------------------------------------------------------- 1 | 2 | # DAG 3 | 4 | At its core, `beavers` executes a Directed Acyclic Graph (DAG), where each node is a python function. 5 | This section discusses the different type of nodes in the DAG. 6 | 7 | ## Stream Source 8 | 9 | A stream source is a node whose value can be set externally. 10 | 11 | When `Dag.execute` is called, the updated value is propagated in the DAG 12 | 13 | ```python 14 | --8<-- "examples/dag_concepts.py:source_stream" 15 | ``` 16 | 17 | If the DAG is executed again, the value of the source stream will be reset to its empty value. 18 | 19 | ```python 20 | --8<-- "examples/dag_concepts.py:source_stream_again" 21 | ``` 22 | 23 | The default empty value is set to `[]`, but it can be customized: 24 | 25 | ```python 26 | --8<-- "examples/dag_concepts.py:source_stream_empty" 27 | ``` 28 | 29 | A source stream can be given a name, so they can be retrieved (and their value set): 30 | 31 | ```python 32 | --8<-- "examples/dag_concepts.py:source_stream_name" 33 | ``` 34 | 35 | ## Stream Node 36 | 37 | A stream node uses the output of other nodes to calculate its updated value. 38 | 39 | ```python 40 | --8<-- "examples/dag_concepts.py:stream_node" 41 | ``` 42 | 43 | If the DAG is executed again, the value of the stream node will be reset to its empty value. 44 | 45 | ```python 46 | --8<-- "examples/dag_concepts.py:stream_node_again" 47 | ``` 48 | 49 | The default empty value is set to `[]`, but it can be customized: 50 | ```python 51 | --8<-- "examples/dag_concepts.py:stream_node_empty" 52 | ``` 53 | 54 | The function provided to the node can be any callable, like a lambda: 55 | ```python 56 | --8<-- "examples/dag_concepts.py:stream_node_lambda" 57 | ``` 58 | 59 | Or a class defining `__call__`: 60 | ```python 61 | --8<-- "examples/dag_concepts.py:stream_node_callable" 62 | ``` 63 | 64 | ## State Node 65 | 66 | A state node retains its value from one DAG execution to the next, even if it didn't update: 67 | ```python 68 | --8<-- "examples/dag_concepts.py:state_node" 69 | ``` 70 | 71 | Because they retain their value when they are not updated, state nodes don't require an empty value 72 | 73 | ## Const Node 74 | 75 | A const node is a node whose value doesn't change. 76 | ```python 77 | --8<-- "examples/dag_concepts.py:const_node" 78 | ``` 79 | 80 | Const nodes behave like state nodes (their value isn't reset when they don't update). 81 | 82 | ## Connecting Nodes (aka `map`) 83 | 84 | Nodes are connected by calling the `map` function. 85 | Any stream or state node can be connected to state nodes, stream nodes or const nodes. 86 | 87 | > :warning: The `map` function doesn't execute the underlying node. 88 | > Instead it adds a node to the DAG 89 | 90 | The map function can use positional arguments: 91 | 92 | ```python 93 | --8<-- "examples/dag_concepts.py:map_positional" 94 | ``` 95 | Or key word arguments: 96 | 97 | ```python 98 | --8<-- "examples/dag_concepts.py:map_key_word" 99 | ``` 100 | 101 | ## State vs Stream 102 | 103 | Stream Nodes: 104 | 105 | - need their return type to implement `collections.abc.Sized` 106 | - need an empty value to be specfied (which default to `[]`) 107 | - have their value reset to empty when they don't update 108 | - are not considered updated if they return empty 109 | 110 | State Nodes: 111 | 112 | - Can return any type 113 | - don't require an empty value 114 | - retain their value on cycle they don't update 115 | - are always considered updated if they are called 116 | -------------------------------------------------------------------------------- /docs/concepts/kafka.md: -------------------------------------------------------------------------------- 1 | # Live with Kafka 2 | 3 | This section explains how to run a beavers application in real time using kafka. 4 | 5 | ## Count Word Example 6 | 7 | Starting with a simple "count word" dag with one source going to one sink: 8 | 9 | ```python 10 | --8<-- "examples/kafka_concepts.py:dag" 11 | ``` 12 | 13 | This dag has got a source node called `words` and a sink node called `counts` 14 | 15 | ## Defining Kafka Source 16 | 17 | We will be receiving data from kafka, on a topic called `words`. 18 | 19 | First we need to define how we deserialize messages coming from kafka: 20 | 21 | ```python 22 | --8<-- "examples/kafka_concepts.py:deserializer" 23 | ``` 24 | 25 | Then, we put together the `SourceTopic` with its: 26 | 27 | - topic (`words`) 28 | - deserializer (`deserialize_messages`) 29 | - replay policy (`from_latest`) 30 | 31 | ```python 32 | --8<-- "examples/kafka_concepts.py:kafka_source" 33 | ``` 34 | 35 | There are multiple kafka replay policy available, see the api doc for the full list. 36 | 37 | ## Defining Kafka Sink 38 | 39 | We will be sending the results to the `counts` topic. 40 | The key will be the word.T The value will be the latest count. 41 | 42 | First we need to define a serializer, which converts each count to a `KafkaProducerMessage` 43 | 44 | ```python 45 | --8<-- "examples/kafka_concepts.py:serializer" 46 | ``` 47 | 48 | The serializer is responsible for providing the topic for each outgoing message. 49 | 50 | ## Putting it together with KafkaDriver 51 | 52 | The `KafkaDriver` takes care of creating the kafka producer and consumer, and passing the message through: 53 | 54 | ```python 55 | --8<-- "examples/kafka_concepts.py:kafka_driver" 56 | ``` 57 | 58 | ## Beavers Kafka Features 59 | 60 | - One consumer: There is only one consumer (rather than one consumer for each topic) 61 | - One producer: There is only one producer (rather than one producer for each topic) 62 | - When polling messages, beavers tries to read all available messages, up to a limit of `batch_size=5000` (which is configurable in the KafkaDriver) 63 | - When replaying past data, beavers orchestrate topic/partition so data is replayed in order, across topics, based on each message timestamp. 64 | - When replaying past data, some newer messages have to be held. 65 | To avoid memory issue, the number of held messages is capped to `batch_size*5`. 66 | Once the number of held messages get to high, partitions that are ahead of the watermark are paused. 67 | These partitions are un-paused once the application catches up 68 | 69 | 70 | ## Beavers Kafka Limitations 71 | 72 | - One beavers application consumes every partition for requested topics (no load balancing/scaling) 73 | -------------------------------------------------------------------------------- /docs/concepts/pandas.md: -------------------------------------------------------------------------------- 1 | # Pandas integration 2 | 3 | This section explains how to use beavers with pandas. 4 | 5 | ## ETF value calculation example 6 | 7 | In this example we want to calculate the value of ETFs. 8 | If you are not familiar with ETFs, think about them as just a basket of shares. 9 | 10 | Starting with a table of individual share prices: 11 | ```python 12 | --8<-- "examples/pandas_concepts.py:business_logic_price" 13 | ``` 14 | 15 | | ticker | price | 16 | |:---------|--------:| 17 | | AAPL | 174.79 | 18 | | GOOGL | 130.25 | 19 | | MSFT | 317.01 | 20 | | F | 12.43 | 21 | | GM | 35.28 | 22 | 23 | And another table containing the composition of each ETF: 24 | ```python 25 | --8<-- "examples/pandas_concepts.py:business_logic_composition" 26 | ``` 27 | 28 | | etf | ticker | quantity | 29 | |:------|:---------|-----------:| 30 | | TECH | AAPL | 2.0 | 31 | | TECH | GOOGL | 2.0 | 32 | | TECH | MSFT | 1.0 | 33 | | CARS | F | 3.0 | 34 | | CARS | GM | 1.0 | 35 | 36 | In a few line of `pandas` we can derive the value of each ETF: 37 | ```python 38 | --8<-- "examples/pandas_concepts.py:business_logic_calculation" 39 | ``` 40 | 41 | | etf | value | 42 | |:-----|--------:| 43 | | TECH | 927.09 | 44 | | CARS | 72.57 | 45 | 46 | ## ETF value calculation DAG 47 | 48 | Once the business logic of the calculation is writen and tested it can be added into a Dag. 49 | We'll be using the Dag `pd` helper which makes it easier to deal with `pandas` table in beavers. 50 | 51 | First we define two source streams, made of `pandas.DataFrame`: 52 | ```python 53 | --8<-- "examples/pandas_concepts.py:dag_source" 54 | ``` 55 | 56 | Then we keep track of the latest value for each source stream: 57 | ```python 58 | --8<-- "examples/pandas_concepts.py:dag_state" 59 | ``` 60 | 61 | Lastly we put together the share prices and ETF composition: 62 | ```python 63 | --8<-- "examples/pandas_concepts.py:dag_calculation" 64 | ``` 65 | 66 | And that's it: 67 | 68 | ```python 69 | --8<-- "examples/pandas_concepts.py:dag_test" 70 | ``` 71 | -------------------------------------------------------------------------------- /docs/concepts/perspective.md: -------------------------------------------------------------------------------- 1 | # Perspective Integration 2 | 3 | This section explains how to build a live web dashboard with [Perspective](https://github.com/finos/perspective) and Beavers. 4 | 5 | In Beavers, you can connect any node of type `pyarrow.Table` to a perspective table. 6 | All you need to do is call `dag.psp.to_perspecive`, and provide a `PerspectiveTableDefinition`. 7 | 8 | 9 | ## Key Value Example 10 | 11 | We'll write a super simple key-value store application. 12 | It listens to a topic, and displays the value of kafka messages by key, with their timestamp 13 | 14 | ## Install 15 | 16 | ```shell 17 | pip install beavers[pyarrow, perpective-python] 18 | ``` 19 | 20 | ## Defining the schema of incoming message 21 | 22 | First we define a schema for the incoming "key value" messages: 23 | 24 | - a timestamp, in millis 25 | - a key (string) 26 | - a value (string) 27 | 28 | ```python 29 | --8<-- "examples/perspective_concepts.py:schema" 30 | ``` 31 | 32 | ## Convert kafka messages to arrow Table 33 | 34 | Then we write a function that converts kafka messages to an apache arrow table of "key value" messages: 35 | 36 | ```python 37 | --8<-- "examples/perspective_concepts.py:converter" 38 | ``` 39 | 40 | 41 | ## Create a dag 42 | 43 | We create a super simple dag. 44 | It has a source, called `key_value`, which is a table of "key value" messages. 45 | The source is plugged into a perspective table, called... `key_value`, whose index is the `key` column 46 | 47 | ```python 48 | --8<-- "examples/perspective_concepts.py:dag" 49 | ``` 50 | 51 | ## Run the dashboard 52 | 53 | Lastly, we put everything together in an application 54 | ```python 55 | --8<-- "examples/perspective_concepts.py:run" 56 | ``` 57 | 58 | You should be able to see it in http://localhost:8082/key_value 59 | -------------------------------------------------------------------------------- /docs/concepts/polars.md: -------------------------------------------------------------------------------- 1 | # Polars integration 2 | 3 | This section explains how to use beavers with polars. 4 | 5 | ## ETF value calculation example 6 | 7 | In this example we want to calculate the value of ETFs. 8 | 9 | Starting with a data frame of individual share prices: 10 | ```python 11 | --8<-- "examples/polars_concepts.py:business_logic_price" 12 | ``` 13 | 14 | | ticker | price | 15 | |:---------|--------:| 16 | | AAPL | 174.79 | 17 | | GOOGL | 130.25 | 18 | | MSFT | 317.01 | 19 | | F | 12.43 | 20 | | GM | 35.28 | 21 | 22 | And another data frame containing the composition of each ETF: 23 | ```python 24 | --8<-- "examples/polars_concepts.py:business_logic_composition" 25 | ``` 26 | 27 | | etf | ticker | quantity | 28 | |:------|:---------|-----------:| 29 | | TECH | AAPL | 2.0 | 30 | | TECH | GOOGL | 2.0 | 31 | | TECH | MSFT | 1.0 | 32 | | CARS | F | 3.0 | 33 | | CARS | GM | 1.0 | 34 | 35 | In a few line of `polars` we can derive the value of each ETF: 36 | ```python 37 | --8<-- "examples/polars_concepts.py:business_logic_calculation" 38 | ``` 39 | 40 | | etf | value | 41 | |:-----|--------:| 42 | | TECH | 927.09 | 43 | | CARS | 72.57 | 44 | 45 | ## ETF value calculation DAG 46 | 47 | Once the business logic of the calculation is writen and tested it can be added into a Dag. 48 | We'll be using the Dag `pl` helper which makes it easier to deal with `polars` data frame in beavers. 49 | 50 | First we define two source streams, made of `polars.DataFrame`: 51 | ```python 52 | --8<-- "examples/polars_concepts.py:dag_source" 53 | ``` 54 | 55 | Then we keep track of the latest value for each source stream: 56 | ```python 57 | --8<-- "examples/polars_concepts.py:dag_state" 58 | ``` 59 | 60 | Lastly we put together the share prices and ETF composition: 61 | ```python 62 | --8<-- "examples/polars_concepts.py:dag_calculation" 63 | ``` 64 | 65 | And that's it: 66 | 67 | ```python 68 | --8<-- "examples/polars_concepts.py:dag_test" 69 | ``` 70 | 71 | 72 | ## Taming updates 73 | 74 | This simple dag does the job of calculating the ETF value in real time. 75 | But there is one issue. 76 | The value of every ETF would update every time either `price` or `etf_composition` update. 77 | Even if the updates comes on a ticker that is not relevant to the ETFs we are tracking. 78 | 79 | In the example below, when the price of GameStop updates, we recalculate the value of every ETF. 80 | Even though their value hasn't changed: 81 | ```python 82 | --8<-- "examples/polars_concepts.py:spurious_update" 83 | ``` 84 | 85 | To tame updates we need to identify which ETF needs updating. 86 | 87 | ETF values can update because their composition has changed: 88 | ```python 89 | --8<-- "examples/polars_concepts.py:updated_because_of_composition" 90 | ``` 91 | 92 | Or because one of their component has updated: 93 | ```python 94 | --8<-- "examples/polars_concepts.py:updated_because_of_price" 95 | ``` 96 | 97 | We can then put it back together and only calculate updates for relevant ETFs: 98 | ```python 99 | --8<-- "examples/polars_concepts.py:update_all" 100 | ``` 101 | 102 | 103 | And see that only the value "TECH" ETF updates when a tech stock update: 104 | ```python 105 | --8<-- "examples/polars_concepts.py:update_all_test" 106 | ``` 107 | 108 | | etf | value | 109 | |:------|--------:| 110 | | TECH | 927.13 | 111 | -------------------------------------------------------------------------------- /docs/concepts/pyarrow.md: -------------------------------------------------------------------------------- 1 | # Pyarrow integration 2 | 3 | This section explains how to use beavers with pyarrow. 4 | 5 | ## ETF value calculation example 6 | 7 | In this example we want to calculate the value of ETFs. 8 | If you are not familiar with ETFs, think about them as just a basket of shares. 9 | 10 | Starting with a table of individual share prices: 11 | ```python 12 | --8<-- "examples/pyarrow_concepts.py:business_logic_price" 13 | ``` 14 | 15 | | ticker | price | 16 | |:---------|--------:| 17 | | AAPL | 174.79 | 18 | | GOOGL | 130.25 | 19 | | MSFT | 317.01 | 20 | | F | 12.43 | 21 | | GM | 35.28 | 22 | 23 | And another table containing the composition of each ETF: 24 | ```python 25 | --8<-- "examples/pyarrow_concepts.py:business_logic_composition" 26 | ``` 27 | 28 | | etf | ticker | quantity | 29 | |:------|:---------|-----------:| 30 | | TECH | AAPL | 2.0 | 31 | | TECH | GOOGL | 2.0 | 32 | | TECH | MSFT | 1.0 | 33 | | CARS | F | 3.0 | 34 | | CARS | GM | 1.0 | 35 | 36 | In a few line of `pyarrow` we can derive the value of each ETF: 37 | ```python 38 | --8<-- "examples/pyarrow_concepts.py:business_logic_calculation" 39 | ``` 40 | 41 | | etf | value | 42 | |:-----|--------:| 43 | | TECH | 927.09 | 44 | | CARS | 72.57 | 45 | 46 | ## ETF value calculation DAG 47 | 48 | Once the business logic of the calculation is writen and tested it can be added into a Dag. 49 | We'll be using the Dag `pa` helper which makes it easier to deal with `pyarrow` table in beavers. 50 | 51 | First we define two source streams, made of `pyarrow.Table`: 52 | ```python 53 | --8<-- "examples/pyarrow_concepts.py:dag_source" 54 | ``` 55 | 56 | Then we keep track of the latest value for each source stream: 57 | ```python 58 | --8<-- "examples/pyarrow_concepts.py:dag_state" 59 | ``` 60 | 61 | Lastly we put together the share prices and ETF composition: 62 | ```python 63 | --8<-- "examples/pyarrow_concepts.py:dag_calculation" 64 | ``` 65 | 66 | And that's it: 67 | 68 | ```python 69 | --8<-- "examples/pyarrow_concepts.py:dag_test" 70 | ``` 71 | 72 | 73 | ## Taming updates 74 | 75 | This simple dag does the job of calculating the ETF value in real time. 76 | But there is one issue. 77 | The value of every ETF would update every time either `price` or `etf_composition` update. 78 | Even if the updates comes on a ticker that is not relevant to the ETFs we are tracking. 79 | 80 | In the example below, when the price of GameStop updates, we recalculate the value of every ETF. 81 | Even though their value hasn't changed: 82 | ```python 83 | --8<-- "examples/pyarrow_concepts.py:spurious_update" 84 | ``` 85 | 86 | To tame updates we need to identify which ETF needs updating. 87 | 88 | ETF values can update because their composition has changed: 89 | ```python 90 | --8<-- "examples/pyarrow_concepts.py:updated_because_of_composition" 91 | ``` 92 | 93 | Or because one of their component has updated: 94 | ```python 95 | --8<-- "examples/pyarrow_concepts.py:updated_because_of_price" 96 | ``` 97 | 98 | We can then put it back together and only calculate updates for relevant ETFs: 99 | ```python 100 | --8<-- "examples/pyarrow_concepts.py:update_all" 101 | ``` 102 | 103 | 104 | And see that only the value "TECH" ETF updates when a tech stock update: 105 | ```python 106 | --8<-- "examples/pyarrow_concepts.py:update_all_test" 107 | ``` 108 | 109 | | etf | value | 110 | |:------|--------:| 111 | | TECH | 927.13 | 112 | -------------------------------------------------------------------------------- /docs/concepts/replay.md: -------------------------------------------------------------------------------- 1 | # Replay 2 | 3 | This section explains how to run a beavers application using historical data, typically stored in files or databases. 4 | 5 | ## Manual Replay 6 | 7 | Starting with a simple dag with one source going to one sink: 8 | 9 | ```python 10 | --8<-- "examples/replay_concepts.py:simple_dag" 11 | ``` 12 | 13 | Assuming your data has got this shape: 14 | ```python 15 | --8<-- "examples/replay_concepts.py:simple_data_class" 16 | ``` 17 | 18 | You could replay the data manually your self and run the dag for regular interval: 19 | ```python 20 | --8<-- "examples/replay_concepts.py:manual_replay" 21 | ``` 22 | 23 | But this requires a lot of boilerplate code and becomes cumbersome very quickly. 24 | 25 | ## Replay Framework 26 | 27 | The replay framework uses a few key abstraction in order to define how the data is loaded and injected in the dag. 28 | 29 | ### `DataSource` 30 | 31 | A `DataSource` provides a way of streaming data. 32 | ```python 33 | --8<-- "examples/replay_concepts.py:data_source" 34 | ``` 35 | 36 | By convention, `DataSource`s: 37 | 38 | - return `UTC_MAX` when there is no more data 39 | - are stateful and need to remember what has already been read. 40 | 41 | ### `ReplayContext` 42 | 43 | The `ReplayContext` contains timing information: 44 | ```python 45 | --8<-- "examples/replay_concepts.py:replay_context" 46 | ``` 47 | 48 | :warning: By convention all timestamps are UTC 49 | 50 | 51 | ### `DataSourceProvider` 52 | 53 | A `DataSourceProvider` provides a way of creating `DataSource`. 54 | 55 | For example, if the data is stored in a csv file: 56 | 57 | ```csv 58 | timestamp,message 59 | 2023-01-01 01:00:00+00:00,Hello 60 | 2023-01-01 01:01:00+00:00,How are you 61 | ``` 62 | 63 | Provided with the `ReplayContext`, our `DataSourceProvider` will load the and return a `DataSource` 64 | 65 | ```python 66 | --8<-- "examples/replay_concepts.py:data_source_provider" 67 | ``` 68 | 69 | 70 | ### `DataSink` 71 | 72 | A `DataSink` provides a way of capturing the output of nodes and saving the data: 73 | 74 | 75 | ```python 76 | --8<-- "examples/replay_concepts.py:data_sink" 77 | ``` 78 | 79 | ### `DataSinkProvider` 80 | 81 | A `DataSinkProvider` provides a way of creating `DataSink`. 82 | 83 | In this example we save the data to csv: 84 | 85 | 86 | ```python 87 | --8<-- "examples/replay_concepts.py:data_sink_provider" 88 | ``` 89 | 90 | 91 | ### `ReplayDriver` 92 | 93 | The replay driver is responsible for putting the dag, context, sources and sinks together, and orchestrate the replay. 94 | 95 | ```python 96 | --8<-- "examples/replay_concepts.py:replay_driver" 97 | ``` 98 | 99 | 100 | ## Reading Files Partitioned By Time 101 | 102 | Assuming: 103 | 104 | - you want to replay a dag for a long period of time. 105 | - all that historic data doesn't fit into time 106 | - the data is partitioned by time period. For example one file per day, `input_2023-01-01.csv`. 107 | 108 | It's then possible, with the `IteratorDataSourceAdapter` to load each file one by one as they are needed. 109 | 110 | In this example, csv files are stored under . We need to provide: 111 | 112 | - a generator that will yield a `DataSource` for each file, in order 113 | - a way to concatenate the output of 2 `DataSource`. In this case we'll use `+` to merge two lists 114 | - an empty value for the case there is no more data, or we reach the last file. 115 | 116 | ```python 117 | --8<-- "examples/replay_concepts.py:iterator_data_source_adapter" 118 | ``` 119 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Welcome! We're happy to have you here. Thank you in advance for your contribution to Beavers. 4 | 5 | ## Development environment set up 6 | 7 | ```shell 8 | python3 -m venv --clear venv 9 | source venv/bin/activate 10 | poetry self add "poetry-dynamic-versioning[plugin]" 11 | poetry install 12 | pre-commit install 13 | ``` 14 | 15 | ## Testing 16 | 17 | To run tests fast: 18 | 19 | ```shell 20 | pytest -n auto tests 21 | ``` 22 | 23 | To Get coverage: 24 | 25 | ```shell 26 | coverage run --branch --rcfile=./pyproject.toml --include "./beavers/*" -m pytest tests 27 | coverage report --show-missing 28 | ``` 29 | 30 | ## Generating the change log 31 | 32 | We use [git-change-log](https://pawamoy.github.io/git-changelog/usage/) to generate our CHANGELOG.md 33 | 34 | Please follow the [basic convention](https://pawamoy.github.io/git-changelog/usage/#basic-convention) for commit 35 | message. 36 | 37 | To update the change log, run: 38 | 39 | ```shell 40 | git-changelog -io CHANGELOG.md 41 | ``` 42 | 43 | ## New Release 44 | 45 | For new release, first prepare the change log, push and merge it. 46 | 47 | ```shell 48 | git-changelog --bump=auto -io CHANGELOG.md 49 | ``` 50 | 51 | Then tag and push: 52 | 53 | ```shell 54 | git tag vX.X.X 55 | git push origin vX.X.X 56 | ``` 57 | 58 | Lastly on github, go to tags and create a release. 59 | The CI will deploy to pypi automatically from then. 60 | 61 | ## Testing the documentation 62 | 63 | ```shell 64 | mkdocs serve --livereload --watch=./ 65 | ``` 66 | 67 | ## Updating dependencies 68 | 69 | - For the repo `poetry update` 70 | - For the doc: `(cd docs/; pip-compile ./requirements.in > ./requirements.txt)` 71 | - For pre-commit: `pre-commit autoupdate` 72 | 73 | ## Resources 74 | 75 | The repo set up is inspired by this [guide](https://mathspp.com/blog/how-to-create-a-python-package-in-2022) 76 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## Why is it called beavers? 4 | 5 | Beavers are very clever animals that builds dams to regulate the flow of rivers. 6 | Likewise, the beavers library builds a dam around your data to regulate how it is processed by your applications. 7 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ![Beavers Logo][5] 2 | 3 | # Beavers 4 | 5 | [Documentation][6] / [Installation][7] / [Repository][1] / [PyPI][8] 6 | 7 | [Beavers][1] is a python library for stream processing, optimized for analytics. 8 | 9 | It is used at [Tradewell Technologies][2], 10 | to calculate analytics and serve model predictions, 11 | for both realtime and batch jobs. 12 | 13 | ## Key Features 14 | 15 | - Works in **real time** (eg: reading from Kafka) and **replay mode** (eg: reading from Parquet files). 16 | - Optimized for analytics, using micro-batches (instead of processing records one by one). 17 | - Similar to [incremental][3], it updates nodes in a dag incrementally. 18 | - Taking inspiration from [kafka streams][4], there are two types of nodes in the dag: 19 | - **Stream**: ephemeral micro-batches of events (cleared after every cycle). 20 | - **State**: durable state derived from streams. 21 | - Clear separation between the business logic and the IO. 22 | So the same dag can be used in real time mode, replay mode or can be easily tested. 23 | - Functional interface: no inheritance or decorator required. 24 | - Support for complicated joins, not just "linear" data flow. 25 | 26 | ## Limitations 27 | 28 | - No concurrency support. 29 | To speed up calculation use libraries like pandas, pyarrow or polars. 30 | - No async code. 31 | To speed up IO use kafka driver native thread or parquet IO thread pool. 32 | - No support for persistent state. 33 | Instead of saving state, replay historic data from kafka to prime stateful nodes. 34 | 35 | ## Talks 36 | 37 | - [Unified batch and stream processing in python | PyData Global 2023][9] 38 | 39 | [1]: https://github.com/tradewelltech/beavers 40 | [2]: https://www.tradewelltech.co/ 41 | [3]: https://github.com/janestreet/incremental 42 | [4]: https://www.confluent.io/blog/kafka-streams-tables-part-1-event-streaming/ 43 | [5]: https://raw.githubusercontent.com/tradewelltech/beavers/master/docs/static/icons/beavers/logo.svg 44 | [6]: https://beavers.readthedocs.io/en/latest/ 45 | [7]: https://beavers.readthedocs.io/en/latest/install/ 46 | [8]: https://pypi.org/project/beavers/ 47 | [9]: https://www.youtube.com/watch?v=8pUwsGA8SQM 48 | -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | # Install 2 | 3 | ## Basic Install 4 | 5 | ```sh 6 | pip install beavers 7 | ``` 8 | 9 | ## Extras 10 | 11 | To install with extras like Arrow, Kafka or Perspective: 12 | 13 | ```sh 14 | pip install beavers[pyarrow, confluent_kafka, perspective-python] 15 | ``` 16 | -------------------------------------------------------------------------------- /docs/reference/dag.md: -------------------------------------------------------------------------------- 1 | ::: beavers.dag 2 | options: 3 | heading_level: 2 4 | show_source: false 5 | -------------------------------------------------------------------------------- /docs/reference/kafka.md: -------------------------------------------------------------------------------- 1 | ::: beavers.kafka 2 | options: 3 | heading_level: 2 4 | show_source: false 5 | -------------------------------------------------------------------------------- /docs/reference/pandas_wrapper.md: -------------------------------------------------------------------------------- 1 | ::: beavers.pandas_wrapper 2 | options: 3 | heading_level: 2 4 | show_source: false 5 | -------------------------------------------------------------------------------- /docs/reference/pyarrow_wrapper.md: -------------------------------------------------------------------------------- 1 | ::: beavers.pyarrow_wrapper 2 | options: 3 | heading_level: 2 4 | show_source: false 5 | -------------------------------------------------------------------------------- /docs/reference/replay.md: -------------------------------------------------------------------------------- 1 | ::: beavers.replay 2 | options: 3 | heading_level: 2 4 | show_source: false 5 | -------------------------------------------------------------------------------- /docs/requirements.in: -------------------------------------------------------------------------------- 1 | markdown-include 2 | mkdocs 3 | mkdocs-material 4 | mkdocs-material-extensions 5 | mkdocstrings[python] 6 | pymdown-extensions 7 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.10 3 | # by the following command: 4 | # 5 | # pip-compile ./requirements.in 6 | # 7 | babel==2.17.0 8 | # via mkdocs-material 9 | backrefs==5.8 10 | # via mkdocs-material 11 | certifi==2025.4.26 12 | # via requests 13 | charset-normalizer==3.4.2 14 | # via requests 15 | click==8.2.1 16 | # via mkdocs 17 | colorama==0.4.6 18 | # via 19 | # griffe 20 | # mkdocs-material 21 | ghp-import==2.1.0 22 | # via mkdocs 23 | griffe==1.7.3 24 | # via mkdocstrings-python 25 | idna==3.10 26 | # via requests 27 | jinja2==3.1.6 28 | # via 29 | # mkdocs 30 | # mkdocs-material 31 | # mkdocstrings 32 | markdown==3.8 33 | # via 34 | # markdown-include 35 | # mkdocs 36 | # mkdocs-autorefs 37 | # mkdocs-material 38 | # mkdocstrings 39 | # pymdown-extensions 40 | markdown-include==0.8.1 41 | # via -r ./requirements.in 42 | markupsafe==3.0.2 43 | # via 44 | # jinja2 45 | # mkdocs 46 | # mkdocs-autorefs 47 | # mkdocstrings 48 | mergedeep==1.3.4 49 | # via 50 | # mkdocs 51 | # mkdocs-get-deps 52 | mkdocs==1.6.1 53 | # via 54 | # -r ./requirements.in 55 | # mkdocs-autorefs 56 | # mkdocs-material 57 | # mkdocstrings 58 | mkdocs-autorefs==1.4.2 59 | # via 60 | # mkdocstrings 61 | # mkdocstrings-python 62 | mkdocs-get-deps==0.2.0 63 | # via mkdocs 64 | mkdocs-material==9.6.14 65 | # via -r ./requirements.in 66 | mkdocs-material-extensions==1.3.1 67 | # via 68 | # -r ./requirements.in 69 | # mkdocs-material 70 | mkdocstrings[python]==0.29.1 71 | # via 72 | # -r ./requirements.in 73 | # mkdocstrings-python 74 | mkdocstrings-python==1.16.12 75 | # via mkdocstrings 76 | packaging==25.0 77 | # via mkdocs 78 | paginate==0.5.7 79 | # via mkdocs-material 80 | pathspec==0.12.1 81 | # via mkdocs 82 | platformdirs==4.3.8 83 | # via mkdocs-get-deps 84 | pygments==2.19.1 85 | # via mkdocs-material 86 | pymdown-extensions==10.15 87 | # via 88 | # -r ./requirements.in 89 | # mkdocs-material 90 | # mkdocstrings 91 | python-dateutil==2.9.0.post0 92 | # via ghp-import 93 | pyyaml==6.0.2 94 | # via 95 | # mkdocs 96 | # mkdocs-get-deps 97 | # pymdown-extensions 98 | # pyyaml-env-tag 99 | pyyaml-env-tag==1.1 100 | # via mkdocs 101 | requests==2.32.3 102 | # via mkdocs-material 103 | six==1.17.0 104 | # via python-dateutil 105 | typing-extensions==4.14.0 106 | # via mkdocstrings-python 107 | urllib3==2.4.0 108 | # via requests 109 | watchdog==6.0.0 110 | # via mkdocs 111 | -------------------------------------------------------------------------------- /docs/static/icons/beavers/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tradewelltech/beavers/ec9979086868589ab82b47ce55fa11cc31b32c16/docs/static/icons/beavers/icon.png -------------------------------------------------------------------------------- /docs/static/icons/beavers/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tradewelltech/beavers/ec9979086868589ab82b47ce55fa11cc31b32c16/examples/__init__.py -------------------------------------------------------------------------------- /examples/advanced_concepts.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: E402 2 | # isort: skip_file 3 | import pandas as pd 4 | 5 | from beavers import Dag 6 | 7 | dag = Dag() 8 | 9 | # --8<-- [start:propagate_any] 10 | source_1 = dag.source_stream() 11 | source_2 = dag.source_stream() 12 | node = dag.stream(lambda x, y: x + y).map(source_1, source_2) 13 | 14 | source_1.set_stream([1, 2, 3]) 15 | dag.execute() 16 | assert node.get_value() == [1, 2, 3] # source_1 updated 17 | 18 | source_2.set_stream([4, 5, 6]) 19 | dag.execute() 20 | assert node.get_value() == [4, 5, 6] # source_2 updated 21 | 22 | dag.execute() 23 | assert node.get_value() == [] # no updates, reset to empty 24 | # --8<-- [end:propagate_any] 25 | 26 | # --8<-- [start:propagate_cycle_id] 27 | source_1.set_stream([1, 2, 3]) 28 | dag.execute() 29 | assert node.get_value() == [1, 2, 3] 30 | assert node.get_cycle_id() == dag.get_cycle_id() 31 | 32 | dag.execute() 33 | assert node.get_value() == [] 34 | assert node.get_cycle_id() == dag.get_cycle_id() - 1 35 | # --8<-- [end:propagate_cycle_id] 36 | 37 | 38 | # --8<-- [start:propagate_both] 39 | source_1.set_stream([1, 2, 3]) 40 | source_2.set_stream([4, 5, 6]) 41 | dag.execute() 42 | assert node.get_value() == [1, 2, 3, 4, 5, 6] 43 | assert node.get_cycle_id() == dag.get_cycle_id() 44 | # --8<-- [end:propagate_both] 45 | 46 | 47 | # --8<-- [start:propagate_empty] 48 | def even_only(values: list[int]) -> list[int]: 49 | return [v for v in values if (v % 2) == 0] 50 | 51 | 52 | even = dag.stream(even_only).map(source_1) 53 | 54 | source_1.set_stream([1, 2, 3]) 55 | dag.execute() 56 | assert even.get_value() == [2] 57 | assert even.get_cycle_id() == dag.get_cycle_id() 58 | 59 | source_1.set_stream([1, 3]) 60 | dag.execute() 61 | assert even.get_value() == [] 62 | assert even.get_cycle_id() == dag.get_cycle_id() - 1 63 | # --8<-- [end:propagate_empty] 64 | 65 | 66 | # --8<-- [start:now_node] 67 | def get_delay(timestamps: list[pd.Timestamp], now: pd.Timestamp) -> list[pd.Timedelta]: 68 | return [now - timestamp for timestamp in timestamps] 69 | 70 | 71 | timestamp_stream = dag.source_stream() 72 | delay = dag.stream(get_delay).map(timestamp_stream, dag.now()) 73 | 74 | timestamp_stream.set_stream( 75 | [ 76 | pd.to_datetime("2022-01-01", utc=True), 77 | pd.to_datetime("2022-01-02", utc=True), 78 | pd.to_datetime("2022-01-03", utc=True), 79 | ] 80 | ) 81 | dag.execute(timestamp=pd.to_datetime("2022-01-04", utc=True)) 82 | assert delay.get_value() == [ 83 | pd.to_timedelta("3d"), 84 | pd.to_timedelta("2d"), 85 | pd.to_timedelta("1d"), 86 | ] 87 | 88 | # --8<-- [end:now_node] 89 | 90 | # --8<-- [start:timer_manager] 91 | from beavers import TimerManager 92 | 93 | 94 | def get_year(now: pd.Timestamp, timer_manager: TimerManager): 95 | if not timer_manager.has_next_timer(): 96 | timer_manager.set_next_timer( 97 | pd.Timestamp(year=now.year + 1, day=1, month=1, tzinfo=now.tzinfo) 98 | ) 99 | 100 | return now.year 101 | 102 | 103 | year = dag.state(get_year).map(dag.now(), dag.timer_manager()) 104 | 105 | dag.execute(pd.to_datetime("2022-01-01", utc=True)) 106 | assert year.get_value() == 2022 107 | assert year.get_cycle_id() == dag.get_cycle_id() 108 | 109 | dag.execute(pd.to_datetime("2022-01-02", utc=True)) 110 | assert year.get_value() == 2022 111 | assert year.get_cycle_id() == dag.get_cycle_id() - 1 112 | 113 | dag.execute(pd.to_datetime("2023-01-02", utc=True)) 114 | assert year.get_value() == 2023 115 | assert year.get_cycle_id() == dag.get_cycle_id() 116 | # --8<-- [end:timer_manager] 117 | 118 | 119 | # --8<-- [start:silence] 120 | source_1 = dag.source_stream() 121 | source_1_silence = dag.silence(source_1) 122 | source_2 = dag.source_stream() 123 | 124 | both = dag.stream(lambda x, y: x + y).map(source_1_silence, source_2) 125 | 126 | source_1.set_stream([1, 2, 3]) 127 | source_2.set_stream([4, 5, 6]) 128 | dag.execute() 129 | assert both.get_value() == [1, 2, 3, 4, 5, 6] 130 | assert both.get_cycle_id() == dag.get_cycle_id() 131 | 132 | source_1.set_stream([1, 2, 3]) 133 | dag.execute() 134 | assert both.get_value() == [] 135 | assert ( 136 | both.get_cycle_id() == dag.get_cycle_id() - 1 137 | ) # No update because source_1 is silent 138 | 139 | # --8<-- [end:silence] 140 | 141 | 142 | # --8<-- [start:cutoff] 143 | class GetMax: 144 | def __init__(self): 145 | self._max = 0.0 146 | 147 | def __call__(self, values: list[float]) -> float: 148 | self._max = max(self._max, *values) 149 | return self._max 150 | 151 | 152 | source = dag.source_stream() 153 | get_max = dag.state(GetMax()).map(source) 154 | get_max_cutoff = dag.cutoff(get_max) 155 | 156 | source.set_stream([1.0, 2.0]) 157 | dag.execute() 158 | assert get_max.get_value() == 2.0 159 | assert get_max.get_cycle_id() == dag.get_cycle_id() 160 | assert get_max_cutoff.get_cycle_id() == dag.get_cycle_id() 161 | 162 | source.set_stream([1.0]) 163 | dag.execute() 164 | assert get_max.get_value() == 2.0 165 | assert get_max.get_cycle_id() == dag.get_cycle_id() 166 | assert get_max_cutoff.get_cycle_id() == dag.get_cycle_id() - 1 167 | 168 | source.set_stream([3.0]) 169 | dag.execute() 170 | assert get_max.get_value() == 3.0 171 | assert get_max.get_cycle_id() == dag.get_cycle_id() 172 | assert get_max_cutoff.get_cycle_id() == dag.get_cycle_id() 173 | # --8<-- [end:cutoff] 174 | 175 | # --8<-- [start:cutoff_custom] 176 | get_max_cutoff_custom = dag.cutoff(get_max, lambda x, y: abs(x - y) < 0.1) 177 | 178 | source.set_stream([4.0]) 179 | dag.execute() 180 | assert get_max.get_value() == 4.0 181 | assert get_max.get_cycle_id() == dag.get_cycle_id() 182 | assert get_max_cutoff_custom.get_cycle_id() == dag.get_cycle_id() 183 | 184 | 185 | source.set_stream([4.05]) 186 | dag.execute() 187 | assert get_max.get_value() == 4.05 188 | assert get_max.get_cycle_id() == dag.get_cycle_id() 189 | assert get_max_cutoff_custom.get_value() == 4.0 190 | assert get_max_cutoff_custom.get_cycle_id() == dag.get_cycle_id() - 1 191 | 192 | 193 | source.set_stream([4.11]) 194 | dag.execute() 195 | assert get_max.get_value() == 4.11 196 | assert get_max.get_cycle_id() == dag.get_cycle_id() 197 | assert get_max_cutoff_custom.get_value() == 4.11 198 | assert get_max_cutoff_custom.get_cycle_id() == dag.get_cycle_id() 199 | # --8<-- [end:cutoff_custom] 200 | -------------------------------------------------------------------------------- /examples/dag_concepts.py: -------------------------------------------------------------------------------- 1 | # isort: skip_file 2 | 3 | # --8<-- [start:source_stream] 4 | from beavers import Dag 5 | 6 | dag = Dag() 7 | 8 | source_stream = dag.source_stream() 9 | 10 | source_stream.set_stream([1, 2, 3]) 11 | dag.execute() 12 | assert source_stream.get_value() == [1, 2, 3] 13 | # --8<-- [end:source_stream] 14 | 15 | 16 | # --8<-- [start:source_stream_again] 17 | dag.execute() 18 | assert source_stream.get_value() == [] 19 | # --8<-- [end:source_stream_again] 20 | 21 | # --8<-- [start:source_stream_name] 22 | my_source_stream = dag.source_stream(name="my_source") 23 | dag.get_sources()["my_source"].set_stream([4, 5, 6]) 24 | dag.execute() 25 | assert my_source_stream.get_value() == [4, 5, 6] 26 | # --8<-- [end:source_stream_name] 27 | 28 | # --8<-- [start:source_stream_empty] 29 | dict_source_stream = dag.source_stream(empty_factory=dict) 30 | dict_source_stream.set_stream({"hello": "world"}) 31 | dag.execute() 32 | assert dict_source_stream.get_value() == {"hello": "world"} 33 | dag.execute() 34 | assert dict_source_stream.get_value() == {} 35 | # --8<-- [end:source_stream_empty] 36 | 37 | 38 | # --8<-- [start:stream_node] 39 | def multiply_by_2(values: list[int]) -> list[int]: 40 | return [v * 2 for v in values] 41 | 42 | 43 | stream_node = dag.stream(multiply_by_2).map(source_stream) 44 | 45 | source_stream.set_stream([1, 2, 3]) 46 | dag.execute() 47 | assert stream_node.get_value() == [2, 4, 6] 48 | # --8<-- [end:stream_node] 49 | 50 | 51 | # --8<-- [start:stream_node_again] 52 | dag.execute() 53 | assert stream_node.get_value() == [] 54 | # --8<-- [end:stream_node_again] 55 | 56 | 57 | # --8<-- [start:stream_node_empty] 58 | set_stream_node = dag.stream(set, empty_factory=set).map(source_stream) 59 | source_stream.set_stream([1, 2, 3, 1, 2, 3]) 60 | dag.execute() 61 | assert set_stream_node.get_value() == {1, 2, 3} 62 | dag.execute() 63 | assert set_stream_node.get_value() == set() 64 | # --8<-- [end:stream_node_empty] 65 | 66 | 67 | # --8<-- [start:stream_node_lambda] 68 | lambda_stream_node = dag.stream(lambda x: x[:-1]).map(source_stream) 69 | source_stream.set_stream([1, 2, 3]) 70 | dag.execute() 71 | assert lambda_stream_node.get_value() == [1, 2] 72 | # --8<-- [end:stream_node_lambda] 73 | 74 | 75 | # --8<-- [start:stream_node_callable] 76 | class MultiplyBy: 77 | def __init__(self, by: int): 78 | self.by = by 79 | 80 | def __call__(self, values: list[int]) -> list[int]: 81 | return [v * self.by for v in values] 82 | 83 | 84 | callable_stream_node = dag.stream(MultiplyBy(3)).map(source_stream) 85 | source_stream.set_stream([1, 2, 3]) 86 | dag.execute() 87 | assert callable_stream_node.get_value() == [3, 6, 9] 88 | # --8<-- [end:stream_node_callable] 89 | 90 | 91 | # --8<-- [start:state_node] 92 | class Accumulator: 93 | def __init__(self): 94 | self._count = 0 95 | 96 | def __call__(self, values: list[int]) -> int: 97 | self._count += sum(values) 98 | return self._count 99 | 100 | 101 | state_node = dag.state(Accumulator()).map(source_stream) 102 | source_stream.set_stream([1, 2, 3]) 103 | dag.execute() 104 | assert state_node.get_value() == 6 105 | dag.execute() 106 | assert state_node.get_value() == 6 107 | # --8<-- [end:state_node] 108 | 109 | 110 | # --8<-- [start:const_node] 111 | const_node = dag.const(2) 112 | assert const_node.get_value() == 2 113 | # --8<-- [end:const_node] 114 | 115 | 116 | # --8<-- [start:map_positional] 117 | to_append = dag.const([3]) 118 | positional_stream = dag.stream(lambda x, y: x + y).map(source_stream, to_append) 119 | source_stream.set_stream([1, 2]) 120 | dag.execute() 121 | assert positional_stream.get_value() == [1, 2, 3] 122 | # --8<-- [end:map_positional] 123 | 124 | 125 | # --8<-- [start:map_key_word] 126 | key_word = dag.stream(lambda x, y: x + y).map(x=source_stream, y=to_append) 127 | # --8<-- [end:map_key_word] 128 | -------------------------------------------------------------------------------- /examples/etfs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example of ETF nav (Net Asset Value) calculation 3 | """ 4 | 5 | import dataclasses 6 | import random 7 | from operator import attrgetter 8 | from typing import Callable, Generic, Optional, TypeVar 9 | 10 | import numpy as np 11 | import pandas as pd 12 | 13 | from beavers import Dag 14 | 15 | K = TypeVar("K") 16 | V = TypeVar("V") 17 | 18 | 19 | @dataclasses.dataclass(frozen=True) 20 | class PriceRecord: 21 | timestamp: pd.Timestamp 22 | ticker: str 23 | price: Optional[float] 24 | 25 | 26 | @dataclasses.dataclass(frozen=True) 27 | class EtfComposition: 28 | timestamp: pd.Timestamp 29 | ticker: str 30 | weights: dict[str, float] 31 | 32 | 33 | class GetLatest(Generic[K, V]): 34 | def __init__(self, key_extractor: Callable[[V], K]): 35 | self._key_extractor = key_extractor 36 | self._latest = {} 37 | 38 | def __call__(self, updates: list[V]) -> dict[str, V]: 39 | for update in updates: 40 | self._latest[self._key_extractor(update)] = update 41 | return self._latest 42 | 43 | 44 | class GetUnique(Generic[K, V]): 45 | def __init__(self, key_extractor: Callable[[V], K]): 46 | self._key_extractor = key_extractor 47 | 48 | def __call__(self, updates: list[V]) -> list[str]: 49 | return sorted(list({self._key_extractor(update) for update in updates})) 50 | 51 | 52 | def create_day_test_prices(date: pd.Timestamp) -> list[PriceRecord]: 53 | end = date + pd.offsets.Day() 54 | return sorted( 55 | [ 56 | PriceRecord( 57 | timestamp=pd.Timestamp( 58 | np.random.randint(date.value, end.value), unit="ns" 59 | ), 60 | ticker=random.choice(["AAPL", "GOOGL", "MSFT"]), # nosec B311 61 | price=random.random(), # nosec B311 62 | ) 63 | for _ in range(random.randint(0, 1000)) # nosec B311 64 | ], 65 | key=lambda x: x.timestamp, 66 | ) 67 | 68 | 69 | def calculate_nav( 70 | composition: EtfComposition, prices: dict[str, PriceRecord] 71 | ) -> PriceRecord: 72 | timestamp = composition.timestamp 73 | quotient = 0.0 74 | dividend = 0.0 75 | error = False 76 | for ticker, weight in composition.weights.items(): 77 | try: 78 | price = prices[ticker] 79 | except KeyError: 80 | error = True 81 | else: 82 | quotient += price.price * weight 83 | dividend += weight 84 | timestamp = max(timestamp, price.timestamp) 85 | 86 | return PriceRecord( 87 | timestamp, 88 | composition.ticker, 89 | None if dividend == 0.0 or error else quotient / dividend, 90 | ) 91 | 92 | 93 | def calculate_navs( 94 | updated_tickers: set[str], 95 | etf_compositions: dict[str, EtfComposition], 96 | prices: dict[str, PriceRecord], 97 | ) -> list[PriceRecord]: 98 | return [ 99 | calculate_nav(etf_composition, prices) 100 | for etf_composition in etf_compositions.values() 101 | if ( 102 | etf_composition.ticker in updated_tickers 103 | or (updated_tickers & etf_composition.weights.keys()) 104 | ) 105 | ] 106 | 107 | 108 | def get_updated_tickers( 109 | updated_prices: list[PriceRecord], 110 | updated_etf_compositions: list[EtfComposition], 111 | ) -> set[str]: 112 | return set(p.ticker for p in updated_prices) | set( 113 | e.ticker for e in updated_etf_compositions 114 | ) 115 | 116 | 117 | def create_dag() -> Dag: 118 | dag = Dag() 119 | price_stream = dag.source_stream([], name="price") 120 | etf_composition_stream = dag.source_stream([], name="etf_composition") 121 | price_latest = dag.state(GetLatest(attrgetter("ticker"))).map(price_stream) 122 | etf_composition_latest = dag.state(GetLatest(attrgetter("ticker"))).map( 123 | etf_composition_stream 124 | ) 125 | 126 | updated_tickers = dag.stream(get_updated_tickers, set()).map( 127 | price_stream, etf_composition_stream 128 | ) 129 | updated_navs = dag.stream(calculate_navs, []).map( 130 | updated_tickers, etf_composition_latest, price_latest 131 | ) 132 | dag.sink("etf_price", updated_navs) 133 | return dag 134 | -------------------------------------------------------------------------------- /examples/kafka_concepts.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: E402 2 | # isort: skip_file 3 | 4 | 5 | import confluent_kafka 6 | import pandas as pd 7 | 8 | # --8<-- [start:dag] 9 | from beavers import Dag 10 | 11 | 12 | class CountWords: 13 | state = {} 14 | 15 | def __call__(self, new_words: list[str]) -> dict[str, int]: 16 | for word in new_words: 17 | self.state[word] = self.state.get(word, 0) + 1 18 | return self.state 19 | 20 | 21 | def update_stream( 22 | state: dict[str, int], updated_words: list[str] 23 | ) -> list[tuple[str, int]]: 24 | return [(word, state[word]) for word in set(updated_words)] 25 | 26 | 27 | dag = Dag() 28 | word_source = dag.source_stream(name="words") 29 | count_state = dag.state(CountWords()).map(word_source) 30 | count_stream = dag.stream(update_stream, []).map(count_state, word_source) 31 | dag.sink("counts", count_stream) 32 | # --8<-- [end:dag] 33 | 34 | 35 | # --8<-- [start:deserializer] 36 | def deserialize_messages(messages: list[confluent_kafka.Message]) -> list[str]: 37 | return [message.value() for message in messages] 38 | 39 | 40 | # --8<-- [end:deserializer] 41 | 42 | # --8<-- [start:kafka_source] 43 | from beavers.kafka import SourceTopic, KafkaDriver 44 | 45 | source_topic = SourceTopic.from_start_of_day( 46 | "words", deserialize_messages, pd.to_timedelta("15min"), "UTC" 47 | ) 48 | # --8<-- [end:kafka_source] 49 | 50 | 51 | # --8<-- [start:serializer] 52 | from beavers.kafka import KafkaProducerMessage 53 | 54 | 55 | def serialize_counts(values: list[tuple[str, int]]) -> list[KafkaProducerMessage]: 56 | return [ 57 | KafkaProducerMessage( 58 | topic="counts", 59 | key=word, 60 | value=str(count), 61 | ) 62 | for word, count in values 63 | ] 64 | 65 | 66 | # --8<-- [end:serializer] 67 | 68 | 69 | # --8<-- [start:kafka_driver] 70 | kafka_driver = KafkaDriver.create( 71 | dag=dag, 72 | consumer_config={ 73 | "group.id": "beavers", 74 | "bootstrap.servers": "localhost:9092", 75 | }, 76 | producer_config={"bootstrap.servers": "localhost:9092"}, 77 | source_topics={"words": source_topic}, 78 | sink_topics={"counts": serialize_counts}, 79 | ) 80 | while True: 81 | kafka_driver.run_cycle() 82 | # --8<-- [end:kafka_driver] 83 | 84 | 85 | # Note: you can test it with the following commands 86 | # kafka-topics --create --topic words --bootstrap-server=localhost:9092 87 | # kafka-console-producer --topic words --bootstrap-server=localhost:9092 88 | # kafka-console-consumer --topic=counts --bootstrap-server=localhost:9092 \ 89 | # --property print.key=true 90 | -------------------------------------------------------------------------------- /examples/pandas_concepts.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: E402 2 | # isort: skip_file 3 | 4 | # --8<-- [start:business_logic_price] 5 | import pandas as pd 6 | 7 | price_table = pd.DataFrame.from_records( 8 | [ 9 | {"ticker": "AAPL", "price": 174.79}, 10 | {"ticker": "GOOGL", "price": 130.25}, 11 | {"ticker": "MSFT", "price": 317.01}, 12 | {"ticker": "F", "price": 12.43}, 13 | {"ticker": "GM", "price": 35.28}, 14 | ], 15 | ) 16 | 17 | price_dtypes = price_table.dtypes 18 | 19 | # --8<-- [end:business_logic_price] 20 | 21 | # print(price_table.to_pandas().to_markdown(index=False)) 22 | 23 | # --8<-- [start:business_logic_composition] 24 | etf_composition_table = pd.DataFrame.from_records( 25 | [ 26 | {"etf": "TECH", "ticker": "AAPL", "quantity": 2.0}, 27 | {"etf": "TECH", "ticker": "GOOGL", "quantity": 2.0}, 28 | {"etf": "TECH", "ticker": "MSFT", "quantity": 1.0}, 29 | {"etf": "CARS", "ticker": "F", "quantity": 3.0}, 30 | {"etf": "CARS", "ticker": "GM", "quantity": 2.0}, 31 | ], 32 | ) 33 | 34 | etf_composition_dtypes = etf_composition_table.dtypes 35 | # --8<-- [end:business_logic_composition] 36 | 37 | # print(etf_composition_table.to_pandas().to_markdown(index=False, ffmt=".1f")) 38 | 39 | 40 | # --8<-- [start:business_logic_calculation] 41 | def calculate_etf_value( 42 | etf_composition: pd.DataFrame, price: pd.DataFrame 43 | ) -> pd.DataFrame: 44 | return ( 45 | etf_composition.merge(price, left_on="ticker", right_on="ticker", how="left") 46 | .assign(values=lambda x: x["price"] * x["quantity"]) 47 | .groupby("etf") 48 | .aggregate([("value", "sum")]) 49 | ) 50 | 51 | 52 | etf_value_table = calculate_etf_value( 53 | etf_composition=etf_composition_table, price=price_table 54 | ) 55 | # --8<-- [end:business_logic_calculation] 56 | 57 | 58 | # print(etf_value_table.to_pandas().to_markdown(index=False, floatfmt=".2f")) 59 | 60 | # --8<-- [start:dag_source] 61 | from beavers import Dag 62 | 63 | dag = Dag() 64 | price_source = dag.pd.source_df(dtypes=price_dtypes, name="price") 65 | etf_composition_source = dag.pd.source_df( 66 | dtypes=etf_composition_dtypes, name="etf_composition" 67 | ) 68 | # --8<-- [end:dag_source] 69 | 70 | # --8<-- [start:dag_state] 71 | price_state = dag.pd.last_by_keys(price_source, ["ticker"]) 72 | etf_composition_state = dag.pd.last_by_keys( 73 | etf_composition_source, 74 | ["etf", "ticker"], 75 | ) 76 | # --8<-- [end:dag_state] 77 | 78 | 79 | # --8<-- [start:dag_calculation] 80 | etf_value_state = dag.state(calculate_etf_value).map( 81 | etf_composition_state, 82 | price_state, 83 | ) 84 | # --8<-- [end:dag_calculation] 85 | 86 | 87 | # --8<-- [start:dag_test] 88 | price_source.set_stream(price_table) 89 | etf_composition_source.set_stream(etf_composition_table) 90 | dag.execute() 91 | pd.testing.assert_frame_equal(etf_value_state.get_value(), etf_value_table) 92 | # --8<-- [end:dag_test] 93 | -------------------------------------------------------------------------------- /examples/perspective_concepts.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: E402 2 | # isort: skip_file 3 | 4 | from typing import Sequence, Optional 5 | 6 | # --8<-- [start:schema] 7 | import pyarrow as pa 8 | 9 | 10 | KEY_VALUE_SCHEMA = pa.schema( 11 | [ 12 | pa.field("timestamp", pa.timestamp("ms", "UTC")), 13 | pa.field("topic", pa.string()), 14 | pa.field("partition", pa.int32()), 15 | pa.field("offset", pa.int64()), 16 | pa.field("key", pa.string()), 17 | pa.field("value", pa.string()), 18 | ] 19 | ) 20 | # --8<-- [end:schema] 21 | 22 | # --8<-- [start:converter] 23 | import confluent_kafka 24 | 25 | 26 | def kafka_messages_to_pyarrow( 27 | messages: Sequence[confluent_kafka.Message], 28 | ) -> pa.Table: 29 | return pa.table( 30 | [ 31 | [m.timestamp()[1] for m in messages], 32 | [m.topic() for m in messages], 33 | [m.partition() for m in messages], 34 | [m.offset() for m in messages], 35 | [None if m.key() is None else m.key().decode("utf-8") for m in messages], 36 | [ 37 | None if m.value() is None else m.value().decode("utf-8") 38 | for m in messages 39 | ], 40 | ], 41 | schema=KEY_VALUE_SCHEMA, 42 | ) 43 | 44 | 45 | # --8<-- [end:converter] 46 | 47 | # --8<-- [start:dag] 48 | from beavers import Dag 49 | from beavers.perspective_wrapper import PerspectiveTableDefinition 50 | 51 | 52 | def create_test_dag() -> Dag: 53 | dag = Dag() 54 | stream = dag.pa.source_table( 55 | name="key_value", 56 | schema=KEY_VALUE_SCHEMA, 57 | ) 58 | dag.psp.to_perspective( 59 | stream, 60 | PerspectiveTableDefinition( 61 | name="key_value", 62 | index_column="key", 63 | ), 64 | ) 65 | return dag 66 | 67 | 68 | # --8<-- [end:dag] 69 | 70 | # --8<-- [start:run] 71 | from beavers.kafka import KafkaDriver, SourceTopic 72 | from beavers.perspective_wrapper import run_web_application 73 | 74 | 75 | def run_dashboard( 76 | topic: str = "key-value", 77 | port: int = 8082, 78 | consumer_config: Optional[dict] = None, 79 | ): 80 | if consumer_config is None: 81 | consumer_config = {"bootstrap.servers": "localhost:9092", "group.id": "beavers"} 82 | 83 | dag = create_test_dag() 84 | 85 | kafka_driver = KafkaDriver.create( 86 | dag=dag, 87 | producer_config={}, 88 | consumer_config=consumer_config, 89 | source_topics={ 90 | "key_value": SourceTopic.from_earliest(topic, kafka_messages_to_pyarrow) 91 | }, 92 | sink_topics={}, 93 | ) 94 | 95 | run_web_application(kafka_driver=kafka_driver, port=port) 96 | 97 | 98 | # --8<-- [end:run] 99 | -------------------------------------------------------------------------------- /examples/polars_concepts.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: E402 2 | # isort: skip_file 3 | 4 | import polars.testing 5 | 6 | 7 | # --8<-- [start:business_logic_price] 8 | import polars as pl 9 | 10 | PRICE_SCHEMA = pl.Schema( 11 | [ 12 | ("ticker", pl.String()), 13 | ("price", pl.Float64()), 14 | ] 15 | ) 16 | 17 | price_table = pl.DataFrame( 18 | [ 19 | {"ticker": "AAPL", "price": 174.79}, 20 | {"ticker": "GOOGL", "price": 130.25}, 21 | {"ticker": "MSFT", "price": 317.01}, 22 | {"ticker": "F", "price": 12.43}, 23 | {"ticker": "GM", "price": 35.28}, 24 | ], 25 | schema=PRICE_SCHEMA, 26 | ) 27 | # --8<-- [end:business_logic_price] 28 | 29 | # print(price_table.to_pandas().to_markdown(index=False)) 30 | 31 | # --8<-- [start:business_logic_composition] 32 | ETF_COMPOSITION_SCHEMA = pl.Schema( 33 | [ 34 | ("etf", pl.String()), 35 | ("ticker", pl.String()), 36 | ("quantity", pl.Float64()), 37 | ] 38 | ) 39 | 40 | 41 | etf_composition_table = pl.DataFrame( 42 | [ 43 | {"etf": "TECH", "ticker": "AAPL", "quantity": 2.0}, 44 | {"etf": "TECH", "ticker": "GOOGL", "quantity": 2.0}, 45 | {"etf": "TECH", "ticker": "MSFT", "quantity": 1.0}, 46 | {"etf": "CARS", "ticker": "F", "quantity": 3.0}, 47 | {"etf": "CARS", "ticker": "GM", "quantity": 2.0}, 48 | ], 49 | schema=ETF_COMPOSITION_SCHEMA, 50 | ) 51 | # --8<-- [end:business_logic_composition] 52 | 53 | # print(etf_composition_table.to_pandas().to_markdown(index=False, floatfmt=".1f")) 54 | 55 | 56 | # --8<-- [start:business_logic_calculation] 57 | ETF_VALUE_SCHEMA = pl.Schema( 58 | [ 59 | ("etf", pl.String()), 60 | ("value", pl.Float64()), 61 | ] 62 | ) 63 | 64 | 65 | def calculate_etf_value( 66 | etf_composition: pl.DataFrame, price: pl.DataFrame 67 | ) -> pl.DataFrame: 68 | return ( 69 | etf_composition.join(price, on=["ticker"]) 70 | .select(pl.col("etf"), (pl.col("price") * pl.col("quantity")).alias("value")) 71 | .group_by("etf", maintain_order=True) 72 | .agg(pl.col("value").sum()) 73 | .cast(ETF_VALUE_SCHEMA) 74 | ) 75 | 76 | 77 | etf_value_table = calculate_etf_value( 78 | etf_composition=etf_composition_table, price=price_table 79 | ) 80 | # --8<-- [end:business_logic_calculation] 81 | 82 | 83 | # print(etf_value_table.to_pandas().to_markdown(index=False, floatfmt=".2f")) 84 | 85 | # --8<-- [start:dag_source] 86 | from beavers import Dag 87 | 88 | dag = Dag() 89 | price_source = dag.pl.source_table(schema=PRICE_SCHEMA, name="price") 90 | etf_composition_source = dag.pl.source_table( 91 | schema=ETF_COMPOSITION_SCHEMA, name="etf_composition" 92 | ) 93 | # --8<-- [end:dag_source] 94 | 95 | # --8<-- [start:dag_state] 96 | price_state = dag.pl.last_by_keys(price_source, ["ticker"]) 97 | etf_composition_state = dag.pl.last_by_keys( 98 | etf_composition_source, 99 | ["etf", "ticker"], 100 | ) 101 | # --8<-- [end:dag_state] 102 | 103 | 104 | # --8<-- [start:dag_calculation] 105 | etf_value_state = dag.state(calculate_etf_value).map( 106 | etf_composition_state, 107 | price_state, 108 | ) 109 | # --8<-- [end:dag_calculation] 110 | 111 | 112 | # --8<-- [start:dag_test] 113 | price_source.set_stream(price_table) 114 | etf_composition_source.set_stream(etf_composition_table) 115 | dag.execute() 116 | polars.testing.assert_frame_equal(etf_value_state.get_value(), etf_value_table) 117 | # --8<-- [end:dag_test] 118 | 119 | 120 | # --8<-- [start:spurious_update] 121 | new_price_updates = pl.DataFrame( 122 | [{"ticker": "GME", "price": 123.0}], 123 | PRICE_SCHEMA, 124 | ) 125 | price_source.set_stream(new_price_updates) 126 | dag.execute() 127 | assert len(etf_value_state.get_value()) == 2 128 | assert etf_value_state.get_cycle_id() == dag.get_cycle_id() 129 | # --8<-- [end:spurious_update] 130 | 131 | # --8<-- [start:updated_because_of_composition] 132 | updated_because_of_composition = dag.pl.get_series( 133 | etf_composition_source, 134 | "etf", 135 | ) 136 | # --8<-- [end:updated_because_of_composition] 137 | 138 | 139 | # --8<-- [start:updated_because_of_price] 140 | def get_etf_to_update_because_of_price( 141 | etf_composition_state: pl.DataFrame, price_update: pl.DataFrame 142 | ) -> pl.Series: 143 | updated_tickers = price_update["ticker"].unique() 144 | return etf_composition_state.filter(pl.col("ticker").is_in(updated_tickers))[ 145 | "etf" 146 | ].unique() 147 | 148 | 149 | updated_because_of_price = dag.stream( 150 | get_etf_to_update_because_of_price, empty=pl.Series(name="etf", dtype=pl.String()) 151 | ).map(etf_composition_state, price_source) 152 | # --8<-- [end:updated_because_of_price] 153 | 154 | # --8<-- [start:update_all] 155 | stale_etfs = dag.pl.concat_series( 156 | updated_because_of_price, updated_because_of_composition 157 | ) 158 | 159 | 160 | def get_composition_for_etfs( 161 | etf_composition_state: pl.DataFrame, 162 | etfs: pl.Series, 163 | ) -> pl.DataFrame: 164 | return etf_composition_state.filter(pl.col("etf").is_in(etfs)) 165 | 166 | 167 | stale_etf_compositions = dag.pl.table_stream( 168 | get_composition_for_etfs, ETF_COMPOSITION_SCHEMA 169 | ).map(etf_composition_state, stale_etfs) 170 | 171 | updated_etf = dag.pl.table_stream(calculate_etf_value, ETF_VALUE_SCHEMA).map( 172 | stale_etf_compositions, price_state 173 | ) 174 | # --8<-- [end:update_all] 175 | 176 | # --8<-- [start:update_all_test] 177 | price_source.set_stream( 178 | pl.DataFrame( 179 | [{"ticker": "MSFT", "price": 317.05}], 180 | schema=PRICE_SCHEMA, 181 | ) 182 | ) 183 | dag.execute() 184 | assert len(updated_etf.get_value()) == 1 185 | # --8<-- [end:update_all_test] 186 | 187 | # print(updated_etf.get_value().to_pandas().to_markdown(index=False)) 188 | -------------------------------------------------------------------------------- /examples/pyarrow_concepts.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: E402 2 | # isort: skip_file 3 | 4 | # --8<-- [start:business_logic_price] 5 | import pyarrow as pa 6 | 7 | PRICE_SCHEMA = pa.schema( 8 | [ 9 | pa.field("ticker", pa.string()), 10 | pa.field("price", pa.float64()), 11 | ] 12 | ) 13 | 14 | price_table = pa.Table.from_pylist( 15 | [ 16 | {"ticker": "AAPL", "price": 174.79}, 17 | {"ticker": "GOOGL", "price": 130.25}, 18 | {"ticker": "MSFT", "price": 317.01}, 19 | {"ticker": "F", "price": 12.43}, 20 | {"ticker": "GM", "price": 35.28}, 21 | ], 22 | schema=PRICE_SCHEMA, 23 | ) 24 | # --8<-- [end:business_logic_price] 25 | 26 | # print(price_table.to_pandas().to_markdown(index=False)) 27 | 28 | # --8<-- [start:business_logic_composition] 29 | ETF_COMPOSITION_SCHEMA = pa.schema( 30 | [ 31 | pa.field("etf", pa.string()), 32 | pa.field("ticker", pa.string()), 33 | pa.field("quantity", pa.float64()), 34 | ] 35 | ) 36 | 37 | 38 | etf_composition_table = pa.Table.from_pylist( 39 | [ 40 | {"etf": "TECH", "ticker": "AAPL", "quantity": 2.0}, 41 | {"etf": "TECH", "ticker": "GOOGL", "quantity": 2.0}, 42 | {"etf": "TECH", "ticker": "MSFT", "quantity": 1.0}, 43 | {"etf": "CARS", "ticker": "F", "quantity": 3.0}, 44 | {"etf": "CARS", "ticker": "GM", "quantity": 2.0}, 45 | ], 46 | schema=ETF_COMPOSITION_SCHEMA, 47 | ) 48 | # --8<-- [end:business_logic_composition] 49 | 50 | # print(etf_composition_table.to_pandas().to_markdown(index=False, floatfmt=".1f")) 51 | 52 | 53 | # --8<-- [start:business_logic_calculation] 54 | import pyarrow.compute as pc 55 | 56 | ETF_VALUE_SCHEMA = pa.schema( 57 | [ 58 | pa.field("etf", pa.string()), 59 | pa.field("value", pa.float64()), 60 | ] 61 | ) 62 | 63 | 64 | def calculate_etf_value(etf_composition: pa.Table, price: pa.Table) -> pa.Table: 65 | positions_with_prices = etf_composition.join(price, keys=["ticker"]) 66 | values = pc.multiply( 67 | positions_with_prices["price"], positions_with_prices["quantity"] 68 | ) 69 | positions_with_prices = positions_with_prices.append_column("value", values) 70 | return ( 71 | positions_with_prices.group_by("etf") 72 | .aggregate([("value", "sum")]) 73 | .rename_columns(ETF_VALUE_SCHEMA.names) 74 | ) 75 | 76 | 77 | etf_value_table = calculate_etf_value( 78 | etf_composition=etf_composition_table, price=price_table 79 | ) 80 | # --8<-- [end:business_logic_calculation] 81 | 82 | 83 | # print(etf_value_table.to_pandas().to_markdown(index=False, floatfmt=".2f")) 84 | 85 | # --8<-- [start:dag_source] 86 | from beavers import Dag 87 | 88 | dag = Dag() 89 | price_source = dag.pa.source_table(schema=PRICE_SCHEMA, name="price") 90 | etf_composition_source = dag.pa.source_table( 91 | schema=ETF_COMPOSITION_SCHEMA, name="etf_composition" 92 | ) 93 | # --8<-- [end:dag_source] 94 | 95 | # --8<-- [start:dag_state] 96 | price_state = dag.pa.last_by_keys(price_source, ["ticker"]) 97 | etf_composition_state = dag.pa.last_by_keys( 98 | etf_composition_source, 99 | ["etf", "ticker"], 100 | ) 101 | # --8<-- [end:dag_state] 102 | 103 | 104 | # --8<-- [start:dag_calculation] 105 | etf_value_state = dag.state(calculate_etf_value).map( 106 | etf_composition_state, 107 | price_state, 108 | ) 109 | # --8<-- [end:dag_calculation] 110 | 111 | 112 | # --8<-- [start:dag_test] 113 | price_source.set_stream(price_table) 114 | etf_composition_source.set_stream(etf_composition_table) 115 | dag.execute() 116 | assert etf_value_state.get_value() == etf_value_table 117 | # --8<-- [end:dag_test] 118 | 119 | 120 | # --8<-- [start:spurious_update] 121 | new_price_updates = pa.Table.from_pylist( 122 | [{"ticker": "GME", "price": 123.0}], 123 | PRICE_SCHEMA, 124 | ) 125 | price_source.set_stream(new_price_updates) 126 | dag.execute() 127 | assert len(etf_value_state.get_value()) == 2 128 | assert etf_value_state.get_cycle_id() == dag.get_cycle_id() 129 | # --8<-- [end:spurious_update] 130 | 131 | # --8<-- [start:updated_because_of_composition] 132 | updated_because_of_composition = dag.pa.get_column( 133 | etf_composition_source, 134 | "etf", 135 | ) 136 | # --8<-- [end:updated_because_of_composition] 137 | 138 | 139 | # --8<-- [start:updated_because_of_price] 140 | def get_etf_to_update_because_of_price( 141 | etf_composition_state: pa.Table, price_update: pa.Table 142 | ) -> pa.Array: 143 | updated_tickers = pc.unique(price_update["ticker"]) 144 | return pc.unique( 145 | etf_composition_state.filter( 146 | pc.is_in(etf_composition_state["ticker"], updated_tickers) 147 | )["etf"] 148 | ) 149 | 150 | 151 | updated_because_of_price = dag.stream( 152 | get_etf_to_update_because_of_price, pa.array([], pa.string()) 153 | ).map(etf_composition_state, price_source) 154 | # --8<-- [end:updated_because_of_price] 155 | 156 | # --8<-- [start:update_all] 157 | stale_etfs = dag.pa.concat_arrays( 158 | updated_because_of_price, updated_because_of_composition 159 | ) 160 | 161 | 162 | def get_composition_for_etfs( 163 | etf_composition_state: pa.Table, etfs: pa.Array 164 | ) -> pa.Table: 165 | return etf_composition_state.filter( 166 | pc.is_in( 167 | etf_composition_state["etf"], 168 | etfs, 169 | ) 170 | ) 171 | 172 | 173 | stale_etf_compositions = dag.pa.table_stream( 174 | get_composition_for_etfs, ETF_COMPOSITION_SCHEMA 175 | ).map(etf_composition_state, stale_etfs) 176 | 177 | updated_etf = dag.pa.table_stream(calculate_etf_value, ETF_VALUE_SCHEMA).map( 178 | stale_etf_compositions, price_state 179 | ) 180 | # --8<-- [end:update_all] 181 | 182 | # --8<-- [start:update_all_test] 183 | price_source.set_stream( 184 | pa.Table.from_pylist( 185 | [{"ticker": "MSFT", "price": 317.05}], 186 | PRICE_SCHEMA, 187 | ) 188 | ) 189 | dag.execute() 190 | assert len(updated_etf.get_value()) == 1 191 | # --8<-- [end:update_all_test] 192 | 193 | # print(updated_etf.get_value().to_pandas().to_markdown(index=False)) 194 | -------------------------------------------------------------------------------- /examples/replay_concepts.py: -------------------------------------------------------------------------------- 1 | # isort: skip_file 2 | # ruff: noqa: E402 3 | import operator 4 | 5 | import beavers 6 | 7 | 8 | # --8<-- [start:simple_dag] 9 | dag = beavers.Dag() 10 | my_source = dag.source_stream(name="my_source") 11 | my_sink = dag.sink("my_sink", my_source) 12 | # --8<-- [end:simple_dag] 13 | 14 | # --8<-- [start:simple_data_class] 15 | import dataclasses 16 | import pandas as pd 17 | 18 | 19 | @dataclasses.dataclass(frozen=True) 20 | class Message: 21 | timestamp: pd.Timestamp 22 | message: str 23 | 24 | 25 | # --8<-- [end:simple_data_class] 26 | 27 | # --8<-- [start:manual_replay] 28 | my_source.set_stream( 29 | [ 30 | Message(pd.Timestamp("2023-01-01T00:00:00Z"), "hello"), 31 | Message(pd.Timestamp("2023-01-01T00:00:30Z"), "How are you"), 32 | ] 33 | ) 34 | dag.execute(pd.Timestamp("2023-01-01T00:01:00Z")) 35 | assert my_sink.get_sink_value() == [ 36 | Message(pd.Timestamp("2023-01-01T00:00:00Z"), "hello"), 37 | Message(pd.Timestamp("2023-01-01T00:00:30Z"), "How are you"), 38 | ] 39 | # --8<-- [end:manual_replay] 40 | 41 | 42 | # --8<-- [start:data_source] 43 | import beavers.replay 44 | 45 | 46 | @dataclasses.dataclass(frozen=True) 47 | class MessageDataSource: 48 | messages: list[Message] 49 | 50 | def read_to(self, timestamp: pd.Timestamp) -> list[Message]: 51 | results = [] 52 | while self.messages and self.messages[0].timestamp <= timestamp: 53 | results.append(self.messages.pop(0)) 54 | return results 55 | 56 | def get_next(self) -> pd.Timestamp: 57 | if self.messages: 58 | return self.messages[0].timestamp 59 | else: 60 | return beavers.replay.UTC_MAX 61 | 62 | 63 | # --8<-- [end:data_source] 64 | 65 | 66 | # --8<-- [start:replay_context] 67 | from beavers.replay import ReplayContext 68 | 69 | replay_context = ReplayContext( 70 | start=pd.to_datetime("2023-01-01T00:00:00Z"), 71 | end=pd.to_datetime("2023-01-02T00:00:00Z"), 72 | frequency=pd.to_timedelta("1h"), 73 | ) 74 | # --8<-- [end:replay_context] 75 | 76 | 77 | # --8<-- [start:data_source_provider] 78 | @dataclasses.dataclass(frozen=True) 79 | class CsvDataSourceProvider: 80 | file_name: str 81 | 82 | def __call__( 83 | self, replay_context: ReplayContext 84 | ) -> beavers.replay.DataSource[list[Message]]: 85 | df = pd.read_csv(self.file_name, parse_dates=["timestamp"]) 86 | messages = [Message(*row) for _, row in df.iterrows()] 87 | messages.sort(key=lambda x: x.timestamp) 88 | return MessageDataSource(messages) 89 | 90 | 91 | # --8<-- [end:data_source_provider] 92 | 93 | 94 | # --8<-- [start:data_sink] 95 | @dataclasses.dataclass(frozen=True) 96 | class CsvDataSink: 97 | destination: str 98 | data: list[Message] = dataclasses.field(default_factory=list) 99 | 100 | def append(self, timestamp: pd.Timestamp, data: list[Message]): 101 | self.data.extend(data) 102 | 103 | def close(self): 104 | pd.DataFrame([dataclasses.asdict(value) for value in self.data]).to_csv( 105 | self.destination, index=False 106 | ) 107 | 108 | 109 | # --8<-- [end:data_sink] 110 | 111 | 112 | # --8<-- [start:data_sink_provider] 113 | @dataclasses.dataclass(frozen=True) 114 | class CsvDataSinkProvider: 115 | destination: str 116 | 117 | def __call__(self, replay_context: ReplayContext) -> CsvDataSink: 118 | return CsvDataSink(self.destination) 119 | 120 | 121 | # --8<-- [end:data_sink_provider] 122 | 123 | 124 | # This is just to print the csv file: 125 | file = "data.csv" 126 | df = pd.DataFrame( 127 | { 128 | "timestamp": [ 129 | pd.Timestamp("2023-01-01T01:00:00Z"), 130 | pd.Timestamp("2023-01-01T01:01:00Z"), 131 | ], 132 | "message": ["Hello", "How are you"], 133 | } 134 | ) 135 | df.to_csv("input.csv", index=False) 136 | 137 | df_after = pd.read_csv("input.csv", parse_dates=["timestamp"]) 138 | pd.testing.assert_frame_equal(df, df_after) 139 | 140 | messages = [Message(*row) for _, row in df_after.iterrows()] 141 | 142 | df2 = pd.DataFrame( 143 | { 144 | "timestamp": [ 145 | pd.Timestamp("2023-01-02T01:00:00Z"), 146 | pd.Timestamp("2023-01-02T01:01:00Z"), 147 | ], 148 | "message": ["I'm fine", "Thanks"], 149 | } 150 | ) 151 | df.to_csv("input_2023-01-01.csv", index=False) 152 | df2.to_csv("input_2023-01-02.csv", index=False) 153 | df2[:0].to_csv("input_2023-01-03.csv", index=False) 154 | 155 | 156 | # --8<-- [start:replay_driver] 157 | from beavers.replay import ReplayDriver 158 | 159 | replay_driver = beavers.replay.ReplayDriver.create( 160 | dag=dag, 161 | replay_context=replay_context, 162 | data_source_providers={"my_source": CsvDataSourceProvider("input.csv")}, 163 | data_sink_providers={"my_sink": CsvDataSinkProvider("output.csv")}, 164 | ) 165 | replay_driver.run() 166 | # --8<-- [end:replay_driver] 167 | 168 | 169 | # --8<-- [start:iterator_data_source_adapter] 170 | from beavers.replay import IteratorDataSourceAdapter 171 | 172 | 173 | @dataclasses.dataclass(frozen=True) 174 | class PartitionedCsvDataSourceProvider: 175 | source_format: str 176 | 177 | def __call__(self, replay_context: ReplayContext): 178 | file_names = [ 179 | self.source_format.format(date=date) 180 | for date in pd.date_range(replay_context.start, replay_context.end) 181 | ] 182 | generator = (self._load_one_file(file_name) for file_name in file_names) 183 | return IteratorDataSourceAdapter( 184 | sources=generator, 185 | empty=[], 186 | concatenator=operator.add, 187 | ) 188 | 189 | def _load_one_file(self, file_name: str) -> MessageDataSource: 190 | return MessageDataSource( 191 | [ 192 | Message(*row) 193 | for _, row in pd.read_csv( 194 | file_name, parse_dates=["timestamp"] 195 | ).iterrows() 196 | ] 197 | ) 198 | 199 | 200 | source_provider = PartitionedCsvDataSourceProvider("input_{date:%Y-%m-%d}.csv") 201 | # --8<-- [end:iterator_data_source_adapter] 202 | 203 | # --8<-- [start:iterator_data_source_adapter_run] 204 | ReplayDriver.create( 205 | dag=dag, 206 | replay_context=ReplayContext( 207 | start=pd.to_datetime("2023-01-01T00:00:00Z"), 208 | end=pd.to_datetime("2023-01-03T00:00:00Z"), 209 | frequency=pd.to_timedelta("1h"), 210 | ), 211 | data_source_providers={ 212 | "my_source": PartitionedCsvDataSourceProvider("input_{date:%Y-%m-%d}.csv") 213 | }, 214 | data_sink_providers={"my_sink": CsvDataSinkProvider("output.csv")}, 215 | ).run() 216 | 217 | # --8<-- [end:iterator_data_source_adapter_run] 218 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Beavers 2 | site_url: https://beavers.readthedocs.io/en/latest/ 3 | repo_url: https://github.com/tradewelltech/beavers 4 | theme: 5 | name: material 6 | features: 7 | - navigation.tabs 8 | - navigation.tabs.sticky 9 | - content.code.annotate 10 | - content.tabs.link 11 | - content.code.copy 12 | - header.autohide 13 | - navigation.indexes 14 | - navigation.instant 15 | - navigation.tracking 16 | - search.highlight 17 | - search.share 18 | - search.suggest 19 | palette: 20 | scheme: slate 21 | accent: green 22 | logo: static/icons/beavers/logo.svg 23 | favicon: static/icons/beavers/icon.png 24 | 25 | plugins: 26 | - search 27 | - mkdocstrings: 28 | default_handler: python 29 | handlers: 30 | python: 31 | options: 32 | show_source: false 33 | 34 | markdown_extensions: 35 | - def_list 36 | - pymdownx.inlinehilite 37 | - pymdownx.superfences 38 | - pymdownx.snippets: 39 | - pymdownx.emoji 40 | - pymdownx.highlight 41 | - attr_list 42 | - md_in_html 43 | extra: 44 | project_name: "beavers" 45 | 46 | 47 | nav: 48 | - Home: 49 | - index.md 50 | - Concepts: 51 | - concepts/dag.md 52 | - concepts/advanced.md 53 | - concepts/replay.md 54 | - concepts/kafka.md 55 | - concepts/pandas.md 56 | - concepts/pyarrow.md 57 | - concepts/polars.md 58 | - concepts/perspective.md 59 | - API Reference: 60 | - reference/dag.md 61 | - reference/replay.md 62 | - reference/kafka.md 63 | - reference/pandas_wrapper.md 64 | - reference/pyarrow_wrapper.md 65 | - install.md 66 | - contributing.md 67 | - faq.md 68 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "poetry_dynamic_versioning.backend" 3 | requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"] 4 | 5 | [project] 6 | authors = [ 7 | {name = "Tradewell Tech", email = "engineering@tradewelltech.co"} 8 | ] 9 | classifiers = [ 10 | "Development Status :: 5 - Production/Stable", 11 | "License :: OSI Approved :: Apache Software License", 12 | "Natural Language :: English", 13 | "Programming Language :: Python :: 3.10", 14 | "Programming Language :: Python :: 3.11", 15 | "Programming Language :: Python :: 3.12", 16 | "Programming Language :: Python :: 3.13" 17 | ] 18 | dependencies = [ 19 | "confluent_kafka>=2.1.1", 20 | "pandas", 21 | "perspective-python>=3.0.0", 22 | "polars", 23 | "pyarrow", 24 | "tornado" 25 | ] 26 | description = "Python stream processing" 27 | documentation = "https://beavers.readthedocs.io/en/latest/" 28 | keywords = ["apache-arrow", "streaming", "data"] 29 | license = "Apache-2.0" 30 | maintainers = [ 31 | {name = "0x26res", email = "0x26res@gmail.com"} 32 | ] 33 | name = "beavers" 34 | packages = [ 35 | {include = "beavers"} 36 | ] 37 | readme = "README.md" 38 | repository = "https://github.com/tradewelltech/beavers" 39 | requires-python = ">=3.10,<4" 40 | version = "0.0.0" 41 | 42 | [project.optional-dependencies] 43 | confluent-kafka = ["confluent-kafka"] 44 | perspective-python = ["perspective-python", "tornado"] 45 | polars = ["polars"] 46 | pyarrow = ["pyarrow"] 47 | 48 | [project.urls] 49 | "Bug Tracker" = "https://github.com/tradewelltech/beavers/issues" 50 | "Changelog" = "https://github.com/tradewelltech/beavers/blob/main/CHANGELOG.md" 51 | 52 | [tool.bandit] 53 | skips = ["B101", "B311"] 54 | 55 | [tool.black] 56 | exclude = "venv/|tox/" 57 | target-version = ["py310"] 58 | 59 | [tool.coverage.report] 60 | # https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185 61 | exclude_lines = ["if TYPE_CHECKING:"] 62 | 63 | [tool.coverage.run] 64 | omit = [ 65 | # This is hard to test, and the API is about to change a lot 66 | "*/beavers/perspective_wrapper.py" 67 | ] 68 | 69 | [tool.poetry.group.dev.dependencies] 70 | black = ">=22.10.0" 71 | click = ">=8.1.7" 72 | coverage = ">=6.5.0" 73 | flake8 = ">=5.0.4" 74 | git-changelog = ">=2.2.0" 75 | isort = ">=5.10.1" 76 | mock = "*" 77 | pip-tools = ">=6.12.1" 78 | pre-commit = ">=2.20.0" 79 | pylint = ">=2.15.0" 80 | pytest = ">=7.2.0" 81 | pytest-asyncio = "*" 82 | tabulate = "*" 83 | 84 | [tool.poetry.group.docs] 85 | optional = true 86 | 87 | [tool.poetry.group.docs.dependencies] 88 | markdown-include = "*" 89 | mkdocs = ">=1.5.3" 90 | mkdocs-material = ">=9.3.2" 91 | mkdocs-material-extensions = "*" 92 | mkdocstrings = {version = ">=0.21.2", extras = ["python"]} 93 | pymdown-extensions = "*" 94 | tornado = "*" 95 | 96 | [tool.poetry-dynamic-versioning] 97 | enable = true 98 | 99 | [tool.poetry-dynamic-versioning.substitution] 100 | files = ["*/__init__.py"] 101 | folders = [{path = "beavers"}] 102 | 103 | [tool.pydocstyle] 104 | ignore = ["D102", "D107", "D203", "D212"] 105 | 106 | [tool.pytest.ini_options] 107 | asyncio_default_fixture_loop_scope = "function" 108 | asyncio_mode = "auto" 109 | 110 | [tool.ruff] 111 | line-length = 88 112 | 113 | [tool.ruff.lint.isort] 114 | known-first-party = ["beavers", "tradewell_proto"] 115 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # Scripts 2 | 3 | These script are helpful for testing beavers with simple real time applications 4 | 5 | ## Set up 6 | 7 | Use kafka-kraft in docker for kafka: 8 | 9 | ```shell 10 | docker run --name=simple_kafka -p 9092:9092 -d bashj79/kafka-kraft 11 | ``` 12 | 13 | ## `kafka_test_bench` 14 | 15 | Tests a simple application with kafka, making sure it replays in order. 16 | The "timestamp" of the output messages should be in order across topics when replaying. 17 | 18 | 19 | ### Create Topics 20 | 21 | ```shell 22 | docker exec -it simple_kafka /opt/kafka/bin/kafka-topics.sh --bootstrap-server=localhost:9092 --create --topic=left --partitions=1 --replication-factor=1 23 | docker exec -it simple_kafka /opt/kafka/bin/kafka-topics.sh --bootstrap-server=localhost:9092 --create --topic=right --partitions=1 --replication-factor=1 24 | docker exec -it simple_kafka /opt/kafka/bin/kafka-topics.sh --bootstrap-server=localhost:9092 --create --topic=both --partitions=1 --replication-factor=1 25 | ``` 26 | 27 | ### Run the Beavers job 28 | 29 | ```shell 30 | python -m scripts.kafka_test_bench --batch-size=2 31 | ``` 32 | 33 | ### Publish data 34 | 35 | ```shell 36 | docker exec -it simple_kafka /opt/kafka/bin/kafka-console-producer.sh --bootstrap-server=localhost:9092 --topic=left 37 | docker exec -it simple_kafka /opt/kafka/bin/kafka-console-producer.sh --bootstrap-server=localhost:9092 --topic=right 38 | ``` 39 | 40 | ### See out put data 41 | 42 | ```shell 43 | docker exec -it simple_kafka /opt/kafka/bin/kafka-console-consumer.sh \ 44 | --bootstrap-server=localhost:9092 \ 45 | --topic=both \ 46 | --property print.key=true \ 47 | --from-beginning 48 | ``` 49 | 50 | ## `perpective_test_bench.py` 51 | 52 | ### Create the topic 53 | 54 | ```shell 55 | docker exec -it simple_kafka /opt/kafka/bin/kafka-topics.sh --bootstrap-server=localhost:9092 --create --topic=key-value --partitions=1 --replication-factor=1 56 | ``` 57 | 58 | ### Publish data 59 | 60 | ```shell 61 | docker exec -it simple_kafka /opt/kafka/bin/kafka-console-producer.sh \ 62 | --topic=key-value \ 63 | --bootstrap-server=localhost:9092 \ 64 | --property parse.key=true \ 65 | --property key.separator=, 66 | ``` 67 | -------------------------------------------------------------------------------- /scripts/kafka_test_bench.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import json 3 | import logging 4 | from operator import itemgetter 5 | from typing import Any, Callable, Sequence 6 | 7 | import click 8 | import confluent_kafka 9 | import pandas as pd 10 | 11 | from beavers import Dag 12 | from beavers.kafka import KafkaDriver, KafkaProducerMessage, SourceTopic 13 | 14 | 15 | def create_test_dag() -> "Dag": 16 | dag = Dag() 17 | left_stream = dag.source_stream(name="left") 18 | right_stream = dag.source_stream(name="right") 19 | both_stream = dag.stream( 20 | lambda left, right: sorted(left + right, key=itemgetter("timestamp")) 21 | ).map(left_stream, right_stream) 22 | dag.sink("both", both_stream) 23 | return dag 24 | 25 | 26 | def kafka_messages_to_json( 27 | messages: Sequence[confluent_kafka.Message], 28 | ) -> list[dict[str, Any]]: 29 | return [ 30 | { 31 | "topic": message.topic(), 32 | "partition": message.partition(), 33 | "offset": message.offset(), 34 | "timestamp": str( 35 | pd.to_datetime(message.timestamp()[1], unit="ms", utc=True) 36 | ), 37 | "key": message.key().encode("utf-8") if message.key() else None, 38 | "value": message.value().decode("utf-8"), 39 | } 40 | for message in messages 41 | ] 42 | 43 | 44 | def kafka_message_serializer( 45 | payloads: list[dict[str, Any]], topic: str 46 | ) -> list[KafkaProducerMessage]: 47 | return [ 48 | KafkaProducerMessage(topic, key=None, value=json.dumps(payload)) 49 | for payload in payloads 50 | ] 51 | 52 | 53 | SOURCE_TOPIC_CREATORS: dict[str, Callable[[str], SourceTopic]] = { 54 | "latest": functools.partial( 55 | SourceTopic.from_latest, message_deserializer=kafka_messages_to_json 56 | ), 57 | "earliest": functools.partial( 58 | SourceTopic.from_earliest, message_deserializer=kafka_messages_to_json 59 | ), 60 | "15min": functools.partial( 61 | SourceTopic.from_relative_time, 62 | message_deserializer=kafka_messages_to_json, 63 | relative_time=pd.to_timedelta("15min"), 64 | ), 65 | "start-of-day": functools.partial( 66 | SourceTopic.from_start_of_day, 67 | message_deserializer=kafka_messages_to_json, 68 | start_of_day_time=pd.to_timedelta("00:00:00"), 69 | start_of_day_timezone="UTC", 70 | ), 71 | "absolute-time": functools.partial( 72 | SourceTopic.from_absolute_time, 73 | message_deserializer=kafka_messages_to_json, 74 | absolute_time=pd.Timestamp.utcnow().normalize(), 75 | ), 76 | "committed": functools.partial( 77 | SourceTopic.from_committed, 78 | message_deserializer=kafka_messages_to_json, 79 | ), 80 | } 81 | 82 | 83 | @click.command() 84 | @click.option("--left-topic", type=click.STRING, default="left") 85 | @click.option( 86 | "--left-offset", type=click.Choice(SOURCE_TOPIC_CREATORS.keys()), default="earliest" 87 | ) 88 | @click.option("--right-topic", type=click.STRING, default="right") 89 | @click.option( 90 | "--right-offset", 91 | type=click.Choice(SOURCE_TOPIC_CREATORS.keys()), 92 | default="earliest", 93 | ) 94 | @click.option("--both-topic", type=click.STRING, default="both") 95 | @click.option( 96 | "--consumer-config", 97 | type=json.loads, 98 | default='{"bootstrap.servers": "localhost:9092", "group.id": "beavers"}', 99 | ) 100 | @click.option( 101 | "--producer-config", 102 | type=json.loads, 103 | default='{"bootstrap.servers": "localhost:9092"}', 104 | ) 105 | @click.option("--batch-size", type=click.INT, default="2") 106 | def kafka_test_bench( 107 | left_topic: str, 108 | left_offset: str, 109 | right_topic: str, 110 | right_offset: str, 111 | both_topic: str, 112 | consumer_config: dict, 113 | producer_config: dict, 114 | batch_size: int, 115 | ): 116 | logging.basicConfig( 117 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 118 | level=logging.DEBUG, 119 | ) 120 | 121 | dag = create_test_dag() 122 | 123 | driver = KafkaDriver.create( 124 | dag=dag, 125 | producer_config=producer_config, 126 | consumer_config=consumer_config, 127 | source_topics={ 128 | "left": SOURCE_TOPIC_CREATORS[left_offset](left_topic), 129 | "right": SOURCE_TOPIC_CREATORS[right_offset](right_topic), 130 | }, 131 | sink_topics={ 132 | "both": functools.partial(kafka_message_serializer, topic=both_topic) 133 | }, 134 | batch_size=batch_size, 135 | ) 136 | while True: 137 | driver.run_cycle() 138 | 139 | 140 | if __name__ == "__main__": 141 | kafka_test_bench() 142 | -------------------------------------------------------------------------------- /scripts/perpective_test_bench.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import click 4 | 5 | from examples.perspective_concepts import run_dashboard 6 | 7 | 8 | @click.command() 9 | @click.option("--topic", type=click.STRING, default="key-value") 10 | @click.option("--port", type=click.INT, default=8082) 11 | @click.option( 12 | "--consumer-config", 13 | type=json.loads, 14 | default='{"bootstrap.servers": "localhost:9092", "group.id": "beavers"}', 15 | ) 16 | def perspective_test_bench( 17 | topic: str, 18 | port: int, 19 | consumer_config: dict, 20 | ): 21 | run_dashboard(topic=topic, port=port, consumer_config=consumer_config) 22 | 23 | 24 | if __name__ == "__main__": 25 | perspective_test_bench() 26 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | from _pytest.assertion import register_assert_rewrite 2 | 3 | register_assert_rewrite("beavers.testing") 4 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tradewelltech/beavers/ec9979086868589ab82b47ce55fa11cc31b32c16/tests/conftest.py -------------------------------------------------------------------------------- /tests/test_docs.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | 4 | def test_readme_and_index_same(): 5 | """Check the README matches the doc home page""" 6 | root = Path(__file__).parent.parent 7 | readme = root / "README.md" 8 | index = root / "docs" / "index.md" 9 | 10 | with readme.open() as fp: 11 | readme_content = fp.read() 12 | 13 | with index.open() as fp: 14 | # Skip first and last line 15 | index_content = "".join(fp.readlines()[1:-1]) 16 | 17 | assert index_content in readme_content 18 | -------------------------------------------------------------------------------- /tests/test_etfs.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from beavers.testing import DagTestBench 4 | from examples import etfs 5 | 6 | 7 | def test_run_dag(): 8 | dag = etfs.create_dag() 9 | bench = DagTestBench(dag) 10 | 11 | # Price and ETF come in: 12 | timestamp_0 = pd.to_datetime("2023-06-10 12:00:00+0000") 13 | ( 14 | bench.set_source( 15 | "price", 16 | [ 17 | etfs.PriceRecord(timestamp_0, "AAPL", 180.0), 18 | etfs.PriceRecord(timestamp_0, "GOOG", 120.0), 19 | ], 20 | ) 21 | .set_source( 22 | "etf_composition", 23 | [etfs.EtfComposition(timestamp_0, "TECH", {"AAPL": 1.0, "GOOG": 1.5})], 24 | ) 25 | .execute(timestamp_0) 26 | .assert_sink_list("etf_price", [etfs.PriceRecord(timestamp_0, "TECH", 144.0)]) 27 | ) 28 | 29 | # AAPL price update: 30 | timestamp_1 = timestamp_0 + pd.to_timedelta("1s") 31 | ( 32 | bench.set_source( 33 | "price", 34 | [ 35 | etfs.PriceRecord(timestamp_1, "AAPL", 200.0), 36 | ], 37 | ) 38 | .execute(timestamp_1) 39 | .assert_sink_list("etf_price", [etfs.PriceRecord(timestamp_1, "TECH", 152.0)]) 40 | ) 41 | 42 | # Unrelated price updates: 43 | timestamp_2 = timestamp_0 + pd.to_timedelta("2s") 44 | ( 45 | bench.set_source( 46 | "price", 47 | [ 48 | etfs.PriceRecord(timestamp_2, "MSFT", 330.0), 49 | ], 50 | ) 51 | .execute(timestamp_2) 52 | .assert_sink_not_updated("etf_price") 53 | ) 54 | 55 | # New ETF comes in 56 | timestamp_3 = timestamp_0 + pd.to_timedelta("4s") 57 | ( 58 | bench.set_source( 59 | "etf_composition", 60 | [etfs.EtfComposition(timestamp_3, "SOFT", {"MSFT": 0.5, "GOOG": 1.0})], 61 | ) 62 | .execute(timestamp_3) 63 | .assert_sink_list("etf_price", [etfs.PriceRecord(timestamp_3, "SOFT", 190.0)]) 64 | ) 65 | 66 | # ETF extends with missing price: 67 | timestamp_4 = timestamp_0 + pd.to_timedelta("4s") 68 | ( 69 | bench.set_source( 70 | "etf_composition", 71 | [ 72 | etfs.EtfComposition( 73 | timestamp_4, "SOFT", {"MSFT": 0.5, "GOOG": 1.0, "ORCL": 0.5} 74 | ) 75 | ], 76 | ) 77 | .execute(timestamp_4) 78 | .assert_sink_list("etf_price", [etfs.PriceRecord(timestamp_4, "SOFT", None)]) 79 | ) 80 | -------------------------------------------------------------------------------- /tests/test_pandas_wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | from beavers import Dag 6 | from beavers.pandas_wrapper import _empty_df, _get_stream_dtypes, _LastTracker 7 | 8 | DTYPES = pd.Series( 9 | { 10 | "col1": np.int64, 11 | "col2": np.object_, 12 | } 13 | ) 14 | DF = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]}) 15 | DF_UPDATE = pd.DataFrame({"col1": [1, 2, 2], "col2": ["e", "f", "g"]}) 16 | 17 | 18 | def test_dtypes(): 19 | df = _empty_df(dtypes=DTYPES) 20 | pd.testing.assert_series_equal(df.dtypes, DTYPES) 21 | 22 | 23 | def test_source_df(): 24 | dag = Dag() 25 | source = dag.pd.source_df(dtypes=DTYPES) 26 | 27 | dag.execute() 28 | pd.testing.assert_series_equal(source.get_value().dtypes, DTYPES) 29 | 30 | source.set_stream(DF) 31 | dag.execute() 32 | pd.testing.assert_frame_equal(source.get_value(), DF) 33 | 34 | 35 | def test_table_stream(): 36 | dag = Dag() 37 | source = dag.pd.source_df(dtypes=DTYPES) 38 | stream = dag.pd.df_stream(lambda x: x[x["col1"] > 1], DTYPES).map(source) 39 | 40 | dag.execute() 41 | pd.testing.assert_frame_equal(stream.get_value(), _empty_df(DTYPES)) 42 | 43 | source.set_stream(DF) 44 | dag.execute() 45 | pd.testing.assert_frame_equal(stream.get_value(), DF[lambda x: x["col1"] > 1]) 46 | 47 | 48 | def test_get_stream_dtypes(): 49 | dag = Dag() 50 | source = dag.pd.source_df(dtypes=DTYPES) 51 | pd.testing.assert_series_equal(_get_stream_dtypes(source), DTYPES) 52 | 53 | state = dag.state(lambda: "foo").map() 54 | with pytest.raises(TypeError, match=r"Argument should be a stream Node"): 55 | pd.testing.assert_series_equal(_get_stream_dtypes(state), DTYPES) 56 | 57 | list_node = dag.source_stream() 58 | with pytest.raises(TypeError, match=r"Argument should be a Node\[pd.DataFrame\]"): 59 | pd.testing.assert_series_equal(_get_stream_dtypes(list_node), DTYPES) 60 | 61 | 62 | def test_latest_tracker(): 63 | tracker = _LastTracker(["col1"], _empty_df(DTYPES)) 64 | pd.testing.assert_frame_equal(tracker(_empty_df(DTYPES)), _empty_df(DTYPES)) 65 | pd.testing.assert_frame_equal(tracker(DF), DF) 66 | pd.testing.assert_frame_equal(tracker(DF), DF) 67 | 68 | pd.testing.assert_frame_equal( 69 | tracker(DF_UPDATE), pd.DataFrame({"col1": [3, 1, 2], "col2": ["c", "e", "g"]}) 70 | ) 71 | 72 | 73 | def test_last_by_keys(): 74 | dag = Dag() 75 | source = dag.pd.source_df(dtypes=DTYPES) 76 | latest = dag.pd.last_by_keys(source, ["col1"]) 77 | 78 | dag.execute() 79 | pd.testing.assert_frame_equal(latest.get_value(), _empty_df(DTYPES)) 80 | 81 | source.set_stream(DF) 82 | dag.execute() 83 | pd.testing.assert_frame_equal(latest.get_value(), DF) 84 | 85 | source.set_stream(DF) 86 | dag.execute() 87 | pd.testing.assert_frame_equal(latest.get_value(), DF) 88 | -------------------------------------------------------------------------------- /tests/test_perpective_wrapper.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | import perspective 4 | import pyarrow as pa 5 | import pytest 6 | from mock import mock 7 | from perspective import Server 8 | from tornado.testing import AsyncHTTPTestCase 9 | from tornado.web import Application 10 | 11 | from beavers import Dag 12 | from beavers.perspective_wrapper import ( 13 | DATA_TYPES, 14 | PerspectiveTableDefinition, 15 | TableRequestHandler, 16 | _PerspectiveNode, 17 | _table_to_bytes, 18 | _TableConfig, 19 | _UpdateRunner, 20 | perspective_thread, 21 | ) 22 | 23 | PERSPECTIVE_TABLE_SCHEMA = pa.schema( 24 | [ 25 | pa.field("index", pa.string()), 26 | pa.field("remove", pa.string()), 27 | ] 28 | ) 29 | PERSPECTIVE_TABLE_DEFINITION = config = PerspectiveTableDefinition( 30 | name="name", 31 | index_column="index", 32 | remove_column="remove", 33 | ) 34 | 35 | 36 | def test_config_validate(): 37 | definition = PERSPECTIVE_TABLE_DEFINITION 38 | 39 | with pytest.raises(AssertionError, match="index"): 40 | definition.validate(pa.schema([])) 41 | 42 | with pytest.raises(AssertionError, match="remove"): 43 | definition.validate(pa.schema([pa.field("index", pa.string())])) 44 | 45 | definition.validate(PERSPECTIVE_TABLE_SCHEMA) 46 | 47 | 48 | def test_to_table_config(): 49 | assert _TableConfig.from_definition( 50 | PERSPECTIVE_TABLE_DEFINITION, PERSPECTIVE_TABLE_SCHEMA 51 | ) == _TableConfig( 52 | name="name", index="index", columns=["index", "remove"], sort=[], filters=[] 53 | ) 54 | 55 | 56 | def test_table_to_bytes(): 57 | results = _table_to_bytes(PERSPECTIVE_TABLE_SCHEMA.empty_table()) 58 | assert isinstance(results, bytes) 59 | assert len(results) > 100 60 | 61 | 62 | def test_update_runner(): 63 | mock = MagicMock() 64 | 65 | runner = _UpdateRunner(mock) 66 | runner() 67 | assert mock.run_cycle.called 68 | 69 | 70 | def test_add_node(): 71 | dag = Dag() 72 | source = dag.pa.source_table(schema=PERSPECTIVE_TABLE_SCHEMA) 73 | state = dag.state(lambda x: x).map(source) 74 | assert dag.psp.to_perspective(source, PERSPECTIVE_TABLE_DEFINITION) is None 75 | 76 | with pytest.raises(AssertionError, match="Must provide a schema for state nodes"): 77 | dag.psp.to_perspective(state, PERSPECTIVE_TABLE_DEFINITION) 78 | 79 | dag.psp.to_perspective( 80 | state, PERSPECTIVE_TABLE_DEFINITION, schema=PERSPECTIVE_TABLE_SCHEMA 81 | ) 82 | 83 | for node in dag._nodes: 84 | if isinstance(node._function, _PerspectiveNode): 85 | assert node._function.table is None 86 | node._function.table = MagicMock() 87 | 88 | dag.execute() 89 | 90 | nodes = [ 91 | n._function for n in dag._nodes if isinstance(n._function, _PerspectiveNode) 92 | ] 93 | assert len(nodes) == 2 94 | assert nodes[0].get_table_config() == _TableConfig( 95 | name="name", index="index", columns=["index", "remove"], sort=[], filters=[] 96 | ) 97 | 98 | 99 | class FakeLoop: 100 | @staticmethod 101 | def current(): 102 | return FakeLoop() 103 | 104 | def add_callback(self): 105 | pass 106 | 107 | def time(self): 108 | return 0 109 | 110 | def add_timeout(self, *args, **kwargs): 111 | pass 112 | 113 | def start(self): 114 | pass 115 | 116 | 117 | @mock.patch("tornado.ioloop.IOLoop", FakeLoop) 118 | def test_perspective_thread(): 119 | manager = Server() 120 | 121 | perspective_thread(manager, MagicMock(), []) 122 | 123 | 124 | class TestHandler(AsyncHTTPTestCase): 125 | def get_app(self): 126 | table_configs = [ 127 | _TableConfig( 128 | "table1", index="col_1", columns=["col_1", "col_2"], sort=(), filters=() 129 | ) 130 | ] 131 | return Application( 132 | [ 133 | ( 134 | r"/([a-z0-9_]*)", 135 | TableRequestHandler, 136 | {"table_configs": table_configs}, 137 | ), 138 | ] 139 | ) 140 | 141 | def test_table(self): 142 | response = self.fetch("/") 143 | assert response.code == 200 144 | assert b'["col_1", "col_2"]' in response.body 145 | 146 | 147 | def test_schema(): 148 | server = perspective.Server() 149 | client = server.new_local_client() 150 | 151 | client.table({str(i): v[1] for i, v in enumerate(DATA_TYPES)}) 152 | -------------------------------------------------------------------------------- /tests/test_polars_wrapper.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | import polars.testing 3 | import pytest 4 | 5 | from beavers import Dag 6 | from beavers.polars_wrapper import _get_stream_schema, _get_stream_dtype 7 | 8 | SIMPLE_SCHEMA = pl.Schema( 9 | [ 10 | ("col1", pl.Int32()), 11 | ("col2", pl.Utf8()), 12 | ] 13 | ) 14 | EMPTY_FRAME = pl.DataFrame(schema=SIMPLE_SCHEMA) 15 | SIMPLE_FRAME = pl.DataFrame([[1, 2, 3], ["a", "b", "c"]], schema=SIMPLE_SCHEMA) 16 | SIMPLE_FRAME_2 = table = pl.DataFrame([[1, 2], ["d", "e"]], schema=SIMPLE_SCHEMA) 17 | 18 | 19 | def test_source_stream(): 20 | dag = Dag() 21 | 22 | node = dag.pl.source_table(schema=SIMPLE_SCHEMA) 23 | polars.testing.assert_frame_equal( 24 | node._empty_factory(), pl.DataFrame(schema=SIMPLE_SCHEMA) 25 | ) 26 | 27 | node.set_stream(SIMPLE_FRAME) 28 | dag.execute() 29 | polars.testing.assert_frame_equal(node.get_value(), SIMPLE_FRAME) 30 | 31 | dag.execute() 32 | polars.testing.assert_frame_equal( 33 | node.get_value(), pl.DataFrame(schema=SIMPLE_SCHEMA) 34 | ) 35 | 36 | 37 | def test_table_stream(): 38 | dag = Dag() 39 | 40 | schema = pl.Schema([("col1", pl.Int32())]) 41 | source = dag.pl.source_table(SIMPLE_SCHEMA) 42 | node = dag.pl.table_stream(lambda x: x.select(["col1"]), schema).map(source) 43 | 44 | dag.execute() 45 | polars.testing.assert_frame_equal(node.get_value(), pl.DataFrame(schema=schema)) 46 | 47 | source.set_stream(SIMPLE_FRAME) 48 | dag.execute() 49 | polars.testing.assert_frame_equal(node.get_value(), SIMPLE_FRAME.select(["col1"])) 50 | 51 | 52 | def test_filter_stream(): 53 | dag = Dag() 54 | 55 | source = dag.pl.source_table(SIMPLE_SCHEMA) 56 | filtered = dag.pl.filter_stream(source, pl.col("col1") > 1, pl.col("col2") == "a") 57 | 58 | dag.execute() 59 | polars.testing.assert_frame_equal( 60 | filtered.get_value(), pl.DataFrame(schema=SIMPLE_SCHEMA) 61 | ) 62 | 63 | source.set_stream(SIMPLE_FRAME) 64 | dag.execute() 65 | polars.testing.assert_frame_equal( 66 | filtered.get_value(), 67 | SIMPLE_FRAME.filter(pl.col("col1") > 1, pl.col("col2") == "a"), 68 | ) 69 | 70 | 71 | def test_get_stream_schema(): 72 | dag = Dag() 73 | 74 | polars_source = dag.pl.source_table(SIMPLE_SCHEMA) 75 | assert _get_stream_schema(polars_source) == SIMPLE_SCHEMA 76 | 77 | list_source = dag.source_stream(empty=[], name="source1") 78 | with pytest.raises(TypeError, match=r"Argument should be a Node\[pl\.DataFrame\]"): 79 | _get_stream_schema(list_source) 80 | 81 | 82 | def test_last_by(): 83 | dag = Dag() 84 | 85 | source = dag.pl.source_table(SIMPLE_SCHEMA) 86 | last_by = dag.pl.last_by_keys(source, ["col1"]) 87 | 88 | dag.execute() 89 | polars.testing.assert_frame_equal( 90 | last_by.get_value(), pl.DataFrame(schema=SIMPLE_SCHEMA) 91 | ) 92 | 93 | source.set_stream(SIMPLE_FRAME) 94 | dag.execute() 95 | polars.testing.assert_frame_equal(last_by.get_value(), SIMPLE_FRAME) 96 | 97 | source.set_stream(SIMPLE_FRAME_2) 98 | dag.execute() 99 | assert str(last_by.get_value()) == str( 100 | pl.DataFrame([[1, 2, 3], ["d", "e", "c"]], schema=SIMPLE_SCHEMA) 101 | ) 102 | 103 | 104 | def test_last_by_order_of_column(): 105 | dag = Dag() 106 | 107 | source = dag.pl.source_table(SIMPLE_SCHEMA) 108 | last_by = dag.pl.last_by_keys(source, ["col2"]) 109 | 110 | dag.execute() 111 | polars.testing.assert_frame_equal( 112 | last_by.get_value(), pl.DataFrame(schema=SIMPLE_SCHEMA) 113 | ) 114 | 115 | source.set_stream(SIMPLE_FRAME) 116 | dag.execute() 117 | polars.testing.assert_frame_equal(last_by.get_value(), SIMPLE_FRAME) 118 | 119 | 120 | def test_last_by_bad_keys(): 121 | dag = Dag() 122 | source = dag.pl.source_table(SIMPLE_SCHEMA) 123 | with pytest.raises(AssertionError, match="Keys must be strings"): 124 | dag.pl.last_by_keys(source, [1]) 125 | 126 | 127 | def test_concat_series(): 128 | dag = Dag() 129 | left_source = dag.pl.source_table(SIMPLE_SCHEMA) 130 | left = dag.pl.get_series(left_source, "col1") 131 | right_source = dag.pl.source_table(SIMPLE_SCHEMA) 132 | right = dag.pl.get_series(right_source, "col1") 133 | 134 | both = dag.pl.concat_series(left, right) 135 | 136 | dag.execute() 137 | polars.testing.assert_series_equal( 138 | both.get_value(), pl.Series(dtype=pl.Int32(), name="col1") 139 | ) 140 | 141 | left_source.set_stream(SIMPLE_FRAME) 142 | dag.execute() 143 | polars.testing.assert_series_equal( 144 | both.get_value(), pl.Series(values=[1, 2, 3], dtype=pl.Int32(), name="col1") 145 | ) 146 | 147 | left_source.set_stream(SIMPLE_FRAME) 148 | right_source.set_stream(SIMPLE_FRAME_2) 149 | dag.execute() 150 | polars.testing.assert_series_equal( 151 | both.get_value(), 152 | pl.Series(values=[1, 2, 3, 1, 2], dtype=pl.Int32(), name="col1"), 153 | ) 154 | 155 | right_source.set_stream(SIMPLE_FRAME_2) 156 | dag.execute() 157 | polars.testing.assert_series_equal( 158 | both.get_value(), 159 | pl.Series(values=[1, 2], dtype=pl.Int32(), name="col1"), 160 | ) 161 | 162 | 163 | def test_concat_series_bad_no_series(): 164 | dag = Dag() 165 | with pytest.raises(ValueError, match="Must pass at least one series"): 166 | dag.pl.concat_series() 167 | 168 | 169 | def test_concat_series_bad_mismatching_series(): 170 | dag = Dag() 171 | source = dag.pl.source_table(SIMPLE_SCHEMA) 172 | left = dag.pl.get_series(source, "col1") 173 | right = dag.pl.get_series(source, "col2") 174 | with pytest.raises(TypeError, match="Series type mismatch Int32 vs String"): 175 | dag.pl.concat_series(left, right) 176 | 177 | 178 | def test_get_series(): 179 | dag = Dag() 180 | left_source = dag.pl.source_table(SIMPLE_SCHEMA) 181 | left_series = dag.pl.get_series(left_source, "col1") 182 | 183 | dag.execute() 184 | polars.testing.assert_series_equal(left_series.get_value(), EMPTY_FRAME["col1"]) 185 | 186 | left_source.set_stream(SIMPLE_FRAME) 187 | dag.execute() 188 | polars.testing.assert_series_equal(left_series.get_value(), SIMPLE_FRAME["col1"]) 189 | 190 | dag.execute() 191 | polars.testing.assert_series_equal(left_series.get_value(), EMPTY_FRAME["col1"]) 192 | 193 | 194 | def test_get_stream_dtype_bad(): 195 | with pytest.raises(TypeError, match=r"Argument should be a Node\[pl\.Series\]"): 196 | _get_stream_dtype(Dag().source_stream()) 197 | -------------------------------------------------------------------------------- /tests/test_pyarrow_kafka.py: -------------------------------------------------------------------------------- 1 | from beavers.pyarrow_kafka import JsonDeserializer, JsonSerializer 2 | from tests.test_kafka import mock_kafka_message 3 | from tests.test_util import TEST_TABLE 4 | 5 | 6 | def test_json_deserializer_empty(): 7 | deserializer = JsonDeserializer(TEST_TABLE.schema) 8 | assert deserializer([]) == TEST_TABLE.schema.empty_table() 9 | 10 | 11 | def test_end_to_end(): 12 | deserializer = JsonDeserializer(TEST_TABLE.schema) 13 | serializer = JsonSerializer("topic-1") 14 | out_messages = serializer(TEST_TABLE) 15 | in_messages = [ 16 | mock_kafka_message(topic=m.topic, value=m.value) for m in out_messages 17 | ] 18 | assert deserializer(in_messages) == TEST_TABLE 19 | -------------------------------------------------------------------------------- /tests/test_pyarrow_replay.py: -------------------------------------------------------------------------------- 1 | from operator import itemgetter 2 | 3 | import pandas as pd 4 | import pyarrow as pa 5 | import pyarrow.csv 6 | import pytest 7 | 8 | from beavers.dag import UTC_MAX 9 | from beavers.pyarrow_replay import ArrowTableDataSink, ArrowTableDataSource 10 | from tests.test_util import TEST_TABLE 11 | 12 | 13 | def test_arrow_table_data_source(): 14 | source = ArrowTableDataSource(TEST_TABLE, itemgetter("timestamp")) 15 | 16 | assert source.get_next() == pd.to_datetime("2023-01-01T00:00:00Z") 17 | assert source.read_to(pd.to_datetime("2023-01-01T00:00:00Z")) == TEST_TABLE[:1] 18 | assert source.read_to(pd.to_datetime("2023-01-01T00:00:00Z")) == TEST_TABLE[:0] 19 | assert source.get_next() == pd.to_datetime("2023-01-02T00:00:00Z") 20 | assert source.read_to(pd.to_datetime("2023-01-02T00:00:00Z")) == TEST_TABLE[1:] 21 | assert source.get_next() == UTC_MAX 22 | assert source.read_to(UTC_MAX) == TEST_TABLE[:0] 23 | 24 | 25 | def test_arrow_table_data_source_ooo(): 26 | with pytest.raises( 27 | AssertionError, match="Timestamp column should be monotonic increasing" 28 | ): 29 | ArrowTableDataSource( 30 | pa.table( 31 | { 32 | "timestamp": [ 33 | pd.to_datetime("2023-01-02T00:00:00Z"), 34 | pd.to_datetime("2023-01-01T00:00:00Z"), 35 | ], 36 | "value": [1, 2], 37 | } 38 | ), 39 | itemgetter("timestamp"), 40 | ) 41 | 42 | 43 | def test_arrow_table_data_sink(tmpdir): 44 | file = tmpdir / "file.csv" 45 | sink = ArrowTableDataSink(lambda table: pyarrow.csv.write_csv(table, file)) 46 | 47 | sink.close() 48 | assert not file.exists() 49 | 50 | sink.append(UTC_MAX, TEST_TABLE) 51 | sink.close() 52 | assert file.exists() 53 | -------------------------------------------------------------------------------- /tests/test_pyarrow_wrapper.py: -------------------------------------------------------------------------------- 1 | import pyarrow as pa 2 | import pyarrow.compute as pc 3 | import pytest 4 | 5 | from beavers import Dag 6 | from beavers.pyarrow_wrapper import _concat_arrow_arrays, _get_last_by, _LastByKey 7 | 8 | SIMPLE_SCHEMA = pa.schema( 9 | [ 10 | pa.field("col1", pa.int32()), 11 | pa.field("col2", pa.string()), 12 | pa.field("col3", pa.timestamp("ns", "UTC")), 13 | ] 14 | ) 15 | SIMPLE_TABLE = pa.table([[1, 2, 3], ["a", "b", "c"], [0, 0, 0]], schema=SIMPLE_SCHEMA) 16 | SIMPLE_TABLE_2 = table = pa.table([[1, 2], ["d", "e"], [0, 0]], schema=SIMPLE_SCHEMA) 17 | 18 | 19 | def test_source_stream(): 20 | dag = Dag() 21 | 22 | node = dag.pa.source_table(schema=SIMPLE_SCHEMA) 23 | assert node._empty_factory() == SIMPLE_SCHEMA.empty_table() 24 | 25 | node.set_stream(SIMPLE_TABLE) 26 | dag.execute() 27 | assert node.get_value() == SIMPLE_TABLE 28 | 29 | dag.execute() 30 | assert node.get_value() == SIMPLE_SCHEMA.empty_table() 31 | 32 | 33 | def test_source_stream_name(): 34 | dag = Dag() 35 | 36 | node = dag.pa.source_table(schema=SIMPLE_SCHEMA, name="source_1") 37 | assert dag.get_sources() == {"source_1": node} 38 | 39 | 40 | def test_table_stream(): 41 | dag = Dag() 42 | 43 | source = dag.pa.source_table(SIMPLE_SCHEMA) 44 | node = dag.pa.table_stream( 45 | lambda x: x.select(["col1"]), 46 | pa.schema([pa.field("col1", pa.int32())]), 47 | ).map(source) 48 | 49 | source.set_stream(SIMPLE_TABLE) 50 | dag.execute() 51 | assert node.get_value() == SIMPLE_TABLE.select(["col1"]) 52 | 53 | 54 | def test_filter_stream(): 55 | dag = Dag() 56 | 57 | source = dag.pa.source_table(SIMPLE_SCHEMA) 58 | node = dag.pa.filter_stream( 59 | lambda x, y: pc.equal(x["col1"], y), source, dag.const(1) 60 | ) 61 | SIMPLE_SCHEMA.empty_table() 62 | source.set_stream(SIMPLE_TABLE) 63 | dag.execute() 64 | assert node.get_value() == SIMPLE_TABLE[0:1] 65 | 66 | dag.execute() 67 | assert node.get_value() == SIMPLE_SCHEMA.empty_table() 68 | 69 | 70 | def _predicate(table: pa.Table) -> pa.Array: 71 | return pc.equal(table["col1"], 1) 72 | 73 | 74 | def test_filter_stream_bad_arguments(): 75 | dag = Dag() 76 | 77 | state_node = dag.state(lambda: "HELLO").map() 78 | with pytest.raises(TypeError, match=r"Argument should be a stream Node"): 79 | dag.pa.filter_stream(_predicate, state_node) 80 | 81 | list_stream_node = dag.source_stream() 82 | with pytest.raises(TypeError, match=r"Argument should be a Node\[pa\.Table\]"): 83 | dag.pa.filter_stream(_predicate, list_stream_node) 84 | 85 | 86 | def test_learn_expression_type(): 87 | field = pc.field("col1") 88 | assert isinstance(field, pc.Expression) 89 | greater_with_pc = pc.greater(field, 2) 90 | assert SIMPLE_TABLE.filter(greater_with_pc) == SIMPLE_TABLE[-1:] 91 | greater_with_python = field > 2 92 | assert SIMPLE_TABLE.filter(greater_with_python) == SIMPLE_TABLE[-1:] 93 | with pytest.raises(TypeError): 94 | pc.min(SIMPLE_TABLE, field) 95 | 96 | 97 | def test_group_by_last(): 98 | with pytest.raises( 99 | pa.ArrowNotImplementedError, 100 | match="Using ordered aggregator" 101 | " in multiple threaded execution is not supported", 102 | ): 103 | SIMPLE_TABLE.group_by("col1").aggregate([("col2", "last")]) 104 | 105 | 106 | def test_get_latest(): 107 | table = pa.table( 108 | [[1, 2, 3, 1, 2], ["a", "b", "c", "d", "e"], [0] * 5], schema=SIMPLE_SCHEMA 109 | ) 110 | assert _get_last_by(table, ["col1"]) == table[2:] 111 | assert _get_last_by(table, ["col1", "col2"]) == table 112 | 113 | 114 | def test_get_last_by_batches(): 115 | table = pa.concat_tables([SIMPLE_TABLE, SIMPLE_TABLE]) 116 | assert _get_last_by(table, ["col1"]) == SIMPLE_TABLE 117 | 118 | 119 | def test_get_last_by_all_columns(): 120 | table = pa.concat_tables([SIMPLE_TABLE, SIMPLE_TABLE]) 121 | assert _get_last_by(table, ["col1", "col2"]) == SIMPLE_TABLE 122 | 123 | 124 | def test_latest_tracker(): 125 | tracker = _LastByKey(["col1"], SIMPLE_SCHEMA.empty_table()) 126 | 127 | assert tracker(SIMPLE_SCHEMA.empty_table()) == SIMPLE_SCHEMA.empty_table() 128 | assert tracker(SIMPLE_TABLE) == SIMPLE_TABLE 129 | assert tracker(SIMPLE_TABLE_2) == pa.table( 130 | [[3, 1, 2], ["c", "d", "e"], [0] * 3], schema=SIMPLE_SCHEMA 131 | ) 132 | 133 | 134 | def test_last_by_keys(): 135 | dag = Dag() 136 | source = dag.pa.source_table(SIMPLE_SCHEMA) 137 | latest = dag.pa.last_by_keys(source, ["col1"]) 138 | 139 | dag.execute() 140 | assert latest.get_value() == SIMPLE_SCHEMA.empty_table() 141 | 142 | source.set_stream(SIMPLE_TABLE) 143 | dag.execute() 144 | assert latest.get_value() == SIMPLE_TABLE 145 | 146 | dag.execute() 147 | assert latest.get_value() == SIMPLE_TABLE 148 | 149 | source.set_stream(SIMPLE_TABLE_2) 150 | dag.execute() 151 | assert latest.get_value() == pa.table( 152 | [[3, 1, 2], ["c", "d", "e"], [0] * 3], schema=SIMPLE_SCHEMA 153 | ) 154 | 155 | 156 | def test_last_by_keys_bad(): 157 | dag = Dag() 158 | 159 | with pytest.raises( 160 | AttributeError, match=r"'str' object has no attribute '_get_empty'" 161 | ): 162 | dag.pa.last_by_keys("Not a node", ["col1"]) 163 | with pytest.raises(TypeError, match=r"Argument should be a Node\[pa.Table\]"): 164 | dag.pa.last_by_keys(dag.source_stream(), ["col1"]) 165 | with pytest.raises(TypeError, match=r"Argument should be a stream Node"): 166 | dag.pa.last_by_keys(dag.state(lambda: None).map(), ["col1"]) 167 | 168 | source = dag.pa.source_table(SIMPLE_SCHEMA) 169 | 170 | with pytest.raises(TypeError, match="123"): 171 | dag.pa.last_by_keys(source, 123) 172 | with pytest.raises(TypeError, match="123"): 173 | dag.pa.last_by_keys(source, [123]) 174 | with pytest.raises( 175 | TypeError, match=r"field colz no in schema: \['col1', 'col2', 'col3'\]" 176 | ): 177 | dag.pa.last_by_keys(source, ["colz"]) 178 | 179 | 180 | def test_get_column(): 181 | dag = Dag() 182 | source = dag.pa.source_table(SIMPLE_SCHEMA) 183 | array = dag.pa.get_column(source, "col1") 184 | 185 | dag.execute() 186 | assert array.get_value() == pa.chunked_array([pa.array([], pa.int32())]) 187 | 188 | source.set_stream(SIMPLE_TABLE) 189 | dag.execute() 190 | assert array.get_value() == SIMPLE_TABLE["col1"] 191 | 192 | dag.execute() 193 | assert array.get_value() == pa.chunked_array([pa.array([], pa.int32())]) 194 | 195 | source.set_stream(SIMPLE_TABLE_2) 196 | dag.execute() 197 | assert array.get_value() == SIMPLE_TABLE_2["col1"] 198 | 199 | 200 | def test_get_column_bad(): 201 | dag = Dag() 202 | 203 | with pytest.raises( 204 | AttributeError, match=r"'str' object has no attribute '_get_empty'" 205 | ): 206 | dag.pa.get_column("Not a node", "col1") 207 | with pytest.raises(TypeError, match=r"Argument should be a Node\[pa.Table\]"): 208 | dag.pa.get_column(dag.source_stream(), "col1") 209 | with pytest.raises(TypeError, match=r"Argument should be a stream Node"): 210 | dag.pa.get_column(dag.state(lambda: None).map(), "col1") 211 | 212 | source = dag.pa.source_table(SIMPLE_SCHEMA) 213 | 214 | with pytest.raises(TypeError, match="123"): 215 | dag.pa.get_column(source, 123) 216 | with pytest.raises( 217 | TypeError, match=r"field colz no in schema: \['col1', 'col2', 'col3'\]" 218 | ): 219 | dag.pa.get_column(source, "colz") 220 | 221 | 222 | def test_concat_arrays_ok(): 223 | dag = Dag() 224 | left = dag.source_stream(empty=pa.array([], pa.string())) 225 | right = dag.source_stream(empty=pa.array([], pa.string())) 226 | both = dag.pa.concat_arrays(left, right) 227 | 228 | dag.execute() 229 | assert both.get_value() == pa.chunked_array([], pa.string()) 230 | 231 | left.set_stream(pa.array(["a", "b"])) 232 | right.set_stream(pa.array(["c"])) 233 | dag.execute() 234 | assert both.get_value() == pa.chunked_array(["a", "b", "c"], pa.string()) 235 | 236 | dag.execute() 237 | assert both.get_value() == pa.chunked_array([], pa.string()) 238 | 239 | 240 | def test_concat_arrays_bad(): 241 | dag = Dag() 242 | 243 | with pytest.raises(ValueError, match=r"Must pass at least one array"): 244 | dag.pa.concat_arrays() 245 | with pytest.raises(TypeError, match=r"Argument should be a stream Node"): 246 | dag.pa.concat_arrays(dag.state(lambda: None).map()) 247 | with pytest.raises(TypeError, match=r"Argument should be a Node\[pa\.Array\]"): 248 | dag.pa.concat_arrays(dag.source_stream()) 249 | with pytest.raises(TypeError, match=r"Array type mismatch string vs int32"): 250 | dag.pa.concat_arrays( 251 | dag.source_stream(empty=pa.array([], pa.string())), 252 | dag.source_stream(empty=pa.array([], pa.int32())), 253 | ) 254 | 255 | 256 | def test_concat_arrow_arrays_mixed(): 257 | assert _concat_arrow_arrays( 258 | [ 259 | pa.array([], pa.string()), 260 | pa.chunked_array(pa.array([], pa.string())), 261 | ] 262 | ) == pa.chunked_array([], pa.string()) 263 | 264 | 265 | def test_concat_arrow_arrays_bad(): 266 | with pytest.raises(TypeError, match="123"): 267 | _concat_arrow_arrays([123]) 268 | -------------------------------------------------------------------------------- /tests/test_replay.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from operator import attrgetter 3 | 4 | import pandas as pd 5 | import pytest 6 | 7 | from beavers.dag import UTC_MAX, Dag 8 | from beavers.replay import ( 9 | DataSource, 10 | IteratorDataSourceAdapter, 11 | NoOpDataSinkProvider, 12 | ReplayContext, 13 | ReplayDriver, 14 | T, 15 | _create_sinks, 16 | _create_sources, 17 | ) 18 | from tests.test_util import ListDataSink, ListDataSource 19 | 20 | 21 | @dataclasses.dataclass(frozen=True) 22 | class Word: 23 | timestamp: pd.Timestamp 24 | value: str 25 | 26 | 27 | @pytest.fixture 28 | def replay_context() -> ReplayContext: 29 | return ReplayContext( 30 | pd.to_datetime("2023-01-01", utc=True), 31 | pd.to_datetime("2023-01-02", utc=True), 32 | pd.to_timedelta("1min"), 33 | ) 34 | 35 | 36 | def create_data_source(context: ReplayContext): 37 | return ListDataSource( 38 | [Word(context.start + pd.Timedelta(minutes=i), "hello") for i in range(10)], 39 | attrgetter("timestamp"), 40 | ) 41 | 42 | 43 | def test_create_sources_mismatch(replay_context: ReplayContext): 44 | with pytest.raises( 45 | ValueError, 46 | match=r"Source node and DataSource names don't match: \[\] vs \['words'\]", 47 | ): 48 | _create_sources(Dag(), replay_context, {"words": create_data_source}) 49 | 50 | 51 | def test_create_sources_match(replay_context: ReplayContext): 52 | dag = Dag() 53 | node = dag.source_stream(empty=[], name="words") 54 | 55 | results = _create_sources(dag, replay_context, {"words": create_data_source}) 56 | assert len(results) == 1 57 | assert results[0].name == "words" 58 | assert results[0].node == node 59 | assert isinstance(results[0].data_source, ListDataSource) 60 | 61 | 62 | def test_create_sinks_mismatch(replay_context: ReplayContext): 63 | sink = ListDataSink() 64 | with pytest.raises( 65 | ValueError, 66 | match=r"Sink node and DataSink names don't match: \[\] vs \['words'\]", 67 | ): 68 | _create_sinks(Dag(), replay_context, {"words": lambda _: sink}) 69 | 70 | 71 | def test_create_sinks_match(replay_context: ReplayContext): 72 | sink = ListDataSink() 73 | dag = Dag() 74 | source_node = dag.source_stream(empty=[], name="words") 75 | sink_node = dag.sink("words", source_node) 76 | results = _create_sinks(dag, replay_context, {"words": lambda _: sink}) 77 | assert len(results) == 1 78 | assert results[0].name == "words" 79 | assert results[0].nodes == [sink_node] 80 | assert results[0].data_sink is sink 81 | 82 | 83 | def test_pass_through_replay(replay_context: ReplayContext): 84 | source = create_data_source(replay_context) 85 | sink = ListDataSink() 86 | dag = Dag() 87 | source_node = dag.source_stream(empty=[], name="words") 88 | dag.sink("words", source_node) 89 | 90 | driver = ReplayDriver.create( 91 | dag, 92 | replay_context, 93 | {"words": lambda _: source}, 94 | {"words": lambda _: sink}, 95 | ) 96 | driver.run() 97 | assert sink._data == source._data 98 | 99 | 100 | def test_no_op_through_replay(replay_context: ReplayContext): 101 | """ 102 | Test a corner case of the driver were a sink did not update during a cycle 103 | """ 104 | sink = ListDataSink() 105 | dag = Dag() 106 | dag.source_stream(empty=[], name="words_1") 107 | source_2 = dag.source_stream(empty=[], name="words_2") 108 | dag.sink("words", source_2) 109 | 110 | driver = ReplayDriver.create( 111 | dag, 112 | replay_context, 113 | { 114 | "words_1": create_data_source, 115 | "words_2": lambda _: ListDataSource([], attrgetter("timestamp")), 116 | }, 117 | {"words": lambda _: sink}, 118 | ) 119 | driver.run() 120 | assert sink._data == [] 121 | 122 | 123 | def create_data_groups() -> list[list[Word]]: 124 | timestamp = pd.to_datetime("2022-01-01", utc=True) 125 | return [ 126 | [ 127 | Word(timestamp + pd.Timedelta(minutes=0), "hello"), 128 | Word(timestamp + pd.Timedelta(minutes=1), "world"), 129 | ], 130 | [ 131 | Word(timestamp + pd.Timedelta(minutes=2), "hello"), 132 | Word(timestamp + pd.Timedelta(minutes=2), "world"), 133 | ], 134 | [ 135 | Word(timestamp + pd.Timedelta(minutes=3), "hello"), 136 | Word(timestamp + pd.Timedelta(minutes=3), "world"), 137 | Word(timestamp + pd.Timedelta(minutes=3), "world"), 138 | Word(timestamp + pd.Timedelta(minutes=4), "world"), 139 | ], 140 | [], 141 | [ 142 | Word(timestamp + pd.Timedelta(minutes=5), "hello"), 143 | Word(timestamp + pd.Timedelta(minutes=5), "world"), 144 | ], 145 | ] 146 | 147 | 148 | def create_adapter(data_groups: list[list[Word]]) -> DataSource[list[Word]]: 149 | return IteratorDataSourceAdapter( 150 | ( 151 | ListDataSource(data_group, attrgetter("timestamp")) 152 | for data_group in data_groups 153 | ), 154 | [], 155 | lambda left, right: left + right, 156 | ) 157 | 158 | 159 | def test_iterator_data_source_adapter_run_all(): 160 | data_groups = create_data_groups() 161 | adapter = create_adapter(data_groups) 162 | assert adapter.read_to(UTC_MAX) == [ 163 | word for data_group in data_groups for word in data_group 164 | ] 165 | assert adapter.read_to(UTC_MAX) == [] 166 | 167 | 168 | def test_iterator_data_source_adapter_run_one_by_one(): 169 | timestamp = pd.to_datetime("2022-01-01", utc=True) 170 | data_groups = create_data_groups() 171 | adapter = create_adapter(data_groups) 172 | assert adapter.get_next() == timestamp 173 | assert adapter.read_to(timestamp) == [data_groups[0][0]] 174 | assert adapter.read_to(timestamp) == [] 175 | assert adapter.read_to(timestamp + pd.Timedelta(minutes=1)) == [data_groups[0][1]] 176 | assert adapter.read_to(timestamp + pd.Timedelta(minutes=1)) == [] 177 | assert ( 178 | adapter.read_to(timestamp + pd.Timedelta(minutes=3)) 179 | == data_groups[1] + data_groups[2][:-1] 180 | ) 181 | assert adapter.read_to(timestamp + pd.Timedelta(minutes=4)) == data_groups[2][-1:] 182 | assert adapter.read_to(timestamp + pd.Timedelta(minutes=5)) == data_groups[4] 183 | assert adapter.read_to(timestamp + pd.Timedelta(minutes=6)) == [] 184 | assert adapter.read_to(UTC_MAX) == [] 185 | 186 | 187 | def test_iterator_data_source_empty(): 188 | adapter = create_adapter([]) 189 | assert adapter.get_next() == UTC_MAX 190 | assert adapter.read_to(UTC_MAX) == [] 191 | assert adapter.get_next() == UTC_MAX 192 | assert adapter.read_to(UTC_MAX) == [] 193 | 194 | 195 | def test_iterator_data_source_all_empty(): 196 | adapter = create_adapter([[], []]) 197 | assert adapter.get_next() == UTC_MAX 198 | assert adapter.read_to(UTC_MAX) == [] 199 | assert adapter.get_next() == UTC_MAX 200 | assert adapter.read_to(UTC_MAX) == [] 201 | 202 | 203 | class CornerCaseTester(DataSource[list[Word]]): 204 | def __init__(self, timestamp: pd.Timestamp): 205 | self._timestamp = timestamp 206 | self._read = False 207 | 208 | def read_to(self, timestamp: pd.Timestamp) -> list[T]: 209 | self._read = True 210 | return [] 211 | 212 | def get_next(self) -> pd.Timestamp: 213 | if self._read: 214 | return UTC_MAX 215 | else: 216 | return self._timestamp 217 | 218 | 219 | def test_iterator_data_source_cutoff(): 220 | """ 221 | Test a tricky corner case were the underlying DataSource of 222 | IteratorDataSourceAdapter doesn't behave as expected. 223 | """ 224 | timestamp = pd.to_datetime("2022-01-01", utc=True) 225 | adapter = IteratorDataSourceAdapter( 226 | ( 227 | source 228 | for source in [ 229 | CornerCaseTester(timestamp + pd.Timedelta(minutes=1)), 230 | ListDataSource( 231 | [Word(timestamp + pd.Timedelta(minutes=2), "hello")], 232 | attrgetter("timestamp"), 233 | ), 234 | ] 235 | ), 236 | [], 237 | lambda left, right: left + right, 238 | ) 239 | 240 | assert adapter.read_to(UTC_MAX) == [ 241 | Word( 242 | timestamp=pd.Timestamp("2022-01-01 00:02:00+0000", tz="UTC"), value="hello" 243 | ) 244 | ] 245 | 246 | 247 | def test_replay_read_sources(): 248 | source = ListDataSource( 249 | [ 250 | Word(pd.to_datetime("2023-01-01 00:01:00Z"), "1"), 251 | Word(pd.to_datetime("2023-01-01 00:02:00Z"), "2"), 252 | Word(pd.to_datetime("2023-01-01 12:01:00Z"), "3"), 253 | Word(pd.to_datetime("2023-01-01 12:04:00Z"), "4"), 254 | ], 255 | attrgetter("timestamp"), 256 | ) 257 | 258 | dag = Dag() 259 | dag.source_stream([], name="hello") 260 | driver = ReplayDriver.create( 261 | dag=dag, 262 | replay_context=ReplayContext( 263 | pd.to_datetime("2023-01-01", utc=True), 264 | pd.to_datetime("2023-01-02", utc=True) - pd.to_timedelta("1ns"), 265 | pd.to_timedelta("12h"), 266 | ), 267 | data_source_providers={"hello": lambda x: source}, 268 | data_sink_providers={}, 269 | ) 270 | 271 | records, timestamp = driver.read_sources() 272 | assert timestamp == pd.to_datetime("2023-01-01 00:01:00Z", utc=True) 273 | assert records == 0 274 | 275 | 276 | def test_replay_run_cycle(): 277 | source = ListDataSource( 278 | [ 279 | Word(pd.to_datetime("2023-01-01 00:01:00Z"), "1"), 280 | Word(pd.to_datetime("2023-01-01 00:02:00Z"), "2"), 281 | Word(pd.to_datetime("2023-01-01 12:01:00Z"), "3"), 282 | Word(pd.to_datetime("2023-01-01 12:04:00Z"), "4"), 283 | ], 284 | attrgetter("timestamp"), 285 | ) 286 | 287 | dag = Dag() 288 | dag.source_stream([], name="hello") 289 | driver = ReplayDriver.create( 290 | dag=dag, 291 | replay_context=ReplayContext( 292 | pd.to_datetime("2023-01-01", utc=True), 293 | pd.to_datetime("2023-01-02", utc=True) - pd.to_timedelta("1ns"), 294 | pd.to_timedelta("12h"), 295 | ), 296 | data_source_providers={"hello": lambda x: source}, 297 | data_sink_providers={}, 298 | ) 299 | 300 | metrics = driver.run_cycle() 301 | assert metrics is None 302 | assert driver.current_time == pd.to_datetime("2023-01-01 12:00:00Z") 303 | 304 | metrics = driver.run_cycle() 305 | assert metrics.timestamp == pd.to_datetime("2023-01-01 12:00:00Z") 306 | assert metrics.source_records == 2 307 | assert metrics.sink_records == 0 308 | assert metrics.cycle_time_ns > 0 309 | assert metrics.warp_ratio > 0.0 310 | assert driver.current_time == pd.to_datetime("2023-01-02 00:00:00Z") 311 | 312 | metrics = driver.run_cycle() 313 | assert metrics.timestamp == pd.to_datetime("2023-01-01 23:59:59.999999999Z") 314 | assert metrics.source_records == 2 315 | assert metrics.sink_records == 0 316 | assert metrics.cycle_time_ns > 0 317 | assert metrics.warp_ratio > 0.0 318 | assert driver.current_time == pd.to_datetime("2023-01-02 12:00:00Z") 319 | assert driver.is_done() 320 | 321 | 322 | def test_no_op(): 323 | provider = NoOpDataSinkProvider() 324 | data_sink = provider(ReplayContext(UTC_MAX, UTC_MAX, pd.to_timedelta("1s"))) 325 | data_sink.append(UTC_MAX, None) 326 | data_sink.close() 327 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import dataclasses 3 | import random 4 | from typing import Callable, Dict, Generic, TypeVar 5 | 6 | import pandas as pd 7 | import pyarrow as pa 8 | 9 | from beavers.dag import UTC_MAX, Dag, TimerManager 10 | from beavers.replay import DataSink, DataSource 11 | 12 | T = TypeVar("T") 13 | 14 | TEST_TABLE = pa.table( 15 | { 16 | "timestamp": [ 17 | pd.to_datetime("2023-01-01T00:00:00Z"), 18 | pd.to_datetime("2023-01-02T00:00:00Z"), 19 | ], 20 | "value": [1, 2], 21 | } 22 | ) 23 | 24 | 25 | class GetLatest(Generic[T]): 26 | def __init__(self, default: T): 27 | self._value = default 28 | 29 | def __call__(self, values: list[T]) -> T: 30 | if values: 31 | self._value = values[-1] 32 | return self._value 33 | 34 | 35 | def add(left, right): 36 | return left + right 37 | 38 | 39 | def add_with_noise(left, right): 40 | return left + right + random.randint(0, 1000) # nosec 41 | 42 | 43 | def add_no_42(left, right): 44 | results = add(left, right) 45 | if results == 42: 46 | raise ValueError(f"{left} + {right} == 42") 47 | else: 48 | return results 49 | 50 | 51 | class AddOther: 52 | def __init__(self, other): 53 | self._other = other 54 | 55 | def set_other(self, other): 56 | self._other = other 57 | 58 | def __call__(self, value): 59 | return self._other + value 60 | 61 | 62 | def select(key, **values): 63 | return values[key] 64 | 65 | 66 | class WordCount: 67 | def __init__(self): 68 | self._counts = collections.defaultdict(lambda: 0) 69 | 70 | def __call__(self, words: list[str]) -> dict[str, int]: 71 | for word in words: 72 | self._counts[word] += 1 73 | 74 | return self._counts 75 | 76 | 77 | def join_counts(**kwargs: Dict[str, int]) -> pd.DataFrame: 78 | return pd.concat( 79 | [pd.Series(value, name=key) for key, value in kwargs.items()], axis=1 80 | ).fillna(0) 81 | 82 | 83 | @dataclasses.dataclass(frozen=True) 84 | class TimerEntry: 85 | timestamp: pd.Timestamp 86 | values: list[int] 87 | 88 | 89 | class SetATimer: 90 | def __init__(self): 91 | self._entry = None 92 | 93 | def __call__( 94 | self, entries: list[TimerEntry], now: pd.Timestamp, timer_manager: TimerManager 95 | ) -> list[int]: 96 | if entries: 97 | self._entry = entries[-1] 98 | timer_manager.set_next_timer(self._entry.timestamp) 99 | if self._entry is not None and now >= self._entry.timestamp: 100 | results = self._entry.values 101 | self._entry = None 102 | return results 103 | else: 104 | return [] 105 | 106 | 107 | def create_word_count_dag() -> tuple[Dag, WordCount]: 108 | dag = Dag() 109 | messages_stream = dag.source_stream([], name="messages") 110 | word_count = WordCount() 111 | state = dag.state(word_count).map(messages_stream) 112 | changed_key = dag.stream(lambda x: sorted(set(x)), []).map(messages_stream) 113 | records = dag.stream(lambda x, y: {v: y[v] for v in x}, {}).map(changed_key, state) 114 | dag.sink("results", records) 115 | return dag, word_count 116 | 117 | 118 | class ListDataSource(DataSource[list[T]]): 119 | def __init__(self, data: list[T], extractor: Callable[[T], pd.Timestamp]): 120 | self._data = data 121 | self._extractor = extractor 122 | self._position = 0 123 | 124 | def read_to(self, timestamp: pd.Timestamp) -> list[T]: 125 | results = [] 126 | while ( 127 | self._position < len(self._data) 128 | and self._extractor(self._data[self._position]) <= timestamp 129 | ): 130 | results.append(self._data[self._position]) 131 | self._position += 1 132 | return results 133 | 134 | def get_next(self) -> pd.Timestamp: 135 | if self._position >= len(self._data): 136 | return UTC_MAX 137 | else: 138 | return self._extractor(self._data[self._position]) 139 | 140 | 141 | class ListDataSink(DataSink[list[T]]): 142 | def __init__(self): 143 | self._data = [] 144 | 145 | def append(self, timestamp: pd.Timestamp, data: list[T]): 146 | self._data.extend(data) 147 | 148 | def close(self): 149 | pass 150 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | isolated_build = True 3 | envlist = 4 | py310, 5 | py311, 6 | py312, 7 | py313, 8 | linting, 9 | 10 | [testenv] 11 | allowlist_externals = poetry 12 | commands_pre = 13 | poetry install --no-root --sync --extras pyarrow --extras confluent_kafka --extras polars 14 | changedir = {envtmpdir} 15 | commands = 16 | poetry run coverage run --source=beavers --rcfile={toxinidir}/pyproject.toml --branch -m pytest {toxinidir}/tests 17 | poetry run python {toxinidir}/examples/advanced_concepts.py 18 | poetry run python {toxinidir}/examples/dag_concepts.py 19 | poetry run python {toxinidir}/examples/etfs.py 20 | poetry run python {toxinidir}/examples/pandas_concepts.py 21 | poetry run python {toxinidir}/examples/polars_concepts.py 22 | poetry run python {toxinidir}/examples/pyarrow_concepts.py 23 | poetry run python {toxinidir}/examples/replay_concepts.py 24 | poetry run coverage report --rcfile={toxinidir}/pyproject.toml -m --fail-under 95 25 | poetry run coverage xml --rcfile={toxinidir}/pyproject.toml -o {toxinidir}/coverage.xml 26 | 27 | [testenv:linting] 28 | deps = pre-commit 29 | commands = pre-commit run --all-files --show-diff-on-failure 30 | 31 | [gh-actions] 32 | python = 33 | 3.10: py310, linting 34 | 3.11: py311 35 | 3.12: py312 36 | 3.13: py313 37 | --------------------------------------------------------------------------------