├── .github
    └── workflows
    │   ├── ci.yaml
    │   └── publish.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── beavers
    ├── __init__.py
    ├── assets
    │   └── favicon.ico
    ├── dag.py
    ├── kafka.py
    ├── pandas_wrapper.py
    ├── perspective_wrapper.py
    ├── polars_wrapper.py
    ├── pyarrow_kafka.py
    ├── pyarrow_replay.py
    ├── pyarrow_wrapper.py
    ├── replay.py
    ├── table.html
    └── testing.py
├── docs
    ├── concepts
    │   ├── advanced.md
    │   ├── dag.md
    │   ├── kafka.md
    │   ├── pandas.md
    │   ├── perspective.md
    │   ├── polars.md
    │   ├── pyarrow.md
    │   └── replay.md
    ├── contributing.md
    ├── faq.md
    ├── index.md
    ├── install.md
    ├── reference
    │   ├── dag.md
    │   ├── kafka.md
    │   ├── pandas_wrapper.md
    │   ├── pyarrow_wrapper.md
    │   └── replay.md
    ├── requirements.in
    ├── requirements.txt
    └── static
    │   └── icons
    │       └── beavers
    │           ├── icon.png
    │           └── logo.svg
├── examples
    ├── __init__.py
    ├── advanced_concepts.py
    ├── dag_concepts.py
    ├── etfs.py
    ├── kafka_concepts.py
    ├── pandas_concepts.py
    ├── perspective_concepts.py
    ├── polars_concepts.py
    ├── pyarrow_concepts.py
    └── replay_concepts.py
├── mkdocs.yml
├── poetry.lock
├── pyproject.toml
├── scripts
    ├── README.md
    ├── kafka_test_bench.py
    └── perpective_test_bench.py
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── test_dag.py
    ├── test_docs.py
    ├── test_etfs.py
    ├── test_kafka.py
    ├── test_pandas_wrapper.py
    ├── test_perpective_wrapper.py
    ├── test_polars_wrapper.py
    ├── test_pyarrow_kafka.py
    ├── test_pyarrow_replay.py
    ├── test_pyarrow_wrapper.py
    ├── test_replay.py
    └── test_util.py
└── tox.ini


/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: beavers CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version:
15 |           - "3.10"
16 |           - "3.11"
17 |           - "3.12"
18 |           - "3.13"
19 |       fail-fast: false
20 |     steps:
21 |       - name: Checkout sources
22 |         uses: actions/checkout@v4
23 | 
24 |       - name: Setup Python
25 |         uses: actions/setup-python@v5
26 |         with:
27 |           python-version: ${{ matrix.python-version }}
28 |           cache: "pip"
29 | 
30 |       - name: Install pip
31 |         run: "python -m pip install --upgrade pip"
32 |       - name: Install tox and poetry
33 |         run: "python -m pip install tox tox-gh-actions poetry==2.1.1"
34 |       - name: Install poetry plugin
35 |         run: 'poetry self add "poetry-dynamic-versioning[plugin]"'
36 | 
37 |       - name: Run tox
38 |         run: tox
39 | 
40 |       - name: Upload coverage to Codecov
41 |         uses: codecov/codecov-action@v4
42 |         if: "matrix.python-version == '3.10'"
43 |         with:
44 |           fail_ci_if_error: true
45 |           token: ${{ secrets.CODECOV_TOKEN }}
46 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [ published ]
 6 |     branches: [ main ]
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   build-and-publish:
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |       - name: Checkout sources
15 |         uses: actions/checkout@v3
16 | 
17 |       - name: Setup Python
18 |         uses: actions/setup-python@v4
19 |         with:
20 |           python-version: "3.10"
21 | 
22 |       - name: Install poetry and dependencies
23 |         run: |
24 |           python -m pip install --upgrade pip
25 |           python -m pip install poetry==2.1.1
26 |           poetry self add "poetry-dynamic-versioning[plugin]"
27 | 
28 |       - name: Configure poetry
29 |         env:
30 |           pypi_token: ${{ secrets.PyPI_TOKEN }}
31 |         run: poetry config pypi-token.pypi $pypi_token
32 | 
33 |       - name: Build and publish
34 |         run: poetry publish --build
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # Custom
163 | /.idea
164 | /.pytest_cache
165 | /.ruff_cache
166 | /venv
167 | *.csv
168 | coverrage.xml
169 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   python: python3.10
 3 | repos:
 4 |   - repo: https://github.com/pycqa/pydocstyle
 5 |     rev: 6.3.0
 6 |     hooks:
 7 |       - id: pydocstyle
 8 |         files: ^beavers/(dag|replay|kafka|arrow).py
 9 |         additional_dependencies:
10 |           - tomli
11 | 
12 |   - repo: https://github.com/pre-commit/pre-commit-hooks
13 |     rev: v5.0.0
14 |     hooks:
15 |       - id: check-toml
16 |       - id: check-yaml
17 |       - id: end-of-file-fixer
18 |       - id: mixed-line-ending
19 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
20 |     rev: v0.11.12
21 |     hooks:
22 |       - id: ruff
23 |         args: ['--fix']
24 |       - id: ruff-format
25 |   - repo: https://github.com/PyCQA/bandit
26 |     rev: 1.8.3
27 |     hooks:
28 |       - id: bandit
29 |         additional_dependencies:
30 |           - tomli
31 |         args:
32 |           - "--config=pyproject.toml"
33 |   - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
34 |     rev: v2.14.0
35 |     hooks:
36 |       - id: pretty-format-toml
37 |         files: "^.*.toml"
38 |         args:
39 |           - "--autofix"
40 |   - repo: https://github.com/python-poetry/poetry
41 |     rev: 2.1.3
42 |     hooks:
43 |       - id: poetry-check
44 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: ubuntu-22.04
 5 |   tools:
 6 |     python: "3.11"
 7 | 
 8 | mkdocs:
 9 |   configuration: mkdocs.yml
10 | 
11 | python:
12 |    install:
13 |    - requirements: docs/requirements.txt
14 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | All notable changes to this project will be documented in this file.
  4 | 
  5 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
  6 | and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
  7 | 
  8 | <!-- insertion marker -->
  9 | ## [v0.13.0](https://github.com/tradewelltech/beavers/releases/tag/v0.13.0) - 2025-02-13
 10 | 
 11 | <small>[Compare with v0.12.1](https://github.com/tradewelltech/beavers/compare/v0.12.1...v0.13.0)</small>
 12 | 
 13 | ### Added
 14 | 
 15 | - Add missing badge (#72) ([5bf44e9](https://github.com/tradewelltech/beavers/commit/5bf44e982740651ccf1a168ce88b4376519181ee) by 0x26res).
 16 | - Add polars support ([f30da87](https://github.com/tradewelltech/beavers/commit/f30da8779c2a683f2ec2d9607134658ac70d4afb) by aandres3).
 17 | 
 18 | ## [v0.12.1](https://github.com/tradewelltech/beavers/releases/tag/v0.12.1) - 2025-02-03
 19 | 
 20 | <small>[Compare with v0.12.0](https://github.com/tradewelltech/beavers/compare/v0.12.0...v0.12.1)</small>
 21 | 
 22 | ## [v0.12.0](https://github.com/tradewelltech/beavers/releases/tag/v0.12.0) - 2024-11-25
 23 | 
 24 | <small>[Compare with v0.11.0](https://github.com/tradewelltech/beavers/compare/v0.11.0...v0.12.0)</small>
 25 | 
 26 | ### Added
 27 | 
 28 | - Add changelog link ([b84d6e6](https://github.com/tradewelltech/beavers/commit/b84d6e6ef42c590379f9bdd16319b1ecb9978b52) by aandres3).
 29 | 
 30 | 
 31 | ## [v0.11.0](https://github.com/tradewelltech/beavers/releases/tag/v0.11.0) - 2024-11-15
 32 | 
 33 | <small>[Compare with v0.10.0](https://github.com/tradewelltech/beavers/compare/v0.10.0...v0.11.0)</small>
 34 | 
 35 | ### Added
 36 | 
 37 | - Add python 3.13 ([1984bb2](https://github.com/tradewelltech/beavers/commit/1984bb2c7b14126084d5497243418f8bc0123494) by aandres3).
 38 | 
 39 | ### Fixed
 40 | 
 41 | - Fix perspective html (#70) ([ebc090d](https://github.com/tradewelltech/beavers/commit/ebc090d5a9ac7bbf31384a826cf94326426386e0) by 0x26res).
 42 | 
 43 | ## [v0.10.0](https://github.com/tradewelltech/beavers/releases/tag/v0.10.0) - 2024-11-11
 44 | 
 45 | <small>[Compare with v0.9.1](https://github.com/tradewelltech/beavers/compare/v0.9.1...v0.10.0)</small>
 46 | 
 47 | ### Added
 48 | 
 49 | - Add latest version of everything ([f339a52](https://github.com/tradewelltech/beavers/commit/f339a52ac8046e72f64ba4f838259d90b0791a6d) by aandres3).
 50 | 
 51 | ### Fixed
 52 | 
 53 | - Fix wrong offset resolution (#65) ([610bad6](https://github.com/tradewelltech/beavers/commit/610bad6cdadb29014ddc098b79e2ca5df18f1c71) by 0x26res).
 54 | 
 55 | 
 56 | ## [v0.9.1](https://github.com/tradewelltech/beavers/releases/tag/v0.9.1) - 2024-09-20
 57 | 
 58 | <small>[Compare with v0.9.0](https://github.com/tradewelltech/beavers/compare/v0.9.0...v0.9.1)</small>
 59 | 
 60 | ### Fixed
 61 | 
 62 | - Fix following perspective update ([f06f375](https://github.com/tradewelltech/beavers/commit/f06f375028c99017231faf9f5ab78c3f7f4e028e) by aandres).
 63 | 
 64 | ## [v0.9.0](https://github.com/tradewelltech/beavers/releases/tag/v0.9.0) - 2024-07-30
 65 | 
 66 | <small>[Compare with v0.8.0](https://github.com/tradewelltech/beavers/compare/v0.8.0...v0.9.0)</small>
 67 | 
 68 | ### Added
 69 | 
 70 | - Add perspective tools ([07878be](https://github.com/tradewelltech/beavers/commit/07878bec527d6e2523345ca437e6a64b77c47182) by aandres).
 71 | 
 72 | ## [v0.8.0](https://github.com/tradewelltech/beavers/releases/tag/v0.8.0) - 2024-07-01
 73 | 
 74 | <small>[Compare with v0.7.0](https://github.com/tradewelltech/beavers/compare/v0.7.0...v0.8.0)</small>
 75 | 
 76 | ### Added
 77 | 
 78 | - Add constructor to mock consumer ([370d5d6](https://github.com/tradewelltech/beavers/commit/370d5d68eb60662a110026ab7844fc3d9c6bf59b) by aandres).
 79 | - Add log message for resolved offsets ([0816ea3](https://github.com/tradewelltech/beavers/commit/0816ea3bde7ec0b667b3d6b62935ebc2d7228adf) by aandres).
 80 | 
 81 | ### Fixed
 82 | 
 83 | - Fix offset resolution on end of topic ([ff76c35](https://github.com/tradewelltech/beavers/commit/ff76c3519d4ae36040cf138059952c9304bc1b3d) by aandres).
 84 | 
 85 | ## [v0.7.0](https://github.com/tradewelltech/beavers/releases/tag/v0.7.0) - 2024-06-25
 86 | 
 87 | <small>[Compare with v0.6.0](https://github.com/tradewelltech/beavers/compare/v0.6.0...v0.7.0)</small>
 88 | 
 89 | ### Added
 90 | 
 91 | - Add poll time metrics ([efa487a](https://github.com/tradewelltech/beavers/commit/efa487a3e86f7748c160413ccba749e277e1bc5e) by aandres).
 92 | 
 93 | ## [v0.6.0](https://github.com/tradewelltech/beavers/releases/tag/v0.6.0) - 2024-06-24
 94 | 
 95 | <small>[Compare with v0.5.0](https://github.com/tradewelltech/beavers/compare/v0.5.0...v0.6.0)</small>
 96 | 
 97 | ### Added
 98 | 
 99 | - Add some missing replay code (#56) ([9973baa](https://github.com/tradewelltech/beavers/commit/9973baa73fd781656938578f9f0cefe7a283a389) by 0x26res).
100 | - Add contributing and code of conduct guide, update deps (#55) ([3bd1147](https://github.com/tradewelltech/beavers/commit/3bd114724b5f2ac1095b00b8e90a55dd3a7333ab) by 0x26res).
101 | 
102 | ### Fixed
103 | 
104 | - fix: make group optional (#54) ([03d27af](https://github.com/tradewelltech/beavers/commit/03d27af029d95be874a0b6b5e5cbc625945b984b) by 0x26res).
105 | 
106 | ### Changed
107 | 
108 | - Change engine to dag, add talk to the doc ([cd57456](https://github.com/tradewelltech/beavers/commit/cd57456a271f99a81602f7d7d385f0caea84acd2) by aandres).
109 | 
110 | ## [v0.5.0](https://github.com/tradewelltech/beavers/releases/tag/v0.5.0) - 2024-01-23
111 | 
112 | <small>[Compare with v0.4.0](https://github.com/tradewelltech/beavers/compare/v0.4.0...v0.5.0)</small>
113 | 
114 | ### Added
115 | 
116 | - Add python 12 support (#53) ([344ff69](https://github.com/tradewelltech/beavers/commit/344ff69309d81780d9d08effc2fdfe3b1f8d9b22) by 0x26res).
117 | - Add prune ([4e5b06f](https://github.com/tradewelltech/beavers/commit/4e5b06f073c2e210f4cca8d67f096698c52c3fa9) by aandres).
118 | - Add kafka json to arrow support (#50) ([120c116](https://github.com/tradewelltech/beavers/commit/120c116d13ab46604d54088bb07d851ff5d3fd00) by 0x26res).
119 | 
120 | 
121 | ## [v0.4.0](https://github.com/tradewelltech/beavers/releases/tag/v0.4.0) - 2023-11-26
122 | 
123 | <small>[Compare with v0.3.1](https://github.com/tradewelltech/beavers/compare/v0.3.1...v0.4.0)</small>
124 | 
125 | ### Added
126 | 
127 | - Add some arrow replay code ([d8026ec](https://github.com/tradewelltech/beavers/commit/d8026ecf744886b0bb7406814904adb3308ba0b9) by 0x26res).
128 | 
129 | ## [v0.3.1](https://github.com/tradewelltech/beavers/releases/tag/v0.3.1) - 2023-10-26
130 | 
131 | <small>[Compare with v0.3.0](https://github.com/tradewelltech/beavers/compare/v0.3.0...v0.3.1)</small>
132 | ### Added
133 | 
134 | - Add pandas module (#47) ([ac81344](https://github.com/tradewelltech/beavers/commit/ac8134452c3a9636ea5a119e65db87df5a245271) by 0x26res).
135 | 
136 | ## [v0.3.0](https://github.com/tradewelltech/beavers/releases/tag/v0.3.0) - 2023-09-29
137 | 
138 | <small>[Compare with v0.2.0](https://github.com/tradewelltech/beavers/compare/v0.2.0...v0.3.0)</small>
139 | 
140 | ### Added
141 | 
142 | - Add faq, make kafka extra dep, update readme, use poetry in tox. (#44) ([de0ddf5](https://github.com/tradewelltech/beavers/commit/de0ddf5baa51fbf5a9b818364e8a2e589a2b0974) by 0x26res).
143 | - Add pyarrow module (#42) ([1117f37](https://github.com/tradewelltech/beavers/commit/1117f375b36a5eac1468c3a5888f1fdc6e9f1ba7) by 0x26res).
144 | - Add developer page (#41) ([b717b62](https://github.com/tradewelltech/beavers/commit/b717b6224bf9e5fd585ff6b0bed77b3333ad2a68) by 0x26res).
145 | - Add logos ([7f6b1cf](https://github.com/tradewelltech/beavers/commit/7f6b1cfc09453927ede5e485c242311362b1e417) by aandres).
146 | 
147 | ### Fixed
148 | 
149 | - Fix logo (#45) ([f24f0dc](https://github.com/tradewelltech/beavers/commit/f24f0dcb8a911f193aa045da0b6a0f20a69fc64e) by 0x26res).
150 | - Fix tests ([cc52ae6](https://github.com/tradewelltech/beavers/commit/cc52ae6f454d6cf3afd98b6804fd750de5a2eab1) by aandres).
151 | 
152 | ### Changed
153 | 
154 | - change update docs deps (#40) ([04bf706](https://github.com/tradewelltech/beavers/commit/04bf706f9277285b9dac922bb0255402d095da6e) by 0x26res).
155 | 
156 | ## [v0.2.0](https://github.com/tradewelltech/beavers/releases/tag/v0.2.0) - 2023-09-19
157 | 
158 | <small>[Compare with v0.1.0](https://github.com/tradewelltech/beavers/compare/v0.1.0...v0.2.0)</small>
159 | 
160 | ### Added
161 | 
162 | - Add changelog ([7ee7685](https://github.com/tradewelltech/beavers/commit/7ee76853ff4186dc1b7c9449022511a6ad477fbe) by aandres).
163 | - Add empty factory ([ee07562](https://github.com/tradewelltech/beavers/commit/ee0756289d4ed79787e760de4441933afd1aa9d7) by aandres).
164 | - Add offset policies, fix committed ([99c1ad7](https://github.com/tradewelltech/beavers/commit/99c1ad76f6d49f4a641749bdea5ec60e73392507) by aandres).
165 | - Add logging ([c8449ab](https://github.com/tradewelltech/beavers/commit/c8449aba69d18ec070755e1efbd89f083b639289) by aandres).
166 | - Add test script ([077bfc2](https://github.com/tradewelltech/beavers/commit/077bfc278809676e048ba121119e1ec67a97bb5f) by aandres).
167 | - Add kafka doc ([806a471](https://github.com/tradewelltech/beavers/commit/806a47188fa4b2c7234f3059975668142fb3c49b) by aandres).
168 | 
169 | ### Fixed
170 | 
171 | - Fix test, fix coverage ([6f0e371](https://github.com/tradewelltech/beavers/commit/6f0e371916c2ba61147f61adfd5995c32fe63212) by aandres).
172 | - Fix covertage ([9db6eec](https://github.com/tradewelltech/beavers/commit/9db6eec070d4e7783bc6028f85ad468b0b26e7c8) by aandres).
173 | - Fix example ([39f4b44](https://github.com/tradewelltech/beavers/commit/39f4b44f48b2b5efe2761f762e7d85ee256df76d) by aandres).
174 | 
175 | ## [v0.1.0](https://github.com/tradewelltech/beavers/releases/tag/v0.1.0) - 2023-08-24
176 | 
177 | <small>[Compare with v0.0.4](https://github.com/tradewelltech/beavers/compare/v0.0.4...v0.1.0)</small>
178 | 
179 | ## [v0.0.4](https://github.com/tradewelltech/beavers/releases/tag/v0.0.4) - 2023-08-22
180 | 
181 | <small>[Compare with v0.0.3](https://github.com/tradewelltech/beavers/compare/v0.0.3...v0.0.4)</small>
182 | 
183 | ### Added
184 | 
185 | - Add dag metrics ([c46a4ee](https://github.com/tradewelltech/beavers/commit/c46a4eec655984c2525fe094942fd002deeb5645) by aandres).
186 | - Add missing assert ([86b924f](https://github.com/tradewelltech/beavers/commit/86b924f06d78cf3b3a8b98e8137275490b61f815) by aandres).
187 | - Add replay doc ([d5b9b43](https://github.com/tradewelltech/beavers/commit/d5b9b43bd3012e292ad86219c5fd304d3fb11198) by aandres).
188 | - Add repaly metrics ([ba274ef](https://github.com/tradewelltech/beavers/commit/ba274ef7d53cda1e380a7defbd5d4884cf018e4a) by aandres).
189 | - Add test ([8e87c6e](https://github.com/tradewelltech/beavers/commit/8e87c6e8a76b6dadcedf810b0373d12cba7f3309) by aandres).
190 | - Add install section ([520ced1](https://github.com/tradewelltech/beavers/commit/520ced1def5b7508507df6cd65339515680b41fe) by aandres).
191 | 
192 | ### Fixed
193 | 
194 | - Fix equality check on nodes ([fa1a09f](https://github.com/tradewelltech/beavers/commit/fa1a09f300b2dd2c307a09f80b8ab37cfd949ea4) by aandres).
195 | - fix test ([85005d5](https://github.com/tradewelltech/beavers/commit/85005d5abcc82685396c39bcf1618aacf0b8ed75) by aandres).
196 | - Fix tox ([7bef814](https://github.com/tradewelltech/beavers/commit/7bef81471d21b405c5982ca19baf1b7ae345f930) by aandres).
197 | 
198 | ### Removed
199 | 
200 | - Remove dead code ([af932d4](https://github.com/tradewelltech/beavers/commit/af932d41ab86fde774dd77f67070ba98a9977df4) by aandres).
201 | 
202 | ## [v0.0.3](https://github.com/tradewelltech/beavers/releases/tag/v0.0.3) - 2023-07-05
203 | 
204 | <small>[Compare with v0.0.2](https://github.com/tradewelltech/beavers/compare/v0.0.2...v0.0.3)</small>
205 | 
206 | ### Added
207 | 
208 | - Add doc ([cb624c7](https://github.com/tradewelltech/beavers/commit/cb624c706920134d362430b0a094b0c722890e43) by aandres).
209 | - Add kafka ([92c37fb](https://github.com/tradewelltech/beavers/commit/92c37fba76b8c26943327834198a24505d0bea79) by aandres).
210 | 
211 | ### Fixed
212 | 
213 | - Fix kafka test coverage ([ecbc890](https://github.com/tradewelltech/beavers/commit/ecbc890f1adddaf236631e95ccf41ed6002430f3) by aandres).
214 | - Fix icon ([8887278](https://github.com/tradewelltech/beavers/commit/88872786071f882f23335c47721fd53a23771b2e) by aandres).
215 | 
216 | ## [v0.0.2](https://github.com/tradewelltech/beavers/releases/tag/v0.0.2) - 2023-06-30
217 | 
218 | <small>[Compare with v0.0.1](https://github.com/tradewelltech/beavers/compare/v0.0.1...v0.0.2)</small>
219 | 
220 | ### Added
221 | 
222 | - Add advanced concept ([3450d72](https://github.com/tradewelltech/beavers/commit/3450d728872962dff7101189d20a4e81a48d8e2e) by aandres).
223 | - Add concept page, rename stabilize ([9c0b9eb](https://github.com/tradewelltech/beavers/commit/9c0b9eba0bf0bd604e0195530bc25e2fb767509a) by aandres).
224 | - Add doc to main api ([4048ae7](https://github.com/tradewelltech/beavers/commit/4048ae7c29c56ffa789b3c1c4f7a3c53aba44a75) by aandres).
225 | - Add const test ([e1af0bd](https://github.com/tradewelltech/beavers/commit/e1af0bdf61d144e76c029421a274433f6967df4c) by aandres).
226 | - Add hook for pydoc ([ad10948](https://github.com/tradewelltech/beavers/commit/ad109481ff06ea4ae26acd3e1279fc056fd5ee54) by aandres).
227 | - Add replay ([c807bef](https://github.com/tradewelltech/beavers/commit/c807bef6354573124d410e13c85450d0cdacf681) by aandres).
228 | - Add ETF example ([e3c4c2e](https://github.com/tradewelltech/beavers/commit/e3c4c2e9f3423e814d47c1dc40e182c88f05c9ba) by aandres).
229 | 
230 | ### Fixed
231 | 
232 | - fix typos ([8df6f74](https://github.com/tradewelltech/beavers/commit/8df6f7412ae96a6cbe55b1941d6475d3754fc0de) by aandres).
233 | - Fix coverage ([fafaa9a](https://github.com/tradewelltech/beavers/commit/fafaa9a49c4c038094058ab8f99346c9e45e9dde) by aandres).
234 | - Fix test coverage ([41938e9](https://github.com/tradewelltech/beavers/commit/41938e9c0c558d56cb89144fb539d00cf85254cf) by aandres).
235 | - Fix ci ([9c46069](https://github.com/tradewelltech/beavers/commit/9c46069ce380cc59a5c53aa1743a9f369d7283bf) by aandres).
236 | 
237 | ### Removed
238 | 
239 | - Remove trailing blank space ([22195ca](https://github.com/tradewelltech/beavers/commit/22195ca075c77deef92f0a7ea00025f0f1a71561) by aandres).
240 | 
241 | ## [v0.0.1](https://github.com/tradewelltech/beavers/releases/tag/v0.0.1) - 2023-05-10
242 | 
243 | <small>[Compare with v0.0.1.rc](https://github.com/tradewelltech/beavers/compare/v0.0.1.rc...v0.0.1)</small>
244 | 
245 | ### Added
246 | 
247 | - Add ci badge ([fdad06c](https://github.com/tradewelltech/beavers/commit/fdad06ca65ed1135d052c4e9e4a13e48b50cdabe) by aandres).
248 | - Add material ([31a46e4](https://github.com/tradewelltech/beavers/commit/31a46e4f5e39824736064fcc13d7fea600be5ac9) by aandres).
249 | - Add python doc requirements ([e1bcd00](https://github.com/tradewelltech/beavers/commit/e1bcd00aba018dba8e16d781b0f6ca9e783105c0) by aandres).
250 | - Add docs ([3c1e87a](https://github.com/tradewelltech/beavers/commit/3c1e87aa14d3d132189d7d5a3bbe66e6df0a57c5) by aandres).
251 | - Add coverage to deps ([ced0670](https://github.com/tradewelltech/beavers/commit/ced0670226f4ef43539efa75b1e6e455efda1df2) by aandres).
252 | 
253 | ### Fixed
254 | 
255 | - Fix branch ([9847cb9](https://github.com/tradewelltech/beavers/commit/9847cb9b4fd4d59c3060318805caeffbe8582cf7) by aandres).
256 | - Fix read the docs ([ecf5d25](https://github.com/tradewelltech/beavers/commit/ecf5d25cefe8be2c0448913d4f1ef100753a644a) by aandres).
257 | 
258 | ### Removed
259 | 
260 | - Remove duplicate and snyk ([b7e8539](https://github.com/tradewelltech/beavers/commit/b7e8539a682162de0fff1a9b6a5f55ca5f550da2) by aandres).
261 | 
262 | ## [v0.0.1.rc](https://github.com/tradewelltech/beavers/releases/tag/v0.0.1.rc) - 2023-05-09
263 | 
264 | <small>[Compare with first commit](https://github.com/tradewelltech/beavers/compare/1cc83cb780e53ef55308100c655c321dcc945d3b...v0.0.1.rc)</small>
265 | 
266 | ### Added
267 | 
268 | - add pre commit ([12d7ffa](https://github.com/tradewelltech/beavers/commit/12d7ffa203c8c88cbb68f683fc2d992960e170fe) by aandres).
269 | - Add engine code ([e2f0949](https://github.com/tradewelltech/beavers/commit/e2f0949dd5dc69692455c7564c5f6bcfd997754d) by aandres).
270 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Contributor Covenant Code of Conduct
  3 | 
  4 | ## Our Pledge
  5 | 
  6 | We as members, contributors, and leaders pledge to make participation in our
  7 | community a harassment-free experience for everyone, regardless of age, body
  8 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  9 | identity and expression, level of experience, education, socio-economic status,
 10 | nationality, personal appearance, race, religion, or sexual identity
 11 | and orientation.
 12 | 
 13 | We pledge to act and interact in ways that contribute to an open, welcoming,
 14 | diverse, inclusive, and healthy community.
 15 | 
 16 | ## Our Standards
 17 | 
 18 | Examples of behavior that contributes to a positive environment for our
 19 | community include:
 20 | 
 21 | * Demonstrating empathy and kindness toward other people
 22 | * Being respectful of differing opinions, viewpoints, and experiences
 23 | * Giving and gracefully accepting constructive feedback
 24 | * Accepting responsibility and apologizing to those affected by our mistakes,
 25 |   and learning from the experience
 26 | * Focusing on what is best not just for us as individuals, but for the
 27 |   overall community
 28 | 
 29 | Examples of unacceptable behavior include:
 30 | 
 31 | * The use of sexualized language or imagery, and sexual attention or
 32 |   advances of any kind
 33 | * Trolling, insulting or derogatory comments, and personal or political attacks
 34 | * Public or private harassment
 35 | * Publishing others' private information, such as a physical or email
 36 |   address, without their explicit permission
 37 | * Other conduct which could reasonably be considered inappropriate in a
 38 |   professional setting
 39 | 
 40 | ## Enforcement Responsibilities
 41 | 
 42 | Community leaders are responsible for clarifying and enforcing our standards of
 43 | acceptable behavior and will take appropriate and fair corrective action in
 44 | response to any behavior that they deem inappropriate, threatening, offensive,
 45 | or harmful.
 46 | 
 47 | Community leaders have the right and responsibility to remove, edit, or reject
 48 | comments, commits, code, wiki edits, issues, and other contributions that are
 49 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 50 | decisions when appropriate.
 51 | 
 52 | ## Scope
 53 | 
 54 | This Code of Conduct applies within all community spaces, and also applies when
 55 | an individual is officially representing the community in public spaces.
 56 | Examples of representing our community include using an official email address,
 57 | posting via an official social media account, or acting as an appointed
 58 | representative at an online or offline event.
 59 | 
 60 | ## Enforcement
 61 | 
 62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 63 | reported to the community leaders responsible for enforcement at
 64 | [INSERT CONTACT METHOD].
 65 | All complaints will be reviewed and investigated promptly and fairly.
 66 | 
 67 | All community leaders are obligated to respect the privacy and security of the
 68 | reporter of any incident.
 69 | 
 70 | ## Enforcement Guidelines
 71 | 
 72 | Community leaders will follow these Community Impact Guidelines in determining
 73 | the consequences for any action they deem in violation of this Code of Conduct:
 74 | 
 75 | ### 1. Correction
 76 | 
 77 | **Community Impact**: Use of inappropriate language or other behavior deemed
 78 | unprofessional or unwelcome in the community.
 79 | 
 80 | **Consequence**: A private, written warning from community leaders, providing
 81 | clarity around the nature of the violation and an explanation of why the
 82 | behavior was inappropriate. A public apology may be requested.
 83 | 
 84 | ### 2. Warning
 85 | 
 86 | **Community Impact**: A violation through a single incident or series
 87 | of actions.
 88 | 
 89 | **Consequence**: A warning with consequences for continued behavior. No
 90 | interaction with the people involved, including unsolicited interaction with
 91 | those enforcing the Code of Conduct, for a specified period of time. This
 92 | includes avoiding interactions in community spaces as well as external channels
 93 | like social media. Violating these terms may lead to a temporary or
 94 | permanent ban.
 95 | 
 96 | ### 3. Temporary Ban
 97 | 
 98 | **Community Impact**: A serious violation of community standards, including
 99 | sustained inappropriate behavior.
100 | 
101 | **Consequence**: A temporary ban from any sort of interaction or public
102 | communication with the community for a specified period of time. No public or
103 | private interaction with the people involved, including unsolicited interaction
104 | with those enforcing the Code of Conduct, is allowed during this period.
105 | Violating these terms may lead to a permanent ban.
106 | 
107 | ### 4. Permanent Ban
108 | 
109 | **Community Impact**: Demonstrating a pattern of violation of community
110 | standards, including sustained inappropriate behavior,  harassment of an
111 | individual, or aggression toward or disparagement of classes of individuals.
112 | 
113 | **Consequence**: A permanent ban from any sort of public interaction within
114 | the community.
115 | 
116 | ## Attribution
117 | 
118 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
119 | version 2.0, available at
120 | [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
121 | 
122 | Community Impact Guidelines were inspired by 
123 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
124 | 
125 | For answers to common questions about this code of conduct, see the FAQ at
126 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available 
127 | at [https://www.contributor-covenant.org/translations][translations].
128 | 
129 | [homepage]: https://www.contributor-covenant.org
130 | [v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html
131 | [Mozilla CoC]: https://github.com/mozilla/diversity
132 | [FAQ]: https://www.contributor-covenant.org/faq
133 | [translations]: https://www.contributor-covenant.org/translations
134 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to Beavers
2 | 
3 | See the [contributing](https://beavers.readthedocs.io/en/latest/contributing/) section of the doc.
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | [![PyPI Version][pypi-image]][pypi-url]
 3 | [![Python Version][versions-image]][versions-url]
 4 | [![Github Stars][stars-image]][stars-url]
 5 | [![codecov][codecov-image]][codecov-url]
 6 | [![Build Status][build-image]][build-url]
 7 | [![Documentation][doc-image]][doc-url]
 8 | [![License][license-image]][license-url]
 9 | [![Downloads][downloads-image]][downloads-url]
10 | [![Downloads][downloads-month-image]][downloads-month-url]
11 | [![Code style: black][codestyle-image]][codestyle-url]
12 | [![snyk][snyk-image]][snyk-url]
13 | <a href="https://trackgit.com">
14 | <img src="https://us-central1-trackgit-analytics.cloudfunctions.net/token/ping/m7c1fpbnueo78pkcetcm" alt="trackgit-views" />
15 | </a>
16 | 
17 | ![Beavers Logo][5]
18 | 
19 | # Beavers
20 | 
21 | [Documentation][6] / [Installation][7] / [Repository][1] / [PyPI][8]
22 | 
23 | [Beavers][1] is a python library for stream processing, optimized for analytics.
24 | 
25 | It is used at [Tradewell Technologies][2],
26 | to calculate analytics and serve model predictions,
27 | for both realtime and batch jobs.
28 | 
29 | ## Key Features
30 | 
31 | - Works in **real time** (eg: reading from Kafka) and **replay mode** (eg: reading from Parquet files).
32 | - Optimized for analytics, using micro-batches (instead of processing records one by one).
33 | - Similar to [incremental][3], it updates nodes in a dag incrementally.
34 | - Taking inspiration from [kafka streams][4], there are two types of nodes in the dag:
35 |     - **Stream**: ephemeral micro-batches of events (cleared after every cycle).
36 |     - **State**: durable state derived from streams.
37 | - Clear separation between the business logic and the IO.
38 |   So the same dag can be used in real time mode, replay mode or can be easily tested.
39 | - Functional interface: no inheritance or decorator required.
40 | - Support for complicated joins, not just "linear" data flow.
41 | 
42 | ## Limitations
43 | 
44 | - No concurrency support.
45 |   To speed up calculation use libraries like pandas, pyarrow or polars.
46 | - No async code.
47 |   To speed up IO use kafka driver native thread or parquet IO thread pool.
48 | - No support for persistent state.
49 |   Instead of saving state, replay historic data from kafka to prime stateful nodes.
50 | 
51 | ## Talks
52 | 
53 | - [Unified batch and stream processing in python | PyData Global 2023][9]
54 | 
55 | [1]: https://github.com/tradewelltech/beavers
56 | [2]: https://www.tradewelltech.co/
57 | [3]: https://github.com/janestreet/incremental
58 | [4]: https://www.confluent.io/blog/kafka-streams-tables-part-1-event-streaming/
59 | [5]: https://raw.githubusercontent.com/tradewelltech/beavers/master/docs/static/icons/beavers/logo.svg
60 | [6]: https://beavers.readthedocs.io/en/latest/
61 | [7]: https://beavers.readthedocs.io/en/latest/install/
62 | [8]: https://pypi.org/project/beavers/
63 | [9]: https://www.youtube.com/watch?v=8pUwsGA8SQM
64 | 
65 | [pypi-image]: https://img.shields.io/pypi/v/beavers
66 | [pypi-url]: https://pypi.org/project/beavers/
67 | [build-image]: https://github.com/tradewelltech/beavers/actions/workflows/ci.yaml/badge.svg
68 | [build-url]: https://github.com/tradewelltech/beavers/actions/workflows/ci.yaml
69 | [stars-image]: https://img.shields.io/github/stars/tradewelltech/beavers
70 | [stars-url]: https://github.com/tradewelltech/beavers
71 | [versions-image]: https://img.shields.io/pypi/pyversions/beavers
72 | [versions-url]: https://pypi.org/project/beavers/
73 | [doc-image]: https://readthedocs.org/projects/beavers/badge/?version=latest
74 | [doc-url]: https://beavers.readthedocs.io/en/latest/?badge=latest
75 | [license-image]: http://img.shields.io/:license-Apache%202-blue.svg
76 | [license-url]: https://github.com/tradewelltech/beavers/blob/main/LICENSE
77 | [codecov-image]: https://codecov.io/gh/tradewelltech/beavers/branch/main/graph/badge.svg?token=GY6KL7NT1Q
78 | [codecov-url]: https://codecov.io/gh/tradewelltech/beavers
79 | [downloads-image]: https://pepy.tech/badge/beavers
80 | [downloads-url]: https://static.pepy.tech/badge/beavers
81 | [downloads-month-image]: https://pepy.tech/badge/beavers/month
82 | [downloads-month-url]: https://static.pepy.tech/badge/beavers/month
83 | [codestyle-image]: https://img.shields.io/badge/code%20style-black-000000.svg
84 | [codestyle-url]: https://github.com/ambv/black
85 | [snyk-image]: https://snyk.io/advisor/python/beavers/badge.svg
86 | [snyk-url]: https://snyk.io/advisor/python/beavers
87 | 


--------------------------------------------------------------------------------
/beavers/__init__.py:
--------------------------------------------------------------------------------
1 | from beavers.dag import Dag, Node, TimerManager
2 | 
3 | __version__ = "0.0.0"
4 | __all__ = ["Dag", "Node", "TimerManager"]
5 | 


--------------------------------------------------------------------------------
/beavers/assets/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tradewelltech/beavers/ec9979086868589ab82b47ce55fa11cc31b32c16/beavers/assets/favicon.ico


--------------------------------------------------------------------------------
/beavers/pandas_wrapper.py:
--------------------------------------------------------------------------------
 1 | """Module for building dags using pandas."""
 2 | 
 3 | import dataclasses
 4 | from typing import Callable, Optional, ParamSpec
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from beavers import Dag, Node
 9 | from beavers.dag import NodePrototype
10 | 
11 | P = ParamSpec("P")
12 | 
13 | 
14 | def _empty_df(dtypes: pd.Series) -> pd.DataFrame:
15 |     return pd.DataFrame(columns=dtypes.index).astype(dtypes)
16 | 
17 | 
18 | def _get_stream_dtypes(node: Node[pd.DataFrame]) -> pd.Series:
19 |     empty = node._get_empty()
20 |     if not isinstance(empty, pd.DataFrame):
21 |         raise TypeError(f"Argument should be a {Node.__name__}[pd.DataFrame]")
22 |     else:
23 |         return empty.dtypes
24 | 
25 | 
26 | @dataclasses.dataclass()
27 | class _LastTracker:
28 |     key_columns: list[str]
29 |     current: pd.DataFrame
30 | 
31 |     def __call__(self, stream: pd.DataFrame):
32 |         self.current = (
33 |             pd.concat([self.current, stream])
34 |             .groupby(self.key_columns, as_index=False)
35 |             .tail(1)
36 |             .reset_index(drop=True)
37 |         )
38 | 
39 |         return self.current
40 | 
41 | 
42 | @dataclasses.dataclass(frozen=True)
43 | class PandasWrapper:
44 |     """Helper call for adding pandas Nodes to a Dag."""
45 | 
46 |     _dag: Dag
47 | 
48 |     def source_df(
49 |         self, dtypes: pd.Series, name: Optional[str] = None
50 |     ) -> Node[pd.DataFrame]:
51 |         empty = _empty_df(dtypes)
52 |         return self._dag.source_stream(empty, name=name)
53 | 
54 |     def df_stream(
55 |         self, function: Callable[P, pd.DataFrame], dtypes: pd.Series
56 |     ) -> NodePrototype[pd.DataFrame]:
57 |         return self._dag.stream(function, empty=_empty_df(dtypes))
58 | 
59 |     def last_by_keys(
60 |         self, stream: Node[pd.DataFrame], keys: list[str]
61 |     ) -> Node[pd.DataFrame]:
62 |         """Build a state of the latest row by keys."""
63 |         dtypes = _get_stream_dtypes(stream)
64 |         for key in keys:
65 |             assert key in dtypes, key
66 |         return self._dag.state(_LastTracker(keys, _empty_df(dtypes))).map(stream)
67 | 


--------------------------------------------------------------------------------
/beavers/perspective_wrapper.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import pathlib
  3 | from typing import Any, Literal, Optional, Sequence
  4 | 
  5 | import perspective
  6 | import pyarrow as pa
  7 | import tornado
  8 | from perspective.handlers.tornado import PerspectiveTornadoHandler
  9 | 
 10 | from beavers import Dag, Node
 11 | from beavers.kafka import KafkaDriver
 12 | 
 13 | COMPARATORS = (
 14 |     "==",
 15 |     "!=",
 16 |     ">",
 17 |     ">=",
 18 |     "<",
 19 |     "<=",
 20 |     "begins with",
 21 |     "contains",
 22 |     "ends with",
 23 |     "in",
 24 |     "not in",
 25 |     "is not null",
 26 |     "is null",
 27 | )
 28 | 
 29 | _SOURCE_DIRECTORY = pathlib.Path(__file__).parent
 30 | TABLE_PATH = str(_SOURCE_DIRECTORY / "table.html")
 31 | ASSETS_DIRECTORY = str(_SOURCE_DIRECTORY / "assets")
 32 | 
 33 | 
 34 | @dataclasses.dataclass(frozen=True)
 35 | class PerspectiveTableDefinition:
 36 |     """
 37 |     API table definition
 38 |     """
 39 | 
 40 |     name: str
 41 |     index_column: str
 42 |     remove_column: Optional[str] = None
 43 |     sort: list[tuple[str, Literal["asc", "desc"]]] = dataclasses.field(
 44 |         default_factory=list
 45 |     )
 46 |     filters: list[tuple[str, str, Any]] = dataclasses.field(default_factory=list)
 47 |     hidden_columns: Sequence[str] = ()
 48 |     limit: Optional[int] = None
 49 | 
 50 |     def validate(self, schema: pa.Schema):
 51 |         assert self.index_column in schema.names, self.index_column
 52 |         if self.remove_column is not None:
 53 |             assert isinstance(self.remove_column, str)
 54 |             assert self.remove_column in schema.names, self.remove_column
 55 | 
 56 |         assert isinstance(self.sort, list)
 57 |         for column, order in self.sort:
 58 |             assert isinstance(column, str)
 59 |             assert column in schema.names
 60 |             assert order in ("asc", "desc")
 61 |         for column in self.hidden_columns:
 62 |             assert isinstance(column, str)
 63 |             assert column in schema.names
 64 |         for each_filter in self.filters:
 65 |             assert len(each_filter) in (2, 3)
 66 |             assert isinstance(each_filter[0], str), each_filter
 67 |             assert each_filter[1] in COMPARATORS
 68 | 
 69 | 
 70 | @dataclasses.dataclass(frozen=True)
 71 | class _TableConfig:
 72 |     """
 73 |     Internal perspective table config, which is passed to the html template
 74 |     """
 75 | 
 76 |     name: str
 77 |     index: str
 78 |     columns: list[str]
 79 |     sort: Sequence[tuple[str, Literal["asc", "desc"]]]
 80 |     filters: Sequence[tuple[str, str, Any]]
 81 | 
 82 |     @staticmethod
 83 |     def from_definition(definition: PerspectiveTableDefinition, schema: pa.Schema):
 84 |         return _TableConfig(
 85 |             name=definition.name,
 86 |             index=definition.index_column,
 87 |             columns=[f for f in schema.names if f not in definition.hidden_columns],
 88 |             sort=[] if definition.sort is None else definition.sort,
 89 |             filters=definition.filters,
 90 |         )
 91 | 
 92 | 
 93 | class TableRequestHandler(tornado.web.RequestHandler):
 94 |     """Renders the table.html template, using the provided configurations"""
 95 | 
 96 |     _tables: Optional[dict[str, _TableConfig]] = None
 97 |     _default_table: Optional[str] = None
 98 | 
 99 |     def initialize(self, table_configs: list[_TableConfig]) -> None:
100 |         self._tables = {
101 |             table_config.name: table_config for table_config in table_configs
102 |         }
103 |         self._default_table = table_configs[0].name
104 | 
105 |     async def get(self, path: str) -> None:
106 |         table_name = path or self._default_table
107 |         table_config = self._tables[table_name]
108 | 
109 |         await self.render(
110 |             TABLE_PATH,
111 |             table_config=table_config,
112 |             perspective_version=perspective.__version__,
113 |         )
114 | 
115 | 
116 | def _table_to_bytes(table: pa.Table) -> bytes:
117 |     """Serialize a table as bytes, to pass it to a perspective table"""
118 |     with pa.BufferOutputStream() as sink:
119 |         with pa.ipc.new_stream(sink, table.schema) as writer:
120 |             for batch in table.to_batches():
121 |                 writer.write_batch(batch)
122 |         return sink.getvalue().to_pybytes()
123 | 
124 | 
125 | @dataclasses.dataclass(frozen=True)
126 | class _UpdateRunner:
127 |     kafka_driver: KafkaDriver
128 | 
129 |     def __call__(self):
130 |         self.kafka_driver.run_cycle(0.0)
131 | 
132 | 
133 | @dataclasses.dataclass()
134 | class _PerspectiveNode:
135 |     table_definition: PerspectiveTableDefinition
136 |     schema: pa.Schema
137 |     table: perspective.Table | None = None
138 | 
139 |     def __call__(self, table: pa.Table) -> None:
140 |         """Pass the arrow data to perspective"""
141 |         self.table.update(_table_to_bytes(table))
142 | 
143 |     def get_table_config(self) -> _TableConfig:
144 |         return _TableConfig.from_definition(self.table_definition, self.schema)
145 | 
146 | 
147 | @dataclasses.dataclass(frozen=True)
148 | class PerspectiveDagWrapper:
149 |     """Helper for adding perspective Nodes to a Dag."""
150 | 
151 |     _dag: Dag
152 | 
153 |     def to_perspective(
154 |         self,
155 |         node: Node,
156 |         table_definition: PerspectiveTableDefinition,
157 |         schema: Optional[pa.Schema] = None,
158 |     ) -> None:
159 |         """Add a source stream of type `pa.Table`."""
160 |         if schema is None:
161 |             assert node._is_stream(), "Must provide a schema for state nodes"
162 |             empty = node._empty_factory()
163 |             assert isinstance(empty, pa.Table), "Only pyarrow.Table nodes supported"
164 |             schema = empty.schema
165 |         table_definition.validate(schema)
166 |         self._dag.state(
167 |             _PerspectiveNode(
168 |                 table_definition,
169 |                 schema,
170 |                 table=None,
171 |             )
172 |         ).map(node)
173 | 
174 | 
175 | DATA_TYPES = [
176 |     (pa.types.is_integer, "integer"),
177 |     (pa.types.is_floating, "float"),
178 |     (pa.types.is_boolean, "boolean"),
179 |     (pa.types.is_date, "date"),
180 |     (pa.types.is_string, "string"),
181 |     (pa.types.is_timestamp, "datetime"),
182 | ]
183 | 
184 | 
185 | def to_perspective_type(data_type: pa.DataType) -> Any:
186 |     for predicate, perspective_type in DATA_TYPES:
187 |         if predicate(data_type):
188 |             return perspective_type
189 |     raise TypeError(f"Unsupported type: {data_type}")
190 | 
191 | 
192 | def to_perspective_schema(schema: pa.Schema) -> dict[str, Any]:
193 |     return {f.name: to_perspective_type(f.type) for f in schema}
194 | 
195 | 
196 | def perspective_thread(
197 |     perspective_server: perspective.Server,
198 |     kafka_driver: KafkaDriver,
199 |     nodes: list[_PerspectiveNode],
200 | ):
201 |     local_client = perspective_server.new_local_client()
202 |     for node in nodes:
203 |         assert node.table is None
204 |         node.table = local_client.table(
205 |             to_perspective_schema(node.schema),
206 |             name=node.table_definition.name,
207 |             index=node.table_definition.index_column,
208 |         )
209 | 
210 |     callback = tornado.ioloop.PeriodicCallback(
211 |         callback=_UpdateRunner(kafka_driver), callback_time=1_000
212 |     )
213 |     callback.start()
214 | 
215 | 
216 | def run_web_application(
217 |     kafka_driver: KafkaDriver,
218 |     assets_directory: str = ASSETS_DIRECTORY,
219 |     port: int = 8082,
220 | ) -> None:
221 |     server = perspective.Server()
222 | 
223 |     nodes: list[_PerspectiveNode] = []
224 |     for node in kafka_driver._dag._nodes:
225 |         if isinstance(node._function, _PerspectiveNode):
226 |             nodes.append(node._function)
227 |     assert len(nodes) > 0, "No perspective table nodes"
228 |     assert len({n.table_definition.name for n in nodes}) == len(nodes), (
229 |         "Duplicate table name"
230 |     )
231 | 
232 |     web_app = tornado.web.Application(
233 |         [
234 |             (
235 |                 r"/websocket",
236 |                 PerspectiveTornadoHandler,
237 |                 {"perspective_server": server},
238 |             ),
239 |             (
240 |                 r"/assets/(.*)",
241 |                 tornado.web.StaticFileHandler,
242 |                 {"path": assets_directory, "default_filename": None},
243 |             ),
244 |             (
245 |                 r"/([a-z0-9_]*)",
246 |                 TableRequestHandler,
247 |                 {"table_configs": [node.get_table_config() for node in nodes]},
248 |             ),
249 |         ],
250 |         serve_traceback=True,
251 |     )
252 |     web_app.listen(port)
253 |     loop = tornado.ioloop.IOLoop.current()
254 |     loop.call_later(0, perspective_thread, server, kafka_driver, nodes)
255 |     loop.start()
256 | 


--------------------------------------------------------------------------------
/beavers/polars_wrapper.py:
--------------------------------------------------------------------------------
  1 | """Module for building dags using polars."""
  2 | 
  3 | import dataclasses
  4 | from operator import itemgetter
  5 | from typing import Callable, Optional, ParamSpec, Iterable, Any
  6 | 
  7 | import polars as pl
  8 | from polars._typing import IntoExprColumn
  9 | 
 10 | from beavers.dag import Dag, Node, NodePrototype
 11 | 
 12 | P = ParamSpec("P")
 13 | 
 14 | 
 15 | @dataclasses.dataclass()
 16 | class _LastByKey:
 17 |     key_columns: tuple[str, ...]
 18 |     current: pl.DataFrame
 19 | 
 20 |     def __call__(self, stream: pl.DataFrame) -> pl.DataFrame:
 21 |         self.current = (
 22 |             pl.concat([self.current, stream])
 23 |             .group_by(self.key_columns, maintain_order=True)
 24 |             .last()
 25 |             .select(self.current.columns)
 26 |         )
 27 |         return self.current
 28 | 
 29 | 
 30 | def _get_stream_schema(node: Node[pl.DataFrame]) -> pl.Schema:
 31 |     empty = node._get_empty()
 32 |     if not isinstance(empty, pl.DataFrame):
 33 |         raise TypeError(f"Argument should be a {Node.__name__}[pl.DataFrame]")
 34 |     else:
 35 |         return empty.schema
 36 | 
 37 | 
 38 | def _get_stream_dtype(node: Node[pl.Series]) -> pl.DataType:
 39 |     empty = node._get_empty()
 40 |     if not isinstance(empty, pl.Series):
 41 |         raise TypeError(f"Argument should be a {Node.__name__}[pl.Series]")
 42 |     else:
 43 |         return empty.dtype
 44 | 
 45 | 
 46 | @dataclasses.dataclass(frozen=True)
 47 | class _TableFilter:
 48 |     predicate: tuple[IntoExprColumn | Iterable[IntoExprColumn], ...]
 49 |     constraints: dict[str, Any]
 50 | 
 51 |     def __call__(self, table: pl.DataFrame) -> pl.DataFrame:
 52 |         return table.filter(*self.predicate, **self.constraints)
 53 | 
 54 | 
 55 | @dataclasses.dataclass(frozen=True)
 56 | class PolarsDagWrapper:
 57 |     """Helper for adding polars Nodes to a Dag."""
 58 | 
 59 |     _dag: Dag
 60 | 
 61 |     def source_table(
 62 |         self, schema: pl.Schema, name: Optional[str] = None
 63 |     ) -> Node[pl.DataFrame]:
 64 |         """Add a source stream of type `pl.DataFrame`."""
 65 | 
 66 |         return self._dag.source_stream(empty=schema.to_frame(), name=name)
 67 | 
 68 |     def table_stream(
 69 |         self, function: Callable[P, pl.DataFrame], schema: pl.Schema
 70 |     ) -> NodePrototype[pl.DataFrame]:
 71 |         """Add a stream node of output type `pl.DataFrame`"""
 72 |         return self._dag.stream(function, empty=schema.to_frame())
 73 | 
 74 |     def filter_stream(
 75 |         self,
 76 |         stream: Node[pl.DataFrame],
 77 |         *predicates: IntoExprColumn | Iterable[IntoExprColumn],
 78 |         **constraints: Any,
 79 |     ) -> Node[pl.DataFrame]:
 80 |         """Filter a stream Node of type `pl.DataFrame`."""
 81 |         schema = _get_stream_schema(stream)
 82 |         return self._dag.stream(
 83 |             _TableFilter(tuple(predicates), dict(constraints)),
 84 |             empty=schema.to_frame(),
 85 |         ).map(stream)
 86 | 
 87 |     def last_by_keys(
 88 |         self, stream: Node[pl.DataFrame], keys: list[str]
 89 |     ) -> Node[pl.DataFrame]:
 90 |         """Build a state of the latest row by keys."""
 91 |         schema = _get_stream_schema(stream)
 92 |         for key in keys:
 93 |             assert isinstance(key, str), "Keys must be strings"
 94 |         return self._dag.state(_LastByKey(tuple(keys), schema.to_frame())).map(stream)
 95 | 
 96 |     def concat_series(self, *streams: Node[pl.Series]) -> Node[pl.Series]:
 97 |         if len(streams) == 0:
 98 |             raise ValueError("Must pass at least one series")
 99 |         series_type = None
100 |         for stream in streams:
101 |             each_type = _get_stream_dtype(stream)
102 |             if series_type is None:
103 |                 series_type = each_type
104 |             elif series_type != each_type:
105 |                 raise TypeError(f"Series type mismatch {series_type} vs {each_type}")
106 | 
107 |         empty = pl.Series(dtype=series_type)
108 |         return self._dag.stream(lambda *x: pl.concat(x), empty=empty).map(*streams)
109 | 
110 |     def get_series(self, stream: Node[pl.DataFrame], name: str) -> Node[pl.Series]:
111 |         empty = _get_stream_schema(stream).to_frame()[name]
112 |         return self._dag.stream(itemgetter(name), empty=empty).map(stream)
113 | 


--------------------------------------------------------------------------------
/beavers/pyarrow_kafka.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | import io
 3 | import json
 4 | 
 5 | import confluent_kafka
 6 | import pyarrow as pa
 7 | import pyarrow.json
 8 | 
 9 | from beavers.kafka import (
10 |     KafkaMessageDeserializer,
11 |     KafkaMessageSerializer,
12 |     KafkaProducerMessage,
13 | )
14 | 
15 | 
16 | @dataclasses.dataclass(frozen=True)
17 | class JsonDeserializer(KafkaMessageDeserializer[pa.Table]):
18 |     schema: pa.Schema
19 | 
20 |     def __call__(self, messages: confluent_kafka.Message) -> pa.Table:
21 |         if messages:
22 |             with io.BytesIO() as buffer:
23 |                 for message in messages:
24 |                     buffer.write(message.value())
25 |                     buffer.write(b"\n")
26 |                 buffer.seek(0)
27 |                 return pyarrow.json.read_json(
28 |                     buffer,
29 |                     parse_options=pyarrow.json.ParseOptions(
30 |                         explicit_schema=self.schema
31 |                     ),
32 |                 )
33 |         else:
34 |             return self.schema.empty_table()
35 | 
36 | 
37 | @dataclasses.dataclass(frozen=True)
38 | class JsonSerializer(KafkaMessageSerializer[pa.Table]):
39 |     topic: str
40 | 
41 |     def __call__(self, table: pa.Table):
42 |         return [
43 |             KafkaProducerMessage(
44 |                 self.topic,
45 |                 key=None,
46 |                 value=json.dumps(message, default=str).encode("utf-8"),
47 |             )
48 |             for message in table.to_pylist()
49 |         ]
50 | 


--------------------------------------------------------------------------------
/beavers/pyarrow_replay.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | from typing import Callable
 3 | 
 4 | import pandas as pd
 5 | import pyarrow as pa
 6 | 
 7 | from beavers.dag import UTC_MAX
 8 | from beavers.replay import DataSink, DataSource
 9 | 
10 | 
11 | class ArrowTableDataSource(DataSource[pa.Table]):
12 |     def __init__(
13 |         self, table: pa.Table, timestamp_extractor: Callable[[pa.Table], pa.Array]
14 |     ):
15 |         assert callable(timestamp_extractor)
16 |         self._table = table
17 |         self._empty_table = table.schema.empty_table()
18 |         self._timestamp_column = timestamp_extractor(table).to_pandas(
19 |             date_as_object=False
20 |         )
21 |         assert self._timestamp_column.is_monotonic_increasing, (
22 |             "Timestamp column should be monotonic increasing"
23 |         )
24 |         self._index = 0
25 | 
26 |     def read_to(self, timestamp: pd.Timestamp) -> pa.Table:
27 |         new_index = self._timestamp_column.searchsorted(timestamp, side="right")
28 |         if new_index > self._index:
29 |             from_index = self._index
30 |             self._index = new_index
31 |             return self._table.slice(from_index, new_index - from_index)
32 |         else:
33 |             results = self._empty_table
34 |         return results
35 | 
36 |     def get_next(self) -> pd.Timestamp:
37 |         if self._index >= len(self._table):
38 |             return UTC_MAX
39 |         else:
40 |             return self._timestamp_column.iloc[self._index]
41 | 
42 | 
43 | @dataclasses.dataclass
44 | class ArrowTableDataSink(DataSink[pa.Table]):
45 |     saver: Callable[[pa.Table], None]
46 |     chunks: list[pa.Table] = dataclasses.field(default_factory=list)
47 | 
48 |     def append(self, timestamp: pd.Timestamp, data: pa.Table):
49 |         self.chunks.append(data)
50 | 
51 |     def close(self):
52 |         if self.chunks:
53 |             results = pa.concat_tables(self.chunks)
54 |             self.saver(results)
55 | 


--------------------------------------------------------------------------------
/beavers/pyarrow_wrapper.py:
--------------------------------------------------------------------------------
  1 | """Module for building dags using pyarrow."""
  2 | 
  3 | import dataclasses
  4 | from typing import Callable, Iterable, Optional, ParamSpec, Sequence
  5 | 
  6 | import numpy as np
  7 | import pyarrow as pa
  8 | 
  9 | from beavers.dag import Dag, Node, NodePrototype, _check_function
 10 | 
 11 | P = ParamSpec("P")
 12 | 
 13 | 
 14 | @dataclasses.dataclass(frozen=True)
 15 | class _TableFiler:
 16 |     predicate: Callable[[pa.Table, ...], pa.Array]
 17 | 
 18 |     def __call__(self, table: pa.Table, *args, **kwargs) -> pa.Table:
 19 |         return table.filter(self.predicate(table, *args, **kwargs))
 20 | 
 21 | 
 22 | def _get_last_by(table: pa.Table, keys: Sequence[str]) -> pa.Table:
 23 |     return table.take(
 24 |         table.select(keys)
 25 |         .append_column("_beavers_index", pa.array(np.arange(len(table))))
 26 |         .group_by(keys)
 27 |         .aggregate([("_beavers_index", "max")])["_beavers_index_max"]
 28 |         .sort()
 29 |     )
 30 | 
 31 | 
 32 | def _concat_arrow_arrays(
 33 |     arrow_arrays: Sequence[pa.ChunkedArray],
 34 | ) -> [pa.Array | pa.ChunkedArray]:
 35 |     arrays: list[pa.Array] = []
 36 |     for arrow_array in arrow_arrays:
 37 |         if isinstance(arrow_array, pa.ChunkedArray):
 38 |             arrays.extend(arrow_array.iterchunks())
 39 |         elif isinstance(arrow_array, pa.Array):
 40 |             arrays.append(arrow_array)
 41 |         else:
 42 |             raise TypeError(arrow_array)
 43 | 
 44 |     return pa.chunked_array(arrays)
 45 | 
 46 | 
 47 | def _check_column(column: str, schema: pa.Schema):
 48 |     if not isinstance(column, str):
 49 |         raise TypeError(column)
 50 |     elif column not in schema.names:
 51 |         raise TypeError(f"field {column} no in schema: {schema.names}")
 52 | 
 53 | 
 54 | def _check_array(node: Node[pa.Array | pa.ChunkedArray]) -> pa.DataType:
 55 |     empty = node._get_empty()
 56 |     if not isinstance(empty, (pa.Array, pa.ChunkedArray)):
 57 |         raise TypeError(f"Argument should be a {Node.__name__}[pa.Array]")
 58 |     else:
 59 |         return empty.type
 60 | 
 61 | 
 62 | def _check_columns(columns: list[str], schema: pa.Schema) -> list[str]:
 63 |     if not isinstance(columns, Iterable):
 64 |         raise TypeError(columns)
 65 |     for column in columns:
 66 |         if not isinstance(column, str):
 67 |             raise TypeError(column)
 68 |         elif column not in schema.names:
 69 |             raise TypeError(f"field {column} no in schema: {schema.names}")
 70 |     return list(columns)
 71 | 
 72 | 
 73 | def _get_stream_schema(node: Node[pa.Table]) -> pa.Schema:
 74 |     empty = node._get_empty()
 75 |     if not isinstance(empty, pa.Table):
 76 |         raise TypeError(f"Argument should be a {Node.__name__}[pa.Table]")
 77 |     else:
 78 |         return empty.schema
 79 | 
 80 | 
 81 | @dataclasses.dataclass()
 82 | class _LastByKey:
 83 |     key_columns: tuple[str, ...]
 84 |     current: pa.Table
 85 | 
 86 |     def __call__(self, stream: pa.Table) -> pa.Table:
 87 |         self.current = _get_last_by(
 88 |             pa.concat_tables([self.current, stream]), self.key_columns
 89 |         )
 90 |         return self.current
 91 | 
 92 | 
 93 | @dataclasses.dataclass(frozen=True)
 94 | class ArrowDagWrapper:
 95 |     """Helper for adding pyarrow Nodes to a Dag."""
 96 | 
 97 |     _dag: Dag
 98 | 
 99 |     def source_table(
100 |         self, schema: pa.Schema, name: Optional[str] = None
101 |     ) -> Node[pa.Table]:
102 |         """Add a source stream of type `pa.Table`."""
103 |         return self._dag.source_stream(empty=schema.empty_table(), name=name)
104 | 
105 |     def table_stream(
106 |         self, function: Callable[P, pa.Table], schema: pa.Schema
107 |     ) -> NodePrototype[pa.Table]:
108 |         """Add a stream node of output type `pa.Table`"""
109 |         return self._dag.stream(function, empty=schema.empty_table())
110 | 
111 |     def filter_stream(
112 |         self,
113 |         predicate: Callable[[pa.Table, ...], pa.Array],
114 |         stream: Node[pa.Table],
115 |         *args: Node,
116 |         **kwargs: Node,
117 |     ) -> Node[pa.Table]:
118 |         """Filter a stream Node of type `pa.Table`."""
119 |         function = _TableFiler(predicate)
120 |         schema = _get_stream_schema(stream)
121 |         _check_function(function)
122 |         return self._dag.stream(function, empty=schema.empty_table()).map(
123 |             stream, *args, **kwargs
124 |         )
125 | 
126 |     def last_by_keys(
127 |         self, stream: Node[pa.Table], keys: Sequence[str]
128 |     ) -> Node[pa.Table]:
129 |         """Build a state of the latest row by keys."""
130 |         schema = _get_stream_schema(stream)
131 |         keys = _check_columns(keys, schema)
132 |         return self._dag.state(_LastByKey(keys, schema.empty_table())).map(stream)
133 | 
134 |     def get_column(self, stream: Node[pa.Table], key: str) -> Node[pa.ChunkedArray]:
135 |         """Return a column from a stream node of type pa.Table."""
136 |         schema = _get_stream_schema(stream)
137 |         _check_column(key, schema)
138 |         field = schema.field(key)
139 |         empty = pa.chunked_array([pa.array([], field.type)])
140 |         return self._dag.stream(lambda x: x[key], empty=empty).map(stream)
141 | 
142 |     def concat_arrays(
143 |         self, *streams: Node[pa.Array | pa.ChunkedArray]
144 |     ) -> Node[pa.ChunkedArray]:
145 |         if len(streams) == 0:
146 |             raise ValueError("Must pass at least one array")
147 |         array_type = None
148 |         for stream in streams:
149 |             each_type = _check_array(stream)
150 |             if array_type is None:
151 |                 array_type = each_type
152 |             elif array_type != each_type:
153 |                 raise TypeError(f"Array type mismatch {array_type} vs {each_type}")
154 | 
155 |         empty = pa.chunked_array([pa.array([], array_type)])
156 |         return self._dag.stream(lambda *x: _concat_arrow_arrays(x), empty=empty).map(
157 |             *streams
158 |         )
159 | 


--------------------------------------------------------------------------------
/beavers/replay.py:
--------------------------------------------------------------------------------
  1 | """Module for replaying historical data."""
  2 | 
  3 | import abc
  4 | import collections.abc
  5 | import dataclasses
  6 | import logging
  7 | import time
  8 | from typing import Callable, Generic, Iterator, Optional, Protocol, TypeVar
  9 | 
 10 | import pandas as pd
 11 | 
 12 | from beavers.dag import UTC_MAX, Dag, Node
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | T = TypeVar("T")
 17 | 
 18 | 
 19 | @dataclasses.dataclass(frozen=True)
 20 | class ReplayContext:
 21 |     """
 22 |     Stores the information about a replay.
 23 | 
 24 |     Attributes
 25 |     ----------
 26 |     start: pd.Timestamp
 27 |         Start of the replay
 28 |     end: pd.Timestamp
 29 |         End of the replay.
 30 |         This is exclusive, the replay will stop 1ns before
 31 |     frequency:
 32 |         How often should the replay run
 33 | 
 34 |     """
 35 | 
 36 |     start: pd.Timestamp
 37 |     end: pd.Timestamp
 38 |     frequency: pd.Timedelta
 39 | 
 40 |     def __post_init__(self):
 41 |         """Check arguments are valid."""
 42 |         assert self.start.tzname() == "UTC"
 43 |         assert self.end.tzname() == "UTC"
 44 | 
 45 | 
 46 | class DataSource(Protocol[T]):
 47 |     """Interface for replaying historical data from a file or database."""
 48 | 
 49 |     def read_to(self, timestamp: pd.Timestamp) -> T:
 50 |         """
 51 |         Read from the data source, all the way to the provided timestamp (inclusive).
 52 | 
 53 |         This function is stateful and must remember the previous timestamp
 54 |          for which data was read.
 55 | 
 56 |         Parameters
 57 |         ----------
 58 |         timestamp
 59 |             End of the time interval for which data is required (inclusive)
 60 | 
 61 |         Returns
 62 |         -------
 63 |         data
 64 |             The data for the interval (or empty if no data is found)
 65 | 
 66 |         """
 67 | 
 68 |     def get_next(self) -> pd.Timestamp:
 69 |         """
 70 |         Return the next timestamp for which there is data.
 71 | 
 72 |         If no data is available this should return `UTC_MAX`
 73 | 
 74 | 
 75 |         Returns
 76 |         -------
 77 |         timestamp: pd.Timestamp
 78 |             Timestamp of the next available data point (or `UTC_MAX` if no more data
 79 |              is available)
 80 | 
 81 |         """
 82 | 
 83 | 
 84 | class DataSink(Protocol[T]):
 85 |     """Interface for saving the results of a replay to a file or database."""
 86 | 
 87 |     def append(self, timestamp: pd.Timestamp, data: T):
 88 |         """
 89 |         Append data for the current cycle.
 90 | 
 91 |         Parameters
 92 |         ----------
 93 |         timestamp:
 94 |             End of the time interval for which data was replayed (inclusive)
 95 |         data:
 96 |             The generated data
 97 | 
 98 |         """
 99 | 
100 |     def close(self):
101 |         """Flush the data and clean up resources."""
102 | 
103 | 
104 | class DataSourceProvider(Protocol[T]):
105 |     """Interface for the provision of `DataSource`."""
106 | 
107 |     def __call__(self, replay_context: ReplayContext) -> DataSource[T]:
108 |         """
109 |         Create a `DataSource` for the given replay_context.
110 | 
111 |         Parameters
112 |         ----------
113 |         replay_context:
114 |             Information about the replay that's about to run
115 | 
116 |         Returns
117 |         -------
118 |         DataSource[T]:
119 |             Source for the replay
120 | 
121 |         """
122 | 
123 | 
124 | class DataSinkProvider(Protocol[T]):
125 |     """Interface for the provision of `DataSink`."""
126 | 
127 |     @abc.abstractmethod
128 |     def __call__(self, replay_context: ReplayContext) -> DataSink[T]:
129 |         """
130 |         Create a `DataSink` for the given replay_context.
131 | 
132 |         Parameters
133 |         ----------
134 |         replay_context:
135 |             Information about the replay that's about to run
136 | 
137 |         Returns
138 |         -------
139 |         DataSink[T]:
140 |             Sink for the replay
141 | 
142 |         """
143 | 
144 | 
145 | @dataclasses.dataclass(frozen=True)
146 | class _ReplaySource(Generic[T]):
147 |     """Internal class used to store `DataSource` at runtime."""
148 | 
149 |     name: str
150 |     node: Node[T]
151 |     data_source: DataSource[T]
152 | 
153 | 
154 | @dataclasses.dataclass(frozen=True)
155 | class _ReplaySink(Generic[T]):
156 |     """Internal class used to store `DataSink` at runtime."""
157 | 
158 |     name: str
159 |     nodes: list[Node[T]]
160 |     data_sink: DataSink[T]
161 | 
162 | 
163 | @dataclasses.dataclass(frozen=True)
164 | class ReplayCycleMetrics:
165 |     """Metrics for each replay cycle."""
166 | 
167 |     timestamp: pd.Timestamp
168 |     cycle_id: int
169 |     source_records: int
170 |     sink_records: int
171 |     cycle_time_ns: int
172 |     warp_ratio: float
173 | 
174 | 
175 | @dataclasses.dataclass
176 | class ReplayDriver:
177 |     """
178 |     Orchestrate the replay of data for dag.
179 | 
180 |     This will:
181 | 
182 |     - create the relevant `DataSource`s
183 |     - create the relevant `DataSink`s
184 |     - stream the data from the sources
185 |     - inject the input data in the dag source nodes
186 |     - execute the dag
187 |     - collect the output data and pass it to the sink
188 |     - close the sink at the end of the run
189 | 
190 |     Notes
191 |     -----
192 |     Do not call the constructor directly, use `create` instead
193 | 
194 |     """
195 | 
196 |     dag: Dag
197 |     replay_context: ReplayContext
198 |     sources: list[_ReplaySource]
199 |     sinks: list[_ReplaySink]
200 |     current_time: pd.Timestamp
201 | 
202 |     @staticmethod
203 |     def create(
204 |         dag: Dag,
205 |         replay_context: ReplayContext,
206 |         data_source_providers: dict[str, DataSourceProvider],
207 |         data_sink_providers: dict[str, DataSinkProvider],
208 |     ) -> "ReplayDriver":
209 |         return ReplayDriver(
210 |             dag,
211 |             replay_context,
212 |             _create_sources(dag, replay_context, data_source_providers),
213 |             _create_sinks(dag, replay_context, data_sink_providers),
214 |             current_time=replay_context.start,
215 |         )
216 | 
217 |     def run(self):
218 |         while not self.is_done():
219 |             self.run_cycle()
220 |         for sink in self.sinks:
221 |             sink.data_sink.close()
222 | 
223 |     def is_done(self) -> bool:
224 |         return self.current_time > self.replay_context.end
225 | 
226 |     def run_cycle(self) -> Optional[ReplayCycleMetrics]:
227 |         st = time.time_ns()
228 |         source_records, next_timestamp = self.read_sources()
229 |         if source_records or self.dag.get_next_timer() <= self.current_time:
230 |             timestamp = min(self.current_time, self.replay_context.end)
231 |             self.dag.execute(timestamp)
232 |             sink_records = self.flush_sinks()
233 |             et = time.time_ns()
234 |             warp_ratio = self.replay_context.frequency.value / (et - st)
235 |             metrics = ReplayCycleMetrics(
236 |                 timestamp=timestamp,
237 |                 cycle_id=self.dag.get_cycle_id(),
238 |                 source_records=source_records,
239 |                 sink_records=sink_records,
240 |                 cycle_time_ns=et - st,
241 |                 warp_ratio=warp_ratio,
242 |             )
243 |             logger.info(
244 |                 f"Running cycle={metrics.cycle_id} "
245 |                 f"timestamp={metrics.timestamp} "
246 |                 f"source_records={metrics.source_records} "
247 |                 f"sink_records={metrics.sink_records} "
248 |                 f"warp={warp_ratio:.1f}"
249 |             )
250 |         else:
251 |             metrics = None
252 | 
253 |         self.current_time = max(
254 |             next_timestamp, self.current_time + self.replay_context.frequency
255 |         ).ceil(self.replay_context.frequency)
256 |         return metrics
257 | 
258 |     def read_sources(self) -> tuple[int, pd.Timestamp]:
259 |         records = 0
260 |         next_timestamp = self.replay_context.end
261 |         for replay_source in self.sources:
262 |             source_data = replay_source.data_source.read_to(self.current_time)
263 |             next_timestamp = min(next_timestamp, replay_source.data_source.get_next())
264 |             if len(source_data) > 0:
265 |                 replay_source.node.set_stream(source_data)
266 |                 records += len(source_data)
267 |         return records, next_timestamp
268 | 
269 |     def flush_sinks(self) -> int:
270 |         records = 0
271 |         for sink in self.sinks:
272 |             for node in sink.nodes:
273 |                 if node.get_cycle_id() == self.dag.get_cycle_id():
274 |                     sink_value = node.get_sink_value()
275 |                     records += (
276 |                         len(sink_value)
277 |                         if isinstance(sink_value, collections.abc.Sized)
278 |                         else 1
279 |                     )
280 |                     sink.data_sink.append(self.current_time, node.get_sink_value())
281 |         return records
282 | 
283 | 
284 | def _create_sources(
285 |     dag: Dag,
286 |     replay_context: ReplayContext,
287 |     data_source_providers: dict[str, DataSourceProvider],
288 | ) -> list[_ReplaySource]:
289 |     source_nodes = dag.get_sources()
290 |     nodes_names = sorted(source_nodes.keys())
291 |     source_names = sorted(data_source_providers.keys())
292 |     if nodes_names != source_names:
293 |         raise ValueError(
294 |             "Source node and DataSource names don't match: "
295 |             f"{nodes_names}  vs {source_names}"
296 |         )
297 |     return [
298 |         _ReplaySource(
299 |             name, source_nodes[name], data_source_providers[name](replay_context)
300 |         )
301 |         for name in data_source_providers.keys()
302 |     ]
303 | 
304 | 
305 | def _create_sinks(
306 |     dag: Dag,
307 |     replay_context: ReplayContext,
308 |     data_sink_providers: dict[str, DataSinkProvider],
309 | ) -> list[_ReplaySink]:
310 |     sink_nodes = dag.get_sinks()
311 |     nodes_names = sorted(sink_nodes.keys())
312 |     sink_names = sorted(data_sink_providers.keys())
313 |     if nodes_names != sink_names:
314 |         raise ValueError(
315 |             f"Sink node and DataSink names don't match: {nodes_names}  vs {sink_names}"
316 |         )
317 |     return [
318 |         _ReplaySink(name, sink_nodes[name], data_sink_providers[name](replay_context))
319 |         for name in data_sink_providers.keys()
320 |     ]
321 | 
322 | 
323 | class IteratorDataSourceAdapter(DataSource[T]):
324 |     """
325 |     Adapter between an iterator of `DataSource` and a DataSource.
326 | 
327 |     This can be used to stitch together various `DataSource` for incremental date range
328 |     """
329 | 
330 |     def __init__(
331 |         self,
332 |         sources: Iterator[DataSource[T]],
333 |         empty: T,
334 |         concatenator: Callable[[T, T], T],
335 |     ):
336 |         self._sources = sources
337 |         self._empty = empty
338 |         self._concatenator = concatenator
339 |         self._current = self._next()
340 | 
341 |     def read_to(self, timestamp: pd.Timestamp) -> T:
342 |         if self._current is None:
343 |             return self._empty
344 |         else:
345 |             this_batch = self._current.read_to(timestamp)
346 |             while self._current is not None and self._current.get_next() == UTC_MAX:
347 |                 self._current = self._next()
348 |                 next_batch = (
349 |                     self._empty
350 |                     if self._current is None
351 |                     else self._current.read_to(timestamp)
352 |                 )
353 |                 if next_batch and this_batch:
354 |                     this_batch = self._concatenator(this_batch, next_batch)
355 |                 elif next_batch:
356 |                     this_batch = next_batch
357 | 
358 |             return this_batch
359 | 
360 |     def get_next(self) -> pd.Timestamp:
361 |         if self._current is None:
362 |             return UTC_MAX
363 |         else:
364 |             return self._current.get_next()
365 | 
366 |     def _next(self) -> Optional[DataSource]:
367 |         try:
368 |             return next(self._sources)
369 |         except StopIteration:
370 |             return None
371 | 
372 | 
373 | class NoOpDataSink(DataSink):
374 |     """DataSink that does nothing."""
375 | 
376 |     def append(self, timestamp: pd.Timestamp, data: T):
377 |         pass
378 | 
379 |     def close(self):
380 |         pass
381 | 
382 | 
383 | class NoOpDataSinkProvider:
384 |     """DataSinkProvider that provides a NoOpDataSink."""
385 | 
386 |     def __call__(self, context: ReplayContext) -> DataSink[T]:
387 |         return NoOpDataSink()
388 | 


--------------------------------------------------------------------------------
/beavers/table.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |     <head>
 4 |         <title>{{table_config.name}} Beavers</title>
 5 |         <link rel="icon" type="image/x-icon" href="/assets/favicon.ico" />
 6 |         <link rel="shortcut icon" type="image/x-icon" href="/assets/favicon.ico" />
 7 |         <meta
 8 |             name="viewport"
 9 |             content="width=device-width, initial-scale=1, maximum-scale=1, minimum-scale=1, user-scalable=no"
10 |         />
11 |         <script type="module" src="https://cdn.jsdelivr.net/npm/@finos/perspective-viewer-datagrid@{{perspective_version}}/dist/cdn/perspective-viewer-datagrid.js"></script>
12 |         <script type="module" src="https://cdn.jsdelivr.net/npm/@finos/perspective-viewer-d3fc@{{perspective_version}}/dist/cdn/perspective-viewer-d3fc.js" ></script>
13 | 
14 |         <link
15 |                 rel="stylesheet"
16 |                 crossorigin="anonymous"
17 |                 href="https://cdn.jsdelivr.net/npm/@finos/perspective-viewer@{{perspective_version}}/dist/css/pro.css"
18 |         />
19 | 
20 |         <style>
21 |             body {
22 |                 height: 100vh;
23 |                 width: 100vw;
24 |                 font-family: 'Roboto Mono';
25 |                 overflow: scroll;
26 |             }
27 |             .perspective-table-class {
28 |                 position: relative;
29 |                 height: 100%;
30 |             }
31 |         </style>
32 |     </head>
33 | 
34 |     <body>
35 |         <perspective-viewer id="viewer" class="perspective-table-class"> </perspective-viewer>
36 | 
37 |         <script type="module">
38 |             import "https://cdn.jsdelivr.net/npm/@finos/perspective-viewer@{{perspective_version}}/dist/cdn/perspective-viewer.js";
39 |             import perspective from "https://cdn.jsdelivr.net/npm/@finos/perspective@{{perspective_version}}/dist/cdn/perspective.js";
40 | 
41 |             const viewer = document.getElementById("viewer");
42 | 
43 |             const websocket_uri = "ws://" + window.location.host + "/websocket";
44 |             const websocket = await perspective.websocket(websocket_uri);
45 |             const table = await websocket.open_table("{{table_config.name}}");
46 | 
47 |             viewer.load(table);
48 | 
49 |             let config = await viewer.save();
50 | 
51 |             config.name = "{{table_config.name}}";
52 |             config.title = "{{table_config.name}}";
53 |             config.columns = {% raw json_encode(table_config.columns) %};
54 |             config.sort = {% raw json_encode(table_config.sort) %};
55 |             config.filter = {% raw json_encode(table_config.filters) %};
56 | 
57 |             await viewer.restore(config);
58 | 
59 |         </script>
60 |     </body>
61 | </html>
62 | 


--------------------------------------------------------------------------------
/beavers/testing.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional, Sequence, TypeVar
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from beavers.dag import Dag
 6 | 
 7 | T = TypeVar("T")
 8 | 
 9 | 
10 | class DagTestBench:
11 |     def __init__(self, dag: Dag):
12 |         self.dag = dag
13 |         for output_name, output_sinks in self.dag.get_sinks().items():
14 |             assert len(output_sinks) == 1, output_name
15 | 
16 |     def set_source(
17 |         self,
18 |         source_name: str,
19 |         source_data: Any,
20 |     ) -> "DagTestBench":
21 |         source = self.dag.get_sources()[source_name]
22 |         source.set_stream(source_data)
23 |         return self
24 | 
25 |     def execute(self, now: Optional[pd.Timestamp] = None) -> "DagTestBench":
26 |         self.dag.execute(now)
27 |         return self
28 | 
29 |     def assert_sink_list(
30 |         self,
31 |         sink_name: str,
32 |         expected_messages: Sequence[T],
33 |     ) -> "DagTestBench":
34 |         sinks = self.dag.get_sinks()[sink_name]
35 |         assert len(sinks) == 1
36 |         cycle_id = sinks[0].get_cycle_id()
37 |         assert cycle_id == self.dag.get_cycle_id()
38 |         actual_messages = sinks[0].get_sink_value()
39 |         assert len(actual_messages) == len(expected_messages), (
40 |             f"Sink {sink_name} value size mismatch"
41 |         )
42 |         for actual_message, expected_message in zip(actual_messages, expected_messages):
43 |             assert actual_message == expected_message
44 |         return self
45 | 
46 |     def assert_sink_not_updated(self, sink_name: str) -> "DagTestBench":
47 |         sinks = self.dag.get_sinks()[sink_name]
48 |         assert len(sinks) == 1
49 |         cycle_id = sinks[0].get_cycle_id()
50 |         assert cycle_id < self.dag.get_cycle_id(), (
51 |             f"Sink {sink_name} got updated this cycle"
52 |         )
53 |         return self
54 | 


--------------------------------------------------------------------------------
/docs/concepts/advanced.md:
--------------------------------------------------------------------------------
 1 | # Advanced 
 2 | 
 3 | This section discuss advanced features that control how updates propagate in the DAG.
 4 | 
 5 | ## How updates propagate in the DAG
 6 | 
 7 | - Nodes are notified if any of their input node was updated during the current execution cycle
 8 | ```python
 9 | --8<-- "examples/advanced_concepts.py:propagate_any"
10 | ```
11 | - You can check if a node updated by looking at its `cycle_id`
12 | ```python
13 | --8<-- "examples/advanced_concepts.py:propagate_cycle_id"
14 | ```
15 | - If several inputs of a node get updated during the same cycle, the node will be executed once (and not once per input)
16 | ```python
17 | --8<-- "examples/advanced_concepts.py:propagate_both"
18 | ```
19 | - Stream nodes (and sources) are not considered updated if their output is empty
20 | ```python
21 | --8<-- "examples/advanced_concepts.py:propagate_empty"
22 | ```
23 | 
24 | 
25 | ## Now node
26 | 
27 | Beavers can be used in both `live` and `replay` mode. 
28 | In `replay` mode, the wall clock isn't relevant. 
29 | To access the current time of the replay, you should use the now node:
30 | 
31 | ```python
32 | --8<-- "examples/advanced_concepts.py:now_node"
33 | ```
34 | 
35 | The now node is shared for the whole DAG.
36 | Its value gets updated silently. 
37 | 
38 | ## TimerManager
39 | 
40 | To be notified when time passes, nodes can subscribe to a `TimerManager` node.
41 | 
42 | ```python
43 | --8<-- "examples/advanced_concepts.py:timer_manager"
44 | ```
45 | 
46 | ## Silent updates
47 | 
48 | Some node may update too often, or their updates may not be relevant to other nodes.
49 | In this case it's possible to silence them:
50 | 
51 | ```python
52 | --8<-- "examples/advanced_concepts.py:silence"
53 | ```
54 | 
55 | `silence` returns a new silenced node (rather than modify the existing node)
56 | 
57 | ## Value Cutoff
58 | 
59 | By default, state nodes will update everytime they are notified. 
60 | The framework doesn't check that their value has changed.
61 | 
62 | You can add a cutoff, to prevent updates when the value hasn't changed:
63 | 
64 | ```python
65 | --8<-- "examples/advanced_concepts.py:cutoff"
66 | ```
67 | 
68 | You can also provide a custom comparator to allow some tolerance when deciding if a value has changed:
69 | 
70 | ```python
71 | --8<-- "examples/advanced_concepts.py:cutoff_custom"
72 | ```
73 | 


--------------------------------------------------------------------------------
/docs/concepts/dag.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # DAG
  3 | 
  4 | At its core, `beavers` executes a Directed Acyclic Graph (DAG), where each node is a python function.   
  5 | This section discusses the different type of nodes in the DAG.
  6 | 
  7 | ## Stream Source
  8 | 
  9 | A stream source is a node whose value can be set externally.
 10 | 
 11 | When `Dag.execute` is called, the updated value is propagated in the DAG
 12 | 
 13 | ```python
 14 | --8<-- "examples/dag_concepts.py:source_stream"
 15 | ```
 16 | 
 17 | If the DAG is executed again, the value of the source stream will be reset to its empty value.
 18 | 
 19 | ```python
 20 | --8<-- "examples/dag_concepts.py:source_stream_again"
 21 | ```
 22 | 
 23 | The default empty value is set to `[]`, but it can be customized:
 24 | 
 25 | ```python
 26 | --8<-- "examples/dag_concepts.py:source_stream_empty"
 27 | ```
 28 | 
 29 | A source stream can be given a name, so they can be retrieved (and their value set):
 30 | 
 31 | ```python
 32 | --8<-- "examples/dag_concepts.py:source_stream_name"
 33 | ```
 34 | 
 35 | ## Stream Node
 36 | 
 37 | A stream node uses the output of other nodes to calculate its updated value. 
 38 | 
 39 | ```python
 40 | --8<-- "examples/dag_concepts.py:stream_node"
 41 | ```
 42 | 
 43 | If the DAG is executed again, the value of the stream node will be reset to its empty value.
 44 | 
 45 | ```python
 46 | --8<-- "examples/dag_concepts.py:stream_node_again"
 47 | ```
 48 | 
 49 | The default empty value is set to `[]`, but it can be customized:
 50 | ```python
 51 | --8<-- "examples/dag_concepts.py:stream_node_empty"
 52 | ```
 53 | 
 54 | The function provided to the node can be any callable, like a lambda:
 55 | ```python
 56 | --8<-- "examples/dag_concepts.py:stream_node_lambda"
 57 | ```
 58 | 
 59 | Or a class defining `__call__`:
 60 | ```python
 61 | --8<-- "examples/dag_concepts.py:stream_node_callable"
 62 | ```
 63 | 
 64 | ## State Node
 65 | 
 66 | A state node retains its value from one DAG execution to the next, even if it didn't update:
 67 | ```python
 68 | --8<-- "examples/dag_concepts.py:state_node"
 69 | ```
 70 | 
 71 | Because they retain their value when they are not updated, state nodes don't require an empty value
 72 | 
 73 | ## Const Node
 74 | 
 75 | A const node is a node whose value doesn't change.
 76 | ```python
 77 | --8<-- "examples/dag_concepts.py:const_node"
 78 | ```
 79 | 
 80 | Const nodes behave like state nodes (their value isn't reset when they don't update).
 81 | 
 82 | ## Connecting Nodes (aka `map`)
 83 | 
 84 | Nodes are connected by calling the `map` function. 
 85 | Any stream or state node can be connected to state nodes, stream nodes or const nodes.
 86 | 
 87 | > :warning: The `map` function doesn't execute the underlying node. 
 88 | > Instead it adds a node to the DAG
 89 | 
 90 | The map function can use positional arguments:
 91 | 
 92 | ```python
 93 | --8<-- "examples/dag_concepts.py:map_positional"
 94 | ```
 95 | Or key word arguments:
 96 | 
 97 | ```python
 98 | --8<-- "examples/dag_concepts.py:map_key_word"
 99 | ```
100 | 
101 | ## State vs Stream
102 | 
103 | Stream Nodes:
104 | 
105 | - need their return type to implement `collections.abc.Sized`
106 | - need an empty value to be specfied (which default to `[]`)
107 | - have their value reset to empty when they don't update
108 | - are not considered updated if they return empty
109 | 
110 | State Nodes:
111 | 
112 | - Can return any type
113 | - don't require an empty value
114 | - retain their value on cycle they don't update
115 | - are always considered updated if they are called
116 | 


--------------------------------------------------------------------------------
/docs/concepts/kafka.md:
--------------------------------------------------------------------------------
 1 | # Live with Kafka
 2 | 
 3 | This section explains how to run a beavers application in real time using kafka.
 4 | 
 5 | ## Count Word Example
 6 | 
 7 | Starting with a simple "count word" dag with one source going to one sink:
 8 | 
 9 | ```python
10 | --8<-- "examples/kafka_concepts.py:dag"
11 | ```
12 | 
13 | This dag has got a source node called `words` and a sink node called `counts`
14 | 
15 | ## Defining Kafka Source
16 | 
17 | We will be receiving data from kafka, on a topic called `words`.
18 | 
19 | First we need to define how we deserialize messages coming from kafka:
20 | 
21 | ```python
22 | --8<-- "examples/kafka_concepts.py:deserializer"
23 | ```
24 | 
25 | Then, we put together the `SourceTopic` with its:
26 | 
27 | - topic (`words`)
28 | - deserializer (`deserialize_messages`)
29 | - replay policy (`from_latest`) 
30 | 
31 | ```python
32 | --8<-- "examples/kafka_concepts.py:kafka_source"
33 | ```
34 | 
35 | There are multiple kafka replay policy available, see the api doc for the full list.
36 | 
37 | ## Defining Kafka Sink
38 | 
39 | We will be sending the results to the `counts` topic. 
40 | The key will be the word.T The value will be the latest count.
41 | 
42 | First we need to define a serializer, which converts each count to a `KafkaProducerMessage`
43 | 
44 | ```python
45 | --8<-- "examples/kafka_concepts.py:serializer"
46 | ```
47 | 
48 | The serializer is responsible for providing the topic for each outgoing message. 
49 | 
50 | ## Putting it together with KafkaDriver
51 | 
52 | The `KafkaDriver` takes care of creating the kafka producer and consumer, and passing the message through:
53 | 
54 | ```python
55 | --8<-- "examples/kafka_concepts.py:kafka_driver"
56 | ```
57 | 
58 | ## Beavers Kafka Features
59 | 
60 | - One consumer: There is only one consumer (rather than one consumer for each topic)
61 | - One producer: There is only one producer (rather than one producer for each topic)
62 | - When polling messages, beavers tries to read all available messages, up to a limit of `batch_size=5000` (which is configurable in the KafkaDriver)
63 | - When replaying past data, beavers orchestrate topic/partition so data is replayed in order, across topics, based on each message timestamp.
64 | - When replaying past data, some newer messages have to be held. 
65 |   To avoid memory issue, the number of held messages is capped to `batch_size*5`.
66 |   Once the number of held messages get to high, partitions that are ahead of the watermark are paused.
67 |   These partitions are un-paused once the application catches up
68 | 
69 | 
70 | ## Beavers Kafka Limitations
71 | 
72 | - One beavers application consumes every partition for requested topics (no load balancing/scaling) 
73 | 


--------------------------------------------------------------------------------
/docs/concepts/pandas.md:
--------------------------------------------------------------------------------
 1 | # Pandas integration
 2 | 
 3 | This section explains how to use beavers with pandas.
 4 | 
 5 | ## ETF value calculation example
 6 | 
 7 | In this example we want to calculate the value of ETFs.
 8 | If you are not familiar with ETFs, think about them as just a basket of shares.
 9 | 
10 | Starting with a table of individual share prices:
11 | ```python
12 | --8<-- "examples/pandas_concepts.py:business_logic_price"
13 | ```
14 | 
15 | | ticker   |   price |
16 | |:---------|--------:|
17 | | AAPL     |  174.79 |
18 | | GOOGL    |  130.25 |
19 | | MSFT     |  317.01 |
20 | | F        |   12.43 |
21 | | GM       |   35.28 |
22 | 
23 | And another table containing the composition of each ETF:
24 | ```python
25 | --8<-- "examples/pandas_concepts.py:business_logic_composition"
26 | ```
27 | 
28 | | etf   | ticker   |   quantity |
29 | |:------|:---------|-----------:|
30 | | TECH  | AAPL     |        2.0 |
31 | | TECH  | GOOGL    |        2.0 |
32 | | TECH  | MSFT     |        1.0 |
33 | | CARS  | F        |        3.0 |
34 | | CARS  | GM       |        1.0 |
35 | 
36 | In a few line of `pandas` we can derive the value of each ETF:
37 | ```python
38 | --8<-- "examples/pandas_concepts.py:business_logic_calculation"
39 | ```
40 | 
41 | | etf  |   value |
42 | |:-----|--------:|
43 | | TECH |  927.09 |
44 | | CARS |   72.57 |
45 | 
46 | ## ETF value calculation DAG
47 | 
48 | Once the business logic of the calculation is writen and tested it can be added into a Dag.
49 | We'll be using the Dag `pd` helper which makes it easier to deal with `pandas` table in beavers.
50 | 
51 | First we define two source streams, made of `pandas.DataFrame`:
52 | ```python
53 | --8<-- "examples/pandas_concepts.py:dag_source"
54 | ```
55 | 
56 | Then we keep track of the latest value for each source stream:
57 | ```python
58 | --8<-- "examples/pandas_concepts.py:dag_state"
59 | ```
60 | 
61 | Lastly we put together the share prices and ETF composition:
62 | ```python
63 | --8<-- "examples/pandas_concepts.py:dag_calculation"
64 | ```
65 | 
66 | And that's it:
67 | 
68 | ```python
69 | --8<-- "examples/pandas_concepts.py:dag_test"
70 | ```
71 | 


--------------------------------------------------------------------------------
/docs/concepts/perspective.md:
--------------------------------------------------------------------------------
 1 | # Perspective Integration
 2 | 
 3 | This section explains how to build a live web dashboard with [Perspective](https://github.com/finos/perspective) and Beavers.
 4 | 
 5 | In Beavers, you can connect any node of type `pyarrow.Table` to a perspective table.
 6 | All you need to do is call `dag.psp.to_perspecive`, and provide a `PerspectiveTableDefinition`.
 7 | 
 8 | 
 9 | ## Key Value Example
10 | 
11 | We'll write a super simple key-value store application.
12 | It listens to a topic, and displays the value of kafka messages by key, with their timestamp
13 | 
14 | ## Install
15 | 
16 | ```shell
17 | pip install beavers[pyarrow, perpective-python]
18 | ```
19 | 
20 | ## Defining the schema of incoming message
21 | 
22 | First we define a schema for the incoming "key value" messages:
23 | 
24 | - a timestamp, in millis
25 | - a key (string)
26 | - a value (string)
27 | 
28 | ```python
29 | --8<-- "examples/perspective_concepts.py:schema"
30 | ```
31 | 
32 | ## Convert kafka messages to arrow Table
33 | 
34 | Then we write a function that converts kafka messages to an apache arrow table of "key value" messages:
35 | 
36 | ```python
37 | --8<-- "examples/perspective_concepts.py:converter"
38 | ```
39 | 
40 | 
41 | ## Create a dag
42 | 
43 | We create a super simple dag. 
44 | It has a source, called `key_value`, which is a table of "key value" messages.
45 | The source is plugged into a perspective table, called... `key_value`, whose index is the `key` column
46 | 
47 | ```python
48 | --8<-- "examples/perspective_concepts.py:dag"
49 | ```
50 | 
51 | ## Run the dashboard
52 | 
53 | Lastly, we put everything together in an application
54 | ```python
55 | --8<-- "examples/perspective_concepts.py:run"
56 | ```
57 | 
58 | You should be able to see it in http://localhost:8082/key_value
59 | 


--------------------------------------------------------------------------------
/docs/concepts/polars.md:
--------------------------------------------------------------------------------
  1 | # Polars integration
  2 | 
  3 | This section explains how to use beavers with polars.
  4 | 
  5 | ## ETF value calculation example
  6 | 
  7 | In this example we want to calculate the value of ETFs.
  8 | 
  9 | Starting with a data frame of individual share prices:
 10 | ```python
 11 | --8<-- "examples/polars_concepts.py:business_logic_price"
 12 | ```
 13 | 
 14 | | ticker   |   price |
 15 | |:---------|--------:|
 16 | | AAPL     |  174.79 |
 17 | | GOOGL    |  130.25 |
 18 | | MSFT     |  317.01 |
 19 | | F        |   12.43 |
 20 | | GM       |   35.28 |
 21 | 
 22 | And another data frame containing the composition of each ETF:
 23 | ```python
 24 | --8<-- "examples/polars_concepts.py:business_logic_composition"
 25 | ```
 26 | 
 27 | | etf   | ticker   |   quantity |
 28 | |:------|:---------|-----------:|
 29 | | TECH  | AAPL     |        2.0 |
 30 | | TECH  | GOOGL    |        2.0 |
 31 | | TECH  | MSFT     |        1.0 |
 32 | | CARS  | F        |        3.0 |
 33 | | CARS  | GM       |        1.0 |
 34 | 
 35 | In a few line of `polars` we can derive the value of each ETF:
 36 | ```python
 37 | --8<-- "examples/polars_concepts.py:business_logic_calculation"
 38 | ```
 39 | 
 40 | | etf  |   value |
 41 | |:-----|--------:|
 42 | | TECH |  927.09 |
 43 | | CARS |   72.57 |
 44 | 
 45 | ## ETF value calculation DAG
 46 | 
 47 | Once the business logic of the calculation is writen and tested it can be added into a Dag.
 48 | We'll be using the Dag `pl` helper which makes it easier to deal with `polars` data frame in beavers.
 49 | 
 50 | First we define two source streams, made of `polars.DataFrame`:
 51 | ```python
 52 | --8<-- "examples/polars_concepts.py:dag_source"
 53 | ```
 54 | 
 55 | Then we keep track of the latest value for each source stream:
 56 | ```python
 57 | --8<-- "examples/polars_concepts.py:dag_state"
 58 | ```
 59 | 
 60 | Lastly we put together the share prices and ETF composition:
 61 | ```python
 62 | --8<-- "examples/polars_concepts.py:dag_calculation"
 63 | ```
 64 | 
 65 | And that's it:
 66 | 
 67 | ```python
 68 | --8<-- "examples/polars_concepts.py:dag_test"
 69 | ```
 70 | 
 71 | 
 72 | ## Taming updates
 73 | 
 74 | This simple dag does the job of calculating the ETF value in real time.
 75 | But there is one issue.
 76 | The value of every ETF would update every time either `price` or `etf_composition` update.
 77 | Even if the updates comes on a ticker that is not relevant to the ETFs we are tracking. 
 78 | 
 79 | In the example below, when the price of GameStop updates, we recalculate the value of every ETF.
 80 | Even though their value hasn't changed:
 81 | ```python
 82 | --8<-- "examples/polars_concepts.py:spurious_update"
 83 | ```
 84 | 
 85 | To tame updates we need to identify which ETF needs updating.
 86 | 
 87 | ETF values can update because their composition has changed:
 88 | ```python
 89 | --8<-- "examples/polars_concepts.py:updated_because_of_composition"
 90 | ```
 91 | 
 92 | Or because one of their component has updated: 
 93 | ```python
 94 | --8<-- "examples/polars_concepts.py:updated_because_of_price"
 95 | ```
 96 | 
 97 | We can then put it back together and only calculate updates for relevant ETFs:
 98 | ```python
 99 | --8<-- "examples/polars_concepts.py:update_all"
100 | ```
101 | 
102 | 
103 | And see that only the value "TECH" ETF updates when a tech stock update:
104 | ```python
105 | --8<-- "examples/polars_concepts.py:update_all_test"
106 | ```
107 | 
108 | | etf   |   value |
109 | |:------|--------:|
110 | | TECH  |  927.13 |
111 | 


--------------------------------------------------------------------------------
/docs/concepts/pyarrow.md:
--------------------------------------------------------------------------------
  1 | # Pyarrow integration
  2 | 
  3 | This section explains how to use beavers with pyarrow.
  4 | 
  5 | ## ETF value calculation example
  6 | 
  7 | In this example we want to calculate the value of ETFs.
  8 | If you are not familiar with ETFs, think about them as just a basket of shares.
  9 | 
 10 | Starting with a table of individual share prices:
 11 | ```python
 12 | --8<-- "examples/pyarrow_concepts.py:business_logic_price"
 13 | ```
 14 | 
 15 | | ticker   |   price |
 16 | |:---------|--------:|
 17 | | AAPL     |  174.79 |
 18 | | GOOGL    |  130.25 |
 19 | | MSFT     |  317.01 |
 20 | | F        |   12.43 |
 21 | | GM       |   35.28 |
 22 | 
 23 | And another table containing the composition of each ETF:
 24 | ```python
 25 | --8<-- "examples/pyarrow_concepts.py:business_logic_composition"
 26 | ```
 27 | 
 28 | | etf   | ticker   |   quantity |
 29 | |:------|:---------|-----------:|
 30 | | TECH  | AAPL     |        2.0 |
 31 | | TECH  | GOOGL    |        2.0 |
 32 | | TECH  | MSFT     |        1.0 |
 33 | | CARS  | F        |        3.0 |
 34 | | CARS  | GM       |        1.0 |
 35 | 
 36 | In a few line of `pyarrow` we can derive the value of each ETF:
 37 | ```python
 38 | --8<-- "examples/pyarrow_concepts.py:business_logic_calculation"
 39 | ```
 40 | 
 41 | | etf  |   value |
 42 | |:-----|--------:|
 43 | | TECH |  927.09 |
 44 | | CARS |   72.57 |
 45 | 
 46 | ## ETF value calculation DAG
 47 | 
 48 | Once the business logic of the calculation is writen and tested it can be added into a Dag.
 49 | We'll be using the Dag `pa` helper which makes it easier to deal with `pyarrow` table in beavers.
 50 | 
 51 | First we define two source streams, made of `pyarrow.Table`:
 52 | ```python
 53 | --8<-- "examples/pyarrow_concepts.py:dag_source"
 54 | ```
 55 | 
 56 | Then we keep track of the latest value for each source stream:
 57 | ```python
 58 | --8<-- "examples/pyarrow_concepts.py:dag_state"
 59 | ```
 60 | 
 61 | Lastly we put together the share prices and ETF composition:
 62 | ```python
 63 | --8<-- "examples/pyarrow_concepts.py:dag_calculation"
 64 | ```
 65 | 
 66 | And that's it:
 67 | 
 68 | ```python
 69 | --8<-- "examples/pyarrow_concepts.py:dag_test"
 70 | ```
 71 | 
 72 | 
 73 | ## Taming updates
 74 | 
 75 | This simple dag does the job of calculating the ETF value in real time.
 76 | But there is one issue.
 77 | The value of every ETF would update every time either `price` or `etf_composition` update.
 78 | Even if the updates comes on a ticker that is not relevant to the ETFs we are tracking. 
 79 | 
 80 | In the example below, when the price of GameStop updates, we recalculate the value of every ETF.
 81 | Even though their value hasn't changed:
 82 | ```python
 83 | --8<-- "examples/pyarrow_concepts.py:spurious_update"
 84 | ```
 85 | 
 86 | To tame updates we need to identify which ETF needs updating.
 87 | 
 88 | ETF values can update because their composition has changed:
 89 | ```python
 90 | --8<-- "examples/pyarrow_concepts.py:updated_because_of_composition"
 91 | ```
 92 | 
 93 | Or because one of their component has updated: 
 94 | ```python
 95 | --8<-- "examples/pyarrow_concepts.py:updated_because_of_price"
 96 | ```
 97 | 
 98 | We can then put it back together and only calculate updates for relevant ETFs:
 99 | ```python
100 | --8<-- "examples/pyarrow_concepts.py:update_all"
101 | ```
102 | 
103 | 
104 | And see that only the value "TECH" ETF updates when a tech stock update:
105 | ```python
106 | --8<-- "examples/pyarrow_concepts.py:update_all_test"
107 | ```
108 | 
109 | | etf   |   value |
110 | |:------|--------:|
111 | | TECH  |  927.13 |
112 | 


--------------------------------------------------------------------------------
/docs/concepts/replay.md:
--------------------------------------------------------------------------------
  1 | # Replay
  2 | 
  3 | This section explains how to run a beavers application using historical data, typically stored in files or databases.
  4 | 
  5 | ## Manual Replay
  6 | 
  7 | Starting with a simple dag with one source going to one sink:
  8 | 
  9 | ```python
 10 | --8<-- "examples/replay_concepts.py:simple_dag"
 11 | ```
 12 | 
 13 | Assuming your data has got this shape:
 14 | ```python
 15 | --8<-- "examples/replay_concepts.py:simple_data_class"
 16 | ```
 17 | 
 18 | You could replay the data manually your self and run the dag for regular interval:
 19 | ```python
 20 | --8<-- "examples/replay_concepts.py:manual_replay"
 21 | ```
 22 | 
 23 | But this requires a lot of boilerplate code and becomes cumbersome very quickly. 
 24 | 
 25 | ## Replay Framework
 26 | 
 27 | The replay framework uses a few key abstraction in order to define how the data is loaded and injected in the dag.
 28 | 
 29 | ### `DataSource`
 30 | 
 31 | A `DataSource` provides a way of streaming data. 
 32 | ```python
 33 | --8<-- "examples/replay_concepts.py:data_source"
 34 | ```
 35 | 
 36 | By convention, `DataSource`s:
 37 | 
 38 | - return `UTC_MAX` when there is no more data
 39 | - are stateful and need to remember what has already been read. 
 40 | 
 41 | ### `ReplayContext`
 42 | 
 43 | The `ReplayContext` contains timing information:
 44 | ```python
 45 | --8<-- "examples/replay_concepts.py:replay_context"
 46 | ```
 47 | 
 48 | :warning: By convention all timestamps are UTC
 49 | 
 50 | 
 51 | ### `DataSourceProvider`
 52 | 
 53 | A `DataSourceProvider` provides a way of creating `DataSource`.
 54 | 
 55 | For example, if the data is stored in a csv file:
 56 | 
 57 | ```csv
 58 | timestamp,message
 59 | 2023-01-01 01:00:00+00:00,Hello
 60 | 2023-01-01 01:01:00+00:00,How are you
 61 | ```
 62 | 
 63 | Provided with the `ReplayContext`, our `DataSourceProvider` will load the and return a `DataSource`
 64 |  
 65 | ```python
 66 | --8<-- "examples/replay_concepts.py:data_source_provider"
 67 | ```
 68 | 
 69 | 
 70 | ### `DataSink`
 71 | 
 72 | A `DataSink` provides a way of capturing the output of nodes and saving the data:
 73 | 
 74 |  
 75 | ```python
 76 | --8<-- "examples/replay_concepts.py:data_sink"
 77 | ```
 78 | 
 79 | ### `DataSinkProvider`
 80 | 
 81 | A `DataSinkProvider` provides a way of creating `DataSink`.
 82 | 
 83 | In this example we save the data to csv:
 84 | 
 85 |  
 86 | ```python
 87 | --8<-- "examples/replay_concepts.py:data_sink_provider"
 88 | ```
 89 | 
 90 | 
 91 | ### `ReplayDriver`
 92 | 
 93 | The replay driver is responsible for putting the dag, context, sources and sinks together, and orchestrate the replay.
 94 | 
 95 | ```python
 96 | --8<-- "examples/replay_concepts.py:replay_driver"
 97 | ```
 98 | 
 99 | 
100 | ## Reading Files Partitioned By Time
101 | 
102 | Assuming:
103 | 
104 | - you want to replay a dag for a long period of time.
105 | - all that historic data doesn't fit into time
106 | - the data is partitioned by time period. For example one file per day, `input_2023-01-01.csv`.
107 | 
108 | It's then possible, with the `IteratorDataSourceAdapter` to load each file one by one as they are needed.
109 | 
110 | In this example, csv files are stored under . We need to provide:
111 | 
112 | - a generator that will yield a `DataSource` for each file, in order
113 | - a way to concatenate the output of 2 `DataSource`. In this case we'll use `+` to merge two lists
114 | - an empty value for the case there is no more data, or we reach the last file.
115 | 
116 | ```python
117 | --8<-- "examples/replay_concepts.py:iterator_data_source_adapter"
118 | ```
119 | 


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Welcome! We're happy to have you here. Thank you in advance for your contribution to Beavers.
 4 | 
 5 | ## Development environment set up
 6 | 
 7 | ```shell
 8 | python3 -m venv --clear venv
 9 | source venv/bin/activate
10 | poetry self add "poetry-dynamic-versioning[plugin]"
11 | poetry install
12 | pre-commit install
13 | ```
14 | 
15 | ## Testing
16 | 
17 | To run tests fast:
18 | 
19 | ```shell
20 | pytest -n auto tests
21 | ```
22 | 
23 | To Get coverage:
24 | 
25 | ```shell
26 | coverage run --branch --rcfile=./pyproject.toml --include "./beavers/*" -m pytest tests
27 | coverage report --show-missing
28 | ```
29 | 
30 | ## Generating the change log
31 | 
32 | We use [git-change-log](https://pawamoy.github.io/git-changelog/usage/) to generate our CHANGELOG.md
33 | 
34 | Please follow the [basic convention](https://pawamoy.github.io/git-changelog/usage/#basic-convention) for commit
35 | message.
36 | 
37 | To update the change log, run:
38 | 
39 | ```shell
40 | git-changelog -io CHANGELOG.md
41 | ```
42 | 
43 | ## New Release
44 | 
45 | For new release, first prepare the change log, push and merge it.
46 | 
47 | ```shell
48 | git-changelog --bump=auto -io CHANGELOG.md
49 | ```
50 | 
51 | Then tag and push:
52 | 
53 | ```shell
54 | git tag vX.X.X
55 | git push origin vX.X.X
56 | ```
57 | 
58 | Lastly on github, go to tags and create a release.
59 | The CI will deploy to pypi automatically from then.
60 | 
61 | ## Testing the documentation
62 | 
63 | ```shell
64 | mkdocs serve --livereload --watch=./
65 | ```
66 | 
67 | ## Updating dependencies
68 | 
69 | - For the repo `poetry update`
70 | - For the doc: `(cd docs/; pip-compile ./requirements.in  > ./requirements.txt)`
71 | - For pre-commit: `pre-commit autoupdate`
72 | 
73 | ## Resources
74 | 
75 | The repo set up is inspired by this [guide](https://mathspp.com/blog/how-to-create-a-python-package-in-2022)
76 | 


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
1 | # FAQ
2 | 
3 | ## Why is it called beavers?
4 | 
5 | Beavers are very clever animals that builds dams to regulate the flow of rivers. 
6 | Likewise, the beavers library builds a dam around your data to regulate how it is processed by your applications.
7 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | ![Beavers Logo][5]
 2 | 
 3 | # Beavers
 4 | 
 5 | [Documentation][6] / [Installation][7] / [Repository][1] / [PyPI][8]
 6 | 
 7 | [Beavers][1] is a python library for stream processing, optimized for analytics.
 8 | 
 9 | It is used at [Tradewell Technologies][2],
10 | to calculate analytics and serve model predictions,
11 | for both realtime and batch jobs.
12 | 
13 | ## Key Features
14 | 
15 | - Works in **real time** (eg: reading from Kafka) and **replay mode** (eg: reading from Parquet files).
16 | - Optimized for analytics, using micro-batches (instead of processing records one by one).
17 | - Similar to [incremental][3], it updates nodes in a dag incrementally.
18 | - Taking inspiration from [kafka streams][4], there are two types of nodes in the dag:
19 |     - **Stream**: ephemeral micro-batches of events (cleared after every cycle).
20 |     - **State**: durable state derived from streams.
21 | - Clear separation between the business logic and the IO.
22 |   So the same dag can be used in real time mode, replay mode or can be easily tested.
23 | - Functional interface: no inheritance or decorator required.
24 | - Support for complicated joins, not just "linear" data flow.
25 | 
26 | ## Limitations
27 | 
28 | - No concurrency support.
29 |   To speed up calculation use libraries like pandas, pyarrow or polars.
30 | - No async code.
31 |   To speed up IO use kafka driver native thread or parquet IO thread pool.
32 | - No support for persistent state.
33 |   Instead of saving state, replay historic data from kafka to prime stateful nodes.
34 | 
35 | ## Talks
36 | 
37 | - [Unified batch and stream processing in python | PyData Global 2023][9]
38 | 
39 | [1]: https://github.com/tradewelltech/beavers
40 | [2]: https://www.tradewelltech.co/
41 | [3]: https://github.com/janestreet/incremental
42 | [4]: https://www.confluent.io/blog/kafka-streams-tables-part-1-event-streaming/
43 | [5]: https://raw.githubusercontent.com/tradewelltech/beavers/master/docs/static/icons/beavers/logo.svg
44 | [6]: https://beavers.readthedocs.io/en/latest/
45 | [7]: https://beavers.readthedocs.io/en/latest/install/
46 | [8]: https://pypi.org/project/beavers/
47 | [9]: https://www.youtube.com/watch?v=8pUwsGA8SQM
48 | 


--------------------------------------------------------------------------------
/docs/install.md:
--------------------------------------------------------------------------------
 1 | # Install
 2 | 
 3 | ## Basic Install
 4 | 
 5 | ```sh
 6 | pip install beavers
 7 | ```
 8 | 
 9 | ## Extras
10 | 
11 | To install with extras like Arrow, Kafka or Perspective:
12 | 
13 | ```sh
14 | pip install beavers[pyarrow, confluent_kafka, perspective-python]
15 | ```
16 | 


--------------------------------------------------------------------------------
/docs/reference/dag.md:
--------------------------------------------------------------------------------
1 | ::: beavers.dag
2 |     options:
3 |         heading_level: 2
4 |         show_source: false
5 | 


--------------------------------------------------------------------------------
/docs/reference/kafka.md:
--------------------------------------------------------------------------------
1 | ::: beavers.kafka
2 |     options:
3 |         heading_level: 2
4 |         show_source: false
5 | 


--------------------------------------------------------------------------------
/docs/reference/pandas_wrapper.md:
--------------------------------------------------------------------------------
1 | ::: beavers.pandas_wrapper
2 |     options:
3 |         heading_level: 2
4 |         show_source: false
5 | 


--------------------------------------------------------------------------------
/docs/reference/pyarrow_wrapper.md:
--------------------------------------------------------------------------------
1 | ::: beavers.pyarrow_wrapper
2 |     options:
3 |         heading_level: 2
4 |         show_source: false
5 | 


--------------------------------------------------------------------------------
/docs/reference/replay.md:
--------------------------------------------------------------------------------
1 | ::: beavers.replay
2 |     options:
3 |         heading_level: 2
4 |         show_source: false
5 | 


--------------------------------------------------------------------------------
/docs/requirements.in:
--------------------------------------------------------------------------------
1 | markdown-include
2 | mkdocs
3 | mkdocs-material
4 | mkdocs-material-extensions
5 | mkdocstrings[python]
6 | pymdown-extensions
7 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file is autogenerated by pip-compile with Python 3.10
  3 | # by the following command:
  4 | #
  5 | #    pip-compile ./requirements.in
  6 | #
  7 | babel==2.17.0
  8 |     # via mkdocs-material
  9 | backrefs==5.8
 10 |     # via mkdocs-material
 11 | certifi==2025.4.26
 12 |     # via requests
 13 | charset-normalizer==3.4.2
 14 |     # via requests
 15 | click==8.2.1
 16 |     # via mkdocs
 17 | colorama==0.4.6
 18 |     # via
 19 |     #   griffe
 20 |     #   mkdocs-material
 21 | ghp-import==2.1.0
 22 |     # via mkdocs
 23 | griffe==1.7.3
 24 |     # via mkdocstrings-python
 25 | idna==3.10
 26 |     # via requests
 27 | jinja2==3.1.6
 28 |     # via
 29 |     #   mkdocs
 30 |     #   mkdocs-material
 31 |     #   mkdocstrings
 32 | markdown==3.8
 33 |     # via
 34 |     #   markdown-include
 35 |     #   mkdocs
 36 |     #   mkdocs-autorefs
 37 |     #   mkdocs-material
 38 |     #   mkdocstrings
 39 |     #   pymdown-extensions
 40 | markdown-include==0.8.1
 41 |     # via -r ./requirements.in
 42 | markupsafe==3.0.2
 43 |     # via
 44 |     #   jinja2
 45 |     #   mkdocs
 46 |     #   mkdocs-autorefs
 47 |     #   mkdocstrings
 48 | mergedeep==1.3.4
 49 |     # via
 50 |     #   mkdocs
 51 |     #   mkdocs-get-deps
 52 | mkdocs==1.6.1
 53 |     # via
 54 |     #   -r ./requirements.in
 55 |     #   mkdocs-autorefs
 56 |     #   mkdocs-material
 57 |     #   mkdocstrings
 58 | mkdocs-autorefs==1.4.2
 59 |     # via
 60 |     #   mkdocstrings
 61 |     #   mkdocstrings-python
 62 | mkdocs-get-deps==0.2.0
 63 |     # via mkdocs
 64 | mkdocs-material==9.6.14
 65 |     # via -r ./requirements.in
 66 | mkdocs-material-extensions==1.3.1
 67 |     # via
 68 |     #   -r ./requirements.in
 69 |     #   mkdocs-material
 70 | mkdocstrings[python]==0.29.1
 71 |     # via
 72 |     #   -r ./requirements.in
 73 |     #   mkdocstrings-python
 74 | mkdocstrings-python==1.16.12
 75 |     # via mkdocstrings
 76 | packaging==25.0
 77 |     # via mkdocs
 78 | paginate==0.5.7
 79 |     # via mkdocs-material
 80 | pathspec==0.12.1
 81 |     # via mkdocs
 82 | platformdirs==4.3.8
 83 |     # via mkdocs-get-deps
 84 | pygments==2.19.1
 85 |     # via mkdocs-material
 86 | pymdown-extensions==10.15
 87 |     # via
 88 |     #   -r ./requirements.in
 89 |     #   mkdocs-material
 90 |     #   mkdocstrings
 91 | python-dateutil==2.9.0.post0
 92 |     # via ghp-import
 93 | pyyaml==6.0.2
 94 |     # via
 95 |     #   mkdocs
 96 |     #   mkdocs-get-deps
 97 |     #   pymdown-extensions
 98 |     #   pyyaml-env-tag
 99 | pyyaml-env-tag==1.1
100 |     # via mkdocs
101 | requests==2.32.3
102 |     # via mkdocs-material
103 | six==1.17.0
104 |     # via python-dateutil
105 | typing-extensions==4.14.0
106 |     # via mkdocstrings-python
107 | urllib3==2.4.0
108 |     # via requests
109 | watchdog==6.0.0
110 |     # via mkdocs
111 | 


--------------------------------------------------------------------------------
/docs/static/icons/beavers/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tradewelltech/beavers/ec9979086868589ab82b47ce55fa11cc31b32c16/docs/static/icons/beavers/icon.png


--------------------------------------------------------------------------------
/docs/static/icons/beavers/logo.svg:
--------------------------------------------------------------------------------
 1 | <svg width="383" height="373" viewBox="0 0 383 373" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 | <rect x="41" y="139" width="300" height="145" fill="white"/>
 3 | <path d="M103 271V276H109C107.125 276 103 273.598 103 271Z" fill="#4A62FF"/>
 4 | <path d="M279 271V276H273C274.875 276 279 273.598 279 271Z" fill="#4A62FF"/>
 5 | <path d="M334.5 137L319.5 137C315.951 137 309.469 137 309.469 137L277.939 276H103.987L72.4933 137C72.4933 137 70.5485 137 67 137H47.5057C43.9571 137 41 137 41 137V287.401V293.315V309.848C41 324.835 54.381 337 70.8669 337H311.133C327.619 337 341 324.835 341 309.848V293.315V287.401V137C341 137 338.049 137 334.5 137Z" fill="#4A62FF"/>
 6 | <path d="M184 269V276H177C180.898 276 184 272.898 184 269Z" fill="#4A62FF"/>
 7 | <path d="M200 269V276H207C203.102 276 200 272.898 200 269Z" fill="#4A62FF"/>
 8 | <path d="M191.721 282C191.721 282 184.266 282 184 282V137C184 137 191.593 137 192.52 137C193.447 137 199.975 137 199.975 137L199.988 282C199.988 282 200.242 282 198.378 282H195.981H191.721Z" fill="#4A62FF"/>
 9 | <path d="M341 91.5589C341 95.1074 338.043 98.0646 334.494 98.0646H47.506C43.9574 98.0646 41.0003 95.1074 41.0003 91.5589V66.8669C40.9264 50.381 54.3073 37 70.8672 37H311.133C327.619 37 341 50.381 341 66.8669V91.5589Z" fill="#00A963"/>
10 | <rect x="41" y="93" width="300" height="68" fill="#00A963"/>
11 | <path d="M171 184.025C122 216.128 74.3333 200.284 41 175.016V160H341V175.016C307.667 200.284 262 216.128 213 184.025C179.667 158.757 204.333 158.757 171 184.025Z" fill="#00A963"/>
12 | <path d="M232.42 79C236.251 79 238.659 83.1319 236.77 86.4651L208.436 136.465C207.549 138.032 205.887 139 204.086 139H179.831C178.075 139 176.447 138.079 175.543 136.572L145.543 86.5725C143.544 83.2399 145.944 79 149.831 79H232.42Z" fill="#4A62FF"/>
13 | <path d="M68 90.1139C88.5675 87.0435 98.1592 81.7482 127.772 95.3433" stroke="#4A62FF" stroke-width="10" stroke-linecap="round"/>
14 | <path d="M68 90.1139C88.5675 87.0435 98.1592 81.7482 127.772 95.3433" stroke="#4A62FF" stroke-width="10" stroke-linecap="round"/>
15 | <path d="M74.0001 126.006C96.442 116.024 109.073 110.879 133.089 115.587" stroke="#4A62FF" stroke-width="10" stroke-linecap="round"/>
16 | <path d="M74.0001 126.006C96.442 116.024 109.073 110.879 133.089 115.587" stroke="#4A62FF" stroke-width="10" stroke-linecap="round"/>
17 | <path d="M90.9999 158.923C111.576 141.621 123.195 132.861 145.378 133.566" stroke="#4A62FF" stroke-width="10" stroke-linecap="round"/>
18 | <path d="M90.9999 158.923C111.576 141.621 123.195 132.861 145.378 133.566" stroke="#4A62FF" stroke-width="10" stroke-linecap="round"/>
19 | <path d="M314.378 90.228C293.811 87.1575 284.219 81.8622 254.607 95.4573" stroke="#4A62FF" stroke-width="10" stroke-linecap="round"/>
20 | <path d="M314.378 90.228C293.811 87.1575 284.219 81.8622 254.607 95.4573" stroke="#4A62FF" stroke-width="10" stroke-linecap="round"/>
21 | <path d="M308.378 126.12C285.936 116.137 273.306 110.993 249.29 115.701" stroke="#4A62FF" stroke-width="10" stroke-linecap="round"/>
22 | <path d="M308.378 126.12C285.936 116.137 273.306 110.993 249.29 115.701" stroke="#4A62FF" stroke-width="10" stroke-linecap="round"/>
23 | <path d="M293.378 159.294C272.803 141.992 261.183 133.231 239 133.937" stroke="#4A62FF" stroke-width="10" stroke-linecap="round"/>
24 | <path d="M293.378 159.294C272.803 141.992 261.183 133.231 239 133.937" stroke="#4A62FF" stroke-width="10" stroke-linecap="round"/>
25 | </svg>
26 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tradewelltech/beavers/ec9979086868589ab82b47ce55fa11cc31b32c16/examples/__init__.py


--------------------------------------------------------------------------------
/examples/advanced_concepts.py:
--------------------------------------------------------------------------------
  1 | # ruff: noqa: E402
  2 | # isort: skip_file
  3 | import pandas as pd
  4 | 
  5 | from beavers import Dag
  6 | 
  7 | dag = Dag()
  8 | 
  9 | # --8<-- [start:propagate_any]
 10 | source_1 = dag.source_stream()
 11 | source_2 = dag.source_stream()
 12 | node = dag.stream(lambda x, y: x + y).map(source_1, source_2)
 13 | 
 14 | source_1.set_stream([1, 2, 3])
 15 | dag.execute()
 16 | assert node.get_value() == [1, 2, 3]  # source_1 updated
 17 | 
 18 | source_2.set_stream([4, 5, 6])
 19 | dag.execute()
 20 | assert node.get_value() == [4, 5, 6]  # source_2 updated
 21 | 
 22 | dag.execute()
 23 | assert node.get_value() == []  # no updates, reset to empty
 24 | # --8<-- [end:propagate_any]
 25 | 
 26 | # --8<-- [start:propagate_cycle_id]
 27 | source_1.set_stream([1, 2, 3])
 28 | dag.execute()
 29 | assert node.get_value() == [1, 2, 3]
 30 | assert node.get_cycle_id() == dag.get_cycle_id()
 31 | 
 32 | dag.execute()
 33 | assert node.get_value() == []
 34 | assert node.get_cycle_id() == dag.get_cycle_id() - 1
 35 | # --8<-- [end:propagate_cycle_id]
 36 | 
 37 | 
 38 | # --8<-- [start:propagate_both]
 39 | source_1.set_stream([1, 2, 3])
 40 | source_2.set_stream([4, 5, 6])
 41 | dag.execute()
 42 | assert node.get_value() == [1, 2, 3, 4, 5, 6]
 43 | assert node.get_cycle_id() == dag.get_cycle_id()
 44 | # --8<-- [end:propagate_both]
 45 | 
 46 | 
 47 | # --8<-- [start:propagate_empty]
 48 | def even_only(values: list[int]) -> list[int]:
 49 |     return [v for v in values if (v % 2) == 0]
 50 | 
 51 | 
 52 | even = dag.stream(even_only).map(source_1)
 53 | 
 54 | source_1.set_stream([1, 2, 3])
 55 | dag.execute()
 56 | assert even.get_value() == [2]
 57 | assert even.get_cycle_id() == dag.get_cycle_id()
 58 | 
 59 | source_1.set_stream([1, 3])
 60 | dag.execute()
 61 | assert even.get_value() == []
 62 | assert even.get_cycle_id() == dag.get_cycle_id() - 1
 63 | # --8<-- [end:propagate_empty]
 64 | 
 65 | 
 66 | # --8<-- [start:now_node]
 67 | def get_delay(timestamps: list[pd.Timestamp], now: pd.Timestamp) -> list[pd.Timedelta]:
 68 |     return [now - timestamp for timestamp in timestamps]
 69 | 
 70 | 
 71 | timestamp_stream = dag.source_stream()
 72 | delay = dag.stream(get_delay).map(timestamp_stream, dag.now())
 73 | 
 74 | timestamp_stream.set_stream(
 75 |     [
 76 |         pd.to_datetime("2022-01-01", utc=True),
 77 |         pd.to_datetime("2022-01-02", utc=True),
 78 |         pd.to_datetime("2022-01-03", utc=True),
 79 |     ]
 80 | )
 81 | dag.execute(timestamp=pd.to_datetime("2022-01-04", utc=True))
 82 | assert delay.get_value() == [
 83 |     pd.to_timedelta("3d"),
 84 |     pd.to_timedelta("2d"),
 85 |     pd.to_timedelta("1d"),
 86 | ]
 87 | 
 88 | # --8<-- [end:now_node]
 89 | 
 90 | # --8<-- [start:timer_manager]
 91 | from beavers import TimerManager
 92 | 
 93 | 
 94 | def get_year(now: pd.Timestamp, timer_manager: TimerManager):
 95 |     if not timer_manager.has_next_timer():
 96 |         timer_manager.set_next_timer(
 97 |             pd.Timestamp(year=now.year + 1, day=1, month=1, tzinfo=now.tzinfo)
 98 |         )
 99 | 
100 |     return now.year
101 | 
102 | 
103 | year = dag.state(get_year).map(dag.now(), dag.timer_manager())
104 | 
105 | dag.execute(pd.to_datetime("2022-01-01", utc=True))
106 | assert year.get_value() == 2022
107 | assert year.get_cycle_id() == dag.get_cycle_id()
108 | 
109 | dag.execute(pd.to_datetime("2022-01-02", utc=True))
110 | assert year.get_value() == 2022
111 | assert year.get_cycle_id() == dag.get_cycle_id() - 1
112 | 
113 | dag.execute(pd.to_datetime("2023-01-02", utc=True))
114 | assert year.get_value() == 2023
115 | assert year.get_cycle_id() == dag.get_cycle_id()
116 | # --8<-- [end:timer_manager]
117 | 
118 | 
119 | # --8<-- [start:silence]
120 | source_1 = dag.source_stream()
121 | source_1_silence = dag.silence(source_1)
122 | source_2 = dag.source_stream()
123 | 
124 | both = dag.stream(lambda x, y: x + y).map(source_1_silence, source_2)
125 | 
126 | source_1.set_stream([1, 2, 3])
127 | source_2.set_stream([4, 5, 6])
128 | dag.execute()
129 | assert both.get_value() == [1, 2, 3, 4, 5, 6]
130 | assert both.get_cycle_id() == dag.get_cycle_id()
131 | 
132 | source_1.set_stream([1, 2, 3])
133 | dag.execute()
134 | assert both.get_value() == []
135 | assert (
136 |     both.get_cycle_id() == dag.get_cycle_id() - 1
137 | )  # No update because source_1 is silent
138 | 
139 | # --8<-- [end:silence]
140 | 
141 | 
142 | # --8<-- [start:cutoff]
143 | class GetMax:
144 |     def __init__(self):
145 |         self._max = 0.0
146 | 
147 |     def __call__(self, values: list[float]) -> float:
148 |         self._max = max(self._max, *values)
149 |         return self._max
150 | 
151 | 
152 | source = dag.source_stream()
153 | get_max = dag.state(GetMax()).map(source)
154 | get_max_cutoff = dag.cutoff(get_max)
155 | 
156 | source.set_stream([1.0, 2.0])
157 | dag.execute()
158 | assert get_max.get_value() == 2.0
159 | assert get_max.get_cycle_id() == dag.get_cycle_id()
160 | assert get_max_cutoff.get_cycle_id() == dag.get_cycle_id()
161 | 
162 | source.set_stream([1.0])
163 | dag.execute()
164 | assert get_max.get_value() == 2.0
165 | assert get_max.get_cycle_id() == dag.get_cycle_id()
166 | assert get_max_cutoff.get_cycle_id() == dag.get_cycle_id() - 1
167 | 
168 | source.set_stream([3.0])
169 | dag.execute()
170 | assert get_max.get_value() == 3.0
171 | assert get_max.get_cycle_id() == dag.get_cycle_id()
172 | assert get_max_cutoff.get_cycle_id() == dag.get_cycle_id()
173 | # --8<-- [end:cutoff]
174 | 
175 | # --8<-- [start:cutoff_custom]
176 | get_max_cutoff_custom = dag.cutoff(get_max, lambda x, y: abs(x - y) < 0.1)
177 | 
178 | source.set_stream([4.0])
179 | dag.execute()
180 | assert get_max.get_value() == 4.0
181 | assert get_max.get_cycle_id() == dag.get_cycle_id()
182 | assert get_max_cutoff_custom.get_cycle_id() == dag.get_cycle_id()
183 | 
184 | 
185 | source.set_stream([4.05])
186 | dag.execute()
187 | assert get_max.get_value() == 4.05
188 | assert get_max.get_cycle_id() == dag.get_cycle_id()
189 | assert get_max_cutoff_custom.get_value() == 4.0
190 | assert get_max_cutoff_custom.get_cycle_id() == dag.get_cycle_id() - 1
191 | 
192 | 
193 | source.set_stream([4.11])
194 | dag.execute()
195 | assert get_max.get_value() == 4.11
196 | assert get_max.get_cycle_id() == dag.get_cycle_id()
197 | assert get_max_cutoff_custom.get_value() == 4.11
198 | assert get_max_cutoff_custom.get_cycle_id() == dag.get_cycle_id()
199 | # --8<-- [end:cutoff_custom]
200 | 


--------------------------------------------------------------------------------
/examples/dag_concepts.py:
--------------------------------------------------------------------------------
  1 | # isort: skip_file
  2 | 
  3 | # --8<-- [start:source_stream]
  4 | from beavers import Dag
  5 | 
  6 | dag = Dag()
  7 | 
  8 | source_stream = dag.source_stream()
  9 | 
 10 | source_stream.set_stream([1, 2, 3])
 11 | dag.execute()
 12 | assert source_stream.get_value() == [1, 2, 3]
 13 | # --8<-- [end:source_stream]
 14 | 
 15 | 
 16 | # --8<-- [start:source_stream_again]
 17 | dag.execute()
 18 | assert source_stream.get_value() == []
 19 | # --8<-- [end:source_stream_again]
 20 | 
 21 | # --8<-- [start:source_stream_name]
 22 | my_source_stream = dag.source_stream(name="my_source")
 23 | dag.get_sources()["my_source"].set_stream([4, 5, 6])
 24 | dag.execute()
 25 | assert my_source_stream.get_value() == [4, 5, 6]
 26 | # --8<-- [end:source_stream_name]
 27 | 
 28 | # --8<-- [start:source_stream_empty]
 29 | dict_source_stream = dag.source_stream(empty_factory=dict)
 30 | dict_source_stream.set_stream({"hello": "world"})
 31 | dag.execute()
 32 | assert dict_source_stream.get_value() == {"hello": "world"}
 33 | dag.execute()
 34 | assert dict_source_stream.get_value() == {}
 35 | # --8<-- [end:source_stream_empty]
 36 | 
 37 | 
 38 | # --8<-- [start:stream_node]
 39 | def multiply_by_2(values: list[int]) -> list[int]:
 40 |     return [v * 2 for v in values]
 41 | 
 42 | 
 43 | stream_node = dag.stream(multiply_by_2).map(source_stream)
 44 | 
 45 | source_stream.set_stream([1, 2, 3])
 46 | dag.execute()
 47 | assert stream_node.get_value() == [2, 4, 6]
 48 | # --8<-- [end:stream_node]
 49 | 
 50 | 
 51 | # --8<-- [start:stream_node_again]
 52 | dag.execute()
 53 | assert stream_node.get_value() == []
 54 | # --8<-- [end:stream_node_again]
 55 | 
 56 | 
 57 | # --8<-- [start:stream_node_empty]
 58 | set_stream_node = dag.stream(set, empty_factory=set).map(source_stream)
 59 | source_stream.set_stream([1, 2, 3, 1, 2, 3])
 60 | dag.execute()
 61 | assert set_stream_node.get_value() == {1, 2, 3}
 62 | dag.execute()
 63 | assert set_stream_node.get_value() == set()
 64 | # --8<-- [end:stream_node_empty]
 65 | 
 66 | 
 67 | # --8<-- [start:stream_node_lambda]
 68 | lambda_stream_node = dag.stream(lambda x: x[:-1]).map(source_stream)
 69 | source_stream.set_stream([1, 2, 3])
 70 | dag.execute()
 71 | assert lambda_stream_node.get_value() == [1, 2]
 72 | # --8<-- [end:stream_node_lambda]
 73 | 
 74 | 
 75 | # --8<-- [start:stream_node_callable]
 76 | class MultiplyBy:
 77 |     def __init__(self, by: int):
 78 |         self.by = by
 79 | 
 80 |     def __call__(self, values: list[int]) -> list[int]:
 81 |         return [v * self.by for v in values]
 82 | 
 83 | 
 84 | callable_stream_node = dag.stream(MultiplyBy(3)).map(source_stream)
 85 | source_stream.set_stream([1, 2, 3])
 86 | dag.execute()
 87 | assert callable_stream_node.get_value() == [3, 6, 9]
 88 | # --8<-- [end:stream_node_callable]
 89 | 
 90 | 
 91 | # --8<-- [start:state_node]
 92 | class Accumulator:
 93 |     def __init__(self):
 94 |         self._count = 0
 95 | 
 96 |     def __call__(self, values: list[int]) -> int:
 97 |         self._count += sum(values)
 98 |         return self._count
 99 | 
100 | 
101 | state_node = dag.state(Accumulator()).map(source_stream)
102 | source_stream.set_stream([1, 2, 3])
103 | dag.execute()
104 | assert state_node.get_value() == 6
105 | dag.execute()
106 | assert state_node.get_value() == 6
107 | # --8<-- [end:state_node]
108 | 
109 | 
110 | # --8<-- [start:const_node]
111 | const_node = dag.const(2)
112 | assert const_node.get_value() == 2
113 | # --8<-- [end:const_node]
114 | 
115 | 
116 | # --8<-- [start:map_positional]
117 | to_append = dag.const([3])
118 | positional_stream = dag.stream(lambda x, y: x + y).map(source_stream, to_append)
119 | source_stream.set_stream([1, 2])
120 | dag.execute()
121 | assert positional_stream.get_value() == [1, 2, 3]
122 | # --8<-- [end:map_positional]
123 | 
124 | 
125 | # --8<-- [start:map_key_word]
126 | key_word = dag.stream(lambda x, y: x + y).map(x=source_stream, y=to_append)
127 | # --8<-- [end:map_key_word]
128 | 


--------------------------------------------------------------------------------
/examples/etfs.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Example of ETF nav (Net Asset Value) calculation
  3 | """
  4 | 
  5 | import dataclasses
  6 | import random
  7 | from operator import attrgetter
  8 | from typing import Callable, Generic, Optional, TypeVar
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | 
 13 | from beavers import Dag
 14 | 
 15 | K = TypeVar("K")
 16 | V = TypeVar("V")
 17 | 
 18 | 
 19 | @dataclasses.dataclass(frozen=True)
 20 | class PriceRecord:
 21 |     timestamp: pd.Timestamp
 22 |     ticker: str
 23 |     price: Optional[float]
 24 | 
 25 | 
 26 | @dataclasses.dataclass(frozen=True)
 27 | class EtfComposition:
 28 |     timestamp: pd.Timestamp
 29 |     ticker: str
 30 |     weights: dict[str, float]
 31 | 
 32 | 
 33 | class GetLatest(Generic[K, V]):
 34 |     def __init__(self, key_extractor: Callable[[V], K]):
 35 |         self._key_extractor = key_extractor
 36 |         self._latest = {}
 37 | 
 38 |     def __call__(self, updates: list[V]) -> dict[str, V]:
 39 |         for update in updates:
 40 |             self._latest[self._key_extractor(update)] = update
 41 |         return self._latest
 42 | 
 43 | 
 44 | class GetUnique(Generic[K, V]):
 45 |     def __init__(self, key_extractor: Callable[[V], K]):
 46 |         self._key_extractor = key_extractor
 47 | 
 48 |     def __call__(self, updates: list[V]) -> list[str]:
 49 |         return sorted(list({self._key_extractor(update) for update in updates}))
 50 | 
 51 | 
 52 | def create_day_test_prices(date: pd.Timestamp) -> list[PriceRecord]:
 53 |     end = date + pd.offsets.Day()
 54 |     return sorted(
 55 |         [
 56 |             PriceRecord(
 57 |                 timestamp=pd.Timestamp(
 58 |                     np.random.randint(date.value, end.value), unit="ns"
 59 |                 ),
 60 |                 ticker=random.choice(["AAPL", "GOOGL", "MSFT"]),  # nosec B311
 61 |                 price=random.random(),  # nosec B311
 62 |             )
 63 |             for _ in range(random.randint(0, 1000))  # nosec B311
 64 |         ],
 65 |         key=lambda x: x.timestamp,
 66 |     )
 67 | 
 68 | 
 69 | def calculate_nav(
 70 |     composition: EtfComposition, prices: dict[str, PriceRecord]
 71 | ) -> PriceRecord:
 72 |     timestamp = composition.timestamp
 73 |     quotient = 0.0
 74 |     dividend = 0.0
 75 |     error = False
 76 |     for ticker, weight in composition.weights.items():
 77 |         try:
 78 |             price = prices[ticker]
 79 |         except KeyError:
 80 |             error = True
 81 |         else:
 82 |             quotient += price.price * weight
 83 |             dividend += weight
 84 |             timestamp = max(timestamp, price.timestamp)
 85 | 
 86 |     return PriceRecord(
 87 |         timestamp,
 88 |         composition.ticker,
 89 |         None if dividend == 0.0 or error else quotient / dividend,
 90 |     )
 91 | 
 92 | 
 93 | def calculate_navs(
 94 |     updated_tickers: set[str],
 95 |     etf_compositions: dict[str, EtfComposition],
 96 |     prices: dict[str, PriceRecord],
 97 | ) -> list[PriceRecord]:
 98 |     return [
 99 |         calculate_nav(etf_composition, prices)
100 |         for etf_composition in etf_compositions.values()
101 |         if (
102 |             etf_composition.ticker in updated_tickers
103 |             or (updated_tickers & etf_composition.weights.keys())
104 |         )
105 |     ]
106 | 
107 | 
108 | def get_updated_tickers(
109 |     updated_prices: list[PriceRecord],
110 |     updated_etf_compositions: list[EtfComposition],
111 | ) -> set[str]:
112 |     return set(p.ticker for p in updated_prices) | set(
113 |         e.ticker for e in updated_etf_compositions
114 |     )
115 | 
116 | 
117 | def create_dag() -> Dag:
118 |     dag = Dag()
119 |     price_stream = dag.source_stream([], name="price")
120 |     etf_composition_stream = dag.source_stream([], name="etf_composition")
121 |     price_latest = dag.state(GetLatest(attrgetter("ticker"))).map(price_stream)
122 |     etf_composition_latest = dag.state(GetLatest(attrgetter("ticker"))).map(
123 |         etf_composition_stream
124 |     )
125 | 
126 |     updated_tickers = dag.stream(get_updated_tickers, set()).map(
127 |         price_stream, etf_composition_stream
128 |     )
129 |     updated_navs = dag.stream(calculate_navs, []).map(
130 |         updated_tickers, etf_composition_latest, price_latest
131 |     )
132 |     dag.sink("etf_price", updated_navs)
133 |     return dag
134 | 


--------------------------------------------------------------------------------
/examples/kafka_concepts.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa: E402
 2 | # isort: skip_file
 3 | 
 4 | 
 5 | import confluent_kafka
 6 | import pandas as pd
 7 | 
 8 | # --8<-- [start:dag]
 9 | from beavers import Dag
10 | 
11 | 
12 | class CountWords:
13 |     state = {}
14 | 
15 |     def __call__(self, new_words: list[str]) -> dict[str, int]:
16 |         for word in new_words:
17 |             self.state[word] = self.state.get(word, 0) + 1
18 |         return self.state
19 | 
20 | 
21 | def update_stream(
22 |     state: dict[str, int], updated_words: list[str]
23 | ) -> list[tuple[str, int]]:
24 |     return [(word, state[word]) for word in set(updated_words)]
25 | 
26 | 
27 | dag = Dag()
28 | word_source = dag.source_stream(name="words")
29 | count_state = dag.state(CountWords()).map(word_source)
30 | count_stream = dag.stream(update_stream, []).map(count_state, word_source)
31 | dag.sink("counts", count_stream)
32 | # --8<-- [end:dag]
33 | 
34 | 
35 | # --8<-- [start:deserializer]
36 | def deserialize_messages(messages: list[confluent_kafka.Message]) -> list[str]:
37 |     return [message.value() for message in messages]
38 | 
39 | 
40 | # --8<-- [end:deserializer]
41 | 
42 | # --8<-- [start:kafka_source]
43 | from beavers.kafka import SourceTopic, KafkaDriver
44 | 
45 | source_topic = SourceTopic.from_start_of_day(
46 |     "words", deserialize_messages, pd.to_timedelta("15min"), "UTC"
47 | )
48 | # --8<-- [end:kafka_source]
49 | 
50 | 
51 | # --8<-- [start:serializer]
52 | from beavers.kafka import KafkaProducerMessage
53 | 
54 | 
55 | def serialize_counts(values: list[tuple[str, int]]) -> list[KafkaProducerMessage]:
56 |     return [
57 |         KafkaProducerMessage(
58 |             topic="counts",
59 |             key=word,
60 |             value=str(count),
61 |         )
62 |         for word, count in values
63 |     ]
64 | 
65 | 
66 | # --8<-- [end:serializer]
67 | 
68 | 
69 | # --8<-- [start:kafka_driver]
70 | kafka_driver = KafkaDriver.create(
71 |     dag=dag,
72 |     consumer_config={
73 |         "group.id": "beavers",
74 |         "bootstrap.servers": "localhost:9092",
75 |     },
76 |     producer_config={"bootstrap.servers": "localhost:9092"},
77 |     source_topics={"words": source_topic},
78 |     sink_topics={"counts": serialize_counts},
79 | )
80 | while True:
81 |     kafka_driver.run_cycle()
82 | # --8<-- [end:kafka_driver]
83 | 
84 | 
85 | # Note: you can test it with the following commands
86 | # kafka-topics --create --topic words --bootstrap-server=localhost:9092
87 | # kafka-console-producer --topic words --bootstrap-server=localhost:9092
88 | # kafka-console-consumer --topic=counts --bootstrap-server=localhost:9092 \
89 | #   --property print.key=true
90 | 


--------------------------------------------------------------------------------
/examples/pandas_concepts.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa: E402
 2 | # isort: skip_file
 3 | 
 4 | # --8<-- [start:business_logic_price]
 5 | import pandas as pd
 6 | 
 7 | price_table = pd.DataFrame.from_records(
 8 |     [
 9 |         {"ticker": "AAPL", "price": 174.79},
10 |         {"ticker": "GOOGL", "price": 130.25},
11 |         {"ticker": "MSFT", "price": 317.01},
12 |         {"ticker": "F", "price": 12.43},
13 |         {"ticker": "GM", "price": 35.28},
14 |     ],
15 | )
16 | 
17 | price_dtypes = price_table.dtypes
18 | 
19 | # --8<-- [end:business_logic_price]
20 | 
21 | # print(price_table.to_pandas().to_markdown(index=False))
22 | 
23 | # --8<-- [start:business_logic_composition]
24 | etf_composition_table = pd.DataFrame.from_records(
25 |     [
26 |         {"etf": "TECH", "ticker": "AAPL", "quantity": 2.0},
27 |         {"etf": "TECH", "ticker": "GOOGL", "quantity": 2.0},
28 |         {"etf": "TECH", "ticker": "MSFT", "quantity": 1.0},
29 |         {"etf": "CARS", "ticker": "F", "quantity": 3.0},
30 |         {"etf": "CARS", "ticker": "GM", "quantity": 2.0},
31 |     ],
32 | )
33 | 
34 | etf_composition_dtypes = etf_composition_table.dtypes
35 | # --8<-- [end:business_logic_composition]
36 | 
37 | # print(etf_composition_table.to_pandas().to_markdown(index=False, ffmt=".1f"))
38 | 
39 | 
40 | # --8<-- [start:business_logic_calculation]
41 | def calculate_etf_value(
42 |     etf_composition: pd.DataFrame, price: pd.DataFrame
43 | ) -> pd.DataFrame:
44 |     return (
45 |         etf_composition.merge(price, left_on="ticker", right_on="ticker", how="left")
46 |         .assign(values=lambda x: x["price"] * x["quantity"])
47 |         .groupby("etf")
48 |         .aggregate([("value", "sum")])
49 |     )
50 | 
51 | 
52 | etf_value_table = calculate_etf_value(
53 |     etf_composition=etf_composition_table, price=price_table
54 | )
55 | # --8<-- [end:business_logic_calculation]
56 | 
57 | 
58 | # print(etf_value_table.to_pandas().to_markdown(index=False, floatfmt=".2f"))
59 | 
60 | # --8<-- [start:dag_source]
61 | from beavers import Dag
62 | 
63 | dag = Dag()
64 | price_source = dag.pd.source_df(dtypes=price_dtypes, name="price")
65 | etf_composition_source = dag.pd.source_df(
66 |     dtypes=etf_composition_dtypes, name="etf_composition"
67 | )
68 | # --8<-- [end:dag_source]
69 | 
70 | # --8<-- [start:dag_state]
71 | price_state = dag.pd.last_by_keys(price_source, ["ticker"])
72 | etf_composition_state = dag.pd.last_by_keys(
73 |     etf_composition_source,
74 |     ["etf", "ticker"],
75 | )
76 | # --8<-- [end:dag_state]
77 | 
78 | 
79 | # --8<-- [start:dag_calculation]
80 | etf_value_state = dag.state(calculate_etf_value).map(
81 |     etf_composition_state,
82 |     price_state,
83 | )
84 | # --8<-- [end:dag_calculation]
85 | 
86 | 
87 | # --8<-- [start:dag_test]
88 | price_source.set_stream(price_table)
89 | etf_composition_source.set_stream(etf_composition_table)
90 | dag.execute()
91 | pd.testing.assert_frame_equal(etf_value_state.get_value(), etf_value_table)
92 | # --8<-- [end:dag_test]
93 | 


--------------------------------------------------------------------------------
/examples/perspective_concepts.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa: E402
 2 | # isort: skip_file
 3 | 
 4 | from typing import Sequence, Optional
 5 | 
 6 | # --8<-- [start:schema]
 7 | import pyarrow as pa
 8 | 
 9 | 
10 | KEY_VALUE_SCHEMA = pa.schema(
11 |     [
12 |         pa.field("timestamp", pa.timestamp("ms", "UTC")),
13 |         pa.field("topic", pa.string()),
14 |         pa.field("partition", pa.int32()),
15 |         pa.field("offset", pa.int64()),
16 |         pa.field("key", pa.string()),
17 |         pa.field("value", pa.string()),
18 |     ]
19 | )
20 | # --8<-- [end:schema]
21 | 
22 | # --8<-- [start:converter]
23 | import confluent_kafka
24 | 
25 | 
26 | def kafka_messages_to_pyarrow(
27 |     messages: Sequence[confluent_kafka.Message],
28 | ) -> pa.Table:
29 |     return pa.table(
30 |         [
31 |             [m.timestamp()[1] for m in messages],
32 |             [m.topic() for m in messages],
33 |             [m.partition() for m in messages],
34 |             [m.offset() for m in messages],
35 |             [None if m.key() is None else m.key().decode("utf-8") for m in messages],
36 |             [
37 |                 None if m.value() is None else m.value().decode("utf-8")
38 |                 for m in messages
39 |             ],
40 |         ],
41 |         schema=KEY_VALUE_SCHEMA,
42 |     )
43 | 
44 | 
45 | # --8<-- [end:converter]
46 | 
47 | # --8<-- [start:dag]
48 | from beavers import Dag
49 | from beavers.perspective_wrapper import PerspectiveTableDefinition
50 | 
51 | 
52 | def create_test_dag() -> Dag:
53 |     dag = Dag()
54 |     stream = dag.pa.source_table(
55 |         name="key_value",
56 |         schema=KEY_VALUE_SCHEMA,
57 |     )
58 |     dag.psp.to_perspective(
59 |         stream,
60 |         PerspectiveTableDefinition(
61 |             name="key_value",
62 |             index_column="key",
63 |         ),
64 |     )
65 |     return dag
66 | 
67 | 
68 | # --8<-- [end:dag]
69 | 
70 | # --8<-- [start:run]
71 | from beavers.kafka import KafkaDriver, SourceTopic
72 | from beavers.perspective_wrapper import run_web_application
73 | 
74 | 
75 | def run_dashboard(
76 |     topic: str = "key-value",
77 |     port: int = 8082,
78 |     consumer_config: Optional[dict] = None,
79 | ):
80 |     if consumer_config is None:
81 |         consumer_config = {"bootstrap.servers": "localhost:9092", "group.id": "beavers"}
82 | 
83 |     dag = create_test_dag()
84 | 
85 |     kafka_driver = KafkaDriver.create(
86 |         dag=dag,
87 |         producer_config={},
88 |         consumer_config=consumer_config,
89 |         source_topics={
90 |             "key_value": SourceTopic.from_earliest(topic, kafka_messages_to_pyarrow)
91 |         },
92 |         sink_topics={},
93 |     )
94 | 
95 |     run_web_application(kafka_driver=kafka_driver, port=port)
96 | 
97 | 
98 | # --8<-- [end:run]
99 | 


--------------------------------------------------------------------------------
/examples/polars_concepts.py:
--------------------------------------------------------------------------------
  1 | # ruff: noqa: E402
  2 | # isort: skip_file
  3 | 
  4 | import polars.testing
  5 | 
  6 | 
  7 | # --8<-- [start:business_logic_price]
  8 | import polars as pl
  9 | 
 10 | PRICE_SCHEMA = pl.Schema(
 11 |     [
 12 |         ("ticker", pl.String()),
 13 |         ("price", pl.Float64()),
 14 |     ]
 15 | )
 16 | 
 17 | price_table = pl.DataFrame(
 18 |     [
 19 |         {"ticker": "AAPL", "price": 174.79},
 20 |         {"ticker": "GOOGL", "price": 130.25},
 21 |         {"ticker": "MSFT", "price": 317.01},
 22 |         {"ticker": "F", "price": 12.43},
 23 |         {"ticker": "GM", "price": 35.28},
 24 |     ],
 25 |     schema=PRICE_SCHEMA,
 26 | )
 27 | # --8<-- [end:business_logic_price]
 28 | 
 29 | # print(price_table.to_pandas().to_markdown(index=False))
 30 | 
 31 | # --8<-- [start:business_logic_composition]
 32 | ETF_COMPOSITION_SCHEMA = pl.Schema(
 33 |     [
 34 |         ("etf", pl.String()),
 35 |         ("ticker", pl.String()),
 36 |         ("quantity", pl.Float64()),
 37 |     ]
 38 | )
 39 | 
 40 | 
 41 | etf_composition_table = pl.DataFrame(
 42 |     [
 43 |         {"etf": "TECH", "ticker": "AAPL", "quantity": 2.0},
 44 |         {"etf": "TECH", "ticker": "GOOGL", "quantity": 2.0},
 45 |         {"etf": "TECH", "ticker": "MSFT", "quantity": 1.0},
 46 |         {"etf": "CARS", "ticker": "F", "quantity": 3.0},
 47 |         {"etf": "CARS", "ticker": "GM", "quantity": 2.0},
 48 |     ],
 49 |     schema=ETF_COMPOSITION_SCHEMA,
 50 | )
 51 | # --8<-- [end:business_logic_composition]
 52 | 
 53 | # print(etf_composition_table.to_pandas().to_markdown(index=False, floatfmt=".1f"))
 54 | 
 55 | 
 56 | # --8<-- [start:business_logic_calculation]
 57 | ETF_VALUE_SCHEMA = pl.Schema(
 58 |     [
 59 |         ("etf", pl.String()),
 60 |         ("value", pl.Float64()),
 61 |     ]
 62 | )
 63 | 
 64 | 
 65 | def calculate_etf_value(
 66 |     etf_composition: pl.DataFrame, price: pl.DataFrame
 67 | ) -> pl.DataFrame:
 68 |     return (
 69 |         etf_composition.join(price, on=["ticker"])
 70 |         .select(pl.col("etf"), (pl.col("price") * pl.col("quantity")).alias("value"))
 71 |         .group_by("etf", maintain_order=True)
 72 |         .agg(pl.col("value").sum())
 73 |         .cast(ETF_VALUE_SCHEMA)
 74 |     )
 75 | 
 76 | 
 77 | etf_value_table = calculate_etf_value(
 78 |     etf_composition=etf_composition_table, price=price_table
 79 | )
 80 | # --8<-- [end:business_logic_calculation]
 81 | 
 82 | 
 83 | # print(etf_value_table.to_pandas().to_markdown(index=False, floatfmt=".2f"))
 84 | 
 85 | # --8<-- [start:dag_source]
 86 | from beavers import Dag
 87 | 
 88 | dag = Dag()
 89 | price_source = dag.pl.source_table(schema=PRICE_SCHEMA, name="price")
 90 | etf_composition_source = dag.pl.source_table(
 91 |     schema=ETF_COMPOSITION_SCHEMA, name="etf_composition"
 92 | )
 93 | # --8<-- [end:dag_source]
 94 | 
 95 | # --8<-- [start:dag_state]
 96 | price_state = dag.pl.last_by_keys(price_source, ["ticker"])
 97 | etf_composition_state = dag.pl.last_by_keys(
 98 |     etf_composition_source,
 99 |     ["etf", "ticker"],
100 | )
101 | # --8<-- [end:dag_state]
102 | 
103 | 
104 | # --8<-- [start:dag_calculation]
105 | etf_value_state = dag.state(calculate_etf_value).map(
106 |     etf_composition_state,
107 |     price_state,
108 | )
109 | # --8<-- [end:dag_calculation]
110 | 
111 | 
112 | # --8<-- [start:dag_test]
113 | price_source.set_stream(price_table)
114 | etf_composition_source.set_stream(etf_composition_table)
115 | dag.execute()
116 | polars.testing.assert_frame_equal(etf_value_state.get_value(), etf_value_table)
117 | # --8<-- [end:dag_test]
118 | 
119 | 
120 | # --8<-- [start:spurious_update]
121 | new_price_updates = pl.DataFrame(
122 |     [{"ticker": "GME", "price": 123.0}],
123 |     PRICE_SCHEMA,
124 | )
125 | price_source.set_stream(new_price_updates)
126 | dag.execute()
127 | assert len(etf_value_state.get_value()) == 2
128 | assert etf_value_state.get_cycle_id() == dag.get_cycle_id()
129 | # --8<-- [end:spurious_update]
130 | 
131 | # --8<-- [start:updated_because_of_composition]
132 | updated_because_of_composition = dag.pl.get_series(
133 |     etf_composition_source,
134 |     "etf",
135 | )
136 | # --8<-- [end:updated_because_of_composition]
137 | 
138 | 
139 | # --8<-- [start:updated_because_of_price]
140 | def get_etf_to_update_because_of_price(
141 |     etf_composition_state: pl.DataFrame, price_update: pl.DataFrame
142 | ) -> pl.Series:
143 |     updated_tickers = price_update["ticker"].unique()
144 |     return etf_composition_state.filter(pl.col("ticker").is_in(updated_tickers))[
145 |         "etf"
146 |     ].unique()
147 | 
148 | 
149 | updated_because_of_price = dag.stream(
150 |     get_etf_to_update_because_of_price, empty=pl.Series(name="etf", dtype=pl.String())
151 | ).map(etf_composition_state, price_source)
152 | # --8<-- [end:updated_because_of_price]
153 | 
154 | # --8<-- [start:update_all]
155 | stale_etfs = dag.pl.concat_series(
156 |     updated_because_of_price, updated_because_of_composition
157 | )
158 | 
159 | 
160 | def get_composition_for_etfs(
161 |     etf_composition_state: pl.DataFrame,
162 |     etfs: pl.Series,
163 | ) -> pl.DataFrame:
164 |     return etf_composition_state.filter(pl.col("etf").is_in(etfs))
165 | 
166 | 
167 | stale_etf_compositions = dag.pl.table_stream(
168 |     get_composition_for_etfs, ETF_COMPOSITION_SCHEMA
169 | ).map(etf_composition_state, stale_etfs)
170 | 
171 | updated_etf = dag.pl.table_stream(calculate_etf_value, ETF_VALUE_SCHEMA).map(
172 |     stale_etf_compositions, price_state
173 | )
174 | # --8<-- [end:update_all]
175 | 
176 | # --8<-- [start:update_all_test]
177 | price_source.set_stream(
178 |     pl.DataFrame(
179 |         [{"ticker": "MSFT", "price": 317.05}],
180 |         schema=PRICE_SCHEMA,
181 |     )
182 | )
183 | dag.execute()
184 | assert len(updated_etf.get_value()) == 1
185 | # --8<-- [end:update_all_test]
186 | 
187 | # print(updated_etf.get_value().to_pandas().to_markdown(index=False))
188 | 


--------------------------------------------------------------------------------
/examples/pyarrow_concepts.py:
--------------------------------------------------------------------------------
  1 | # ruff: noqa: E402
  2 | # isort: skip_file
  3 | 
  4 | # --8<-- [start:business_logic_price]
  5 | import pyarrow as pa
  6 | 
  7 | PRICE_SCHEMA = pa.schema(
  8 |     [
  9 |         pa.field("ticker", pa.string()),
 10 |         pa.field("price", pa.float64()),
 11 |     ]
 12 | )
 13 | 
 14 | price_table = pa.Table.from_pylist(
 15 |     [
 16 |         {"ticker": "AAPL", "price": 174.79},
 17 |         {"ticker": "GOOGL", "price": 130.25},
 18 |         {"ticker": "MSFT", "price": 317.01},
 19 |         {"ticker": "F", "price": 12.43},
 20 |         {"ticker": "GM", "price": 35.28},
 21 |     ],
 22 |     schema=PRICE_SCHEMA,
 23 | )
 24 | # --8<-- [end:business_logic_price]
 25 | 
 26 | # print(price_table.to_pandas().to_markdown(index=False))
 27 | 
 28 | # --8<-- [start:business_logic_composition]
 29 | ETF_COMPOSITION_SCHEMA = pa.schema(
 30 |     [
 31 |         pa.field("etf", pa.string()),
 32 |         pa.field("ticker", pa.string()),
 33 |         pa.field("quantity", pa.float64()),
 34 |     ]
 35 | )
 36 | 
 37 | 
 38 | etf_composition_table = pa.Table.from_pylist(
 39 |     [
 40 |         {"etf": "TECH", "ticker": "AAPL", "quantity": 2.0},
 41 |         {"etf": "TECH", "ticker": "GOOGL", "quantity": 2.0},
 42 |         {"etf": "TECH", "ticker": "MSFT", "quantity": 1.0},
 43 |         {"etf": "CARS", "ticker": "F", "quantity": 3.0},
 44 |         {"etf": "CARS", "ticker": "GM", "quantity": 2.0},
 45 |     ],
 46 |     schema=ETF_COMPOSITION_SCHEMA,
 47 | )
 48 | # --8<-- [end:business_logic_composition]
 49 | 
 50 | # print(etf_composition_table.to_pandas().to_markdown(index=False, floatfmt=".1f"))
 51 | 
 52 | 
 53 | # --8<-- [start:business_logic_calculation]
 54 | import pyarrow.compute as pc
 55 | 
 56 | ETF_VALUE_SCHEMA = pa.schema(
 57 |     [
 58 |         pa.field("etf", pa.string()),
 59 |         pa.field("value", pa.float64()),
 60 |     ]
 61 | )
 62 | 
 63 | 
 64 | def calculate_etf_value(etf_composition: pa.Table, price: pa.Table) -> pa.Table:
 65 |     positions_with_prices = etf_composition.join(price, keys=["ticker"])
 66 |     values = pc.multiply(
 67 |         positions_with_prices["price"], positions_with_prices["quantity"]
 68 |     )
 69 |     positions_with_prices = positions_with_prices.append_column("value", values)
 70 |     return (
 71 |         positions_with_prices.group_by("etf")
 72 |         .aggregate([("value", "sum")])
 73 |         .rename_columns(ETF_VALUE_SCHEMA.names)
 74 |     )
 75 | 
 76 | 
 77 | etf_value_table = calculate_etf_value(
 78 |     etf_composition=etf_composition_table, price=price_table
 79 | )
 80 | # --8<-- [end:business_logic_calculation]
 81 | 
 82 | 
 83 | # print(etf_value_table.to_pandas().to_markdown(index=False, floatfmt=".2f"))
 84 | 
 85 | # --8<-- [start:dag_source]
 86 | from beavers import Dag
 87 | 
 88 | dag = Dag()
 89 | price_source = dag.pa.source_table(schema=PRICE_SCHEMA, name="price")
 90 | etf_composition_source = dag.pa.source_table(
 91 |     schema=ETF_COMPOSITION_SCHEMA, name="etf_composition"
 92 | )
 93 | # --8<-- [end:dag_source]
 94 | 
 95 | # --8<-- [start:dag_state]
 96 | price_state = dag.pa.last_by_keys(price_source, ["ticker"])
 97 | etf_composition_state = dag.pa.last_by_keys(
 98 |     etf_composition_source,
 99 |     ["etf", "ticker"],
100 | )
101 | # --8<-- [end:dag_state]
102 | 
103 | 
104 | # --8<-- [start:dag_calculation]
105 | etf_value_state = dag.state(calculate_etf_value).map(
106 |     etf_composition_state,
107 |     price_state,
108 | )
109 | # --8<-- [end:dag_calculation]
110 | 
111 | 
112 | # --8<-- [start:dag_test]
113 | price_source.set_stream(price_table)
114 | etf_composition_source.set_stream(etf_composition_table)
115 | dag.execute()
116 | assert etf_value_state.get_value() == etf_value_table
117 | # --8<-- [end:dag_test]
118 | 
119 | 
120 | # --8<-- [start:spurious_update]
121 | new_price_updates = pa.Table.from_pylist(
122 |     [{"ticker": "GME", "price": 123.0}],
123 |     PRICE_SCHEMA,
124 | )
125 | price_source.set_stream(new_price_updates)
126 | dag.execute()
127 | assert len(etf_value_state.get_value()) == 2
128 | assert etf_value_state.get_cycle_id() == dag.get_cycle_id()
129 | # --8<-- [end:spurious_update]
130 | 
131 | # --8<-- [start:updated_because_of_composition]
132 | updated_because_of_composition = dag.pa.get_column(
133 |     etf_composition_source,
134 |     "etf",
135 | )
136 | # --8<-- [end:updated_because_of_composition]
137 | 
138 | 
139 | # --8<-- [start:updated_because_of_price]
140 | def get_etf_to_update_because_of_price(
141 |     etf_composition_state: pa.Table, price_update: pa.Table
142 | ) -> pa.Array:
143 |     updated_tickers = pc.unique(price_update["ticker"])
144 |     return pc.unique(
145 |         etf_composition_state.filter(
146 |             pc.is_in(etf_composition_state["ticker"], updated_tickers)
147 |         )["etf"]
148 |     )
149 | 
150 | 
151 | updated_because_of_price = dag.stream(
152 |     get_etf_to_update_because_of_price, pa.array([], pa.string())
153 | ).map(etf_composition_state, price_source)
154 | # --8<-- [end:updated_because_of_price]
155 | 
156 | # --8<-- [start:update_all]
157 | stale_etfs = dag.pa.concat_arrays(
158 |     updated_because_of_price, updated_because_of_composition
159 | )
160 | 
161 | 
162 | def get_composition_for_etfs(
163 |     etf_composition_state: pa.Table, etfs: pa.Array
164 | ) -> pa.Table:
165 |     return etf_composition_state.filter(
166 |         pc.is_in(
167 |             etf_composition_state["etf"],
168 |             etfs,
169 |         )
170 |     )
171 | 
172 | 
173 | stale_etf_compositions = dag.pa.table_stream(
174 |     get_composition_for_etfs, ETF_COMPOSITION_SCHEMA
175 | ).map(etf_composition_state, stale_etfs)
176 | 
177 | updated_etf = dag.pa.table_stream(calculate_etf_value, ETF_VALUE_SCHEMA).map(
178 |     stale_etf_compositions, price_state
179 | )
180 | # --8<-- [end:update_all]
181 | 
182 | # --8<-- [start:update_all_test]
183 | price_source.set_stream(
184 |     pa.Table.from_pylist(
185 |         [{"ticker": "MSFT", "price": 317.05}],
186 |         PRICE_SCHEMA,
187 |     )
188 | )
189 | dag.execute()
190 | assert len(updated_etf.get_value()) == 1
191 | # --8<-- [end:update_all_test]
192 | 
193 | # print(updated_etf.get_value().to_pandas().to_markdown(index=False))
194 | 


--------------------------------------------------------------------------------
/examples/replay_concepts.py:
--------------------------------------------------------------------------------
  1 | # isort: skip_file
  2 | # ruff: noqa: E402
  3 | import operator
  4 | 
  5 | import beavers
  6 | 
  7 | 
  8 | # --8<-- [start:simple_dag]
  9 | dag = beavers.Dag()
 10 | my_source = dag.source_stream(name="my_source")
 11 | my_sink = dag.sink("my_sink", my_source)
 12 | # --8<-- [end:simple_dag]
 13 | 
 14 | # --8<-- [start:simple_data_class]
 15 | import dataclasses
 16 | import pandas as pd
 17 | 
 18 | 
 19 | @dataclasses.dataclass(frozen=True)
 20 | class Message:
 21 |     timestamp: pd.Timestamp
 22 |     message: str
 23 | 
 24 | 
 25 | # --8<-- [end:simple_data_class]
 26 | 
 27 | # --8<-- [start:manual_replay]
 28 | my_source.set_stream(
 29 |     [
 30 |         Message(pd.Timestamp("2023-01-01T00:00:00Z"), "hello"),
 31 |         Message(pd.Timestamp("2023-01-01T00:00:30Z"), "How are you"),
 32 |     ]
 33 | )
 34 | dag.execute(pd.Timestamp("2023-01-01T00:01:00Z"))
 35 | assert my_sink.get_sink_value() == [
 36 |     Message(pd.Timestamp("2023-01-01T00:00:00Z"), "hello"),
 37 |     Message(pd.Timestamp("2023-01-01T00:00:30Z"), "How are you"),
 38 | ]
 39 | # --8<-- [end:manual_replay]
 40 | 
 41 | 
 42 | # --8<-- [start:data_source]
 43 | import beavers.replay
 44 | 
 45 | 
 46 | @dataclasses.dataclass(frozen=True)
 47 | class MessageDataSource:
 48 |     messages: list[Message]
 49 | 
 50 |     def read_to(self, timestamp: pd.Timestamp) -> list[Message]:
 51 |         results = []
 52 |         while self.messages and self.messages[0].timestamp <= timestamp:
 53 |             results.append(self.messages.pop(0))
 54 |         return results
 55 | 
 56 |     def get_next(self) -> pd.Timestamp:
 57 |         if self.messages:
 58 |             return self.messages[0].timestamp
 59 |         else:
 60 |             return beavers.replay.UTC_MAX
 61 | 
 62 | 
 63 | # --8<-- [end:data_source]
 64 | 
 65 | 
 66 | # --8<-- [start:replay_context]
 67 | from beavers.replay import ReplayContext
 68 | 
 69 | replay_context = ReplayContext(
 70 |     start=pd.to_datetime("2023-01-01T00:00:00Z"),
 71 |     end=pd.to_datetime("2023-01-02T00:00:00Z"),
 72 |     frequency=pd.to_timedelta("1h"),
 73 | )
 74 | # --8<-- [end:replay_context]
 75 | 
 76 | 
 77 | # --8<-- [start:data_source_provider]
 78 | @dataclasses.dataclass(frozen=True)
 79 | class CsvDataSourceProvider:
 80 |     file_name: str
 81 | 
 82 |     def __call__(
 83 |         self, replay_context: ReplayContext
 84 |     ) -> beavers.replay.DataSource[list[Message]]:
 85 |         df = pd.read_csv(self.file_name, parse_dates=["timestamp"])
 86 |         messages = [Message(*row) for _, row in df.iterrows()]
 87 |         messages.sort(key=lambda x: x.timestamp)
 88 |         return MessageDataSource(messages)
 89 | 
 90 | 
 91 | # --8<-- [end:data_source_provider]
 92 | 
 93 | 
 94 | # --8<-- [start:data_sink]
 95 | @dataclasses.dataclass(frozen=True)
 96 | class CsvDataSink:
 97 |     destination: str
 98 |     data: list[Message] = dataclasses.field(default_factory=list)
 99 | 
100 |     def append(self, timestamp: pd.Timestamp, data: list[Message]):
101 |         self.data.extend(data)
102 | 
103 |     def close(self):
104 |         pd.DataFrame([dataclasses.asdict(value) for value in self.data]).to_csv(
105 |             self.destination, index=False
106 |         )
107 | 
108 | 
109 | # --8<-- [end:data_sink]
110 | 
111 | 
112 | # --8<-- [start:data_sink_provider]
113 | @dataclasses.dataclass(frozen=True)
114 | class CsvDataSinkProvider:
115 |     destination: str
116 | 
117 |     def __call__(self, replay_context: ReplayContext) -> CsvDataSink:
118 |         return CsvDataSink(self.destination)
119 | 
120 | 
121 | # --8<-- [end:data_sink_provider]
122 | 
123 | 
124 | # This is just to print the csv file:
125 | file = "data.csv"
126 | df = pd.DataFrame(
127 |     {
128 |         "timestamp": [
129 |             pd.Timestamp("2023-01-01T01:00:00Z"),
130 |             pd.Timestamp("2023-01-01T01:01:00Z"),
131 |         ],
132 |         "message": ["Hello", "How are you"],
133 |     }
134 | )
135 | df.to_csv("input.csv", index=False)
136 | 
137 | df_after = pd.read_csv("input.csv", parse_dates=["timestamp"])
138 | pd.testing.assert_frame_equal(df, df_after)
139 | 
140 | messages = [Message(*row) for _, row in df_after.iterrows()]
141 | 
142 | df2 = pd.DataFrame(
143 |     {
144 |         "timestamp": [
145 |             pd.Timestamp("2023-01-02T01:00:00Z"),
146 |             pd.Timestamp("2023-01-02T01:01:00Z"),
147 |         ],
148 |         "message": ["I'm fine", "Thanks"],
149 |     }
150 | )
151 | df.to_csv("input_2023-01-01.csv", index=False)
152 | df2.to_csv("input_2023-01-02.csv", index=False)
153 | df2[:0].to_csv("input_2023-01-03.csv", index=False)
154 | 
155 | 
156 | # --8<-- [start:replay_driver]
157 | from beavers.replay import ReplayDriver
158 | 
159 | replay_driver = beavers.replay.ReplayDriver.create(
160 |     dag=dag,
161 |     replay_context=replay_context,
162 |     data_source_providers={"my_source": CsvDataSourceProvider("input.csv")},
163 |     data_sink_providers={"my_sink": CsvDataSinkProvider("output.csv")},
164 | )
165 | replay_driver.run()
166 | # --8<-- [end:replay_driver]
167 | 
168 | 
169 | # --8<-- [start:iterator_data_source_adapter]
170 | from beavers.replay import IteratorDataSourceAdapter
171 | 
172 | 
173 | @dataclasses.dataclass(frozen=True)
174 | class PartitionedCsvDataSourceProvider:
175 |     source_format: str
176 | 
177 |     def __call__(self, replay_context: ReplayContext):
178 |         file_names = [
179 |             self.source_format.format(date=date)
180 |             for date in pd.date_range(replay_context.start, replay_context.end)
181 |         ]
182 |         generator = (self._load_one_file(file_name) for file_name in file_names)
183 |         return IteratorDataSourceAdapter(
184 |             sources=generator,
185 |             empty=[],
186 |             concatenator=operator.add,
187 |         )
188 | 
189 |     def _load_one_file(self, file_name: str) -> MessageDataSource:
190 |         return MessageDataSource(
191 |             [
192 |                 Message(*row)
193 |                 for _, row in pd.read_csv(
194 |                     file_name, parse_dates=["timestamp"]
195 |                 ).iterrows()
196 |             ]
197 |         )
198 | 
199 | 
200 | source_provider = PartitionedCsvDataSourceProvider("input_{date:%Y-%m-%d}.csv")
201 | # --8<-- [end:iterator_data_source_adapter]
202 | 
203 | # --8<-- [start:iterator_data_source_adapter_run]
204 | ReplayDriver.create(
205 |     dag=dag,
206 |     replay_context=ReplayContext(
207 |         start=pd.to_datetime("2023-01-01T00:00:00Z"),
208 |         end=pd.to_datetime("2023-01-03T00:00:00Z"),
209 |         frequency=pd.to_timedelta("1h"),
210 |     ),
211 |     data_source_providers={
212 |         "my_source": PartitionedCsvDataSourceProvider("input_{date:%Y-%m-%d}.csv")
213 |     },
214 |     data_sink_providers={"my_sink": CsvDataSinkProvider("output.csv")},
215 | ).run()
216 | 
217 | # --8<-- [end:iterator_data_source_adapter_run]
218 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Beavers
 2 | site_url: https://beavers.readthedocs.io/en/latest/
 3 | repo_url: https://github.com/tradewelltech/beavers
 4 | theme:
 5 |   name: material
 6 |   features:
 7 |     - navigation.tabs
 8 |     - navigation.tabs.sticky
 9 |     - content.code.annotate
10 |     - content.tabs.link
11 |     - content.code.copy
12 |     - header.autohide
13 |     - navigation.indexes
14 |     - navigation.instant
15 |     - navigation.tracking
16 |     - search.highlight
17 |     - search.share
18 |     - search.suggest
19 |   palette:
20 |     scheme: slate
21 |     accent: green
22 |   logo: static/icons/beavers/logo.svg
23 |   favicon: static/icons/beavers/icon.png
24 | 
25 | plugins:
26 |   - search
27 |   - mkdocstrings:
28 |       default_handler: python
29 |       handlers:
30 |         python:
31 |           options:
32 |             show_source: false
33 | 
34 | markdown_extensions:
35 |   - def_list
36 |   - pymdownx.inlinehilite
37 |   - pymdownx.superfences
38 |   - pymdownx.snippets:
39 |   - pymdownx.emoji
40 |   - pymdownx.highlight
41 |   - attr_list
42 |   - md_in_html
43 | extra:
44 |   project_name: "beavers"
45 | 
46 | 
47 | nav:
48 |   - Home:
49 |     - index.md
50 |   - Concepts:
51 |     - concepts/dag.md
52 |     - concepts/advanced.md
53 |     - concepts/replay.md
54 |     - concepts/kafka.md
55 |     - concepts/pandas.md
56 |     - concepts/pyarrow.md
57 |     - concepts/polars.md
58 |     - concepts/perspective.md
59 |   - API Reference:
60 |     - reference/dag.md
61 |     - reference/replay.md
62 |     - reference/kafka.md
63 |     - reference/pandas_wrapper.md
64 |     - reference/pyarrow_wrapper.md
65 |   - install.md
66 |   - contributing.md
67 |   - faq.md
68 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | build-backend = "poetry_dynamic_versioning.backend"
  3 | requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"]
  4 | 
  5 | [project]
  6 | authors = [
  7 |   {name = "Tradewell Tech", email = "engineering@tradewelltech.co"}
  8 | ]
  9 | classifiers = [
 10 |   "Development Status :: 5 - Production/Stable",
 11 |   "License :: OSI Approved :: Apache Software License",
 12 |   "Natural Language :: English",
 13 |   "Programming Language :: Python :: 3.10",
 14 |   "Programming Language :: Python :: 3.11",
 15 |   "Programming Language :: Python :: 3.12",
 16 |   "Programming Language :: Python :: 3.13"
 17 | ]
 18 | dependencies = [
 19 |   "confluent_kafka>=2.1.1",
 20 |   "pandas",
 21 |   "perspective-python>=3.0.0",
 22 |   "polars",
 23 |   "pyarrow",
 24 |   "tornado"
 25 | ]
 26 | description = "Python stream processing"
 27 | documentation = "https://beavers.readthedocs.io/en/latest/"
 28 | keywords = ["apache-arrow", "streaming", "data"]
 29 | license = "Apache-2.0"
 30 | maintainers = [
 31 |   {name = "0x26res", email = "0x26res@gmail.com"}
 32 | ]
 33 | name = "beavers"
 34 | packages = [
 35 |   {include = "beavers"}
 36 | ]
 37 | readme = "README.md"
 38 | repository = "https://github.com/tradewelltech/beavers"
 39 | requires-python = ">=3.10,<4"
 40 | version = "0.0.0"
 41 | 
 42 | [project.optional-dependencies]
 43 | confluent-kafka = ["confluent-kafka"]
 44 | perspective-python = ["perspective-python", "tornado"]
 45 | polars = ["polars"]
 46 | pyarrow = ["pyarrow"]
 47 | 
 48 | [project.urls]
 49 | "Bug Tracker" = "https://github.com/tradewelltech/beavers/issues"
 50 | "Changelog" = "https://github.com/tradewelltech/beavers/blob/main/CHANGELOG.md"
 51 | 
 52 | [tool.bandit]
 53 | skips = ["B101", "B311"]
 54 | 
 55 | [tool.black]
 56 | exclude = "venv/|tox/"
 57 | target-version = ["py310"]
 58 | 
 59 | [tool.coverage.report]
 60 | # https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185
 61 | exclude_lines = ["if TYPE_CHECKING:"]
 62 | 
 63 | [tool.coverage.run]
 64 | omit = [
 65 |   # This is hard to test, and the API is about to change a lot
 66 |   "*/beavers/perspective_wrapper.py"
 67 | ]
 68 | 
 69 | [tool.poetry.group.dev.dependencies]
 70 | black = ">=22.10.0"
 71 | click = ">=8.1.7"
 72 | coverage = ">=6.5.0"
 73 | flake8 = ">=5.0.4"
 74 | git-changelog = ">=2.2.0"
 75 | isort = ">=5.10.1"
 76 | mock = "*"
 77 | pip-tools = ">=6.12.1"
 78 | pre-commit = ">=2.20.0"
 79 | pylint = ">=2.15.0"
 80 | pytest = ">=7.2.0"
 81 | pytest-asyncio = "*"
 82 | tabulate = "*"
 83 | 
 84 | [tool.poetry.group.docs]
 85 | optional = true
 86 | 
 87 | [tool.poetry.group.docs.dependencies]
 88 | markdown-include = "*"
 89 | mkdocs = ">=1.5.3"
 90 | mkdocs-material = ">=9.3.2"
 91 | mkdocs-material-extensions = "*"
 92 | mkdocstrings = {version = ">=0.21.2", extras = ["python"]}
 93 | pymdown-extensions = "*"
 94 | tornado = "*"
 95 | 
 96 | [tool.poetry-dynamic-versioning]
 97 | enable = true
 98 | 
 99 | [tool.poetry-dynamic-versioning.substitution]
100 | files = ["*/__init__.py"]
101 | folders = [{path = "beavers"}]
102 | 
103 | [tool.pydocstyle]
104 | ignore = ["D102", "D107", "D203", "D212"]
105 | 
106 | [tool.pytest.ini_options]
107 | asyncio_default_fixture_loop_scope = "function"
108 | asyncio_mode = "auto"
109 | 
110 | [tool.ruff]
111 | line-length = 88
112 | 
113 | [tool.ruff.lint.isort]
114 | known-first-party = ["beavers", "tradewell_proto"]
115 | 


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
 1 | # Scripts
 2 | 
 3 | These script are helpful for testing beavers with simple real time applications
 4 | 
 5 | ## Set up
 6 | 
 7 | Use kafka-kraft in docker for kafka:
 8 | 
 9 | ```shell
10 | docker run --name=simple_kafka -p 9092:9092 -d bashj79/kafka-kraft
11 | ```
12 | 
13 | ## `kafka_test_bench`
14 | 
15 | Tests a simple application with kafka, making sure it replays in order.
16 | The "timestamp" of the output messages should be in order across topics when replaying.
17 | 
18 | 
19 | ### Create Topics
20 | 
21 | ```shell
22 | docker exec -it simple_kafka /opt/kafka/bin/kafka-topics.sh --bootstrap-server=localhost:9092 --create --topic=left --partitions=1 --replication-factor=1 
23 | docker exec -it simple_kafka /opt/kafka/bin/kafka-topics.sh --bootstrap-server=localhost:9092 --create --topic=right --partitions=1 --replication-factor=1 
24 | docker exec -it simple_kafka /opt/kafka/bin/kafka-topics.sh --bootstrap-server=localhost:9092 --create --topic=both --partitions=1 --replication-factor=1 
25 | ```
26 | 
27 | ### Run the Beavers job
28 | 
29 | ```shell
30 | python -m scripts.kafka_test_bench --batch-size=2
31 | ```
32 | 
33 | ### Publish data
34 | 
35 | ```shell
36 | docker exec -it simple_kafka /opt/kafka/bin/kafka-console-producer.sh --bootstrap-server=localhost:9092 --topic=left
37 | docker exec -it simple_kafka /opt/kafka/bin/kafka-console-producer.sh --bootstrap-server=localhost:9092 --topic=right
38 | ```
39 | 
40 | ### See out put data 
41 | 
42 | ```shell
43 | docker exec -it simple_kafka /opt/kafka/bin/kafka-console-consumer.sh \
44 |   --bootstrap-server=localhost:9092 \
45 |   --topic=both \
46 |   --property print.key=true \
47 |   --from-beginning
48 | ```
49 | 
50 | ## `perpective_test_bench.py`
51 | 
52 | ### Create the topic
53 | 
54 | ```shell
55 | docker exec -it simple_kafka /opt/kafka/bin/kafka-topics.sh --bootstrap-server=localhost:9092 --create --topic=key-value --partitions=1 --replication-factor=1 
56 | ```
57 | 
58 | ### Publish data
59 | 
60 | ```shell
61 | docker exec -it simple_kafka /opt/kafka/bin/kafka-console-producer.sh \
62 |   --topic=key-value \
63 |   --bootstrap-server=localhost:9092 \
64 |    --property parse.key=true \
65 |    --property key.separator=, 
66 | ```
67 | 


--------------------------------------------------------------------------------
/scripts/kafka_test_bench.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import json
  3 | import logging
  4 | from operator import itemgetter
  5 | from typing import Any, Callable, Sequence
  6 | 
  7 | import click
  8 | import confluent_kafka
  9 | import pandas as pd
 10 | 
 11 | from beavers import Dag
 12 | from beavers.kafka import KafkaDriver, KafkaProducerMessage, SourceTopic
 13 | 
 14 | 
 15 | def create_test_dag() -> "Dag":
 16 |     dag = Dag()
 17 |     left_stream = dag.source_stream(name="left")
 18 |     right_stream = dag.source_stream(name="right")
 19 |     both_stream = dag.stream(
 20 |         lambda left, right: sorted(left + right, key=itemgetter("timestamp"))
 21 |     ).map(left_stream, right_stream)
 22 |     dag.sink("both", both_stream)
 23 |     return dag
 24 | 
 25 | 
 26 | def kafka_messages_to_json(
 27 |     messages: Sequence[confluent_kafka.Message],
 28 | ) -> list[dict[str, Any]]:
 29 |     return [
 30 |         {
 31 |             "topic": message.topic(),
 32 |             "partition": message.partition(),
 33 |             "offset": message.offset(),
 34 |             "timestamp": str(
 35 |                 pd.to_datetime(message.timestamp()[1], unit="ms", utc=True)
 36 |             ),
 37 |             "key": message.key().encode("utf-8") if message.key() else None,
 38 |             "value": message.value().decode("utf-8"),
 39 |         }
 40 |         for message in messages
 41 |     ]
 42 | 
 43 | 
 44 | def kafka_message_serializer(
 45 |     payloads: list[dict[str, Any]], topic: str
 46 | ) -> list[KafkaProducerMessage]:
 47 |     return [
 48 |         KafkaProducerMessage(topic, key=None, value=json.dumps(payload))
 49 |         for payload in payloads
 50 |     ]
 51 | 
 52 | 
 53 | SOURCE_TOPIC_CREATORS: dict[str, Callable[[str], SourceTopic]] = {
 54 |     "latest": functools.partial(
 55 |         SourceTopic.from_latest, message_deserializer=kafka_messages_to_json
 56 |     ),
 57 |     "earliest": functools.partial(
 58 |         SourceTopic.from_earliest, message_deserializer=kafka_messages_to_json
 59 |     ),
 60 |     "15min": functools.partial(
 61 |         SourceTopic.from_relative_time,
 62 |         message_deserializer=kafka_messages_to_json,
 63 |         relative_time=pd.to_timedelta("15min"),
 64 |     ),
 65 |     "start-of-day": functools.partial(
 66 |         SourceTopic.from_start_of_day,
 67 |         message_deserializer=kafka_messages_to_json,
 68 |         start_of_day_time=pd.to_timedelta("00:00:00"),
 69 |         start_of_day_timezone="UTC",
 70 |     ),
 71 |     "absolute-time": functools.partial(
 72 |         SourceTopic.from_absolute_time,
 73 |         message_deserializer=kafka_messages_to_json,
 74 |         absolute_time=pd.Timestamp.utcnow().normalize(),
 75 |     ),
 76 |     "committed": functools.partial(
 77 |         SourceTopic.from_committed,
 78 |         message_deserializer=kafka_messages_to_json,
 79 |     ),
 80 | }
 81 | 
 82 | 
 83 | @click.command()
 84 | @click.option("--left-topic", type=click.STRING, default="left")
 85 | @click.option(
 86 |     "--left-offset", type=click.Choice(SOURCE_TOPIC_CREATORS.keys()), default="earliest"
 87 | )
 88 | @click.option("--right-topic", type=click.STRING, default="right")
 89 | @click.option(
 90 |     "--right-offset",
 91 |     type=click.Choice(SOURCE_TOPIC_CREATORS.keys()),
 92 |     default="earliest",
 93 | )
 94 | @click.option("--both-topic", type=click.STRING, default="both")
 95 | @click.option(
 96 |     "--consumer-config",
 97 |     type=json.loads,
 98 |     default='{"bootstrap.servers": "localhost:9092", "group.id": "beavers"}',
 99 | )
100 | @click.option(
101 |     "--producer-config",
102 |     type=json.loads,
103 |     default='{"bootstrap.servers": "localhost:9092"}',
104 | )
105 | @click.option("--batch-size", type=click.INT, default="2")
106 | def kafka_test_bench(
107 |     left_topic: str,
108 |     left_offset: str,
109 |     right_topic: str,
110 |     right_offset: str,
111 |     both_topic: str,
112 |     consumer_config: dict,
113 |     producer_config: dict,
114 |     batch_size: int,
115 | ):
116 |     logging.basicConfig(
117 |         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
118 |         level=logging.DEBUG,
119 |     )
120 | 
121 |     dag = create_test_dag()
122 | 
123 |     driver = KafkaDriver.create(
124 |         dag=dag,
125 |         producer_config=producer_config,
126 |         consumer_config=consumer_config,
127 |         source_topics={
128 |             "left": SOURCE_TOPIC_CREATORS[left_offset](left_topic),
129 |             "right": SOURCE_TOPIC_CREATORS[right_offset](right_topic),
130 |         },
131 |         sink_topics={
132 |             "both": functools.partial(kafka_message_serializer, topic=both_topic)
133 |         },
134 |         batch_size=batch_size,
135 |     )
136 |     while True:
137 |         driver.run_cycle()
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     kafka_test_bench()
142 | 


--------------------------------------------------------------------------------
/scripts/perpective_test_bench.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import click
 4 | 
 5 | from examples.perspective_concepts import run_dashboard
 6 | 
 7 | 
 8 | @click.command()
 9 | @click.option("--topic", type=click.STRING, default="key-value")
10 | @click.option("--port", type=click.INT, default=8082)
11 | @click.option(
12 |     "--consumer-config",
13 |     type=json.loads,
14 |     default='{"bootstrap.servers": "localhost:9092", "group.id": "beavers"}',
15 | )
16 | def perspective_test_bench(
17 |     topic: str,
18 |     port: int,
19 |     consumer_config: dict,
20 | ):
21 |     run_dashboard(topic=topic, port=port, consumer_config=consumer_config)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     perspective_test_bench()
26 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | from _pytest.assertion import register_assert_rewrite
2 | 
3 | register_assert_rewrite("beavers.testing")
4 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tradewelltech/beavers/ec9979086868589ab82b47ce55fa11cc31b32c16/tests/conftest.py


--------------------------------------------------------------------------------
/tests/test_docs.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | 
 4 | def test_readme_and_index_same():
 5 |     """Check the README matches the doc home page"""
 6 |     root = Path(__file__).parent.parent
 7 |     readme = root / "README.md"
 8 |     index = root / "docs" / "index.md"
 9 | 
10 |     with readme.open() as fp:
11 |         readme_content = fp.read()
12 | 
13 |     with index.open() as fp:
14 |         # Skip first and last line
15 |         index_content = "".join(fp.readlines()[1:-1])
16 | 
17 |     assert index_content in readme_content
18 | 


--------------------------------------------------------------------------------
/tests/test_etfs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from beavers.testing import DagTestBench
 4 | from examples import etfs
 5 | 
 6 | 
 7 | def test_run_dag():
 8 |     dag = etfs.create_dag()
 9 |     bench = DagTestBench(dag)
10 | 
11 |     # Price and ETF come in:
12 |     timestamp_0 = pd.to_datetime("2023-06-10 12:00:00+0000")
13 |     (
14 |         bench.set_source(
15 |             "price",
16 |             [
17 |                 etfs.PriceRecord(timestamp_0, "AAPL", 180.0),
18 |                 etfs.PriceRecord(timestamp_0, "GOOG", 120.0),
19 |             ],
20 |         )
21 |         .set_source(
22 |             "etf_composition",
23 |             [etfs.EtfComposition(timestamp_0, "TECH", {"AAPL": 1.0, "GOOG": 1.5})],
24 |         )
25 |         .execute(timestamp_0)
26 |         .assert_sink_list("etf_price", [etfs.PriceRecord(timestamp_0, "TECH", 144.0)])
27 |     )
28 | 
29 |     # AAPL price update:
30 |     timestamp_1 = timestamp_0 + pd.to_timedelta("1s")
31 |     (
32 |         bench.set_source(
33 |             "price",
34 |             [
35 |                 etfs.PriceRecord(timestamp_1, "AAPL", 200.0),
36 |             ],
37 |         )
38 |         .execute(timestamp_1)
39 |         .assert_sink_list("etf_price", [etfs.PriceRecord(timestamp_1, "TECH", 152.0)])
40 |     )
41 | 
42 |     # Unrelated price updates:
43 |     timestamp_2 = timestamp_0 + pd.to_timedelta("2s")
44 |     (
45 |         bench.set_source(
46 |             "price",
47 |             [
48 |                 etfs.PriceRecord(timestamp_2, "MSFT", 330.0),
49 |             ],
50 |         )
51 |         .execute(timestamp_2)
52 |         .assert_sink_not_updated("etf_price")
53 |     )
54 | 
55 |     # New ETF comes in
56 |     timestamp_3 = timestamp_0 + pd.to_timedelta("4s")
57 |     (
58 |         bench.set_source(
59 |             "etf_composition",
60 |             [etfs.EtfComposition(timestamp_3, "SOFT", {"MSFT": 0.5, "GOOG": 1.0})],
61 |         )
62 |         .execute(timestamp_3)
63 |         .assert_sink_list("etf_price", [etfs.PriceRecord(timestamp_3, "SOFT", 190.0)])
64 |     )
65 | 
66 |     # ETF extends with missing price:
67 |     timestamp_4 = timestamp_0 + pd.to_timedelta("4s")
68 |     (
69 |         bench.set_source(
70 |             "etf_composition",
71 |             [
72 |                 etfs.EtfComposition(
73 |                     timestamp_4, "SOFT", {"MSFT": 0.5, "GOOG": 1.0, "ORCL": 0.5}
74 |                 )
75 |             ],
76 |         )
77 |         .execute(timestamp_4)
78 |         .assert_sink_list("etf_price", [etfs.PriceRecord(timestamp_4, "SOFT", None)])
79 |     )
80 | 


--------------------------------------------------------------------------------
/tests/test_pandas_wrapper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | from beavers import Dag
 6 | from beavers.pandas_wrapper import _empty_df, _get_stream_dtypes, _LastTracker
 7 | 
 8 | DTYPES = pd.Series(
 9 |     {
10 |         "col1": np.int64,
11 |         "col2": np.object_,
12 |     }
13 | )
14 | DF = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
15 | DF_UPDATE = pd.DataFrame({"col1": [1, 2, 2], "col2": ["e", "f", "g"]})
16 | 
17 | 
18 | def test_dtypes():
19 |     df = _empty_df(dtypes=DTYPES)
20 |     pd.testing.assert_series_equal(df.dtypes, DTYPES)
21 | 
22 | 
23 | def test_source_df():
24 |     dag = Dag()
25 |     source = dag.pd.source_df(dtypes=DTYPES)
26 | 
27 |     dag.execute()
28 |     pd.testing.assert_series_equal(source.get_value().dtypes, DTYPES)
29 | 
30 |     source.set_stream(DF)
31 |     dag.execute()
32 |     pd.testing.assert_frame_equal(source.get_value(), DF)
33 | 
34 | 
35 | def test_table_stream():
36 |     dag = Dag()
37 |     source = dag.pd.source_df(dtypes=DTYPES)
38 |     stream = dag.pd.df_stream(lambda x: x[x["col1"] > 1], DTYPES).map(source)
39 | 
40 |     dag.execute()
41 |     pd.testing.assert_frame_equal(stream.get_value(), _empty_df(DTYPES))
42 | 
43 |     source.set_stream(DF)
44 |     dag.execute()
45 |     pd.testing.assert_frame_equal(stream.get_value(), DF[lambda x: x["col1"] > 1])
46 | 
47 | 
48 | def test_get_stream_dtypes():
49 |     dag = Dag()
50 |     source = dag.pd.source_df(dtypes=DTYPES)
51 |     pd.testing.assert_series_equal(_get_stream_dtypes(source), DTYPES)
52 | 
53 |     state = dag.state(lambda: "foo").map()
54 |     with pytest.raises(TypeError, match=r"Argument should be a stream Node"):
55 |         pd.testing.assert_series_equal(_get_stream_dtypes(state), DTYPES)
56 | 
57 |     list_node = dag.source_stream()
58 |     with pytest.raises(TypeError, match=r"Argument should be a Node\[pd.DataFrame\]"):
59 |         pd.testing.assert_series_equal(_get_stream_dtypes(list_node), DTYPES)
60 | 
61 | 
62 | def test_latest_tracker():
63 |     tracker = _LastTracker(["col1"], _empty_df(DTYPES))
64 |     pd.testing.assert_frame_equal(tracker(_empty_df(DTYPES)), _empty_df(DTYPES))
65 |     pd.testing.assert_frame_equal(tracker(DF), DF)
66 |     pd.testing.assert_frame_equal(tracker(DF), DF)
67 | 
68 |     pd.testing.assert_frame_equal(
69 |         tracker(DF_UPDATE), pd.DataFrame({"col1": [3, 1, 2], "col2": ["c", "e", "g"]})
70 |     )
71 | 
72 | 
73 | def test_last_by_keys():
74 |     dag = Dag()
75 |     source = dag.pd.source_df(dtypes=DTYPES)
76 |     latest = dag.pd.last_by_keys(source, ["col1"])
77 | 
78 |     dag.execute()
79 |     pd.testing.assert_frame_equal(latest.get_value(), _empty_df(DTYPES))
80 | 
81 |     source.set_stream(DF)
82 |     dag.execute()
83 |     pd.testing.assert_frame_equal(latest.get_value(), DF)
84 | 
85 |     source.set_stream(DF)
86 |     dag.execute()
87 |     pd.testing.assert_frame_equal(latest.get_value(), DF)
88 | 


--------------------------------------------------------------------------------
/tests/test_perpective_wrapper.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import MagicMock
  2 | 
  3 | import perspective
  4 | import pyarrow as pa
  5 | import pytest
  6 | from mock import mock
  7 | from perspective import Server
  8 | from tornado.testing import AsyncHTTPTestCase
  9 | from tornado.web import Application
 10 | 
 11 | from beavers import Dag
 12 | from beavers.perspective_wrapper import (
 13 |     DATA_TYPES,
 14 |     PerspectiveTableDefinition,
 15 |     TableRequestHandler,
 16 |     _PerspectiveNode,
 17 |     _table_to_bytes,
 18 |     _TableConfig,
 19 |     _UpdateRunner,
 20 |     perspective_thread,
 21 | )
 22 | 
 23 | PERSPECTIVE_TABLE_SCHEMA = pa.schema(
 24 |     [
 25 |         pa.field("index", pa.string()),
 26 |         pa.field("remove", pa.string()),
 27 |     ]
 28 | )
 29 | PERSPECTIVE_TABLE_DEFINITION = config = PerspectiveTableDefinition(
 30 |     name="name",
 31 |     index_column="index",
 32 |     remove_column="remove",
 33 | )
 34 | 
 35 | 
 36 | def test_config_validate():
 37 |     definition = PERSPECTIVE_TABLE_DEFINITION
 38 | 
 39 |     with pytest.raises(AssertionError, match="index"):
 40 |         definition.validate(pa.schema([]))
 41 | 
 42 |     with pytest.raises(AssertionError, match="remove"):
 43 |         definition.validate(pa.schema([pa.field("index", pa.string())]))
 44 | 
 45 |     definition.validate(PERSPECTIVE_TABLE_SCHEMA)
 46 | 
 47 | 
 48 | def test_to_table_config():
 49 |     assert _TableConfig.from_definition(
 50 |         PERSPECTIVE_TABLE_DEFINITION, PERSPECTIVE_TABLE_SCHEMA
 51 |     ) == _TableConfig(
 52 |         name="name", index="index", columns=["index", "remove"], sort=[], filters=[]
 53 |     )
 54 | 
 55 | 
 56 | def test_table_to_bytes():
 57 |     results = _table_to_bytes(PERSPECTIVE_TABLE_SCHEMA.empty_table())
 58 |     assert isinstance(results, bytes)
 59 |     assert len(results) > 100
 60 | 
 61 | 
 62 | def test_update_runner():
 63 |     mock = MagicMock()
 64 | 
 65 |     runner = _UpdateRunner(mock)
 66 |     runner()
 67 |     assert mock.run_cycle.called
 68 | 
 69 | 
 70 | def test_add_node():
 71 |     dag = Dag()
 72 |     source = dag.pa.source_table(schema=PERSPECTIVE_TABLE_SCHEMA)
 73 |     state = dag.state(lambda x: x).map(source)
 74 |     assert dag.psp.to_perspective(source, PERSPECTIVE_TABLE_DEFINITION) is None
 75 | 
 76 |     with pytest.raises(AssertionError, match="Must provide a schema for state nodes"):
 77 |         dag.psp.to_perspective(state, PERSPECTIVE_TABLE_DEFINITION)
 78 | 
 79 |     dag.psp.to_perspective(
 80 |         state, PERSPECTIVE_TABLE_DEFINITION, schema=PERSPECTIVE_TABLE_SCHEMA
 81 |     )
 82 | 
 83 |     for node in dag._nodes:
 84 |         if isinstance(node._function, _PerspectiveNode):
 85 |             assert node._function.table is None
 86 |             node._function.table = MagicMock()
 87 | 
 88 |     dag.execute()
 89 | 
 90 |     nodes = [
 91 |         n._function for n in dag._nodes if isinstance(n._function, _PerspectiveNode)
 92 |     ]
 93 |     assert len(nodes) == 2
 94 |     assert nodes[0].get_table_config() == _TableConfig(
 95 |         name="name", index="index", columns=["index", "remove"], sort=[], filters=[]
 96 |     )
 97 | 
 98 | 
 99 | class FakeLoop:
100 |     @staticmethod
101 |     def current():
102 |         return FakeLoop()
103 | 
104 |     def add_callback(self):
105 |         pass
106 | 
107 |     def time(self):
108 |         return 0
109 | 
110 |     def add_timeout(self, *args, **kwargs):
111 |         pass
112 | 
113 |     def start(self):
114 |         pass
115 | 
116 | 
117 | @mock.patch("tornado.ioloop.IOLoop", FakeLoop)
118 | def test_perspective_thread():
119 |     manager = Server()
120 | 
121 |     perspective_thread(manager, MagicMock(), [])
122 | 
123 | 
124 | class TestHandler(AsyncHTTPTestCase):
125 |     def get_app(self):
126 |         table_configs = [
127 |             _TableConfig(
128 |                 "table1", index="col_1", columns=["col_1", "col_2"], sort=(), filters=()
129 |             )
130 |         ]
131 |         return Application(
132 |             [
133 |                 (
134 |                     r"/([a-z0-9_]*)",
135 |                     TableRequestHandler,
136 |                     {"table_configs": table_configs},
137 |                 ),
138 |             ]
139 |         )
140 | 
141 |     def test_table(self):
142 |         response = self.fetch("/")
143 |         assert response.code == 200
144 |         assert b'["col_1", "col_2"]' in response.body
145 | 
146 | 
147 | def test_schema():
148 |     server = perspective.Server()
149 |     client = server.new_local_client()
150 | 
151 |     client.table({str(i): v[1] for i, v in enumerate(DATA_TYPES)})
152 | 


--------------------------------------------------------------------------------
/tests/test_polars_wrapper.py:
--------------------------------------------------------------------------------
  1 | import polars as pl
  2 | import polars.testing
  3 | import pytest
  4 | 
  5 | from beavers import Dag
  6 | from beavers.polars_wrapper import _get_stream_schema, _get_stream_dtype
  7 | 
  8 | SIMPLE_SCHEMA = pl.Schema(
  9 |     [
 10 |         ("col1", pl.Int32()),
 11 |         ("col2", pl.Utf8()),
 12 |     ]
 13 | )
 14 | EMPTY_FRAME = pl.DataFrame(schema=SIMPLE_SCHEMA)
 15 | SIMPLE_FRAME = pl.DataFrame([[1, 2, 3], ["a", "b", "c"]], schema=SIMPLE_SCHEMA)
 16 | SIMPLE_FRAME_2 = table = pl.DataFrame([[1, 2], ["d", "e"]], schema=SIMPLE_SCHEMA)
 17 | 
 18 | 
 19 | def test_source_stream():
 20 |     dag = Dag()
 21 | 
 22 |     node = dag.pl.source_table(schema=SIMPLE_SCHEMA)
 23 |     polars.testing.assert_frame_equal(
 24 |         node._empty_factory(), pl.DataFrame(schema=SIMPLE_SCHEMA)
 25 |     )
 26 | 
 27 |     node.set_stream(SIMPLE_FRAME)
 28 |     dag.execute()
 29 |     polars.testing.assert_frame_equal(node.get_value(), SIMPLE_FRAME)
 30 | 
 31 |     dag.execute()
 32 |     polars.testing.assert_frame_equal(
 33 |         node.get_value(), pl.DataFrame(schema=SIMPLE_SCHEMA)
 34 |     )
 35 | 
 36 | 
 37 | def test_table_stream():
 38 |     dag = Dag()
 39 | 
 40 |     schema = pl.Schema([("col1", pl.Int32())])
 41 |     source = dag.pl.source_table(SIMPLE_SCHEMA)
 42 |     node = dag.pl.table_stream(lambda x: x.select(["col1"]), schema).map(source)
 43 | 
 44 |     dag.execute()
 45 |     polars.testing.assert_frame_equal(node.get_value(), pl.DataFrame(schema=schema))
 46 | 
 47 |     source.set_stream(SIMPLE_FRAME)
 48 |     dag.execute()
 49 |     polars.testing.assert_frame_equal(node.get_value(), SIMPLE_FRAME.select(["col1"]))
 50 | 
 51 | 
 52 | def test_filter_stream():
 53 |     dag = Dag()
 54 | 
 55 |     source = dag.pl.source_table(SIMPLE_SCHEMA)
 56 |     filtered = dag.pl.filter_stream(source, pl.col("col1") > 1, pl.col("col2") == "a")
 57 | 
 58 |     dag.execute()
 59 |     polars.testing.assert_frame_equal(
 60 |         filtered.get_value(), pl.DataFrame(schema=SIMPLE_SCHEMA)
 61 |     )
 62 | 
 63 |     source.set_stream(SIMPLE_FRAME)
 64 |     dag.execute()
 65 |     polars.testing.assert_frame_equal(
 66 |         filtered.get_value(),
 67 |         SIMPLE_FRAME.filter(pl.col("col1") > 1, pl.col("col2") == "a"),
 68 |     )
 69 | 
 70 | 
 71 | def test_get_stream_schema():
 72 |     dag = Dag()
 73 | 
 74 |     polars_source = dag.pl.source_table(SIMPLE_SCHEMA)
 75 |     assert _get_stream_schema(polars_source) == SIMPLE_SCHEMA
 76 | 
 77 |     list_source = dag.source_stream(empty=[], name="source1")
 78 |     with pytest.raises(TypeError, match=r"Argument should be a Node\[pl\.DataFrame\]"):
 79 |         _get_stream_schema(list_source)
 80 | 
 81 | 
 82 | def test_last_by():
 83 |     dag = Dag()
 84 | 
 85 |     source = dag.pl.source_table(SIMPLE_SCHEMA)
 86 |     last_by = dag.pl.last_by_keys(source, ["col1"])
 87 | 
 88 |     dag.execute()
 89 |     polars.testing.assert_frame_equal(
 90 |         last_by.get_value(), pl.DataFrame(schema=SIMPLE_SCHEMA)
 91 |     )
 92 | 
 93 |     source.set_stream(SIMPLE_FRAME)
 94 |     dag.execute()
 95 |     polars.testing.assert_frame_equal(last_by.get_value(), SIMPLE_FRAME)
 96 | 
 97 |     source.set_stream(SIMPLE_FRAME_2)
 98 |     dag.execute()
 99 |     assert str(last_by.get_value()) == str(
100 |         pl.DataFrame([[1, 2, 3], ["d", "e", "c"]], schema=SIMPLE_SCHEMA)
101 |     )
102 | 
103 | 
104 | def test_last_by_order_of_column():
105 |     dag = Dag()
106 | 
107 |     source = dag.pl.source_table(SIMPLE_SCHEMA)
108 |     last_by = dag.pl.last_by_keys(source, ["col2"])
109 | 
110 |     dag.execute()
111 |     polars.testing.assert_frame_equal(
112 |         last_by.get_value(), pl.DataFrame(schema=SIMPLE_SCHEMA)
113 |     )
114 | 
115 |     source.set_stream(SIMPLE_FRAME)
116 |     dag.execute()
117 |     polars.testing.assert_frame_equal(last_by.get_value(), SIMPLE_FRAME)
118 | 
119 | 
120 | def test_last_by_bad_keys():
121 |     dag = Dag()
122 |     source = dag.pl.source_table(SIMPLE_SCHEMA)
123 |     with pytest.raises(AssertionError, match="Keys must be strings"):
124 |         dag.pl.last_by_keys(source, [1])
125 | 
126 | 
127 | def test_concat_series():
128 |     dag = Dag()
129 |     left_source = dag.pl.source_table(SIMPLE_SCHEMA)
130 |     left = dag.pl.get_series(left_source, "col1")
131 |     right_source = dag.pl.source_table(SIMPLE_SCHEMA)
132 |     right = dag.pl.get_series(right_source, "col1")
133 | 
134 |     both = dag.pl.concat_series(left, right)
135 | 
136 |     dag.execute()
137 |     polars.testing.assert_series_equal(
138 |         both.get_value(), pl.Series(dtype=pl.Int32(), name="col1")
139 |     )
140 | 
141 |     left_source.set_stream(SIMPLE_FRAME)
142 |     dag.execute()
143 |     polars.testing.assert_series_equal(
144 |         both.get_value(), pl.Series(values=[1, 2, 3], dtype=pl.Int32(), name="col1")
145 |     )
146 | 
147 |     left_source.set_stream(SIMPLE_FRAME)
148 |     right_source.set_stream(SIMPLE_FRAME_2)
149 |     dag.execute()
150 |     polars.testing.assert_series_equal(
151 |         both.get_value(),
152 |         pl.Series(values=[1, 2, 3, 1, 2], dtype=pl.Int32(), name="col1"),
153 |     )
154 | 
155 |     right_source.set_stream(SIMPLE_FRAME_2)
156 |     dag.execute()
157 |     polars.testing.assert_series_equal(
158 |         both.get_value(),
159 |         pl.Series(values=[1, 2], dtype=pl.Int32(), name="col1"),
160 |     )
161 | 
162 | 
163 | def test_concat_series_bad_no_series():
164 |     dag = Dag()
165 |     with pytest.raises(ValueError, match="Must pass at least one series"):
166 |         dag.pl.concat_series()
167 | 
168 | 
169 | def test_concat_series_bad_mismatching_series():
170 |     dag = Dag()
171 |     source = dag.pl.source_table(SIMPLE_SCHEMA)
172 |     left = dag.pl.get_series(source, "col1")
173 |     right = dag.pl.get_series(source, "col2")
174 |     with pytest.raises(TypeError, match="Series type mismatch Int32 vs String"):
175 |         dag.pl.concat_series(left, right)
176 | 
177 | 
178 | def test_get_series():
179 |     dag = Dag()
180 |     left_source = dag.pl.source_table(SIMPLE_SCHEMA)
181 |     left_series = dag.pl.get_series(left_source, "col1")
182 | 
183 |     dag.execute()
184 |     polars.testing.assert_series_equal(left_series.get_value(), EMPTY_FRAME["col1"])
185 | 
186 |     left_source.set_stream(SIMPLE_FRAME)
187 |     dag.execute()
188 |     polars.testing.assert_series_equal(left_series.get_value(), SIMPLE_FRAME["col1"])
189 | 
190 |     dag.execute()
191 |     polars.testing.assert_series_equal(left_series.get_value(), EMPTY_FRAME["col1"])
192 | 
193 | 
194 | def test_get_stream_dtype_bad():
195 |     with pytest.raises(TypeError, match=r"Argument should be a Node\[pl\.Series\]"):
196 |         _get_stream_dtype(Dag().source_stream())
197 | 


--------------------------------------------------------------------------------
/tests/test_pyarrow_kafka.py:
--------------------------------------------------------------------------------
 1 | from beavers.pyarrow_kafka import JsonDeserializer, JsonSerializer
 2 | from tests.test_kafka import mock_kafka_message
 3 | from tests.test_util import TEST_TABLE
 4 | 
 5 | 
 6 | def test_json_deserializer_empty():
 7 |     deserializer = JsonDeserializer(TEST_TABLE.schema)
 8 |     assert deserializer([]) == TEST_TABLE.schema.empty_table()
 9 | 
10 | 
11 | def test_end_to_end():
12 |     deserializer = JsonDeserializer(TEST_TABLE.schema)
13 |     serializer = JsonSerializer("topic-1")
14 |     out_messages = serializer(TEST_TABLE)
15 |     in_messages = [
16 |         mock_kafka_message(topic=m.topic, value=m.value) for m in out_messages
17 |     ]
18 |     assert deserializer(in_messages) == TEST_TABLE
19 | 


--------------------------------------------------------------------------------
/tests/test_pyarrow_replay.py:
--------------------------------------------------------------------------------
 1 | from operator import itemgetter
 2 | 
 3 | import pandas as pd
 4 | import pyarrow as pa
 5 | import pyarrow.csv
 6 | import pytest
 7 | 
 8 | from beavers.dag import UTC_MAX
 9 | from beavers.pyarrow_replay import ArrowTableDataSink, ArrowTableDataSource
10 | from tests.test_util import TEST_TABLE
11 | 
12 | 
13 | def test_arrow_table_data_source():
14 |     source = ArrowTableDataSource(TEST_TABLE, itemgetter("timestamp"))
15 | 
16 |     assert source.get_next() == pd.to_datetime("2023-01-01T00:00:00Z")
17 |     assert source.read_to(pd.to_datetime("2023-01-01T00:00:00Z")) == TEST_TABLE[:1]
18 |     assert source.read_to(pd.to_datetime("2023-01-01T00:00:00Z")) == TEST_TABLE[:0]
19 |     assert source.get_next() == pd.to_datetime("2023-01-02T00:00:00Z")
20 |     assert source.read_to(pd.to_datetime("2023-01-02T00:00:00Z")) == TEST_TABLE[1:]
21 |     assert source.get_next() == UTC_MAX
22 |     assert source.read_to(UTC_MAX) == TEST_TABLE[:0]
23 | 
24 | 
25 | def test_arrow_table_data_source_ooo():
26 |     with pytest.raises(
27 |         AssertionError, match="Timestamp column should be monotonic increasing"
28 |     ):
29 |         ArrowTableDataSource(
30 |             pa.table(
31 |                 {
32 |                     "timestamp": [
33 |                         pd.to_datetime("2023-01-02T00:00:00Z"),
34 |                         pd.to_datetime("2023-01-01T00:00:00Z"),
35 |                     ],
36 |                     "value": [1, 2],
37 |                 }
38 |             ),
39 |             itemgetter("timestamp"),
40 |         )
41 | 
42 | 
43 | def test_arrow_table_data_sink(tmpdir):
44 |     file = tmpdir / "file.csv"
45 |     sink = ArrowTableDataSink(lambda table: pyarrow.csv.write_csv(table, file))
46 | 
47 |     sink.close()
48 |     assert not file.exists()
49 | 
50 |     sink.append(UTC_MAX, TEST_TABLE)
51 |     sink.close()
52 |     assert file.exists()
53 | 


--------------------------------------------------------------------------------
/tests/test_pyarrow_wrapper.py:
--------------------------------------------------------------------------------
  1 | import pyarrow as pa
  2 | import pyarrow.compute as pc
  3 | import pytest
  4 | 
  5 | from beavers import Dag
  6 | from beavers.pyarrow_wrapper import _concat_arrow_arrays, _get_last_by, _LastByKey
  7 | 
  8 | SIMPLE_SCHEMA = pa.schema(
  9 |     [
 10 |         pa.field("col1", pa.int32()),
 11 |         pa.field("col2", pa.string()),
 12 |         pa.field("col3", pa.timestamp("ns", "UTC")),
 13 |     ]
 14 | )
 15 | SIMPLE_TABLE = pa.table([[1, 2, 3], ["a", "b", "c"], [0, 0, 0]], schema=SIMPLE_SCHEMA)
 16 | SIMPLE_TABLE_2 = table = pa.table([[1, 2], ["d", "e"], [0, 0]], schema=SIMPLE_SCHEMA)
 17 | 
 18 | 
 19 | def test_source_stream():
 20 |     dag = Dag()
 21 | 
 22 |     node = dag.pa.source_table(schema=SIMPLE_SCHEMA)
 23 |     assert node._empty_factory() == SIMPLE_SCHEMA.empty_table()
 24 | 
 25 |     node.set_stream(SIMPLE_TABLE)
 26 |     dag.execute()
 27 |     assert node.get_value() == SIMPLE_TABLE
 28 | 
 29 |     dag.execute()
 30 |     assert node.get_value() == SIMPLE_SCHEMA.empty_table()
 31 | 
 32 | 
 33 | def test_source_stream_name():
 34 |     dag = Dag()
 35 | 
 36 |     node = dag.pa.source_table(schema=SIMPLE_SCHEMA, name="source_1")
 37 |     assert dag.get_sources() == {"source_1": node}
 38 | 
 39 | 
 40 | def test_table_stream():
 41 |     dag = Dag()
 42 | 
 43 |     source = dag.pa.source_table(SIMPLE_SCHEMA)
 44 |     node = dag.pa.table_stream(
 45 |         lambda x: x.select(["col1"]),
 46 |         pa.schema([pa.field("col1", pa.int32())]),
 47 |     ).map(source)
 48 | 
 49 |     source.set_stream(SIMPLE_TABLE)
 50 |     dag.execute()
 51 |     assert node.get_value() == SIMPLE_TABLE.select(["col1"])
 52 | 
 53 | 
 54 | def test_filter_stream():
 55 |     dag = Dag()
 56 | 
 57 |     source = dag.pa.source_table(SIMPLE_SCHEMA)
 58 |     node = dag.pa.filter_stream(
 59 |         lambda x, y: pc.equal(x["col1"], y), source, dag.const(1)
 60 |     )
 61 |     SIMPLE_SCHEMA.empty_table()
 62 |     source.set_stream(SIMPLE_TABLE)
 63 |     dag.execute()
 64 |     assert node.get_value() == SIMPLE_TABLE[0:1]
 65 | 
 66 |     dag.execute()
 67 |     assert node.get_value() == SIMPLE_SCHEMA.empty_table()
 68 | 
 69 | 
 70 | def _predicate(table: pa.Table) -> pa.Array:
 71 |     return pc.equal(table["col1"], 1)
 72 | 
 73 | 
 74 | def test_filter_stream_bad_arguments():
 75 |     dag = Dag()
 76 | 
 77 |     state_node = dag.state(lambda: "HELLO").map()
 78 |     with pytest.raises(TypeError, match=r"Argument should be a stream Node"):
 79 |         dag.pa.filter_stream(_predicate, state_node)
 80 | 
 81 |     list_stream_node = dag.source_stream()
 82 |     with pytest.raises(TypeError, match=r"Argument should be a Node\[pa\.Table\]"):
 83 |         dag.pa.filter_stream(_predicate, list_stream_node)
 84 | 
 85 | 
 86 | def test_learn_expression_type():
 87 |     field = pc.field("col1")
 88 |     assert isinstance(field, pc.Expression)
 89 |     greater_with_pc = pc.greater(field, 2)
 90 |     assert SIMPLE_TABLE.filter(greater_with_pc) == SIMPLE_TABLE[-1:]
 91 |     greater_with_python = field > 2
 92 |     assert SIMPLE_TABLE.filter(greater_with_python) == SIMPLE_TABLE[-1:]
 93 |     with pytest.raises(TypeError):
 94 |         pc.min(SIMPLE_TABLE, field)
 95 | 
 96 | 
 97 | def test_group_by_last():
 98 |     with pytest.raises(
 99 |         pa.ArrowNotImplementedError,
100 |         match="Using ordered aggregator"
101 |         " in multiple threaded execution is not supported",
102 |     ):
103 |         SIMPLE_TABLE.group_by("col1").aggregate([("col2", "last")])
104 | 
105 | 
106 | def test_get_latest():
107 |     table = pa.table(
108 |         [[1, 2, 3, 1, 2], ["a", "b", "c", "d", "e"], [0] * 5], schema=SIMPLE_SCHEMA
109 |     )
110 |     assert _get_last_by(table, ["col1"]) == table[2:]
111 |     assert _get_last_by(table, ["col1", "col2"]) == table
112 | 
113 | 
114 | def test_get_last_by_batches():
115 |     table = pa.concat_tables([SIMPLE_TABLE, SIMPLE_TABLE])
116 |     assert _get_last_by(table, ["col1"]) == SIMPLE_TABLE
117 | 
118 | 
119 | def test_get_last_by_all_columns():
120 |     table = pa.concat_tables([SIMPLE_TABLE, SIMPLE_TABLE])
121 |     assert _get_last_by(table, ["col1", "col2"]) == SIMPLE_TABLE
122 | 
123 | 
124 | def test_latest_tracker():
125 |     tracker = _LastByKey(["col1"], SIMPLE_SCHEMA.empty_table())
126 | 
127 |     assert tracker(SIMPLE_SCHEMA.empty_table()) == SIMPLE_SCHEMA.empty_table()
128 |     assert tracker(SIMPLE_TABLE) == SIMPLE_TABLE
129 |     assert tracker(SIMPLE_TABLE_2) == pa.table(
130 |         [[3, 1, 2], ["c", "d", "e"], [0] * 3], schema=SIMPLE_SCHEMA
131 |     )
132 | 
133 | 
134 | def test_last_by_keys():
135 |     dag = Dag()
136 |     source = dag.pa.source_table(SIMPLE_SCHEMA)
137 |     latest = dag.pa.last_by_keys(source, ["col1"])
138 | 
139 |     dag.execute()
140 |     assert latest.get_value() == SIMPLE_SCHEMA.empty_table()
141 | 
142 |     source.set_stream(SIMPLE_TABLE)
143 |     dag.execute()
144 |     assert latest.get_value() == SIMPLE_TABLE
145 | 
146 |     dag.execute()
147 |     assert latest.get_value() == SIMPLE_TABLE
148 | 
149 |     source.set_stream(SIMPLE_TABLE_2)
150 |     dag.execute()
151 |     assert latest.get_value() == pa.table(
152 |         [[3, 1, 2], ["c", "d", "e"], [0] * 3], schema=SIMPLE_SCHEMA
153 |     )
154 | 
155 | 
156 | def test_last_by_keys_bad():
157 |     dag = Dag()
158 | 
159 |     with pytest.raises(
160 |         AttributeError, match=r"'str' object has no attribute '_get_empty'"
161 |     ):
162 |         dag.pa.last_by_keys("Not a node", ["col1"])
163 |     with pytest.raises(TypeError, match=r"Argument should be a Node\[pa.Table\]"):
164 |         dag.pa.last_by_keys(dag.source_stream(), ["col1"])
165 |     with pytest.raises(TypeError, match=r"Argument should be a stream Node"):
166 |         dag.pa.last_by_keys(dag.state(lambda: None).map(), ["col1"])
167 | 
168 |     source = dag.pa.source_table(SIMPLE_SCHEMA)
169 | 
170 |     with pytest.raises(TypeError, match="123"):
171 |         dag.pa.last_by_keys(source, 123)
172 |     with pytest.raises(TypeError, match="123"):
173 |         dag.pa.last_by_keys(source, [123])
174 |     with pytest.raises(
175 |         TypeError, match=r"field colz no in schema: \['col1', 'col2', 'col3'\]"
176 |     ):
177 |         dag.pa.last_by_keys(source, ["colz"])
178 | 
179 | 
180 | def test_get_column():
181 |     dag = Dag()
182 |     source = dag.pa.source_table(SIMPLE_SCHEMA)
183 |     array = dag.pa.get_column(source, "col1")
184 | 
185 |     dag.execute()
186 |     assert array.get_value() == pa.chunked_array([pa.array([], pa.int32())])
187 | 
188 |     source.set_stream(SIMPLE_TABLE)
189 |     dag.execute()
190 |     assert array.get_value() == SIMPLE_TABLE["col1"]
191 | 
192 |     dag.execute()
193 |     assert array.get_value() == pa.chunked_array([pa.array([], pa.int32())])
194 | 
195 |     source.set_stream(SIMPLE_TABLE_2)
196 |     dag.execute()
197 |     assert array.get_value() == SIMPLE_TABLE_2["col1"]
198 | 
199 | 
200 | def test_get_column_bad():
201 |     dag = Dag()
202 | 
203 |     with pytest.raises(
204 |         AttributeError, match=r"'str' object has no attribute '_get_empty'"
205 |     ):
206 |         dag.pa.get_column("Not a node", "col1")
207 |     with pytest.raises(TypeError, match=r"Argument should be a Node\[pa.Table\]"):
208 |         dag.pa.get_column(dag.source_stream(), "col1")
209 |     with pytest.raises(TypeError, match=r"Argument should be a stream Node"):
210 |         dag.pa.get_column(dag.state(lambda: None).map(), "col1")
211 | 
212 |     source = dag.pa.source_table(SIMPLE_SCHEMA)
213 | 
214 |     with pytest.raises(TypeError, match="123"):
215 |         dag.pa.get_column(source, 123)
216 |     with pytest.raises(
217 |         TypeError, match=r"field colz no in schema: \['col1', 'col2', 'col3'\]"
218 |     ):
219 |         dag.pa.get_column(source, "colz")
220 | 
221 | 
222 | def test_concat_arrays_ok():
223 |     dag = Dag()
224 |     left = dag.source_stream(empty=pa.array([], pa.string()))
225 |     right = dag.source_stream(empty=pa.array([], pa.string()))
226 |     both = dag.pa.concat_arrays(left, right)
227 | 
228 |     dag.execute()
229 |     assert both.get_value() == pa.chunked_array([], pa.string())
230 | 
231 |     left.set_stream(pa.array(["a", "b"]))
232 |     right.set_stream(pa.array(["c"]))
233 |     dag.execute()
234 |     assert both.get_value() == pa.chunked_array(["a", "b", "c"], pa.string())
235 | 
236 |     dag.execute()
237 |     assert both.get_value() == pa.chunked_array([], pa.string())
238 | 
239 | 
240 | def test_concat_arrays_bad():
241 |     dag = Dag()
242 | 
243 |     with pytest.raises(ValueError, match=r"Must pass at least one array"):
244 |         dag.pa.concat_arrays()
245 |     with pytest.raises(TypeError, match=r"Argument should be a stream Node"):
246 |         dag.pa.concat_arrays(dag.state(lambda: None).map())
247 |     with pytest.raises(TypeError, match=r"Argument should be a Node\[pa\.Array\]"):
248 |         dag.pa.concat_arrays(dag.source_stream())
249 |     with pytest.raises(TypeError, match=r"Array type mismatch string vs int32"):
250 |         dag.pa.concat_arrays(
251 |             dag.source_stream(empty=pa.array([], pa.string())),
252 |             dag.source_stream(empty=pa.array([], pa.int32())),
253 |         )
254 | 
255 | 
256 | def test_concat_arrow_arrays_mixed():
257 |     assert _concat_arrow_arrays(
258 |         [
259 |             pa.array([], pa.string()),
260 |             pa.chunked_array(pa.array([], pa.string())),
261 |         ]
262 |     ) == pa.chunked_array([], pa.string())
263 | 
264 | 
265 | def test_concat_arrow_arrays_bad():
266 |     with pytest.raises(TypeError, match="123"):
267 |         _concat_arrow_arrays([123])
268 | 


--------------------------------------------------------------------------------
/tests/test_replay.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | from operator import attrgetter
  3 | 
  4 | import pandas as pd
  5 | import pytest
  6 | 
  7 | from beavers.dag import UTC_MAX, Dag
  8 | from beavers.replay import (
  9 |     DataSource,
 10 |     IteratorDataSourceAdapter,
 11 |     NoOpDataSinkProvider,
 12 |     ReplayContext,
 13 |     ReplayDriver,
 14 |     T,
 15 |     _create_sinks,
 16 |     _create_sources,
 17 | )
 18 | from tests.test_util import ListDataSink, ListDataSource
 19 | 
 20 | 
 21 | @dataclasses.dataclass(frozen=True)
 22 | class Word:
 23 |     timestamp: pd.Timestamp
 24 |     value: str
 25 | 
 26 | 
 27 | @pytest.fixture
 28 | def replay_context() -> ReplayContext:
 29 |     return ReplayContext(
 30 |         pd.to_datetime("2023-01-01", utc=True),
 31 |         pd.to_datetime("2023-01-02", utc=True),
 32 |         pd.to_timedelta("1min"),
 33 |     )
 34 | 
 35 | 
 36 | def create_data_source(context: ReplayContext):
 37 |     return ListDataSource(
 38 |         [Word(context.start + pd.Timedelta(minutes=i), "hello") for i in range(10)],
 39 |         attrgetter("timestamp"),
 40 |     )
 41 | 
 42 | 
 43 | def test_create_sources_mismatch(replay_context: ReplayContext):
 44 |     with pytest.raises(
 45 |         ValueError,
 46 |         match=r"Source node and DataSource names don't match: \[\]  vs \['words'\]",
 47 |     ):
 48 |         _create_sources(Dag(), replay_context, {"words": create_data_source})
 49 | 
 50 | 
 51 | def test_create_sources_match(replay_context: ReplayContext):
 52 |     dag = Dag()
 53 |     node = dag.source_stream(empty=[], name="words")
 54 | 
 55 |     results = _create_sources(dag, replay_context, {"words": create_data_source})
 56 |     assert len(results) == 1
 57 |     assert results[0].name == "words"
 58 |     assert results[0].node == node
 59 |     assert isinstance(results[0].data_source, ListDataSource)
 60 | 
 61 | 
 62 | def test_create_sinks_mismatch(replay_context: ReplayContext):
 63 |     sink = ListDataSink()
 64 |     with pytest.raises(
 65 |         ValueError,
 66 |         match=r"Sink node and DataSink names don't match: \[\]  vs \['words'\]",
 67 |     ):
 68 |         _create_sinks(Dag(), replay_context, {"words": lambda _: sink})
 69 | 
 70 | 
 71 | def test_create_sinks_match(replay_context: ReplayContext):
 72 |     sink = ListDataSink()
 73 |     dag = Dag()
 74 |     source_node = dag.source_stream(empty=[], name="words")
 75 |     sink_node = dag.sink("words", source_node)
 76 |     results = _create_sinks(dag, replay_context, {"words": lambda _: sink})
 77 |     assert len(results) == 1
 78 |     assert results[0].name == "words"
 79 |     assert results[0].nodes == [sink_node]
 80 |     assert results[0].data_sink is sink
 81 | 
 82 | 
 83 | def test_pass_through_replay(replay_context: ReplayContext):
 84 |     source = create_data_source(replay_context)
 85 |     sink = ListDataSink()
 86 |     dag = Dag()
 87 |     source_node = dag.source_stream(empty=[], name="words")
 88 |     dag.sink("words", source_node)
 89 | 
 90 |     driver = ReplayDriver.create(
 91 |         dag,
 92 |         replay_context,
 93 |         {"words": lambda _: source},
 94 |         {"words": lambda _: sink},
 95 |     )
 96 |     driver.run()
 97 |     assert sink._data == source._data
 98 | 
 99 | 
100 | def test_no_op_through_replay(replay_context: ReplayContext):
101 |     """
102 |     Test a corner case of the driver were a sink did not update during a cycle
103 |     """
104 |     sink = ListDataSink()
105 |     dag = Dag()
106 |     dag.source_stream(empty=[], name="words_1")
107 |     source_2 = dag.source_stream(empty=[], name="words_2")
108 |     dag.sink("words", source_2)
109 | 
110 |     driver = ReplayDriver.create(
111 |         dag,
112 |         replay_context,
113 |         {
114 |             "words_1": create_data_source,
115 |             "words_2": lambda _: ListDataSource([], attrgetter("timestamp")),
116 |         },
117 |         {"words": lambda _: sink},
118 |     )
119 |     driver.run()
120 |     assert sink._data == []
121 | 
122 | 
123 | def create_data_groups() -> list[list[Word]]:
124 |     timestamp = pd.to_datetime("2022-01-01", utc=True)
125 |     return [
126 |         [
127 |             Word(timestamp + pd.Timedelta(minutes=0), "hello"),
128 |             Word(timestamp + pd.Timedelta(minutes=1), "world"),
129 |         ],
130 |         [
131 |             Word(timestamp + pd.Timedelta(minutes=2), "hello"),
132 |             Word(timestamp + pd.Timedelta(minutes=2), "world"),
133 |         ],
134 |         [
135 |             Word(timestamp + pd.Timedelta(minutes=3), "hello"),
136 |             Word(timestamp + pd.Timedelta(minutes=3), "world"),
137 |             Word(timestamp + pd.Timedelta(minutes=3), "world"),
138 |             Word(timestamp + pd.Timedelta(minutes=4), "world"),
139 |         ],
140 |         [],
141 |         [
142 |             Word(timestamp + pd.Timedelta(minutes=5), "hello"),
143 |             Word(timestamp + pd.Timedelta(minutes=5), "world"),
144 |         ],
145 |     ]
146 | 
147 | 
148 | def create_adapter(data_groups: list[list[Word]]) -> DataSource[list[Word]]:
149 |     return IteratorDataSourceAdapter(
150 |         (
151 |             ListDataSource(data_group, attrgetter("timestamp"))
152 |             for data_group in data_groups
153 |         ),
154 |         [],
155 |         lambda left, right: left + right,
156 |     )
157 | 
158 | 
159 | def test_iterator_data_source_adapter_run_all():
160 |     data_groups = create_data_groups()
161 |     adapter = create_adapter(data_groups)
162 |     assert adapter.read_to(UTC_MAX) == [
163 |         word for data_group in data_groups for word in data_group
164 |     ]
165 |     assert adapter.read_to(UTC_MAX) == []
166 | 
167 | 
168 | def test_iterator_data_source_adapter_run_one_by_one():
169 |     timestamp = pd.to_datetime("2022-01-01", utc=True)
170 |     data_groups = create_data_groups()
171 |     adapter = create_adapter(data_groups)
172 |     assert adapter.get_next() == timestamp
173 |     assert adapter.read_to(timestamp) == [data_groups[0][0]]
174 |     assert adapter.read_to(timestamp) == []
175 |     assert adapter.read_to(timestamp + pd.Timedelta(minutes=1)) == [data_groups[0][1]]
176 |     assert adapter.read_to(timestamp + pd.Timedelta(minutes=1)) == []
177 |     assert (
178 |         adapter.read_to(timestamp + pd.Timedelta(minutes=3))
179 |         == data_groups[1] + data_groups[2][:-1]
180 |     )
181 |     assert adapter.read_to(timestamp + pd.Timedelta(minutes=4)) == data_groups[2][-1:]
182 |     assert adapter.read_to(timestamp + pd.Timedelta(minutes=5)) == data_groups[4]
183 |     assert adapter.read_to(timestamp + pd.Timedelta(minutes=6)) == []
184 |     assert adapter.read_to(UTC_MAX) == []
185 | 
186 | 
187 | def test_iterator_data_source_empty():
188 |     adapter = create_adapter([])
189 |     assert adapter.get_next() == UTC_MAX
190 |     assert adapter.read_to(UTC_MAX) == []
191 |     assert adapter.get_next() == UTC_MAX
192 |     assert adapter.read_to(UTC_MAX) == []
193 | 
194 | 
195 | def test_iterator_data_source_all_empty():
196 |     adapter = create_adapter([[], []])
197 |     assert adapter.get_next() == UTC_MAX
198 |     assert adapter.read_to(UTC_MAX) == []
199 |     assert adapter.get_next() == UTC_MAX
200 |     assert adapter.read_to(UTC_MAX) == []
201 | 
202 | 
203 | class CornerCaseTester(DataSource[list[Word]]):
204 |     def __init__(self, timestamp: pd.Timestamp):
205 |         self._timestamp = timestamp
206 |         self._read = False
207 | 
208 |     def read_to(self, timestamp: pd.Timestamp) -> list[T]:
209 |         self._read = True
210 |         return []
211 | 
212 |     def get_next(self) -> pd.Timestamp:
213 |         if self._read:
214 |             return UTC_MAX
215 |         else:
216 |             return self._timestamp
217 | 
218 | 
219 | def test_iterator_data_source_cutoff():
220 |     """
221 |     Test a tricky corner case were the underlying DataSource of
222 |      IteratorDataSourceAdapter doesn't behave as expected.
223 |     """
224 |     timestamp = pd.to_datetime("2022-01-01", utc=True)
225 |     adapter = IteratorDataSourceAdapter(
226 |         (
227 |             source
228 |             for source in [
229 |                 CornerCaseTester(timestamp + pd.Timedelta(minutes=1)),
230 |                 ListDataSource(
231 |                     [Word(timestamp + pd.Timedelta(minutes=2), "hello")],
232 |                     attrgetter("timestamp"),
233 |                 ),
234 |             ]
235 |         ),
236 |         [],
237 |         lambda left, right: left + right,
238 |     )
239 | 
240 |     assert adapter.read_to(UTC_MAX) == [
241 |         Word(
242 |             timestamp=pd.Timestamp("2022-01-01 00:02:00+0000", tz="UTC"), value="hello"
243 |         )
244 |     ]
245 | 
246 | 
247 | def test_replay_read_sources():
248 |     source = ListDataSource(
249 |         [
250 |             Word(pd.to_datetime("2023-01-01 00:01:00Z"), "1"),
251 |             Word(pd.to_datetime("2023-01-01 00:02:00Z"), "2"),
252 |             Word(pd.to_datetime("2023-01-01 12:01:00Z"), "3"),
253 |             Word(pd.to_datetime("2023-01-01 12:04:00Z"), "4"),
254 |         ],
255 |         attrgetter("timestamp"),
256 |     )
257 | 
258 |     dag = Dag()
259 |     dag.source_stream([], name="hello")
260 |     driver = ReplayDriver.create(
261 |         dag=dag,
262 |         replay_context=ReplayContext(
263 |             pd.to_datetime("2023-01-01", utc=True),
264 |             pd.to_datetime("2023-01-02", utc=True) - pd.to_timedelta("1ns"),
265 |             pd.to_timedelta("12h"),
266 |         ),
267 |         data_source_providers={"hello": lambda x: source},
268 |         data_sink_providers={},
269 |     )
270 | 
271 |     records, timestamp = driver.read_sources()
272 |     assert timestamp == pd.to_datetime("2023-01-01 00:01:00Z", utc=True)
273 |     assert records == 0
274 | 
275 | 
276 | def test_replay_run_cycle():
277 |     source = ListDataSource(
278 |         [
279 |             Word(pd.to_datetime("2023-01-01 00:01:00Z"), "1"),
280 |             Word(pd.to_datetime("2023-01-01 00:02:00Z"), "2"),
281 |             Word(pd.to_datetime("2023-01-01 12:01:00Z"), "3"),
282 |             Word(pd.to_datetime("2023-01-01 12:04:00Z"), "4"),
283 |         ],
284 |         attrgetter("timestamp"),
285 |     )
286 | 
287 |     dag = Dag()
288 |     dag.source_stream([], name="hello")
289 |     driver = ReplayDriver.create(
290 |         dag=dag,
291 |         replay_context=ReplayContext(
292 |             pd.to_datetime("2023-01-01", utc=True),
293 |             pd.to_datetime("2023-01-02", utc=True) - pd.to_timedelta("1ns"),
294 |             pd.to_timedelta("12h"),
295 |         ),
296 |         data_source_providers={"hello": lambda x: source},
297 |         data_sink_providers={},
298 |     )
299 | 
300 |     metrics = driver.run_cycle()
301 |     assert metrics is None
302 |     assert driver.current_time == pd.to_datetime("2023-01-01 12:00:00Z")
303 | 
304 |     metrics = driver.run_cycle()
305 |     assert metrics.timestamp == pd.to_datetime("2023-01-01 12:00:00Z")
306 |     assert metrics.source_records == 2
307 |     assert metrics.sink_records == 0
308 |     assert metrics.cycle_time_ns > 0
309 |     assert metrics.warp_ratio > 0.0
310 |     assert driver.current_time == pd.to_datetime("2023-01-02 00:00:00Z")
311 | 
312 |     metrics = driver.run_cycle()
313 |     assert metrics.timestamp == pd.to_datetime("2023-01-01 23:59:59.999999999Z")
314 |     assert metrics.source_records == 2
315 |     assert metrics.sink_records == 0
316 |     assert metrics.cycle_time_ns > 0
317 |     assert metrics.warp_ratio > 0.0
318 |     assert driver.current_time == pd.to_datetime("2023-01-02 12:00:00Z")
319 |     assert driver.is_done()
320 | 
321 | 
322 | def test_no_op():
323 |     provider = NoOpDataSinkProvider()
324 |     data_sink = provider(ReplayContext(UTC_MAX, UTC_MAX, pd.to_timedelta("1s")))
325 |     data_sink.append(UTC_MAX, None)
326 |     data_sink.close()
327 | 


--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import dataclasses
  3 | import random
  4 | from typing import Callable, Dict, Generic, TypeVar
  5 | 
  6 | import pandas as pd
  7 | import pyarrow as pa
  8 | 
  9 | from beavers.dag import UTC_MAX, Dag, TimerManager
 10 | from beavers.replay import DataSink, DataSource
 11 | 
 12 | T = TypeVar("T")
 13 | 
 14 | TEST_TABLE = pa.table(
 15 |     {
 16 |         "timestamp": [
 17 |             pd.to_datetime("2023-01-01T00:00:00Z"),
 18 |             pd.to_datetime("2023-01-02T00:00:00Z"),
 19 |         ],
 20 |         "value": [1, 2],
 21 |     }
 22 | )
 23 | 
 24 | 
 25 | class GetLatest(Generic[T]):
 26 |     def __init__(self, default: T):
 27 |         self._value = default
 28 | 
 29 |     def __call__(self, values: list[T]) -> T:
 30 |         if values:
 31 |             self._value = values[-1]
 32 |         return self._value
 33 | 
 34 | 
 35 | def add(left, right):
 36 |     return left + right
 37 | 
 38 | 
 39 | def add_with_noise(left, right):
 40 |     return left + right + random.randint(0, 1000)  # nosec
 41 | 
 42 | 
 43 | def add_no_42(left, right):
 44 |     results = add(left, right)
 45 |     if results == 42:
 46 |         raise ValueError(f"{left} + {right} == 42")
 47 |     else:
 48 |         return results
 49 | 
 50 | 
 51 | class AddOther:
 52 |     def __init__(self, other):
 53 |         self._other = other
 54 | 
 55 |     def set_other(self, other):
 56 |         self._other = other
 57 | 
 58 |     def __call__(self, value):
 59 |         return self._other + value
 60 | 
 61 | 
 62 | def select(key, **values):
 63 |     return values[key]
 64 | 
 65 | 
 66 | class WordCount:
 67 |     def __init__(self):
 68 |         self._counts = collections.defaultdict(lambda: 0)
 69 | 
 70 |     def __call__(self, words: list[str]) -> dict[str, int]:
 71 |         for word in words:
 72 |             self._counts[word] += 1
 73 | 
 74 |         return self._counts
 75 | 
 76 | 
 77 | def join_counts(**kwargs: Dict[str, int]) -> pd.DataFrame:
 78 |     return pd.concat(
 79 |         [pd.Series(value, name=key) for key, value in kwargs.items()], axis=1
 80 |     ).fillna(0)
 81 | 
 82 | 
 83 | @dataclasses.dataclass(frozen=True)
 84 | class TimerEntry:
 85 |     timestamp: pd.Timestamp
 86 |     values: list[int]
 87 | 
 88 | 
 89 | class SetATimer:
 90 |     def __init__(self):
 91 |         self._entry = None
 92 | 
 93 |     def __call__(
 94 |         self, entries: list[TimerEntry], now: pd.Timestamp, timer_manager: TimerManager
 95 |     ) -> list[int]:
 96 |         if entries:
 97 |             self._entry = entries[-1]
 98 |             timer_manager.set_next_timer(self._entry.timestamp)
 99 |         if self._entry is not None and now >= self._entry.timestamp:
100 |             results = self._entry.values
101 |             self._entry = None
102 |             return results
103 |         else:
104 |             return []
105 | 
106 | 
107 | def create_word_count_dag() -> tuple[Dag, WordCount]:
108 |     dag = Dag()
109 |     messages_stream = dag.source_stream([], name="messages")
110 |     word_count = WordCount()
111 |     state = dag.state(word_count).map(messages_stream)
112 |     changed_key = dag.stream(lambda x: sorted(set(x)), []).map(messages_stream)
113 |     records = dag.stream(lambda x, y: {v: y[v] for v in x}, {}).map(changed_key, state)
114 |     dag.sink("results", records)
115 |     return dag, word_count
116 | 
117 | 
118 | class ListDataSource(DataSource[list[T]]):
119 |     def __init__(self, data: list[T], extractor: Callable[[T], pd.Timestamp]):
120 |         self._data = data
121 |         self._extractor = extractor
122 |         self._position = 0
123 | 
124 |     def read_to(self, timestamp: pd.Timestamp) -> list[T]:
125 |         results = []
126 |         while (
127 |             self._position < len(self._data)
128 |             and self._extractor(self._data[self._position]) <= timestamp
129 |         ):
130 |             results.append(self._data[self._position])
131 |             self._position += 1
132 |         return results
133 | 
134 |     def get_next(self) -> pd.Timestamp:
135 |         if self._position >= len(self._data):
136 |             return UTC_MAX
137 |         else:
138 |             return self._extractor(self._data[self._position])
139 | 
140 | 
141 | class ListDataSink(DataSink[list[T]]):
142 |     def __init__(self):
143 |         self._data = []
144 | 
145 |     def append(self, timestamp: pd.Timestamp, data: list[T]):
146 |         self._data.extend(data)
147 | 
148 |     def close(self):
149 |         pass
150 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | isolated_build = True
 3 | envlist =
 4 |     py310,
 5 |     py311,
 6 |     py312,
 7 |     py313,
 8 |     linting,
 9 | 
10 | [testenv]
11 | allowlist_externals = poetry
12 | commands_pre =
13 |     poetry install --no-root --sync --extras pyarrow --extras confluent_kafka --extras polars
14 | changedir = {envtmpdir}
15 | commands =
16 |     poetry run coverage run --source=beavers --rcfile={toxinidir}/pyproject.toml --branch -m pytest {toxinidir}/tests
17 |     poetry run python {toxinidir}/examples/advanced_concepts.py
18 |     poetry run python {toxinidir}/examples/dag_concepts.py
19 |     poetry run python {toxinidir}/examples/etfs.py
20 |     poetry run python {toxinidir}/examples/pandas_concepts.py
21 |     poetry run python {toxinidir}/examples/polars_concepts.py
22 |     poetry run python {toxinidir}/examples/pyarrow_concepts.py
23 |     poetry run python {toxinidir}/examples/replay_concepts.py
24 |     poetry run coverage report --rcfile={toxinidir}/pyproject.toml -m --fail-under 95
25 |     poetry run coverage xml --rcfile={toxinidir}/pyproject.toml -o {toxinidir}/coverage.xml
26 | 
27 | [testenv:linting]
28 | deps = pre-commit
29 | commands = pre-commit run --all-files --show-diff-on-failure
30 | 
31 | [gh-actions]
32 | python =
33 |     3.10: py310, linting
34 |     3.11: py311
35 |     3.12: py312
36 |     3.13: py313
37 | 


--------------------------------------------------------------------------------