├── .github
└── workflows
│ ├── ci.yaml
│ └── publish.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── beavers
├── __init__.py
├── assets
│ └── favicon.ico
├── dag.py
├── kafka.py
├── pandas_wrapper.py
├── perspective_wrapper.py
├── polars_wrapper.py
├── pyarrow_kafka.py
├── pyarrow_replay.py
├── pyarrow_wrapper.py
├── replay.py
├── table.html
└── testing.py
├── docs
├── concepts
│ ├── advanced.md
│ ├── dag.md
│ ├── kafka.md
│ ├── pandas.md
│ ├── perspective.md
│ ├── polars.md
│ ├── pyarrow.md
│ └── replay.md
├── contributing.md
├── faq.md
├── index.md
├── install.md
├── reference
│ ├── dag.md
│ ├── kafka.md
│ ├── pandas_wrapper.md
│ ├── pyarrow_wrapper.md
│ └── replay.md
├── requirements.in
├── requirements.txt
└── static
│ └── icons
│ └── beavers
│ ├── icon.png
│ └── logo.svg
├── examples
├── __init__.py
├── advanced_concepts.py
├── dag_concepts.py
├── etfs.py
├── kafka_concepts.py
├── pandas_concepts.py
├── perspective_concepts.py
├── polars_concepts.py
├── pyarrow_concepts.py
└── replay_concepts.py
├── mkdocs.yml
├── poetry.lock
├── pyproject.toml
├── scripts
├── README.md
├── kafka_test_bench.py
└── perpective_test_bench.py
├── tests
├── __init__.py
├── conftest.py
├── test_dag.py
├── test_docs.py
├── test_etfs.py
├── test_kafka.py
├── test_pandas_wrapper.py
├── test_perpective_wrapper.py
├── test_polars_wrapper.py
├── test_pyarrow_kafka.py
├── test_pyarrow_replay.py
├── test_pyarrow_wrapper.py
├── test_replay.py
└── test_util.py
└── tox.ini
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
1 | name: beavers CI
2 |
3 | on:
4 | push:
5 | branches: [ main ]
6 | pull_request:
7 | branches: [ main ]
8 |
9 | jobs:
10 | build:
11 | runs-on: ubuntu-latest
12 | strategy:
13 | matrix:
14 | python-version:
15 | - "3.10"
16 | - "3.11"
17 | - "3.12"
18 | - "3.13"
19 | fail-fast: false
20 | steps:
21 | - name: Checkout sources
22 | uses: actions/checkout@v4
23 |
24 | - name: Setup Python
25 | uses: actions/setup-python@v5
26 | with:
27 | python-version: ${{ matrix.python-version }}
28 | cache: "pip"
29 |
30 | - name: Install pip
31 | run: "python -m pip install --upgrade pip"
32 | - name: Install tox and poetry
33 | run: "python -m pip install tox tox-gh-actions poetry==2.1.1"
34 | - name: Install poetry plugin
35 | run: 'poetry self add "poetry-dynamic-versioning[plugin]"'
36 |
37 | - name: Run tox
38 | run: tox
39 |
40 | - name: Upload coverage to Codecov
41 | uses: codecov/codecov-action@v4
42 | if: "matrix.python-version == '3.10'"
43 | with:
44 | fail_ci_if_error: true
45 | token: ${{ secrets.CODECOV_TOKEN }}
46 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yaml:
--------------------------------------------------------------------------------
1 | name: Publish to PyPI
2 |
3 | on:
4 | release:
5 | types: [ published ]
6 | branches: [ main ]
7 | workflow_dispatch:
8 |
9 | jobs:
10 | build-and-publish:
11 | runs-on: ubuntu-latest
12 |
13 | steps:
14 | - name: Checkout sources
15 | uses: actions/checkout@v3
16 |
17 | - name: Setup Python
18 | uses: actions/setup-python@v4
19 | with:
20 | python-version: "3.10"
21 |
22 | - name: Install poetry and dependencies
23 | run: |
24 | python -m pip install --upgrade pip
25 | python -m pip install poetry==2.1.1
26 | poetry self add "poetry-dynamic-versioning[plugin]"
27 |
28 | - name: Configure poetry
29 | env:
30 | pypi_token: ${{ secrets.PyPI_TOKEN }}
31 | run: poetry config pypi-token.pypi $pypi_token
32 |
33 | - name: Build and publish
34 | run: poetry publish --build
35 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
162 | # Custom
163 | /.idea
164 | /.pytest_cache
165 | /.ruff_cache
166 | /venv
167 | *.csv
168 | coverrage.xml
169 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | default_language_version:
2 | python: python3.10
3 | repos:
4 | - repo: https://github.com/pycqa/pydocstyle
5 | rev: 6.3.0
6 | hooks:
7 | - id: pydocstyle
8 | files: ^beavers/(dag|replay|kafka|arrow).py
9 | additional_dependencies:
10 | - tomli
11 |
12 | - repo: https://github.com/pre-commit/pre-commit-hooks
13 | rev: v5.0.0
14 | hooks:
15 | - id: check-toml
16 | - id: check-yaml
17 | - id: end-of-file-fixer
18 | - id: mixed-line-ending
19 | - repo: https://github.com/charliermarsh/ruff-pre-commit
20 | rev: v0.11.12
21 | hooks:
22 | - id: ruff
23 | args: ['--fix']
24 | - id: ruff-format
25 | - repo: https://github.com/PyCQA/bandit
26 | rev: 1.8.3
27 | hooks:
28 | - id: bandit
29 | additional_dependencies:
30 | - tomli
31 | args:
32 | - "--config=pyproject.toml"
33 | - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
34 | rev: v2.14.0
35 | hooks:
36 | - id: pretty-format-toml
37 | files: "^.*.toml"
38 | args:
39 | - "--autofix"
40 | - repo: https://github.com/python-poetry/poetry
41 | rev: 2.1.3
42 | hooks:
43 | - id: poetry-check
44 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | build:
4 | os: ubuntu-22.04
5 | tools:
6 | python: "3.11"
7 |
8 | mkdocs:
9 | configuration: mkdocs.yml
10 |
11 | python:
12 | install:
13 | - requirements: docs/requirements.txt
14 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | All notable changes to this project will be documented in this file.
4 |
5 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
6 | and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
7 |
8 |
9 | ## [v0.13.0](https://github.com/tradewelltech/beavers/releases/tag/v0.13.0) - 2025-02-13
10 |
11 | [Compare with v0.12.1](https://github.com/tradewelltech/beavers/compare/v0.12.1...v0.13.0)
12 |
13 | ### Added
14 |
15 | - Add missing badge (#72) ([5bf44e9](https://github.com/tradewelltech/beavers/commit/5bf44e982740651ccf1a168ce88b4376519181ee) by 0x26res).
16 | - Add polars support ([f30da87](https://github.com/tradewelltech/beavers/commit/f30da8779c2a683f2ec2d9607134658ac70d4afb) by aandres3).
17 |
18 | ## [v0.12.1](https://github.com/tradewelltech/beavers/releases/tag/v0.12.1) - 2025-02-03
19 |
20 | [Compare with v0.12.0](https://github.com/tradewelltech/beavers/compare/v0.12.0...v0.12.1)
21 |
22 | ## [v0.12.0](https://github.com/tradewelltech/beavers/releases/tag/v0.12.0) - 2024-11-25
23 |
24 | [Compare with v0.11.0](https://github.com/tradewelltech/beavers/compare/v0.11.0...v0.12.0)
25 |
26 | ### Added
27 |
28 | - Add changelog link ([b84d6e6](https://github.com/tradewelltech/beavers/commit/b84d6e6ef42c590379f9bdd16319b1ecb9978b52) by aandres3).
29 |
30 |
31 | ## [v0.11.0](https://github.com/tradewelltech/beavers/releases/tag/v0.11.0) - 2024-11-15
32 |
33 | [Compare with v0.10.0](https://github.com/tradewelltech/beavers/compare/v0.10.0...v0.11.0)
34 |
35 | ### Added
36 |
37 | - Add python 3.13 ([1984bb2](https://github.com/tradewelltech/beavers/commit/1984bb2c7b14126084d5497243418f8bc0123494) by aandres3).
38 |
39 | ### Fixed
40 |
41 | - Fix perspective html (#70) ([ebc090d](https://github.com/tradewelltech/beavers/commit/ebc090d5a9ac7bbf31384a826cf94326426386e0) by 0x26res).
42 |
43 | ## [v0.10.0](https://github.com/tradewelltech/beavers/releases/tag/v0.10.0) - 2024-11-11
44 |
45 | [Compare with v0.9.1](https://github.com/tradewelltech/beavers/compare/v0.9.1...v0.10.0)
46 |
47 | ### Added
48 |
49 | - Add latest version of everything ([f339a52](https://github.com/tradewelltech/beavers/commit/f339a52ac8046e72f64ba4f838259d90b0791a6d) by aandres3).
50 |
51 | ### Fixed
52 |
53 | - Fix wrong offset resolution (#65) ([610bad6](https://github.com/tradewelltech/beavers/commit/610bad6cdadb29014ddc098b79e2ca5df18f1c71) by 0x26res).
54 |
55 |
56 | ## [v0.9.1](https://github.com/tradewelltech/beavers/releases/tag/v0.9.1) - 2024-09-20
57 |
58 | [Compare with v0.9.0](https://github.com/tradewelltech/beavers/compare/v0.9.0...v0.9.1)
59 |
60 | ### Fixed
61 |
62 | - Fix following perspective update ([f06f375](https://github.com/tradewelltech/beavers/commit/f06f375028c99017231faf9f5ab78c3f7f4e028e) by aandres).
63 |
64 | ## [v0.9.0](https://github.com/tradewelltech/beavers/releases/tag/v0.9.0) - 2024-07-30
65 |
66 | [Compare with v0.8.0](https://github.com/tradewelltech/beavers/compare/v0.8.0...v0.9.0)
67 |
68 | ### Added
69 |
70 | - Add perspective tools ([07878be](https://github.com/tradewelltech/beavers/commit/07878bec527d6e2523345ca437e6a64b77c47182) by aandres).
71 |
72 | ## [v0.8.0](https://github.com/tradewelltech/beavers/releases/tag/v0.8.0) - 2024-07-01
73 |
74 | [Compare with v0.7.0](https://github.com/tradewelltech/beavers/compare/v0.7.0...v0.8.0)
75 |
76 | ### Added
77 |
78 | - Add constructor to mock consumer ([370d5d6](https://github.com/tradewelltech/beavers/commit/370d5d68eb60662a110026ab7844fc3d9c6bf59b) by aandres).
79 | - Add log message for resolved offsets ([0816ea3](https://github.com/tradewelltech/beavers/commit/0816ea3bde7ec0b667b3d6b62935ebc2d7228adf) by aandres).
80 |
81 | ### Fixed
82 |
83 | - Fix offset resolution on end of topic ([ff76c35](https://github.com/tradewelltech/beavers/commit/ff76c3519d4ae36040cf138059952c9304bc1b3d) by aandres).
84 |
85 | ## [v0.7.0](https://github.com/tradewelltech/beavers/releases/tag/v0.7.0) - 2024-06-25
86 |
87 | [Compare with v0.6.0](https://github.com/tradewelltech/beavers/compare/v0.6.0...v0.7.0)
88 |
89 | ### Added
90 |
91 | - Add poll time metrics ([efa487a](https://github.com/tradewelltech/beavers/commit/efa487a3e86f7748c160413ccba749e277e1bc5e) by aandres).
92 |
93 | ## [v0.6.0](https://github.com/tradewelltech/beavers/releases/tag/v0.6.0) - 2024-06-24
94 |
95 | [Compare with v0.5.0](https://github.com/tradewelltech/beavers/compare/v0.5.0...v0.6.0)
96 |
97 | ### Added
98 |
99 | - Add some missing replay code (#56) ([9973baa](https://github.com/tradewelltech/beavers/commit/9973baa73fd781656938578f9f0cefe7a283a389) by 0x26res).
100 | - Add contributing and code of conduct guide, update deps (#55) ([3bd1147](https://github.com/tradewelltech/beavers/commit/3bd114724b5f2ac1095b00b8e90a55dd3a7333ab) by 0x26res).
101 |
102 | ### Fixed
103 |
104 | - fix: make group optional (#54) ([03d27af](https://github.com/tradewelltech/beavers/commit/03d27af029d95be874a0b6b5e5cbc625945b984b) by 0x26res).
105 |
106 | ### Changed
107 |
108 | - Change engine to dag, add talk to the doc ([cd57456](https://github.com/tradewelltech/beavers/commit/cd57456a271f99a81602f7d7d385f0caea84acd2) by aandres).
109 |
110 | ## [v0.5.0](https://github.com/tradewelltech/beavers/releases/tag/v0.5.0) - 2024-01-23
111 |
112 | [Compare with v0.4.0](https://github.com/tradewelltech/beavers/compare/v0.4.0...v0.5.0)
113 |
114 | ### Added
115 |
116 | - Add python 12 support (#53) ([344ff69](https://github.com/tradewelltech/beavers/commit/344ff69309d81780d9d08effc2fdfe3b1f8d9b22) by 0x26res).
117 | - Add prune ([4e5b06f](https://github.com/tradewelltech/beavers/commit/4e5b06f073c2e210f4cca8d67f096698c52c3fa9) by aandres).
118 | - Add kafka json to arrow support (#50) ([120c116](https://github.com/tradewelltech/beavers/commit/120c116d13ab46604d54088bb07d851ff5d3fd00) by 0x26res).
119 |
120 |
121 | ## [v0.4.0](https://github.com/tradewelltech/beavers/releases/tag/v0.4.0) - 2023-11-26
122 |
123 | [Compare with v0.3.1](https://github.com/tradewelltech/beavers/compare/v0.3.1...v0.4.0)
124 |
125 | ### Added
126 |
127 | - Add some arrow replay code ([d8026ec](https://github.com/tradewelltech/beavers/commit/d8026ecf744886b0bb7406814904adb3308ba0b9) by 0x26res).
128 |
129 | ## [v0.3.1](https://github.com/tradewelltech/beavers/releases/tag/v0.3.1) - 2023-10-26
130 |
131 | [Compare with v0.3.0](https://github.com/tradewelltech/beavers/compare/v0.3.0...v0.3.1)
132 | ### Added
133 |
134 | - Add pandas module (#47) ([ac81344](https://github.com/tradewelltech/beavers/commit/ac8134452c3a9636ea5a119e65db87df5a245271) by 0x26res).
135 |
136 | ## [v0.3.0](https://github.com/tradewelltech/beavers/releases/tag/v0.3.0) - 2023-09-29
137 |
138 | [Compare with v0.2.0](https://github.com/tradewelltech/beavers/compare/v0.2.0...v0.3.0)
139 |
140 | ### Added
141 |
142 | - Add faq, make kafka extra dep, update readme, use poetry in tox. (#44) ([de0ddf5](https://github.com/tradewelltech/beavers/commit/de0ddf5baa51fbf5a9b818364e8a2e589a2b0974) by 0x26res).
143 | - Add pyarrow module (#42) ([1117f37](https://github.com/tradewelltech/beavers/commit/1117f375b36a5eac1468c3a5888f1fdc6e9f1ba7) by 0x26res).
144 | - Add developer page (#41) ([b717b62](https://github.com/tradewelltech/beavers/commit/b717b6224bf9e5fd585ff6b0bed77b3333ad2a68) by 0x26res).
145 | - Add logos ([7f6b1cf](https://github.com/tradewelltech/beavers/commit/7f6b1cfc09453927ede5e485c242311362b1e417) by aandres).
146 |
147 | ### Fixed
148 |
149 | - Fix logo (#45) ([f24f0dc](https://github.com/tradewelltech/beavers/commit/f24f0dcb8a911f193aa045da0b6a0f20a69fc64e) by 0x26res).
150 | - Fix tests ([cc52ae6](https://github.com/tradewelltech/beavers/commit/cc52ae6f454d6cf3afd98b6804fd750de5a2eab1) by aandres).
151 |
152 | ### Changed
153 |
154 | - change update docs deps (#40) ([04bf706](https://github.com/tradewelltech/beavers/commit/04bf706f9277285b9dac922bb0255402d095da6e) by 0x26res).
155 |
156 | ## [v0.2.0](https://github.com/tradewelltech/beavers/releases/tag/v0.2.0) - 2023-09-19
157 |
158 | [Compare with v0.1.0](https://github.com/tradewelltech/beavers/compare/v0.1.0...v0.2.0)
159 |
160 | ### Added
161 |
162 | - Add changelog ([7ee7685](https://github.com/tradewelltech/beavers/commit/7ee76853ff4186dc1b7c9449022511a6ad477fbe) by aandres).
163 | - Add empty factory ([ee07562](https://github.com/tradewelltech/beavers/commit/ee0756289d4ed79787e760de4441933afd1aa9d7) by aandres).
164 | - Add offset policies, fix committed ([99c1ad7](https://github.com/tradewelltech/beavers/commit/99c1ad76f6d49f4a641749bdea5ec60e73392507) by aandres).
165 | - Add logging ([c8449ab](https://github.com/tradewelltech/beavers/commit/c8449aba69d18ec070755e1efbd89f083b639289) by aandres).
166 | - Add test script ([077bfc2](https://github.com/tradewelltech/beavers/commit/077bfc278809676e048ba121119e1ec67a97bb5f) by aandres).
167 | - Add kafka doc ([806a471](https://github.com/tradewelltech/beavers/commit/806a47188fa4b2c7234f3059975668142fb3c49b) by aandres).
168 |
169 | ### Fixed
170 |
171 | - Fix test, fix coverage ([6f0e371](https://github.com/tradewelltech/beavers/commit/6f0e371916c2ba61147f61adfd5995c32fe63212) by aandres).
172 | - Fix covertage ([9db6eec](https://github.com/tradewelltech/beavers/commit/9db6eec070d4e7783bc6028f85ad468b0b26e7c8) by aandres).
173 | - Fix example ([39f4b44](https://github.com/tradewelltech/beavers/commit/39f4b44f48b2b5efe2761f762e7d85ee256df76d) by aandres).
174 |
175 | ## [v0.1.0](https://github.com/tradewelltech/beavers/releases/tag/v0.1.0) - 2023-08-24
176 |
177 | [Compare with v0.0.4](https://github.com/tradewelltech/beavers/compare/v0.0.4...v0.1.0)
178 |
179 | ## [v0.0.4](https://github.com/tradewelltech/beavers/releases/tag/v0.0.4) - 2023-08-22
180 |
181 | [Compare with v0.0.3](https://github.com/tradewelltech/beavers/compare/v0.0.3...v0.0.4)
182 |
183 | ### Added
184 |
185 | - Add dag metrics ([c46a4ee](https://github.com/tradewelltech/beavers/commit/c46a4eec655984c2525fe094942fd002deeb5645) by aandres).
186 | - Add missing assert ([86b924f](https://github.com/tradewelltech/beavers/commit/86b924f06d78cf3b3a8b98e8137275490b61f815) by aandres).
187 | - Add replay doc ([d5b9b43](https://github.com/tradewelltech/beavers/commit/d5b9b43bd3012e292ad86219c5fd304d3fb11198) by aandres).
188 | - Add repaly metrics ([ba274ef](https://github.com/tradewelltech/beavers/commit/ba274ef7d53cda1e380a7defbd5d4884cf018e4a) by aandres).
189 | - Add test ([8e87c6e](https://github.com/tradewelltech/beavers/commit/8e87c6e8a76b6dadcedf810b0373d12cba7f3309) by aandres).
190 | - Add install section ([520ced1](https://github.com/tradewelltech/beavers/commit/520ced1def5b7508507df6cd65339515680b41fe) by aandres).
191 |
192 | ### Fixed
193 |
194 | - Fix equality check on nodes ([fa1a09f](https://github.com/tradewelltech/beavers/commit/fa1a09f300b2dd2c307a09f80b8ab37cfd949ea4) by aandres).
195 | - fix test ([85005d5](https://github.com/tradewelltech/beavers/commit/85005d5abcc82685396c39bcf1618aacf0b8ed75) by aandres).
196 | - Fix tox ([7bef814](https://github.com/tradewelltech/beavers/commit/7bef81471d21b405c5982ca19baf1b7ae345f930) by aandres).
197 |
198 | ### Removed
199 |
200 | - Remove dead code ([af932d4](https://github.com/tradewelltech/beavers/commit/af932d41ab86fde774dd77f67070ba98a9977df4) by aandres).
201 |
202 | ## [v0.0.3](https://github.com/tradewelltech/beavers/releases/tag/v0.0.3) - 2023-07-05
203 |
204 | [Compare with v0.0.2](https://github.com/tradewelltech/beavers/compare/v0.0.2...v0.0.3)
205 |
206 | ### Added
207 |
208 | - Add doc ([cb624c7](https://github.com/tradewelltech/beavers/commit/cb624c706920134d362430b0a094b0c722890e43) by aandres).
209 | - Add kafka ([92c37fb](https://github.com/tradewelltech/beavers/commit/92c37fba76b8c26943327834198a24505d0bea79) by aandres).
210 |
211 | ### Fixed
212 |
213 | - Fix kafka test coverage ([ecbc890](https://github.com/tradewelltech/beavers/commit/ecbc890f1adddaf236631e95ccf41ed6002430f3) by aandres).
214 | - Fix icon ([8887278](https://github.com/tradewelltech/beavers/commit/88872786071f882f23335c47721fd53a23771b2e) by aandres).
215 |
216 | ## [v0.0.2](https://github.com/tradewelltech/beavers/releases/tag/v0.0.2) - 2023-06-30
217 |
218 | [Compare with v0.0.1](https://github.com/tradewelltech/beavers/compare/v0.0.1...v0.0.2)
219 |
220 | ### Added
221 |
222 | - Add advanced concept ([3450d72](https://github.com/tradewelltech/beavers/commit/3450d728872962dff7101189d20a4e81a48d8e2e) by aandres).
223 | - Add concept page, rename stabilize ([9c0b9eb](https://github.com/tradewelltech/beavers/commit/9c0b9eba0bf0bd604e0195530bc25e2fb767509a) by aandres).
224 | - Add doc to main api ([4048ae7](https://github.com/tradewelltech/beavers/commit/4048ae7c29c56ffa789b3c1c4f7a3c53aba44a75) by aandres).
225 | - Add const test ([e1af0bd](https://github.com/tradewelltech/beavers/commit/e1af0bdf61d144e76c029421a274433f6967df4c) by aandres).
226 | - Add hook for pydoc ([ad10948](https://github.com/tradewelltech/beavers/commit/ad109481ff06ea4ae26acd3e1279fc056fd5ee54) by aandres).
227 | - Add replay ([c807bef](https://github.com/tradewelltech/beavers/commit/c807bef6354573124d410e13c85450d0cdacf681) by aandres).
228 | - Add ETF example ([e3c4c2e](https://github.com/tradewelltech/beavers/commit/e3c4c2e9f3423e814d47c1dc40e182c88f05c9ba) by aandres).
229 |
230 | ### Fixed
231 |
232 | - fix typos ([8df6f74](https://github.com/tradewelltech/beavers/commit/8df6f7412ae96a6cbe55b1941d6475d3754fc0de) by aandres).
233 | - Fix coverage ([fafaa9a](https://github.com/tradewelltech/beavers/commit/fafaa9a49c4c038094058ab8f99346c9e45e9dde) by aandres).
234 | - Fix test coverage ([41938e9](https://github.com/tradewelltech/beavers/commit/41938e9c0c558d56cb89144fb539d00cf85254cf) by aandres).
235 | - Fix ci ([9c46069](https://github.com/tradewelltech/beavers/commit/9c46069ce380cc59a5c53aa1743a9f369d7283bf) by aandres).
236 |
237 | ### Removed
238 |
239 | - Remove trailing blank space ([22195ca](https://github.com/tradewelltech/beavers/commit/22195ca075c77deef92f0a7ea00025f0f1a71561) by aandres).
240 |
241 | ## [v0.0.1](https://github.com/tradewelltech/beavers/releases/tag/v0.0.1) - 2023-05-10
242 |
243 | [Compare with v0.0.1.rc](https://github.com/tradewelltech/beavers/compare/v0.0.1.rc...v0.0.1)
244 |
245 | ### Added
246 |
247 | - Add ci badge ([fdad06c](https://github.com/tradewelltech/beavers/commit/fdad06ca65ed1135d052c4e9e4a13e48b50cdabe) by aandres).
248 | - Add material ([31a46e4](https://github.com/tradewelltech/beavers/commit/31a46e4f5e39824736064fcc13d7fea600be5ac9) by aandres).
249 | - Add python doc requirements ([e1bcd00](https://github.com/tradewelltech/beavers/commit/e1bcd00aba018dba8e16d781b0f6ca9e783105c0) by aandres).
250 | - Add docs ([3c1e87a](https://github.com/tradewelltech/beavers/commit/3c1e87aa14d3d132189d7d5a3bbe66e6df0a57c5) by aandres).
251 | - Add coverage to deps ([ced0670](https://github.com/tradewelltech/beavers/commit/ced0670226f4ef43539efa75b1e6e455efda1df2) by aandres).
252 |
253 | ### Fixed
254 |
255 | - Fix branch ([9847cb9](https://github.com/tradewelltech/beavers/commit/9847cb9b4fd4d59c3060318805caeffbe8582cf7) by aandres).
256 | - Fix read the docs ([ecf5d25](https://github.com/tradewelltech/beavers/commit/ecf5d25cefe8be2c0448913d4f1ef100753a644a) by aandres).
257 |
258 | ### Removed
259 |
260 | - Remove duplicate and snyk ([b7e8539](https://github.com/tradewelltech/beavers/commit/b7e8539a682162de0fff1a9b6a5f55ca5f550da2) by aandres).
261 |
262 | ## [v0.0.1.rc](https://github.com/tradewelltech/beavers/releases/tag/v0.0.1.rc) - 2023-05-09
263 |
264 | [Compare with first commit](https://github.com/tradewelltech/beavers/compare/1cc83cb780e53ef55308100c655c321dcc945d3b...v0.0.1.rc)
265 |
266 | ### Added
267 |
268 | - add pre commit ([12d7ffa](https://github.com/tradewelltech/beavers/commit/12d7ffa203c8c88cbb68f683fc2d992960e170fe) by aandres).
269 | - Add engine code ([e2f0949](https://github.com/tradewelltech/beavers/commit/e2f0949dd5dc69692455c7564c5f6bcfd997754d) by aandres).
270 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 |
2 | # Contributor Covenant Code of Conduct
3 |
4 | ## Our Pledge
5 |
6 | We as members, contributors, and leaders pledge to make participation in our
7 | community a harassment-free experience for everyone, regardless of age, body
8 | size, visible or invisible disability, ethnicity, sex characteristics, gender
9 | identity and expression, level of experience, education, socio-economic status,
10 | nationality, personal appearance, race, religion, or sexual identity
11 | and orientation.
12 |
13 | We pledge to act and interact in ways that contribute to an open, welcoming,
14 | diverse, inclusive, and healthy community.
15 |
16 | ## Our Standards
17 |
18 | Examples of behavior that contributes to a positive environment for our
19 | community include:
20 |
21 | * Demonstrating empathy and kindness toward other people
22 | * Being respectful of differing opinions, viewpoints, and experiences
23 | * Giving and gracefully accepting constructive feedback
24 | * Accepting responsibility and apologizing to those affected by our mistakes,
25 | and learning from the experience
26 | * Focusing on what is best not just for us as individuals, but for the
27 | overall community
28 |
29 | Examples of unacceptable behavior include:
30 |
31 | * The use of sexualized language or imagery, and sexual attention or
32 | advances of any kind
33 | * Trolling, insulting or derogatory comments, and personal or political attacks
34 | * Public or private harassment
35 | * Publishing others' private information, such as a physical or email
36 | address, without their explicit permission
37 | * Other conduct which could reasonably be considered inappropriate in a
38 | professional setting
39 |
40 | ## Enforcement Responsibilities
41 |
42 | Community leaders are responsible for clarifying and enforcing our standards of
43 | acceptable behavior and will take appropriate and fair corrective action in
44 | response to any behavior that they deem inappropriate, threatening, offensive,
45 | or harmful.
46 |
47 | Community leaders have the right and responsibility to remove, edit, or reject
48 | comments, commits, code, wiki edits, issues, and other contributions that are
49 | not aligned to this Code of Conduct, and will communicate reasons for moderation
50 | decisions when appropriate.
51 |
52 | ## Scope
53 |
54 | This Code of Conduct applies within all community spaces, and also applies when
55 | an individual is officially representing the community in public spaces.
56 | Examples of representing our community include using an official email address,
57 | posting via an official social media account, or acting as an appointed
58 | representative at an online or offline event.
59 |
60 | ## Enforcement
61 |
62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
63 | reported to the community leaders responsible for enforcement at
64 | [INSERT CONTACT METHOD].
65 | All complaints will be reviewed and investigated promptly and fairly.
66 |
67 | All community leaders are obligated to respect the privacy and security of the
68 | reporter of any incident.
69 |
70 | ## Enforcement Guidelines
71 |
72 | Community leaders will follow these Community Impact Guidelines in determining
73 | the consequences for any action they deem in violation of this Code of Conduct:
74 |
75 | ### 1. Correction
76 |
77 | **Community Impact**: Use of inappropriate language or other behavior deemed
78 | unprofessional or unwelcome in the community.
79 |
80 | **Consequence**: A private, written warning from community leaders, providing
81 | clarity around the nature of the violation and an explanation of why the
82 | behavior was inappropriate. A public apology may be requested.
83 |
84 | ### 2. Warning
85 |
86 | **Community Impact**: A violation through a single incident or series
87 | of actions.
88 |
89 | **Consequence**: A warning with consequences for continued behavior. No
90 | interaction with the people involved, including unsolicited interaction with
91 | those enforcing the Code of Conduct, for a specified period of time. This
92 | includes avoiding interactions in community spaces as well as external channels
93 | like social media. Violating these terms may lead to a temporary or
94 | permanent ban.
95 |
96 | ### 3. Temporary Ban
97 |
98 | **Community Impact**: A serious violation of community standards, including
99 | sustained inappropriate behavior.
100 |
101 | **Consequence**: A temporary ban from any sort of interaction or public
102 | communication with the community for a specified period of time. No public or
103 | private interaction with the people involved, including unsolicited interaction
104 | with those enforcing the Code of Conduct, is allowed during this period.
105 | Violating these terms may lead to a permanent ban.
106 |
107 | ### 4. Permanent Ban
108 |
109 | **Community Impact**: Demonstrating a pattern of violation of community
110 | standards, including sustained inappropriate behavior, harassment of an
111 | individual, or aggression toward or disparagement of classes of individuals.
112 |
113 | **Consequence**: A permanent ban from any sort of public interaction within
114 | the community.
115 |
116 | ## Attribution
117 |
118 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
119 | version 2.0, available at
120 | [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
121 |
122 | Community Impact Guidelines were inspired by
123 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
124 |
125 | For answers to common questions about this code of conduct, see the FAQ at
126 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available
127 | at [https://www.contributor-covenant.org/translations][translations].
128 |
129 | [homepage]: https://www.contributor-covenant.org
130 | [v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html
131 | [Mozilla CoC]: https://github.com/mozilla/diversity
132 | [FAQ]: https://www.contributor-covenant.org/faq
133 | [translations]: https://www.contributor-covenant.org/translations
134 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to Beavers
2 |
3 | See the [contributing](https://beavers.readthedocs.io/en/latest/contributing/) section of the doc.
4 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | [![PyPI Version][pypi-image]][pypi-url]
3 | [![Python Version][versions-image]][versions-url]
4 | [![Github Stars][stars-image]][stars-url]
5 | [![codecov][codecov-image]][codecov-url]
6 | [![Build Status][build-image]][build-url]
7 | [![Documentation][doc-image]][doc-url]
8 | [![License][license-image]][license-url]
9 | [![Downloads][downloads-image]][downloads-url]
10 | [![Downloads][downloads-month-image]][downloads-month-url]
11 | [![Code style: black][codestyle-image]][codestyle-url]
12 | [![snyk][snyk-image]][snyk-url]
13 |
14 |
15 |
16 |
17 | ![Beavers Logo][5]
18 |
19 | # Beavers
20 |
21 | [Documentation][6] / [Installation][7] / [Repository][1] / [PyPI][8]
22 |
23 | [Beavers][1] is a python library for stream processing, optimized for analytics.
24 |
25 | It is used at [Tradewell Technologies][2],
26 | to calculate analytics and serve model predictions,
27 | for both realtime and batch jobs.
28 |
29 | ## Key Features
30 |
31 | - Works in **real time** (eg: reading from Kafka) and **replay mode** (eg: reading from Parquet files).
32 | - Optimized for analytics, using micro-batches (instead of processing records one by one).
33 | - Similar to [incremental][3], it updates nodes in a dag incrementally.
34 | - Taking inspiration from [kafka streams][4], there are two types of nodes in the dag:
35 | - **Stream**: ephemeral micro-batches of events (cleared after every cycle).
36 | - **State**: durable state derived from streams.
37 | - Clear separation between the business logic and the IO.
38 | So the same dag can be used in real time mode, replay mode or can be easily tested.
39 | - Functional interface: no inheritance or decorator required.
40 | - Support for complicated joins, not just "linear" data flow.
41 |
42 | ## Limitations
43 |
44 | - No concurrency support.
45 | To speed up calculation use libraries like pandas, pyarrow or polars.
46 | - No async code.
47 | To speed up IO use kafka driver native thread or parquet IO thread pool.
48 | - No support for persistent state.
49 | Instead of saving state, replay historic data from kafka to prime stateful nodes.
50 |
51 | ## Talks
52 |
53 | - [Unified batch and stream processing in python | PyData Global 2023][9]
54 |
55 | [1]: https://github.com/tradewelltech/beavers
56 | [2]: https://www.tradewelltech.co/
57 | [3]: https://github.com/janestreet/incremental
58 | [4]: https://www.confluent.io/blog/kafka-streams-tables-part-1-event-streaming/
59 | [5]: https://raw.githubusercontent.com/tradewelltech/beavers/master/docs/static/icons/beavers/logo.svg
60 | [6]: https://beavers.readthedocs.io/en/latest/
61 | [7]: https://beavers.readthedocs.io/en/latest/install/
62 | [8]: https://pypi.org/project/beavers/
63 | [9]: https://www.youtube.com/watch?v=8pUwsGA8SQM
64 |
65 | [pypi-image]: https://img.shields.io/pypi/v/beavers
66 | [pypi-url]: https://pypi.org/project/beavers/
67 | [build-image]: https://github.com/tradewelltech/beavers/actions/workflows/ci.yaml/badge.svg
68 | [build-url]: https://github.com/tradewelltech/beavers/actions/workflows/ci.yaml
69 | [stars-image]: https://img.shields.io/github/stars/tradewelltech/beavers
70 | [stars-url]: https://github.com/tradewelltech/beavers
71 | [versions-image]: https://img.shields.io/pypi/pyversions/beavers
72 | [versions-url]: https://pypi.org/project/beavers/
73 | [doc-image]: https://readthedocs.org/projects/beavers/badge/?version=latest
74 | [doc-url]: https://beavers.readthedocs.io/en/latest/?badge=latest
75 | [license-image]: http://img.shields.io/:license-Apache%202-blue.svg
76 | [license-url]: https://github.com/tradewelltech/beavers/blob/main/LICENSE
77 | [codecov-image]: https://codecov.io/gh/tradewelltech/beavers/branch/main/graph/badge.svg?token=GY6KL7NT1Q
78 | [codecov-url]: https://codecov.io/gh/tradewelltech/beavers
79 | [downloads-image]: https://pepy.tech/badge/beavers
80 | [downloads-url]: https://static.pepy.tech/badge/beavers
81 | [downloads-month-image]: https://pepy.tech/badge/beavers/month
82 | [downloads-month-url]: https://static.pepy.tech/badge/beavers/month
83 | [codestyle-image]: https://img.shields.io/badge/code%20style-black-000000.svg
84 | [codestyle-url]: https://github.com/ambv/black
85 | [snyk-image]: https://snyk.io/advisor/python/beavers/badge.svg
86 | [snyk-url]: https://snyk.io/advisor/python/beavers
87 |
--------------------------------------------------------------------------------
/beavers/__init__.py:
--------------------------------------------------------------------------------
1 | from beavers.dag import Dag, Node, TimerManager
2 |
3 | __version__ = "0.0.0"
4 | __all__ = ["Dag", "Node", "TimerManager"]
5 |
--------------------------------------------------------------------------------
/beavers/assets/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tradewelltech/beavers/ec9979086868589ab82b47ce55fa11cc31b32c16/beavers/assets/favicon.ico
--------------------------------------------------------------------------------
/beavers/pandas_wrapper.py:
--------------------------------------------------------------------------------
1 | """Module for building dags using pandas."""
2 |
3 | import dataclasses
4 | from typing import Callable, Optional, ParamSpec
5 |
6 | import pandas as pd
7 |
8 | from beavers import Dag, Node
9 | from beavers.dag import NodePrototype
10 |
11 | P = ParamSpec("P")
12 |
13 |
14 | def _empty_df(dtypes: pd.Series) -> pd.DataFrame:
15 | return pd.DataFrame(columns=dtypes.index).astype(dtypes)
16 |
17 |
18 | def _get_stream_dtypes(node: Node[pd.DataFrame]) -> pd.Series:
19 | empty = node._get_empty()
20 | if not isinstance(empty, pd.DataFrame):
21 | raise TypeError(f"Argument should be a {Node.__name__}[pd.DataFrame]")
22 | else:
23 | return empty.dtypes
24 |
25 |
26 | @dataclasses.dataclass()
27 | class _LastTracker:
28 | key_columns: list[str]
29 | current: pd.DataFrame
30 |
31 | def __call__(self, stream: pd.DataFrame):
32 | self.current = (
33 | pd.concat([self.current, stream])
34 | .groupby(self.key_columns, as_index=False)
35 | .tail(1)
36 | .reset_index(drop=True)
37 | )
38 |
39 | return self.current
40 |
41 |
42 | @dataclasses.dataclass(frozen=True)
43 | class PandasWrapper:
44 | """Helper call for adding pandas Nodes to a Dag."""
45 |
46 | _dag: Dag
47 |
48 | def source_df(
49 | self, dtypes: pd.Series, name: Optional[str] = None
50 | ) -> Node[pd.DataFrame]:
51 | empty = _empty_df(dtypes)
52 | return self._dag.source_stream(empty, name=name)
53 |
54 | def df_stream(
55 | self, function: Callable[P, pd.DataFrame], dtypes: pd.Series
56 | ) -> NodePrototype[pd.DataFrame]:
57 | return self._dag.stream(function, empty=_empty_df(dtypes))
58 |
59 | def last_by_keys(
60 | self, stream: Node[pd.DataFrame], keys: list[str]
61 | ) -> Node[pd.DataFrame]:
62 | """Build a state of the latest row by keys."""
63 | dtypes = _get_stream_dtypes(stream)
64 | for key in keys:
65 | assert key in dtypes, key
66 | return self._dag.state(_LastTracker(keys, _empty_df(dtypes))).map(stream)
67 |
--------------------------------------------------------------------------------
/beavers/perspective_wrapper.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | import pathlib
3 | from typing import Any, Literal, Optional, Sequence
4 |
5 | import perspective
6 | import pyarrow as pa
7 | import tornado
8 | from perspective.handlers.tornado import PerspectiveTornadoHandler
9 |
10 | from beavers import Dag, Node
11 | from beavers.kafka import KafkaDriver
12 |
13 | COMPARATORS = (
14 | "==",
15 | "!=",
16 | ">",
17 | ">=",
18 | "<",
19 | "<=",
20 | "begins with",
21 | "contains",
22 | "ends with",
23 | "in",
24 | "not in",
25 | "is not null",
26 | "is null",
27 | )
28 |
29 | _SOURCE_DIRECTORY = pathlib.Path(__file__).parent
30 | TABLE_PATH = str(_SOURCE_DIRECTORY / "table.html")
31 | ASSETS_DIRECTORY = str(_SOURCE_DIRECTORY / "assets")
32 |
33 |
34 | @dataclasses.dataclass(frozen=True)
35 | class PerspectiveTableDefinition:
36 | """
37 | API table definition
38 | """
39 |
40 | name: str
41 | index_column: str
42 | remove_column: Optional[str] = None
43 | sort: list[tuple[str, Literal["asc", "desc"]]] = dataclasses.field(
44 | default_factory=list
45 | )
46 | filters: list[tuple[str, str, Any]] = dataclasses.field(default_factory=list)
47 | hidden_columns: Sequence[str] = ()
48 | limit: Optional[int] = None
49 |
50 | def validate(self, schema: pa.Schema):
51 | assert self.index_column in schema.names, self.index_column
52 | if self.remove_column is not None:
53 | assert isinstance(self.remove_column, str)
54 | assert self.remove_column in schema.names, self.remove_column
55 |
56 | assert isinstance(self.sort, list)
57 | for column, order in self.sort:
58 | assert isinstance(column, str)
59 | assert column in schema.names
60 | assert order in ("asc", "desc")
61 | for column in self.hidden_columns:
62 | assert isinstance(column, str)
63 | assert column in schema.names
64 | for each_filter in self.filters:
65 | assert len(each_filter) in (2, 3)
66 | assert isinstance(each_filter[0], str), each_filter
67 | assert each_filter[1] in COMPARATORS
68 |
69 |
70 | @dataclasses.dataclass(frozen=True)
71 | class _TableConfig:
72 | """
73 | Internal perspective table config, which is passed to the html template
74 | """
75 |
76 | name: str
77 | index: str
78 | columns: list[str]
79 | sort: Sequence[tuple[str, Literal["asc", "desc"]]]
80 | filters: Sequence[tuple[str, str, Any]]
81 |
82 | @staticmethod
83 | def from_definition(definition: PerspectiveTableDefinition, schema: pa.Schema):
84 | return _TableConfig(
85 | name=definition.name,
86 | index=definition.index_column,
87 | columns=[f for f in schema.names if f not in definition.hidden_columns],
88 | sort=[] if definition.sort is None else definition.sort,
89 | filters=definition.filters,
90 | )
91 |
92 |
93 | class TableRequestHandler(tornado.web.RequestHandler):
94 | """Renders the table.html template, using the provided configurations"""
95 |
96 | _tables: Optional[dict[str, _TableConfig]] = None
97 | _default_table: Optional[str] = None
98 |
99 | def initialize(self, table_configs: list[_TableConfig]) -> None:
100 | self._tables = {
101 | table_config.name: table_config for table_config in table_configs
102 | }
103 | self._default_table = table_configs[0].name
104 |
105 | async def get(self, path: str) -> None:
106 | table_name = path or self._default_table
107 | table_config = self._tables[table_name]
108 |
109 | await self.render(
110 | TABLE_PATH,
111 | table_config=table_config,
112 | perspective_version=perspective.__version__,
113 | )
114 |
115 |
116 | def _table_to_bytes(table: pa.Table) -> bytes:
117 | """Serialize a table as bytes, to pass it to a perspective table"""
118 | with pa.BufferOutputStream() as sink:
119 | with pa.ipc.new_stream(sink, table.schema) as writer:
120 | for batch in table.to_batches():
121 | writer.write_batch(batch)
122 | return sink.getvalue().to_pybytes()
123 |
124 |
125 | @dataclasses.dataclass(frozen=True)
126 | class _UpdateRunner:
127 | kafka_driver: KafkaDriver
128 |
129 | def __call__(self):
130 | self.kafka_driver.run_cycle(0.0)
131 |
132 |
133 | @dataclasses.dataclass()
134 | class _PerspectiveNode:
135 | table_definition: PerspectiveTableDefinition
136 | schema: pa.Schema
137 | table: perspective.Table | None = None
138 |
139 | def __call__(self, table: pa.Table) -> None:
140 | """Pass the arrow data to perspective"""
141 | self.table.update(_table_to_bytes(table))
142 |
143 | def get_table_config(self) -> _TableConfig:
144 | return _TableConfig.from_definition(self.table_definition, self.schema)
145 |
146 |
147 | @dataclasses.dataclass(frozen=True)
148 | class PerspectiveDagWrapper:
149 | """Helper for adding perspective Nodes to a Dag."""
150 |
151 | _dag: Dag
152 |
153 | def to_perspective(
154 | self,
155 | node: Node,
156 | table_definition: PerspectiveTableDefinition,
157 | schema: Optional[pa.Schema] = None,
158 | ) -> None:
159 | """Add a source stream of type `pa.Table`."""
160 | if schema is None:
161 | assert node._is_stream(), "Must provide a schema for state nodes"
162 | empty = node._empty_factory()
163 | assert isinstance(empty, pa.Table), "Only pyarrow.Table nodes supported"
164 | schema = empty.schema
165 | table_definition.validate(schema)
166 | self._dag.state(
167 | _PerspectiveNode(
168 | table_definition,
169 | schema,
170 | table=None,
171 | )
172 | ).map(node)
173 |
174 |
175 | DATA_TYPES = [
176 | (pa.types.is_integer, "integer"),
177 | (pa.types.is_floating, "float"),
178 | (pa.types.is_boolean, "boolean"),
179 | (pa.types.is_date, "date"),
180 | (pa.types.is_string, "string"),
181 | (pa.types.is_timestamp, "datetime"),
182 | ]
183 |
184 |
185 | def to_perspective_type(data_type: pa.DataType) -> Any:
186 | for predicate, perspective_type in DATA_TYPES:
187 | if predicate(data_type):
188 | return perspective_type
189 | raise TypeError(f"Unsupported type: {data_type}")
190 |
191 |
192 | def to_perspective_schema(schema: pa.Schema) -> dict[str, Any]:
193 | return {f.name: to_perspective_type(f.type) for f in schema}
194 |
195 |
196 | def perspective_thread(
197 | perspective_server: perspective.Server,
198 | kafka_driver: KafkaDriver,
199 | nodes: list[_PerspectiveNode],
200 | ):
201 | local_client = perspective_server.new_local_client()
202 | for node in nodes:
203 | assert node.table is None
204 | node.table = local_client.table(
205 | to_perspective_schema(node.schema),
206 | name=node.table_definition.name,
207 | index=node.table_definition.index_column,
208 | )
209 |
210 | callback = tornado.ioloop.PeriodicCallback(
211 | callback=_UpdateRunner(kafka_driver), callback_time=1_000
212 | )
213 | callback.start()
214 |
215 |
216 | def run_web_application(
217 | kafka_driver: KafkaDriver,
218 | assets_directory: str = ASSETS_DIRECTORY,
219 | port: int = 8082,
220 | ) -> None:
221 | server = perspective.Server()
222 |
223 | nodes: list[_PerspectiveNode] = []
224 | for node in kafka_driver._dag._nodes:
225 | if isinstance(node._function, _PerspectiveNode):
226 | nodes.append(node._function)
227 | assert len(nodes) > 0, "No perspective table nodes"
228 | assert len({n.table_definition.name for n in nodes}) == len(nodes), (
229 | "Duplicate table name"
230 | )
231 |
232 | web_app = tornado.web.Application(
233 | [
234 | (
235 | r"/websocket",
236 | PerspectiveTornadoHandler,
237 | {"perspective_server": server},
238 | ),
239 | (
240 | r"/assets/(.*)",
241 | tornado.web.StaticFileHandler,
242 | {"path": assets_directory, "default_filename": None},
243 | ),
244 | (
245 | r"/([a-z0-9_]*)",
246 | TableRequestHandler,
247 | {"table_configs": [node.get_table_config() for node in nodes]},
248 | ),
249 | ],
250 | serve_traceback=True,
251 | )
252 | web_app.listen(port)
253 | loop = tornado.ioloop.IOLoop.current()
254 | loop.call_later(0, perspective_thread, server, kafka_driver, nodes)
255 | loop.start()
256 |
--------------------------------------------------------------------------------
/beavers/polars_wrapper.py:
--------------------------------------------------------------------------------
1 | """Module for building dags using polars."""
2 |
3 | import dataclasses
4 | from operator import itemgetter
5 | from typing import Callable, Optional, ParamSpec, Iterable, Any
6 |
7 | import polars as pl
8 | from polars._typing import IntoExprColumn
9 |
10 | from beavers.dag import Dag, Node, NodePrototype
11 |
12 | P = ParamSpec("P")
13 |
14 |
15 | @dataclasses.dataclass()
16 | class _LastByKey:
17 | key_columns: tuple[str, ...]
18 | current: pl.DataFrame
19 |
20 | def __call__(self, stream: pl.DataFrame) -> pl.DataFrame:
21 | self.current = (
22 | pl.concat([self.current, stream])
23 | .group_by(self.key_columns, maintain_order=True)
24 | .last()
25 | .select(self.current.columns)
26 | )
27 | return self.current
28 |
29 |
30 | def _get_stream_schema(node: Node[pl.DataFrame]) -> pl.Schema:
31 | empty = node._get_empty()
32 | if not isinstance(empty, pl.DataFrame):
33 | raise TypeError(f"Argument should be a {Node.__name__}[pl.DataFrame]")
34 | else:
35 | return empty.schema
36 |
37 |
38 | def _get_stream_dtype(node: Node[pl.Series]) -> pl.DataType:
39 | empty = node._get_empty()
40 | if not isinstance(empty, pl.Series):
41 | raise TypeError(f"Argument should be a {Node.__name__}[pl.Series]")
42 | else:
43 | return empty.dtype
44 |
45 |
46 | @dataclasses.dataclass(frozen=True)
47 | class _TableFilter:
48 | predicate: tuple[IntoExprColumn | Iterable[IntoExprColumn], ...]
49 | constraints: dict[str, Any]
50 |
51 | def __call__(self, table: pl.DataFrame) -> pl.DataFrame:
52 | return table.filter(*self.predicate, **self.constraints)
53 |
54 |
55 | @dataclasses.dataclass(frozen=True)
56 | class PolarsDagWrapper:
57 | """Helper for adding polars Nodes to a Dag."""
58 |
59 | _dag: Dag
60 |
61 | def source_table(
62 | self, schema: pl.Schema, name: Optional[str] = None
63 | ) -> Node[pl.DataFrame]:
64 | """Add a source stream of type `pl.DataFrame`."""
65 |
66 | return self._dag.source_stream(empty=schema.to_frame(), name=name)
67 |
68 | def table_stream(
69 | self, function: Callable[P, pl.DataFrame], schema: pl.Schema
70 | ) -> NodePrototype[pl.DataFrame]:
71 | """Add a stream node of output type `pl.DataFrame`"""
72 | return self._dag.stream(function, empty=schema.to_frame())
73 |
74 | def filter_stream(
75 | self,
76 | stream: Node[pl.DataFrame],
77 | *predicates: IntoExprColumn | Iterable[IntoExprColumn],
78 | **constraints: Any,
79 | ) -> Node[pl.DataFrame]:
80 | """Filter a stream Node of type `pl.DataFrame`."""
81 | schema = _get_stream_schema(stream)
82 | return self._dag.stream(
83 | _TableFilter(tuple(predicates), dict(constraints)),
84 | empty=schema.to_frame(),
85 | ).map(stream)
86 |
87 | def last_by_keys(
88 | self, stream: Node[pl.DataFrame], keys: list[str]
89 | ) -> Node[pl.DataFrame]:
90 | """Build a state of the latest row by keys."""
91 | schema = _get_stream_schema(stream)
92 | for key in keys:
93 | assert isinstance(key, str), "Keys must be strings"
94 | return self._dag.state(_LastByKey(tuple(keys), schema.to_frame())).map(stream)
95 |
96 | def concat_series(self, *streams: Node[pl.Series]) -> Node[pl.Series]:
97 | if len(streams) == 0:
98 | raise ValueError("Must pass at least one series")
99 | series_type = None
100 | for stream in streams:
101 | each_type = _get_stream_dtype(stream)
102 | if series_type is None:
103 | series_type = each_type
104 | elif series_type != each_type:
105 | raise TypeError(f"Series type mismatch {series_type} vs {each_type}")
106 |
107 | empty = pl.Series(dtype=series_type)
108 | return self._dag.stream(lambda *x: pl.concat(x), empty=empty).map(*streams)
109 |
110 | def get_series(self, stream: Node[pl.DataFrame], name: str) -> Node[pl.Series]:
111 | empty = _get_stream_schema(stream).to_frame()[name]
112 | return self._dag.stream(itemgetter(name), empty=empty).map(stream)
113 |
--------------------------------------------------------------------------------
/beavers/pyarrow_kafka.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | import io
3 | import json
4 |
5 | import confluent_kafka
6 | import pyarrow as pa
7 | import pyarrow.json
8 |
9 | from beavers.kafka import (
10 | KafkaMessageDeserializer,
11 | KafkaMessageSerializer,
12 | KafkaProducerMessage,
13 | )
14 |
15 |
16 | @dataclasses.dataclass(frozen=True)
17 | class JsonDeserializer(KafkaMessageDeserializer[pa.Table]):
18 | schema: pa.Schema
19 |
20 | def __call__(self, messages: confluent_kafka.Message) -> pa.Table:
21 | if messages:
22 | with io.BytesIO() as buffer:
23 | for message in messages:
24 | buffer.write(message.value())
25 | buffer.write(b"\n")
26 | buffer.seek(0)
27 | return pyarrow.json.read_json(
28 | buffer,
29 | parse_options=pyarrow.json.ParseOptions(
30 | explicit_schema=self.schema
31 | ),
32 | )
33 | else:
34 | return self.schema.empty_table()
35 |
36 |
37 | @dataclasses.dataclass(frozen=True)
38 | class JsonSerializer(KafkaMessageSerializer[pa.Table]):
39 | topic: str
40 |
41 | def __call__(self, table: pa.Table):
42 | return [
43 | KafkaProducerMessage(
44 | self.topic,
45 | key=None,
46 | value=json.dumps(message, default=str).encode("utf-8"),
47 | )
48 | for message in table.to_pylist()
49 | ]
50 |
--------------------------------------------------------------------------------
/beavers/pyarrow_replay.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | from typing import Callable
3 |
4 | import pandas as pd
5 | import pyarrow as pa
6 |
7 | from beavers.dag import UTC_MAX
8 | from beavers.replay import DataSink, DataSource
9 |
10 |
11 | class ArrowTableDataSource(DataSource[pa.Table]):
12 | def __init__(
13 | self, table: pa.Table, timestamp_extractor: Callable[[pa.Table], pa.Array]
14 | ):
15 | assert callable(timestamp_extractor)
16 | self._table = table
17 | self._empty_table = table.schema.empty_table()
18 | self._timestamp_column = timestamp_extractor(table).to_pandas(
19 | date_as_object=False
20 | )
21 | assert self._timestamp_column.is_monotonic_increasing, (
22 | "Timestamp column should be monotonic increasing"
23 | )
24 | self._index = 0
25 |
26 | def read_to(self, timestamp: pd.Timestamp) -> pa.Table:
27 | new_index = self._timestamp_column.searchsorted(timestamp, side="right")
28 | if new_index > self._index:
29 | from_index = self._index
30 | self._index = new_index
31 | return self._table.slice(from_index, new_index - from_index)
32 | else:
33 | results = self._empty_table
34 | return results
35 |
36 | def get_next(self) -> pd.Timestamp:
37 | if self._index >= len(self._table):
38 | return UTC_MAX
39 | else:
40 | return self._timestamp_column.iloc[self._index]
41 |
42 |
43 | @dataclasses.dataclass
44 | class ArrowTableDataSink(DataSink[pa.Table]):
45 | saver: Callable[[pa.Table], None]
46 | chunks: list[pa.Table] = dataclasses.field(default_factory=list)
47 |
48 | def append(self, timestamp: pd.Timestamp, data: pa.Table):
49 | self.chunks.append(data)
50 |
51 | def close(self):
52 | if self.chunks:
53 | results = pa.concat_tables(self.chunks)
54 | self.saver(results)
55 |
--------------------------------------------------------------------------------
/beavers/pyarrow_wrapper.py:
--------------------------------------------------------------------------------
1 | """Module for building dags using pyarrow."""
2 |
3 | import dataclasses
4 | from typing import Callable, Iterable, Optional, ParamSpec, Sequence
5 |
6 | import numpy as np
7 | import pyarrow as pa
8 |
9 | from beavers.dag import Dag, Node, NodePrototype, _check_function
10 |
11 | P = ParamSpec("P")
12 |
13 |
14 | @dataclasses.dataclass(frozen=True)
15 | class _TableFiler:
16 | predicate: Callable[[pa.Table, ...], pa.Array]
17 |
18 | def __call__(self, table: pa.Table, *args, **kwargs) -> pa.Table:
19 | return table.filter(self.predicate(table, *args, **kwargs))
20 |
21 |
22 | def _get_last_by(table: pa.Table, keys: Sequence[str]) -> pa.Table:
23 | return table.take(
24 | table.select(keys)
25 | .append_column("_beavers_index", pa.array(np.arange(len(table))))
26 | .group_by(keys)
27 | .aggregate([("_beavers_index", "max")])["_beavers_index_max"]
28 | .sort()
29 | )
30 |
31 |
32 | def _concat_arrow_arrays(
33 | arrow_arrays: Sequence[pa.ChunkedArray],
34 | ) -> [pa.Array | pa.ChunkedArray]:
35 | arrays: list[pa.Array] = []
36 | for arrow_array in arrow_arrays:
37 | if isinstance(arrow_array, pa.ChunkedArray):
38 | arrays.extend(arrow_array.iterchunks())
39 | elif isinstance(arrow_array, pa.Array):
40 | arrays.append(arrow_array)
41 | else:
42 | raise TypeError(arrow_array)
43 |
44 | return pa.chunked_array(arrays)
45 |
46 |
47 | def _check_column(column: str, schema: pa.Schema):
48 | if not isinstance(column, str):
49 | raise TypeError(column)
50 | elif column not in schema.names:
51 | raise TypeError(f"field {column} no in schema: {schema.names}")
52 |
53 |
54 | def _check_array(node: Node[pa.Array | pa.ChunkedArray]) -> pa.DataType:
55 | empty = node._get_empty()
56 | if not isinstance(empty, (pa.Array, pa.ChunkedArray)):
57 | raise TypeError(f"Argument should be a {Node.__name__}[pa.Array]")
58 | else:
59 | return empty.type
60 |
61 |
62 | def _check_columns(columns: list[str], schema: pa.Schema) -> list[str]:
63 | if not isinstance(columns, Iterable):
64 | raise TypeError(columns)
65 | for column in columns:
66 | if not isinstance(column, str):
67 | raise TypeError(column)
68 | elif column not in schema.names:
69 | raise TypeError(f"field {column} no in schema: {schema.names}")
70 | return list(columns)
71 |
72 |
73 | def _get_stream_schema(node: Node[pa.Table]) -> pa.Schema:
74 | empty = node._get_empty()
75 | if not isinstance(empty, pa.Table):
76 | raise TypeError(f"Argument should be a {Node.__name__}[pa.Table]")
77 | else:
78 | return empty.schema
79 |
80 |
81 | @dataclasses.dataclass()
82 | class _LastByKey:
83 | key_columns: tuple[str, ...]
84 | current: pa.Table
85 |
86 | def __call__(self, stream: pa.Table) -> pa.Table:
87 | self.current = _get_last_by(
88 | pa.concat_tables([self.current, stream]), self.key_columns
89 | )
90 | return self.current
91 |
92 |
93 | @dataclasses.dataclass(frozen=True)
94 | class ArrowDagWrapper:
95 | """Helper for adding pyarrow Nodes to a Dag."""
96 |
97 | _dag: Dag
98 |
99 | def source_table(
100 | self, schema: pa.Schema, name: Optional[str] = None
101 | ) -> Node[pa.Table]:
102 | """Add a source stream of type `pa.Table`."""
103 | return self._dag.source_stream(empty=schema.empty_table(), name=name)
104 |
105 | def table_stream(
106 | self, function: Callable[P, pa.Table], schema: pa.Schema
107 | ) -> NodePrototype[pa.Table]:
108 | """Add a stream node of output type `pa.Table`"""
109 | return self._dag.stream(function, empty=schema.empty_table())
110 |
111 | def filter_stream(
112 | self,
113 | predicate: Callable[[pa.Table, ...], pa.Array],
114 | stream: Node[pa.Table],
115 | *args: Node,
116 | **kwargs: Node,
117 | ) -> Node[pa.Table]:
118 | """Filter a stream Node of type `pa.Table`."""
119 | function = _TableFiler(predicate)
120 | schema = _get_stream_schema(stream)
121 | _check_function(function)
122 | return self._dag.stream(function, empty=schema.empty_table()).map(
123 | stream, *args, **kwargs
124 | )
125 |
126 | def last_by_keys(
127 | self, stream: Node[pa.Table], keys: Sequence[str]
128 | ) -> Node[pa.Table]:
129 | """Build a state of the latest row by keys."""
130 | schema = _get_stream_schema(stream)
131 | keys = _check_columns(keys, schema)
132 | return self._dag.state(_LastByKey(keys, schema.empty_table())).map(stream)
133 |
134 | def get_column(self, stream: Node[pa.Table], key: str) -> Node[pa.ChunkedArray]:
135 | """Return a column from a stream node of type pa.Table."""
136 | schema = _get_stream_schema(stream)
137 | _check_column(key, schema)
138 | field = schema.field(key)
139 | empty = pa.chunked_array([pa.array([], field.type)])
140 | return self._dag.stream(lambda x: x[key], empty=empty).map(stream)
141 |
142 | def concat_arrays(
143 | self, *streams: Node[pa.Array | pa.ChunkedArray]
144 | ) -> Node[pa.ChunkedArray]:
145 | if len(streams) == 0:
146 | raise ValueError("Must pass at least one array")
147 | array_type = None
148 | for stream in streams:
149 | each_type = _check_array(stream)
150 | if array_type is None:
151 | array_type = each_type
152 | elif array_type != each_type:
153 | raise TypeError(f"Array type mismatch {array_type} vs {each_type}")
154 |
155 | empty = pa.chunked_array([pa.array([], array_type)])
156 | return self._dag.stream(lambda *x: _concat_arrow_arrays(x), empty=empty).map(
157 | *streams
158 | )
159 |
--------------------------------------------------------------------------------
/beavers/replay.py:
--------------------------------------------------------------------------------
1 | """Module for replaying historical data."""
2 |
3 | import abc
4 | import collections.abc
5 | import dataclasses
6 | import logging
7 | import time
8 | from typing import Callable, Generic, Iterator, Optional, Protocol, TypeVar
9 |
10 | import pandas as pd
11 |
12 | from beavers.dag import UTC_MAX, Dag, Node
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 | T = TypeVar("T")
17 |
18 |
19 | @dataclasses.dataclass(frozen=True)
20 | class ReplayContext:
21 | """
22 | Stores the information about a replay.
23 |
24 | Attributes
25 | ----------
26 | start: pd.Timestamp
27 | Start of the replay
28 | end: pd.Timestamp
29 | End of the replay.
30 | This is exclusive, the replay will stop 1ns before
31 | frequency:
32 | How often should the replay run
33 |
34 | """
35 |
36 | start: pd.Timestamp
37 | end: pd.Timestamp
38 | frequency: pd.Timedelta
39 |
40 | def __post_init__(self):
41 | """Check arguments are valid."""
42 | assert self.start.tzname() == "UTC"
43 | assert self.end.tzname() == "UTC"
44 |
45 |
46 | class DataSource(Protocol[T]):
47 | """Interface for replaying historical data from a file or database."""
48 |
49 | def read_to(self, timestamp: pd.Timestamp) -> T:
50 | """
51 | Read from the data source, all the way to the provided timestamp (inclusive).
52 |
53 | This function is stateful and must remember the previous timestamp
54 | for which data was read.
55 |
56 | Parameters
57 | ----------
58 | timestamp
59 | End of the time interval for which data is required (inclusive)
60 |
61 | Returns
62 | -------
63 | data
64 | The data for the interval (or empty if no data is found)
65 |
66 | """
67 |
68 | def get_next(self) -> pd.Timestamp:
69 | """
70 | Return the next timestamp for which there is data.
71 |
72 | If no data is available this should return `UTC_MAX`
73 |
74 |
75 | Returns
76 | -------
77 | timestamp: pd.Timestamp
78 | Timestamp of the next available data point (or `UTC_MAX` if no more data
79 | is available)
80 |
81 | """
82 |
83 |
84 | class DataSink(Protocol[T]):
85 | """Interface for saving the results of a replay to a file or database."""
86 |
87 | def append(self, timestamp: pd.Timestamp, data: T):
88 | """
89 | Append data for the current cycle.
90 |
91 | Parameters
92 | ----------
93 | timestamp:
94 | End of the time interval for which data was replayed (inclusive)
95 | data:
96 | The generated data
97 |
98 | """
99 |
100 | def close(self):
101 | """Flush the data and clean up resources."""
102 |
103 |
104 | class DataSourceProvider(Protocol[T]):
105 | """Interface for the provision of `DataSource`."""
106 |
107 | def __call__(self, replay_context: ReplayContext) -> DataSource[T]:
108 | """
109 | Create a `DataSource` for the given replay_context.
110 |
111 | Parameters
112 | ----------
113 | replay_context:
114 | Information about the replay that's about to run
115 |
116 | Returns
117 | -------
118 | DataSource[T]:
119 | Source for the replay
120 |
121 | """
122 |
123 |
124 | class DataSinkProvider(Protocol[T]):
125 | """Interface for the provision of `DataSink`."""
126 |
127 | @abc.abstractmethod
128 | def __call__(self, replay_context: ReplayContext) -> DataSink[T]:
129 | """
130 | Create a `DataSink` for the given replay_context.
131 |
132 | Parameters
133 | ----------
134 | replay_context:
135 | Information about the replay that's about to run
136 |
137 | Returns
138 | -------
139 | DataSink[T]:
140 | Sink for the replay
141 |
142 | """
143 |
144 |
145 | @dataclasses.dataclass(frozen=True)
146 | class _ReplaySource(Generic[T]):
147 | """Internal class used to store `DataSource` at runtime."""
148 |
149 | name: str
150 | node: Node[T]
151 | data_source: DataSource[T]
152 |
153 |
154 | @dataclasses.dataclass(frozen=True)
155 | class _ReplaySink(Generic[T]):
156 | """Internal class used to store `DataSink` at runtime."""
157 |
158 | name: str
159 | nodes: list[Node[T]]
160 | data_sink: DataSink[T]
161 |
162 |
163 | @dataclasses.dataclass(frozen=True)
164 | class ReplayCycleMetrics:
165 | """Metrics for each replay cycle."""
166 |
167 | timestamp: pd.Timestamp
168 | cycle_id: int
169 | source_records: int
170 | sink_records: int
171 | cycle_time_ns: int
172 | warp_ratio: float
173 |
174 |
175 | @dataclasses.dataclass
176 | class ReplayDriver:
177 | """
178 | Orchestrate the replay of data for dag.
179 |
180 | This will:
181 |
182 | - create the relevant `DataSource`s
183 | - create the relevant `DataSink`s
184 | - stream the data from the sources
185 | - inject the input data in the dag source nodes
186 | - execute the dag
187 | - collect the output data and pass it to the sink
188 | - close the sink at the end of the run
189 |
190 | Notes
191 | -----
192 | Do not call the constructor directly, use `create` instead
193 |
194 | """
195 |
196 | dag: Dag
197 | replay_context: ReplayContext
198 | sources: list[_ReplaySource]
199 | sinks: list[_ReplaySink]
200 | current_time: pd.Timestamp
201 |
202 | @staticmethod
203 | def create(
204 | dag: Dag,
205 | replay_context: ReplayContext,
206 | data_source_providers: dict[str, DataSourceProvider],
207 | data_sink_providers: dict[str, DataSinkProvider],
208 | ) -> "ReplayDriver":
209 | return ReplayDriver(
210 | dag,
211 | replay_context,
212 | _create_sources(dag, replay_context, data_source_providers),
213 | _create_sinks(dag, replay_context, data_sink_providers),
214 | current_time=replay_context.start,
215 | )
216 |
217 | def run(self):
218 | while not self.is_done():
219 | self.run_cycle()
220 | for sink in self.sinks:
221 | sink.data_sink.close()
222 |
223 | def is_done(self) -> bool:
224 | return self.current_time > self.replay_context.end
225 |
226 | def run_cycle(self) -> Optional[ReplayCycleMetrics]:
227 | st = time.time_ns()
228 | source_records, next_timestamp = self.read_sources()
229 | if source_records or self.dag.get_next_timer() <= self.current_time:
230 | timestamp = min(self.current_time, self.replay_context.end)
231 | self.dag.execute(timestamp)
232 | sink_records = self.flush_sinks()
233 | et = time.time_ns()
234 | warp_ratio = self.replay_context.frequency.value / (et - st)
235 | metrics = ReplayCycleMetrics(
236 | timestamp=timestamp,
237 | cycle_id=self.dag.get_cycle_id(),
238 | source_records=source_records,
239 | sink_records=sink_records,
240 | cycle_time_ns=et - st,
241 | warp_ratio=warp_ratio,
242 | )
243 | logger.info(
244 | f"Running cycle={metrics.cycle_id} "
245 | f"timestamp={metrics.timestamp} "
246 | f"source_records={metrics.source_records} "
247 | f"sink_records={metrics.sink_records} "
248 | f"warp={warp_ratio:.1f}"
249 | )
250 | else:
251 | metrics = None
252 |
253 | self.current_time = max(
254 | next_timestamp, self.current_time + self.replay_context.frequency
255 | ).ceil(self.replay_context.frequency)
256 | return metrics
257 |
258 | def read_sources(self) -> tuple[int, pd.Timestamp]:
259 | records = 0
260 | next_timestamp = self.replay_context.end
261 | for replay_source in self.sources:
262 | source_data = replay_source.data_source.read_to(self.current_time)
263 | next_timestamp = min(next_timestamp, replay_source.data_source.get_next())
264 | if len(source_data) > 0:
265 | replay_source.node.set_stream(source_data)
266 | records += len(source_data)
267 | return records, next_timestamp
268 |
269 | def flush_sinks(self) -> int:
270 | records = 0
271 | for sink in self.sinks:
272 | for node in sink.nodes:
273 | if node.get_cycle_id() == self.dag.get_cycle_id():
274 | sink_value = node.get_sink_value()
275 | records += (
276 | len(sink_value)
277 | if isinstance(sink_value, collections.abc.Sized)
278 | else 1
279 | )
280 | sink.data_sink.append(self.current_time, node.get_sink_value())
281 | return records
282 |
283 |
284 | def _create_sources(
285 | dag: Dag,
286 | replay_context: ReplayContext,
287 | data_source_providers: dict[str, DataSourceProvider],
288 | ) -> list[_ReplaySource]:
289 | source_nodes = dag.get_sources()
290 | nodes_names = sorted(source_nodes.keys())
291 | source_names = sorted(data_source_providers.keys())
292 | if nodes_names != source_names:
293 | raise ValueError(
294 | "Source node and DataSource names don't match: "
295 | f"{nodes_names} vs {source_names}"
296 | )
297 | return [
298 | _ReplaySource(
299 | name, source_nodes[name], data_source_providers[name](replay_context)
300 | )
301 | for name in data_source_providers.keys()
302 | ]
303 |
304 |
305 | def _create_sinks(
306 | dag: Dag,
307 | replay_context: ReplayContext,
308 | data_sink_providers: dict[str, DataSinkProvider],
309 | ) -> list[_ReplaySink]:
310 | sink_nodes = dag.get_sinks()
311 | nodes_names = sorted(sink_nodes.keys())
312 | sink_names = sorted(data_sink_providers.keys())
313 | if nodes_names != sink_names:
314 | raise ValueError(
315 | f"Sink node and DataSink names don't match: {nodes_names} vs {sink_names}"
316 | )
317 | return [
318 | _ReplaySink(name, sink_nodes[name], data_sink_providers[name](replay_context))
319 | for name in data_sink_providers.keys()
320 | ]
321 |
322 |
323 | class IteratorDataSourceAdapter(DataSource[T]):
324 | """
325 | Adapter between an iterator of `DataSource` and a DataSource.
326 |
327 | This can be used to stitch together various `DataSource` for incremental date range
328 | """
329 |
330 | def __init__(
331 | self,
332 | sources: Iterator[DataSource[T]],
333 | empty: T,
334 | concatenator: Callable[[T, T], T],
335 | ):
336 | self._sources = sources
337 | self._empty = empty
338 | self._concatenator = concatenator
339 | self._current = self._next()
340 |
341 | def read_to(self, timestamp: pd.Timestamp) -> T:
342 | if self._current is None:
343 | return self._empty
344 | else:
345 | this_batch = self._current.read_to(timestamp)
346 | while self._current is not None and self._current.get_next() == UTC_MAX:
347 | self._current = self._next()
348 | next_batch = (
349 | self._empty
350 | if self._current is None
351 | else self._current.read_to(timestamp)
352 | )
353 | if next_batch and this_batch:
354 | this_batch = self._concatenator(this_batch, next_batch)
355 | elif next_batch:
356 | this_batch = next_batch
357 |
358 | return this_batch
359 |
360 | def get_next(self) -> pd.Timestamp:
361 | if self._current is None:
362 | return UTC_MAX
363 | else:
364 | return self._current.get_next()
365 |
366 | def _next(self) -> Optional[DataSource]:
367 | try:
368 | return next(self._sources)
369 | except StopIteration:
370 | return None
371 |
372 |
373 | class NoOpDataSink(DataSink):
374 | """DataSink that does nothing."""
375 |
376 | def append(self, timestamp: pd.Timestamp, data: T):
377 | pass
378 |
379 | def close(self):
380 | pass
381 |
382 |
383 | class NoOpDataSinkProvider:
384 | """DataSinkProvider that provides a NoOpDataSink."""
385 |
386 | def __call__(self, context: ReplayContext) -> DataSink[T]:
387 | return NoOpDataSink()
388 |
--------------------------------------------------------------------------------
/beavers/table.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | {{table_config.name}} Beavers
5 |
6 |
7 |
11 |
12 |
13 |
14 |
19 |
20 |
32 |
33 |
34 |
35 |
36 |
37 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/beavers/testing.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Optional, Sequence, TypeVar
2 |
3 | import pandas as pd
4 |
5 | from beavers.dag import Dag
6 |
7 | T = TypeVar("T")
8 |
9 |
10 | class DagTestBench:
11 | def __init__(self, dag: Dag):
12 | self.dag = dag
13 | for output_name, output_sinks in self.dag.get_sinks().items():
14 | assert len(output_sinks) == 1, output_name
15 |
16 | def set_source(
17 | self,
18 | source_name: str,
19 | source_data: Any,
20 | ) -> "DagTestBench":
21 | source = self.dag.get_sources()[source_name]
22 | source.set_stream(source_data)
23 | return self
24 |
25 | def execute(self, now: Optional[pd.Timestamp] = None) -> "DagTestBench":
26 | self.dag.execute(now)
27 | return self
28 |
29 | def assert_sink_list(
30 | self,
31 | sink_name: str,
32 | expected_messages: Sequence[T],
33 | ) -> "DagTestBench":
34 | sinks = self.dag.get_sinks()[sink_name]
35 | assert len(sinks) == 1
36 | cycle_id = sinks[0].get_cycle_id()
37 | assert cycle_id == self.dag.get_cycle_id()
38 | actual_messages = sinks[0].get_sink_value()
39 | assert len(actual_messages) == len(expected_messages), (
40 | f"Sink {sink_name} value size mismatch"
41 | )
42 | for actual_message, expected_message in zip(actual_messages, expected_messages):
43 | assert actual_message == expected_message
44 | return self
45 |
46 | def assert_sink_not_updated(self, sink_name: str) -> "DagTestBench":
47 | sinks = self.dag.get_sinks()[sink_name]
48 | assert len(sinks) == 1
49 | cycle_id = sinks[0].get_cycle_id()
50 | assert cycle_id < self.dag.get_cycle_id(), (
51 | f"Sink {sink_name} got updated this cycle"
52 | )
53 | return self
54 |
--------------------------------------------------------------------------------
/docs/concepts/advanced.md:
--------------------------------------------------------------------------------
1 | # Advanced
2 |
3 | This section discuss advanced features that control how updates propagate in the DAG.
4 |
5 | ## How updates propagate in the DAG
6 |
7 | - Nodes are notified if any of their input node was updated during the current execution cycle
8 | ```python
9 | --8<-- "examples/advanced_concepts.py:propagate_any"
10 | ```
11 | - You can check if a node updated by looking at its `cycle_id`
12 | ```python
13 | --8<-- "examples/advanced_concepts.py:propagate_cycle_id"
14 | ```
15 | - If several inputs of a node get updated during the same cycle, the node will be executed once (and not once per input)
16 | ```python
17 | --8<-- "examples/advanced_concepts.py:propagate_both"
18 | ```
19 | - Stream nodes (and sources) are not considered updated if their output is empty
20 | ```python
21 | --8<-- "examples/advanced_concepts.py:propagate_empty"
22 | ```
23 |
24 |
25 | ## Now node
26 |
27 | Beavers can be used in both `live` and `replay` mode.
28 | In `replay` mode, the wall clock isn't relevant.
29 | To access the current time of the replay, you should use the now node:
30 |
31 | ```python
32 | --8<-- "examples/advanced_concepts.py:now_node"
33 | ```
34 |
35 | The now node is shared for the whole DAG.
36 | Its value gets updated silently.
37 |
38 | ## TimerManager
39 |
40 | To be notified when time passes, nodes can subscribe to a `TimerManager` node.
41 |
42 | ```python
43 | --8<-- "examples/advanced_concepts.py:timer_manager"
44 | ```
45 |
46 | ## Silent updates
47 |
48 | Some node may update too often, or their updates may not be relevant to other nodes.
49 | In this case it's possible to silence them:
50 |
51 | ```python
52 | --8<-- "examples/advanced_concepts.py:silence"
53 | ```
54 |
55 | `silence` returns a new silenced node (rather than modify the existing node)
56 |
57 | ## Value Cutoff
58 |
59 | By default, state nodes will update everytime they are notified.
60 | The framework doesn't check that their value has changed.
61 |
62 | You can add a cutoff, to prevent updates when the value hasn't changed:
63 |
64 | ```python
65 | --8<-- "examples/advanced_concepts.py:cutoff"
66 | ```
67 |
68 | You can also provide a custom comparator to allow some tolerance when deciding if a value has changed:
69 |
70 | ```python
71 | --8<-- "examples/advanced_concepts.py:cutoff_custom"
72 | ```
73 |
--------------------------------------------------------------------------------
/docs/concepts/dag.md:
--------------------------------------------------------------------------------
1 |
2 | # DAG
3 |
4 | At its core, `beavers` executes a Directed Acyclic Graph (DAG), where each node is a python function.
5 | This section discusses the different type of nodes in the DAG.
6 |
7 | ## Stream Source
8 |
9 | A stream source is a node whose value can be set externally.
10 |
11 | When `Dag.execute` is called, the updated value is propagated in the DAG
12 |
13 | ```python
14 | --8<-- "examples/dag_concepts.py:source_stream"
15 | ```
16 |
17 | If the DAG is executed again, the value of the source stream will be reset to its empty value.
18 |
19 | ```python
20 | --8<-- "examples/dag_concepts.py:source_stream_again"
21 | ```
22 |
23 | The default empty value is set to `[]`, but it can be customized:
24 |
25 | ```python
26 | --8<-- "examples/dag_concepts.py:source_stream_empty"
27 | ```
28 |
29 | A source stream can be given a name, so they can be retrieved (and their value set):
30 |
31 | ```python
32 | --8<-- "examples/dag_concepts.py:source_stream_name"
33 | ```
34 |
35 | ## Stream Node
36 |
37 | A stream node uses the output of other nodes to calculate its updated value.
38 |
39 | ```python
40 | --8<-- "examples/dag_concepts.py:stream_node"
41 | ```
42 |
43 | If the DAG is executed again, the value of the stream node will be reset to its empty value.
44 |
45 | ```python
46 | --8<-- "examples/dag_concepts.py:stream_node_again"
47 | ```
48 |
49 | The default empty value is set to `[]`, but it can be customized:
50 | ```python
51 | --8<-- "examples/dag_concepts.py:stream_node_empty"
52 | ```
53 |
54 | The function provided to the node can be any callable, like a lambda:
55 | ```python
56 | --8<-- "examples/dag_concepts.py:stream_node_lambda"
57 | ```
58 |
59 | Or a class defining `__call__`:
60 | ```python
61 | --8<-- "examples/dag_concepts.py:stream_node_callable"
62 | ```
63 |
64 | ## State Node
65 |
66 | A state node retains its value from one DAG execution to the next, even if it didn't update:
67 | ```python
68 | --8<-- "examples/dag_concepts.py:state_node"
69 | ```
70 |
71 | Because they retain their value when they are not updated, state nodes don't require an empty value
72 |
73 | ## Const Node
74 |
75 | A const node is a node whose value doesn't change.
76 | ```python
77 | --8<-- "examples/dag_concepts.py:const_node"
78 | ```
79 |
80 | Const nodes behave like state nodes (their value isn't reset when they don't update).
81 |
82 | ## Connecting Nodes (aka `map`)
83 |
84 | Nodes are connected by calling the `map` function.
85 | Any stream or state node can be connected to state nodes, stream nodes or const nodes.
86 |
87 | > :warning: The `map` function doesn't execute the underlying node.
88 | > Instead it adds a node to the DAG
89 |
90 | The map function can use positional arguments:
91 |
92 | ```python
93 | --8<-- "examples/dag_concepts.py:map_positional"
94 | ```
95 | Or key word arguments:
96 |
97 | ```python
98 | --8<-- "examples/dag_concepts.py:map_key_word"
99 | ```
100 |
101 | ## State vs Stream
102 |
103 | Stream Nodes:
104 |
105 | - need their return type to implement `collections.abc.Sized`
106 | - need an empty value to be specfied (which default to `[]`)
107 | - have their value reset to empty when they don't update
108 | - are not considered updated if they return empty
109 |
110 | State Nodes:
111 |
112 | - Can return any type
113 | - don't require an empty value
114 | - retain their value on cycle they don't update
115 | - are always considered updated if they are called
116 |
--------------------------------------------------------------------------------
/docs/concepts/kafka.md:
--------------------------------------------------------------------------------
1 | # Live with Kafka
2 |
3 | This section explains how to run a beavers application in real time using kafka.
4 |
5 | ## Count Word Example
6 |
7 | Starting with a simple "count word" dag with one source going to one sink:
8 |
9 | ```python
10 | --8<-- "examples/kafka_concepts.py:dag"
11 | ```
12 |
13 | This dag has got a source node called `words` and a sink node called `counts`
14 |
15 | ## Defining Kafka Source
16 |
17 | We will be receiving data from kafka, on a topic called `words`.
18 |
19 | First we need to define how we deserialize messages coming from kafka:
20 |
21 | ```python
22 | --8<-- "examples/kafka_concepts.py:deserializer"
23 | ```
24 |
25 | Then, we put together the `SourceTopic` with its:
26 |
27 | - topic (`words`)
28 | - deserializer (`deserialize_messages`)
29 | - replay policy (`from_latest`)
30 |
31 | ```python
32 | --8<-- "examples/kafka_concepts.py:kafka_source"
33 | ```
34 |
35 | There are multiple kafka replay policy available, see the api doc for the full list.
36 |
37 | ## Defining Kafka Sink
38 |
39 | We will be sending the results to the `counts` topic.
40 | The key will be the word.T The value will be the latest count.
41 |
42 | First we need to define a serializer, which converts each count to a `KafkaProducerMessage`
43 |
44 | ```python
45 | --8<-- "examples/kafka_concepts.py:serializer"
46 | ```
47 |
48 | The serializer is responsible for providing the topic for each outgoing message.
49 |
50 | ## Putting it together with KafkaDriver
51 |
52 | The `KafkaDriver` takes care of creating the kafka producer and consumer, and passing the message through:
53 |
54 | ```python
55 | --8<-- "examples/kafka_concepts.py:kafka_driver"
56 | ```
57 |
58 | ## Beavers Kafka Features
59 |
60 | - One consumer: There is only one consumer (rather than one consumer for each topic)
61 | - One producer: There is only one producer (rather than one producer for each topic)
62 | - When polling messages, beavers tries to read all available messages, up to a limit of `batch_size=5000` (which is configurable in the KafkaDriver)
63 | - When replaying past data, beavers orchestrate topic/partition so data is replayed in order, across topics, based on each message timestamp.
64 | - When replaying past data, some newer messages have to be held.
65 | To avoid memory issue, the number of held messages is capped to `batch_size*5`.
66 | Once the number of held messages get to high, partitions that are ahead of the watermark are paused.
67 | These partitions are un-paused once the application catches up
68 |
69 |
70 | ## Beavers Kafka Limitations
71 |
72 | - One beavers application consumes every partition for requested topics (no load balancing/scaling)
73 |
--------------------------------------------------------------------------------
/docs/concepts/pandas.md:
--------------------------------------------------------------------------------
1 | # Pandas integration
2 |
3 | This section explains how to use beavers with pandas.
4 |
5 | ## ETF value calculation example
6 |
7 | In this example we want to calculate the value of ETFs.
8 | If you are not familiar with ETFs, think about them as just a basket of shares.
9 |
10 | Starting with a table of individual share prices:
11 | ```python
12 | --8<-- "examples/pandas_concepts.py:business_logic_price"
13 | ```
14 |
15 | | ticker | price |
16 | |:---------|--------:|
17 | | AAPL | 174.79 |
18 | | GOOGL | 130.25 |
19 | | MSFT | 317.01 |
20 | | F | 12.43 |
21 | | GM | 35.28 |
22 |
23 | And another table containing the composition of each ETF:
24 | ```python
25 | --8<-- "examples/pandas_concepts.py:business_logic_composition"
26 | ```
27 |
28 | | etf | ticker | quantity |
29 | |:------|:---------|-----------:|
30 | | TECH | AAPL | 2.0 |
31 | | TECH | GOOGL | 2.0 |
32 | | TECH | MSFT | 1.0 |
33 | | CARS | F | 3.0 |
34 | | CARS | GM | 1.0 |
35 |
36 | In a few line of `pandas` we can derive the value of each ETF:
37 | ```python
38 | --8<-- "examples/pandas_concepts.py:business_logic_calculation"
39 | ```
40 |
41 | | etf | value |
42 | |:-----|--------:|
43 | | TECH | 927.09 |
44 | | CARS | 72.57 |
45 |
46 | ## ETF value calculation DAG
47 |
48 | Once the business logic of the calculation is writen and tested it can be added into a Dag.
49 | We'll be using the Dag `pd` helper which makes it easier to deal with `pandas` table in beavers.
50 |
51 | First we define two source streams, made of `pandas.DataFrame`:
52 | ```python
53 | --8<-- "examples/pandas_concepts.py:dag_source"
54 | ```
55 |
56 | Then we keep track of the latest value for each source stream:
57 | ```python
58 | --8<-- "examples/pandas_concepts.py:dag_state"
59 | ```
60 |
61 | Lastly we put together the share prices and ETF composition:
62 | ```python
63 | --8<-- "examples/pandas_concepts.py:dag_calculation"
64 | ```
65 |
66 | And that's it:
67 |
68 | ```python
69 | --8<-- "examples/pandas_concepts.py:dag_test"
70 | ```
71 |
--------------------------------------------------------------------------------
/docs/concepts/perspective.md:
--------------------------------------------------------------------------------
1 | # Perspective Integration
2 |
3 | This section explains how to build a live web dashboard with [Perspective](https://github.com/finos/perspective) and Beavers.
4 |
5 | In Beavers, you can connect any node of type `pyarrow.Table` to a perspective table.
6 | All you need to do is call `dag.psp.to_perspecive`, and provide a `PerspectiveTableDefinition`.
7 |
8 |
9 | ## Key Value Example
10 |
11 | We'll write a super simple key-value store application.
12 | It listens to a topic, and displays the value of kafka messages by key, with their timestamp
13 |
14 | ## Install
15 |
16 | ```shell
17 | pip install beavers[pyarrow, perpective-python]
18 | ```
19 |
20 | ## Defining the schema of incoming message
21 |
22 | First we define a schema for the incoming "key value" messages:
23 |
24 | - a timestamp, in millis
25 | - a key (string)
26 | - a value (string)
27 |
28 | ```python
29 | --8<-- "examples/perspective_concepts.py:schema"
30 | ```
31 |
32 | ## Convert kafka messages to arrow Table
33 |
34 | Then we write a function that converts kafka messages to an apache arrow table of "key value" messages:
35 |
36 | ```python
37 | --8<-- "examples/perspective_concepts.py:converter"
38 | ```
39 |
40 |
41 | ## Create a dag
42 |
43 | We create a super simple dag.
44 | It has a source, called `key_value`, which is a table of "key value" messages.
45 | The source is plugged into a perspective table, called... `key_value`, whose index is the `key` column
46 |
47 | ```python
48 | --8<-- "examples/perspective_concepts.py:dag"
49 | ```
50 |
51 | ## Run the dashboard
52 |
53 | Lastly, we put everything together in an application
54 | ```python
55 | --8<-- "examples/perspective_concepts.py:run"
56 | ```
57 |
58 | You should be able to see it in http://localhost:8082/key_value
59 |
--------------------------------------------------------------------------------
/docs/concepts/polars.md:
--------------------------------------------------------------------------------
1 | # Polars integration
2 |
3 | This section explains how to use beavers with polars.
4 |
5 | ## ETF value calculation example
6 |
7 | In this example we want to calculate the value of ETFs.
8 |
9 | Starting with a data frame of individual share prices:
10 | ```python
11 | --8<-- "examples/polars_concepts.py:business_logic_price"
12 | ```
13 |
14 | | ticker | price |
15 | |:---------|--------:|
16 | | AAPL | 174.79 |
17 | | GOOGL | 130.25 |
18 | | MSFT | 317.01 |
19 | | F | 12.43 |
20 | | GM | 35.28 |
21 |
22 | And another data frame containing the composition of each ETF:
23 | ```python
24 | --8<-- "examples/polars_concepts.py:business_logic_composition"
25 | ```
26 |
27 | | etf | ticker | quantity |
28 | |:------|:---------|-----------:|
29 | | TECH | AAPL | 2.0 |
30 | | TECH | GOOGL | 2.0 |
31 | | TECH | MSFT | 1.0 |
32 | | CARS | F | 3.0 |
33 | | CARS | GM | 1.0 |
34 |
35 | In a few line of `polars` we can derive the value of each ETF:
36 | ```python
37 | --8<-- "examples/polars_concepts.py:business_logic_calculation"
38 | ```
39 |
40 | | etf | value |
41 | |:-----|--------:|
42 | | TECH | 927.09 |
43 | | CARS | 72.57 |
44 |
45 | ## ETF value calculation DAG
46 |
47 | Once the business logic of the calculation is writen and tested it can be added into a Dag.
48 | We'll be using the Dag `pl` helper which makes it easier to deal with `polars` data frame in beavers.
49 |
50 | First we define two source streams, made of `polars.DataFrame`:
51 | ```python
52 | --8<-- "examples/polars_concepts.py:dag_source"
53 | ```
54 |
55 | Then we keep track of the latest value for each source stream:
56 | ```python
57 | --8<-- "examples/polars_concepts.py:dag_state"
58 | ```
59 |
60 | Lastly we put together the share prices and ETF composition:
61 | ```python
62 | --8<-- "examples/polars_concepts.py:dag_calculation"
63 | ```
64 |
65 | And that's it:
66 |
67 | ```python
68 | --8<-- "examples/polars_concepts.py:dag_test"
69 | ```
70 |
71 |
72 | ## Taming updates
73 |
74 | This simple dag does the job of calculating the ETF value in real time.
75 | But there is one issue.
76 | The value of every ETF would update every time either `price` or `etf_composition` update.
77 | Even if the updates comes on a ticker that is not relevant to the ETFs we are tracking.
78 |
79 | In the example below, when the price of GameStop updates, we recalculate the value of every ETF.
80 | Even though their value hasn't changed:
81 | ```python
82 | --8<-- "examples/polars_concepts.py:spurious_update"
83 | ```
84 |
85 | To tame updates we need to identify which ETF needs updating.
86 |
87 | ETF values can update because their composition has changed:
88 | ```python
89 | --8<-- "examples/polars_concepts.py:updated_because_of_composition"
90 | ```
91 |
92 | Or because one of their component has updated:
93 | ```python
94 | --8<-- "examples/polars_concepts.py:updated_because_of_price"
95 | ```
96 |
97 | We can then put it back together and only calculate updates for relevant ETFs:
98 | ```python
99 | --8<-- "examples/polars_concepts.py:update_all"
100 | ```
101 |
102 |
103 | And see that only the value "TECH" ETF updates when a tech stock update:
104 | ```python
105 | --8<-- "examples/polars_concepts.py:update_all_test"
106 | ```
107 |
108 | | etf | value |
109 | |:------|--------:|
110 | | TECH | 927.13 |
111 |
--------------------------------------------------------------------------------
/docs/concepts/pyarrow.md:
--------------------------------------------------------------------------------
1 | # Pyarrow integration
2 |
3 | This section explains how to use beavers with pyarrow.
4 |
5 | ## ETF value calculation example
6 |
7 | In this example we want to calculate the value of ETFs.
8 | If you are not familiar with ETFs, think about them as just a basket of shares.
9 |
10 | Starting with a table of individual share prices:
11 | ```python
12 | --8<-- "examples/pyarrow_concepts.py:business_logic_price"
13 | ```
14 |
15 | | ticker | price |
16 | |:---------|--------:|
17 | | AAPL | 174.79 |
18 | | GOOGL | 130.25 |
19 | | MSFT | 317.01 |
20 | | F | 12.43 |
21 | | GM | 35.28 |
22 |
23 | And another table containing the composition of each ETF:
24 | ```python
25 | --8<-- "examples/pyarrow_concepts.py:business_logic_composition"
26 | ```
27 |
28 | | etf | ticker | quantity |
29 | |:------|:---------|-----------:|
30 | | TECH | AAPL | 2.0 |
31 | | TECH | GOOGL | 2.0 |
32 | | TECH | MSFT | 1.0 |
33 | | CARS | F | 3.0 |
34 | | CARS | GM | 1.0 |
35 |
36 | In a few line of `pyarrow` we can derive the value of each ETF:
37 | ```python
38 | --8<-- "examples/pyarrow_concepts.py:business_logic_calculation"
39 | ```
40 |
41 | | etf | value |
42 | |:-----|--------:|
43 | | TECH | 927.09 |
44 | | CARS | 72.57 |
45 |
46 | ## ETF value calculation DAG
47 |
48 | Once the business logic of the calculation is writen and tested it can be added into a Dag.
49 | We'll be using the Dag `pa` helper which makes it easier to deal with `pyarrow` table in beavers.
50 |
51 | First we define two source streams, made of `pyarrow.Table`:
52 | ```python
53 | --8<-- "examples/pyarrow_concepts.py:dag_source"
54 | ```
55 |
56 | Then we keep track of the latest value for each source stream:
57 | ```python
58 | --8<-- "examples/pyarrow_concepts.py:dag_state"
59 | ```
60 |
61 | Lastly we put together the share prices and ETF composition:
62 | ```python
63 | --8<-- "examples/pyarrow_concepts.py:dag_calculation"
64 | ```
65 |
66 | And that's it:
67 |
68 | ```python
69 | --8<-- "examples/pyarrow_concepts.py:dag_test"
70 | ```
71 |
72 |
73 | ## Taming updates
74 |
75 | This simple dag does the job of calculating the ETF value in real time.
76 | But there is one issue.
77 | The value of every ETF would update every time either `price` or `etf_composition` update.
78 | Even if the updates comes on a ticker that is not relevant to the ETFs we are tracking.
79 |
80 | In the example below, when the price of GameStop updates, we recalculate the value of every ETF.
81 | Even though their value hasn't changed:
82 | ```python
83 | --8<-- "examples/pyarrow_concepts.py:spurious_update"
84 | ```
85 |
86 | To tame updates we need to identify which ETF needs updating.
87 |
88 | ETF values can update because their composition has changed:
89 | ```python
90 | --8<-- "examples/pyarrow_concepts.py:updated_because_of_composition"
91 | ```
92 |
93 | Or because one of their component has updated:
94 | ```python
95 | --8<-- "examples/pyarrow_concepts.py:updated_because_of_price"
96 | ```
97 |
98 | We can then put it back together and only calculate updates for relevant ETFs:
99 | ```python
100 | --8<-- "examples/pyarrow_concepts.py:update_all"
101 | ```
102 |
103 |
104 | And see that only the value "TECH" ETF updates when a tech stock update:
105 | ```python
106 | --8<-- "examples/pyarrow_concepts.py:update_all_test"
107 | ```
108 |
109 | | etf | value |
110 | |:------|--------:|
111 | | TECH | 927.13 |
112 |
--------------------------------------------------------------------------------
/docs/concepts/replay.md:
--------------------------------------------------------------------------------
1 | # Replay
2 |
3 | This section explains how to run a beavers application using historical data, typically stored in files or databases.
4 |
5 | ## Manual Replay
6 |
7 | Starting with a simple dag with one source going to one sink:
8 |
9 | ```python
10 | --8<-- "examples/replay_concepts.py:simple_dag"
11 | ```
12 |
13 | Assuming your data has got this shape:
14 | ```python
15 | --8<-- "examples/replay_concepts.py:simple_data_class"
16 | ```
17 |
18 | You could replay the data manually your self and run the dag for regular interval:
19 | ```python
20 | --8<-- "examples/replay_concepts.py:manual_replay"
21 | ```
22 |
23 | But this requires a lot of boilerplate code and becomes cumbersome very quickly.
24 |
25 | ## Replay Framework
26 |
27 | The replay framework uses a few key abstraction in order to define how the data is loaded and injected in the dag.
28 |
29 | ### `DataSource`
30 |
31 | A `DataSource` provides a way of streaming data.
32 | ```python
33 | --8<-- "examples/replay_concepts.py:data_source"
34 | ```
35 |
36 | By convention, `DataSource`s:
37 |
38 | - return `UTC_MAX` when there is no more data
39 | - are stateful and need to remember what has already been read.
40 |
41 | ### `ReplayContext`
42 |
43 | The `ReplayContext` contains timing information:
44 | ```python
45 | --8<-- "examples/replay_concepts.py:replay_context"
46 | ```
47 |
48 | :warning: By convention all timestamps are UTC
49 |
50 |
51 | ### `DataSourceProvider`
52 |
53 | A `DataSourceProvider` provides a way of creating `DataSource`.
54 |
55 | For example, if the data is stored in a csv file:
56 |
57 | ```csv
58 | timestamp,message
59 | 2023-01-01 01:00:00+00:00,Hello
60 | 2023-01-01 01:01:00+00:00,How are you
61 | ```
62 |
63 | Provided with the `ReplayContext`, our `DataSourceProvider` will load the and return a `DataSource`
64 |
65 | ```python
66 | --8<-- "examples/replay_concepts.py:data_source_provider"
67 | ```
68 |
69 |
70 | ### `DataSink`
71 |
72 | A `DataSink` provides a way of capturing the output of nodes and saving the data:
73 |
74 |
75 | ```python
76 | --8<-- "examples/replay_concepts.py:data_sink"
77 | ```
78 |
79 | ### `DataSinkProvider`
80 |
81 | A `DataSinkProvider` provides a way of creating `DataSink`.
82 |
83 | In this example we save the data to csv:
84 |
85 |
86 | ```python
87 | --8<-- "examples/replay_concepts.py:data_sink_provider"
88 | ```
89 |
90 |
91 | ### `ReplayDriver`
92 |
93 | The replay driver is responsible for putting the dag, context, sources and sinks together, and orchestrate the replay.
94 |
95 | ```python
96 | --8<-- "examples/replay_concepts.py:replay_driver"
97 | ```
98 |
99 |
100 | ## Reading Files Partitioned By Time
101 |
102 | Assuming:
103 |
104 | - you want to replay a dag for a long period of time.
105 | - all that historic data doesn't fit into time
106 | - the data is partitioned by time period. For example one file per day, `input_2023-01-01.csv`.
107 |
108 | It's then possible, with the `IteratorDataSourceAdapter` to load each file one by one as they are needed.
109 |
110 | In this example, csv files are stored under . We need to provide:
111 |
112 | - a generator that will yield a `DataSource` for each file, in order
113 | - a way to concatenate the output of 2 `DataSource`. In this case we'll use `+` to merge two lists
114 | - an empty value for the case there is no more data, or we reach the last file.
115 |
116 | ```python
117 | --8<-- "examples/replay_concepts.py:iterator_data_source_adapter"
118 | ```
119 |
--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | Welcome! We're happy to have you here. Thank you in advance for your contribution to Beavers.
4 |
5 | ## Development environment set up
6 |
7 | ```shell
8 | python3 -m venv --clear venv
9 | source venv/bin/activate
10 | poetry self add "poetry-dynamic-versioning[plugin]"
11 | poetry install
12 | pre-commit install
13 | ```
14 |
15 | ## Testing
16 |
17 | To run tests fast:
18 |
19 | ```shell
20 | pytest -n auto tests
21 | ```
22 |
23 | To Get coverage:
24 |
25 | ```shell
26 | coverage run --branch --rcfile=./pyproject.toml --include "./beavers/*" -m pytest tests
27 | coverage report --show-missing
28 | ```
29 |
30 | ## Generating the change log
31 |
32 | We use [git-change-log](https://pawamoy.github.io/git-changelog/usage/) to generate our CHANGELOG.md
33 |
34 | Please follow the [basic convention](https://pawamoy.github.io/git-changelog/usage/#basic-convention) for commit
35 | message.
36 |
37 | To update the change log, run:
38 |
39 | ```shell
40 | git-changelog -io CHANGELOG.md
41 | ```
42 |
43 | ## New Release
44 |
45 | For new release, first prepare the change log, push and merge it.
46 |
47 | ```shell
48 | git-changelog --bump=auto -io CHANGELOG.md
49 | ```
50 |
51 | Then tag and push:
52 |
53 | ```shell
54 | git tag vX.X.X
55 | git push origin vX.X.X
56 | ```
57 |
58 | Lastly on github, go to tags and create a release.
59 | The CI will deploy to pypi automatically from then.
60 |
61 | ## Testing the documentation
62 |
63 | ```shell
64 | mkdocs serve --livereload --watch=./
65 | ```
66 |
67 | ## Updating dependencies
68 |
69 | - For the repo `poetry update`
70 | - For the doc: `(cd docs/; pip-compile ./requirements.in > ./requirements.txt)`
71 | - For pre-commit: `pre-commit autoupdate`
72 |
73 | ## Resources
74 |
75 | The repo set up is inspired by this [guide](https://mathspp.com/blog/how-to-create-a-python-package-in-2022)
76 |
--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
1 | # FAQ
2 |
3 | ## Why is it called beavers?
4 |
5 | Beavers are very clever animals that builds dams to regulate the flow of rivers.
6 | Likewise, the beavers library builds a dam around your data to regulate how it is processed by your applications.
7 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | ![Beavers Logo][5]
2 |
3 | # Beavers
4 |
5 | [Documentation][6] / [Installation][7] / [Repository][1] / [PyPI][8]
6 |
7 | [Beavers][1] is a python library for stream processing, optimized for analytics.
8 |
9 | It is used at [Tradewell Technologies][2],
10 | to calculate analytics and serve model predictions,
11 | for both realtime and batch jobs.
12 |
13 | ## Key Features
14 |
15 | - Works in **real time** (eg: reading from Kafka) and **replay mode** (eg: reading from Parquet files).
16 | - Optimized for analytics, using micro-batches (instead of processing records one by one).
17 | - Similar to [incremental][3], it updates nodes in a dag incrementally.
18 | - Taking inspiration from [kafka streams][4], there are two types of nodes in the dag:
19 | - **Stream**: ephemeral micro-batches of events (cleared after every cycle).
20 | - **State**: durable state derived from streams.
21 | - Clear separation between the business logic and the IO.
22 | So the same dag can be used in real time mode, replay mode or can be easily tested.
23 | - Functional interface: no inheritance or decorator required.
24 | - Support for complicated joins, not just "linear" data flow.
25 |
26 | ## Limitations
27 |
28 | - No concurrency support.
29 | To speed up calculation use libraries like pandas, pyarrow or polars.
30 | - No async code.
31 | To speed up IO use kafka driver native thread or parquet IO thread pool.
32 | - No support for persistent state.
33 | Instead of saving state, replay historic data from kafka to prime stateful nodes.
34 |
35 | ## Talks
36 |
37 | - [Unified batch and stream processing in python | PyData Global 2023][9]
38 |
39 | [1]: https://github.com/tradewelltech/beavers
40 | [2]: https://www.tradewelltech.co/
41 | [3]: https://github.com/janestreet/incremental
42 | [4]: https://www.confluent.io/blog/kafka-streams-tables-part-1-event-streaming/
43 | [5]: https://raw.githubusercontent.com/tradewelltech/beavers/master/docs/static/icons/beavers/logo.svg
44 | [6]: https://beavers.readthedocs.io/en/latest/
45 | [7]: https://beavers.readthedocs.io/en/latest/install/
46 | [8]: https://pypi.org/project/beavers/
47 | [9]: https://www.youtube.com/watch?v=8pUwsGA8SQM
48 |
--------------------------------------------------------------------------------
/docs/install.md:
--------------------------------------------------------------------------------
1 | # Install
2 |
3 | ## Basic Install
4 |
5 | ```sh
6 | pip install beavers
7 | ```
8 |
9 | ## Extras
10 |
11 | To install with extras like Arrow, Kafka or Perspective:
12 |
13 | ```sh
14 | pip install beavers[pyarrow, confluent_kafka, perspective-python]
15 | ```
16 |
--------------------------------------------------------------------------------
/docs/reference/dag.md:
--------------------------------------------------------------------------------
1 | ::: beavers.dag
2 | options:
3 | heading_level: 2
4 | show_source: false
5 |
--------------------------------------------------------------------------------
/docs/reference/kafka.md:
--------------------------------------------------------------------------------
1 | ::: beavers.kafka
2 | options:
3 | heading_level: 2
4 | show_source: false
5 |
--------------------------------------------------------------------------------
/docs/reference/pandas_wrapper.md:
--------------------------------------------------------------------------------
1 | ::: beavers.pandas_wrapper
2 | options:
3 | heading_level: 2
4 | show_source: false
5 |
--------------------------------------------------------------------------------
/docs/reference/pyarrow_wrapper.md:
--------------------------------------------------------------------------------
1 | ::: beavers.pyarrow_wrapper
2 | options:
3 | heading_level: 2
4 | show_source: false
5 |
--------------------------------------------------------------------------------
/docs/reference/replay.md:
--------------------------------------------------------------------------------
1 | ::: beavers.replay
2 | options:
3 | heading_level: 2
4 | show_source: false
5 |
--------------------------------------------------------------------------------
/docs/requirements.in:
--------------------------------------------------------------------------------
1 | markdown-include
2 | mkdocs
3 | mkdocs-material
4 | mkdocs-material-extensions
5 | mkdocstrings[python]
6 | pymdown-extensions
7 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile with Python 3.10
3 | # by the following command:
4 | #
5 | # pip-compile ./requirements.in
6 | #
7 | babel==2.17.0
8 | # via mkdocs-material
9 | backrefs==5.8
10 | # via mkdocs-material
11 | certifi==2025.4.26
12 | # via requests
13 | charset-normalizer==3.4.2
14 | # via requests
15 | click==8.2.1
16 | # via mkdocs
17 | colorama==0.4.6
18 | # via
19 | # griffe
20 | # mkdocs-material
21 | ghp-import==2.1.0
22 | # via mkdocs
23 | griffe==1.7.3
24 | # via mkdocstrings-python
25 | idna==3.10
26 | # via requests
27 | jinja2==3.1.6
28 | # via
29 | # mkdocs
30 | # mkdocs-material
31 | # mkdocstrings
32 | markdown==3.8
33 | # via
34 | # markdown-include
35 | # mkdocs
36 | # mkdocs-autorefs
37 | # mkdocs-material
38 | # mkdocstrings
39 | # pymdown-extensions
40 | markdown-include==0.8.1
41 | # via -r ./requirements.in
42 | markupsafe==3.0.2
43 | # via
44 | # jinja2
45 | # mkdocs
46 | # mkdocs-autorefs
47 | # mkdocstrings
48 | mergedeep==1.3.4
49 | # via
50 | # mkdocs
51 | # mkdocs-get-deps
52 | mkdocs==1.6.1
53 | # via
54 | # -r ./requirements.in
55 | # mkdocs-autorefs
56 | # mkdocs-material
57 | # mkdocstrings
58 | mkdocs-autorefs==1.4.2
59 | # via
60 | # mkdocstrings
61 | # mkdocstrings-python
62 | mkdocs-get-deps==0.2.0
63 | # via mkdocs
64 | mkdocs-material==9.6.14
65 | # via -r ./requirements.in
66 | mkdocs-material-extensions==1.3.1
67 | # via
68 | # -r ./requirements.in
69 | # mkdocs-material
70 | mkdocstrings[python]==0.29.1
71 | # via
72 | # -r ./requirements.in
73 | # mkdocstrings-python
74 | mkdocstrings-python==1.16.12
75 | # via mkdocstrings
76 | packaging==25.0
77 | # via mkdocs
78 | paginate==0.5.7
79 | # via mkdocs-material
80 | pathspec==0.12.1
81 | # via mkdocs
82 | platformdirs==4.3.8
83 | # via mkdocs-get-deps
84 | pygments==2.19.1
85 | # via mkdocs-material
86 | pymdown-extensions==10.15
87 | # via
88 | # -r ./requirements.in
89 | # mkdocs-material
90 | # mkdocstrings
91 | python-dateutil==2.9.0.post0
92 | # via ghp-import
93 | pyyaml==6.0.2
94 | # via
95 | # mkdocs
96 | # mkdocs-get-deps
97 | # pymdown-extensions
98 | # pyyaml-env-tag
99 | pyyaml-env-tag==1.1
100 | # via mkdocs
101 | requests==2.32.3
102 | # via mkdocs-material
103 | six==1.17.0
104 | # via python-dateutil
105 | typing-extensions==4.14.0
106 | # via mkdocstrings-python
107 | urllib3==2.4.0
108 | # via requests
109 | watchdog==6.0.0
110 | # via mkdocs
111 |
--------------------------------------------------------------------------------
/docs/static/icons/beavers/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tradewelltech/beavers/ec9979086868589ab82b47ce55fa11cc31b32c16/docs/static/icons/beavers/icon.png
--------------------------------------------------------------------------------
/docs/static/icons/beavers/logo.svg:
--------------------------------------------------------------------------------
1 |
26 |
--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tradewelltech/beavers/ec9979086868589ab82b47ce55fa11cc31b32c16/examples/__init__.py
--------------------------------------------------------------------------------
/examples/advanced_concepts.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa: E402
2 | # isort: skip_file
3 | import pandas as pd
4 |
5 | from beavers import Dag
6 |
7 | dag = Dag()
8 |
9 | # --8<-- [start:propagate_any]
10 | source_1 = dag.source_stream()
11 | source_2 = dag.source_stream()
12 | node = dag.stream(lambda x, y: x + y).map(source_1, source_2)
13 |
14 | source_1.set_stream([1, 2, 3])
15 | dag.execute()
16 | assert node.get_value() == [1, 2, 3] # source_1 updated
17 |
18 | source_2.set_stream([4, 5, 6])
19 | dag.execute()
20 | assert node.get_value() == [4, 5, 6] # source_2 updated
21 |
22 | dag.execute()
23 | assert node.get_value() == [] # no updates, reset to empty
24 | # --8<-- [end:propagate_any]
25 |
26 | # --8<-- [start:propagate_cycle_id]
27 | source_1.set_stream([1, 2, 3])
28 | dag.execute()
29 | assert node.get_value() == [1, 2, 3]
30 | assert node.get_cycle_id() == dag.get_cycle_id()
31 |
32 | dag.execute()
33 | assert node.get_value() == []
34 | assert node.get_cycle_id() == dag.get_cycle_id() - 1
35 | # --8<-- [end:propagate_cycle_id]
36 |
37 |
38 | # --8<-- [start:propagate_both]
39 | source_1.set_stream([1, 2, 3])
40 | source_2.set_stream([4, 5, 6])
41 | dag.execute()
42 | assert node.get_value() == [1, 2, 3, 4, 5, 6]
43 | assert node.get_cycle_id() == dag.get_cycle_id()
44 | # --8<-- [end:propagate_both]
45 |
46 |
47 | # --8<-- [start:propagate_empty]
48 | def even_only(values: list[int]) -> list[int]:
49 | return [v for v in values if (v % 2) == 0]
50 |
51 |
52 | even = dag.stream(even_only).map(source_1)
53 |
54 | source_1.set_stream([1, 2, 3])
55 | dag.execute()
56 | assert even.get_value() == [2]
57 | assert even.get_cycle_id() == dag.get_cycle_id()
58 |
59 | source_1.set_stream([1, 3])
60 | dag.execute()
61 | assert even.get_value() == []
62 | assert even.get_cycle_id() == dag.get_cycle_id() - 1
63 | # --8<-- [end:propagate_empty]
64 |
65 |
66 | # --8<-- [start:now_node]
67 | def get_delay(timestamps: list[pd.Timestamp], now: pd.Timestamp) -> list[pd.Timedelta]:
68 | return [now - timestamp for timestamp in timestamps]
69 |
70 |
71 | timestamp_stream = dag.source_stream()
72 | delay = dag.stream(get_delay).map(timestamp_stream, dag.now())
73 |
74 | timestamp_stream.set_stream(
75 | [
76 | pd.to_datetime("2022-01-01", utc=True),
77 | pd.to_datetime("2022-01-02", utc=True),
78 | pd.to_datetime("2022-01-03", utc=True),
79 | ]
80 | )
81 | dag.execute(timestamp=pd.to_datetime("2022-01-04", utc=True))
82 | assert delay.get_value() == [
83 | pd.to_timedelta("3d"),
84 | pd.to_timedelta("2d"),
85 | pd.to_timedelta("1d"),
86 | ]
87 |
88 | # --8<-- [end:now_node]
89 |
90 | # --8<-- [start:timer_manager]
91 | from beavers import TimerManager
92 |
93 |
94 | def get_year(now: pd.Timestamp, timer_manager: TimerManager):
95 | if not timer_manager.has_next_timer():
96 | timer_manager.set_next_timer(
97 | pd.Timestamp(year=now.year + 1, day=1, month=1, tzinfo=now.tzinfo)
98 | )
99 |
100 | return now.year
101 |
102 |
103 | year = dag.state(get_year).map(dag.now(), dag.timer_manager())
104 |
105 | dag.execute(pd.to_datetime("2022-01-01", utc=True))
106 | assert year.get_value() == 2022
107 | assert year.get_cycle_id() == dag.get_cycle_id()
108 |
109 | dag.execute(pd.to_datetime("2022-01-02", utc=True))
110 | assert year.get_value() == 2022
111 | assert year.get_cycle_id() == dag.get_cycle_id() - 1
112 |
113 | dag.execute(pd.to_datetime("2023-01-02", utc=True))
114 | assert year.get_value() == 2023
115 | assert year.get_cycle_id() == dag.get_cycle_id()
116 | # --8<-- [end:timer_manager]
117 |
118 |
119 | # --8<-- [start:silence]
120 | source_1 = dag.source_stream()
121 | source_1_silence = dag.silence(source_1)
122 | source_2 = dag.source_stream()
123 |
124 | both = dag.stream(lambda x, y: x + y).map(source_1_silence, source_2)
125 |
126 | source_1.set_stream([1, 2, 3])
127 | source_2.set_stream([4, 5, 6])
128 | dag.execute()
129 | assert both.get_value() == [1, 2, 3, 4, 5, 6]
130 | assert both.get_cycle_id() == dag.get_cycle_id()
131 |
132 | source_1.set_stream([1, 2, 3])
133 | dag.execute()
134 | assert both.get_value() == []
135 | assert (
136 | both.get_cycle_id() == dag.get_cycle_id() - 1
137 | ) # No update because source_1 is silent
138 |
139 | # --8<-- [end:silence]
140 |
141 |
142 | # --8<-- [start:cutoff]
143 | class GetMax:
144 | def __init__(self):
145 | self._max = 0.0
146 |
147 | def __call__(self, values: list[float]) -> float:
148 | self._max = max(self._max, *values)
149 | return self._max
150 |
151 |
152 | source = dag.source_stream()
153 | get_max = dag.state(GetMax()).map(source)
154 | get_max_cutoff = dag.cutoff(get_max)
155 |
156 | source.set_stream([1.0, 2.0])
157 | dag.execute()
158 | assert get_max.get_value() == 2.0
159 | assert get_max.get_cycle_id() == dag.get_cycle_id()
160 | assert get_max_cutoff.get_cycle_id() == dag.get_cycle_id()
161 |
162 | source.set_stream([1.0])
163 | dag.execute()
164 | assert get_max.get_value() == 2.0
165 | assert get_max.get_cycle_id() == dag.get_cycle_id()
166 | assert get_max_cutoff.get_cycle_id() == dag.get_cycle_id() - 1
167 |
168 | source.set_stream([3.0])
169 | dag.execute()
170 | assert get_max.get_value() == 3.0
171 | assert get_max.get_cycle_id() == dag.get_cycle_id()
172 | assert get_max_cutoff.get_cycle_id() == dag.get_cycle_id()
173 | # --8<-- [end:cutoff]
174 |
175 | # --8<-- [start:cutoff_custom]
176 | get_max_cutoff_custom = dag.cutoff(get_max, lambda x, y: abs(x - y) < 0.1)
177 |
178 | source.set_stream([4.0])
179 | dag.execute()
180 | assert get_max.get_value() == 4.0
181 | assert get_max.get_cycle_id() == dag.get_cycle_id()
182 | assert get_max_cutoff_custom.get_cycle_id() == dag.get_cycle_id()
183 |
184 |
185 | source.set_stream([4.05])
186 | dag.execute()
187 | assert get_max.get_value() == 4.05
188 | assert get_max.get_cycle_id() == dag.get_cycle_id()
189 | assert get_max_cutoff_custom.get_value() == 4.0
190 | assert get_max_cutoff_custom.get_cycle_id() == dag.get_cycle_id() - 1
191 |
192 |
193 | source.set_stream([4.11])
194 | dag.execute()
195 | assert get_max.get_value() == 4.11
196 | assert get_max.get_cycle_id() == dag.get_cycle_id()
197 | assert get_max_cutoff_custom.get_value() == 4.11
198 | assert get_max_cutoff_custom.get_cycle_id() == dag.get_cycle_id()
199 | # --8<-- [end:cutoff_custom]
200 |
--------------------------------------------------------------------------------
/examples/dag_concepts.py:
--------------------------------------------------------------------------------
1 | # isort: skip_file
2 |
3 | # --8<-- [start:source_stream]
4 | from beavers import Dag
5 |
6 | dag = Dag()
7 |
8 | source_stream = dag.source_stream()
9 |
10 | source_stream.set_stream([1, 2, 3])
11 | dag.execute()
12 | assert source_stream.get_value() == [1, 2, 3]
13 | # --8<-- [end:source_stream]
14 |
15 |
16 | # --8<-- [start:source_stream_again]
17 | dag.execute()
18 | assert source_stream.get_value() == []
19 | # --8<-- [end:source_stream_again]
20 |
21 | # --8<-- [start:source_stream_name]
22 | my_source_stream = dag.source_stream(name="my_source")
23 | dag.get_sources()["my_source"].set_stream([4, 5, 6])
24 | dag.execute()
25 | assert my_source_stream.get_value() == [4, 5, 6]
26 | # --8<-- [end:source_stream_name]
27 |
28 | # --8<-- [start:source_stream_empty]
29 | dict_source_stream = dag.source_stream(empty_factory=dict)
30 | dict_source_stream.set_stream({"hello": "world"})
31 | dag.execute()
32 | assert dict_source_stream.get_value() == {"hello": "world"}
33 | dag.execute()
34 | assert dict_source_stream.get_value() == {}
35 | # --8<-- [end:source_stream_empty]
36 |
37 |
38 | # --8<-- [start:stream_node]
39 | def multiply_by_2(values: list[int]) -> list[int]:
40 | return [v * 2 for v in values]
41 |
42 |
43 | stream_node = dag.stream(multiply_by_2).map(source_stream)
44 |
45 | source_stream.set_stream([1, 2, 3])
46 | dag.execute()
47 | assert stream_node.get_value() == [2, 4, 6]
48 | # --8<-- [end:stream_node]
49 |
50 |
51 | # --8<-- [start:stream_node_again]
52 | dag.execute()
53 | assert stream_node.get_value() == []
54 | # --8<-- [end:stream_node_again]
55 |
56 |
57 | # --8<-- [start:stream_node_empty]
58 | set_stream_node = dag.stream(set, empty_factory=set).map(source_stream)
59 | source_stream.set_stream([1, 2, 3, 1, 2, 3])
60 | dag.execute()
61 | assert set_stream_node.get_value() == {1, 2, 3}
62 | dag.execute()
63 | assert set_stream_node.get_value() == set()
64 | # --8<-- [end:stream_node_empty]
65 |
66 |
67 | # --8<-- [start:stream_node_lambda]
68 | lambda_stream_node = dag.stream(lambda x: x[:-1]).map(source_stream)
69 | source_stream.set_stream([1, 2, 3])
70 | dag.execute()
71 | assert lambda_stream_node.get_value() == [1, 2]
72 | # --8<-- [end:stream_node_lambda]
73 |
74 |
75 | # --8<-- [start:stream_node_callable]
76 | class MultiplyBy:
77 | def __init__(self, by: int):
78 | self.by = by
79 |
80 | def __call__(self, values: list[int]) -> list[int]:
81 | return [v * self.by for v in values]
82 |
83 |
84 | callable_stream_node = dag.stream(MultiplyBy(3)).map(source_stream)
85 | source_stream.set_stream([1, 2, 3])
86 | dag.execute()
87 | assert callable_stream_node.get_value() == [3, 6, 9]
88 | # --8<-- [end:stream_node_callable]
89 |
90 |
91 | # --8<-- [start:state_node]
92 | class Accumulator:
93 | def __init__(self):
94 | self._count = 0
95 |
96 | def __call__(self, values: list[int]) -> int:
97 | self._count += sum(values)
98 | return self._count
99 |
100 |
101 | state_node = dag.state(Accumulator()).map(source_stream)
102 | source_stream.set_stream([1, 2, 3])
103 | dag.execute()
104 | assert state_node.get_value() == 6
105 | dag.execute()
106 | assert state_node.get_value() == 6
107 | # --8<-- [end:state_node]
108 |
109 |
110 | # --8<-- [start:const_node]
111 | const_node = dag.const(2)
112 | assert const_node.get_value() == 2
113 | # --8<-- [end:const_node]
114 |
115 |
116 | # --8<-- [start:map_positional]
117 | to_append = dag.const([3])
118 | positional_stream = dag.stream(lambda x, y: x + y).map(source_stream, to_append)
119 | source_stream.set_stream([1, 2])
120 | dag.execute()
121 | assert positional_stream.get_value() == [1, 2, 3]
122 | # --8<-- [end:map_positional]
123 |
124 |
125 | # --8<-- [start:map_key_word]
126 | key_word = dag.stream(lambda x, y: x + y).map(x=source_stream, y=to_append)
127 | # --8<-- [end:map_key_word]
128 |
--------------------------------------------------------------------------------
/examples/etfs.py:
--------------------------------------------------------------------------------
1 | """
2 | Example of ETF nav (Net Asset Value) calculation
3 | """
4 |
5 | import dataclasses
6 | import random
7 | from operator import attrgetter
8 | from typing import Callable, Generic, Optional, TypeVar
9 |
10 | import numpy as np
11 | import pandas as pd
12 |
13 | from beavers import Dag
14 |
15 | K = TypeVar("K")
16 | V = TypeVar("V")
17 |
18 |
19 | @dataclasses.dataclass(frozen=True)
20 | class PriceRecord:
21 | timestamp: pd.Timestamp
22 | ticker: str
23 | price: Optional[float]
24 |
25 |
26 | @dataclasses.dataclass(frozen=True)
27 | class EtfComposition:
28 | timestamp: pd.Timestamp
29 | ticker: str
30 | weights: dict[str, float]
31 |
32 |
33 | class GetLatest(Generic[K, V]):
34 | def __init__(self, key_extractor: Callable[[V], K]):
35 | self._key_extractor = key_extractor
36 | self._latest = {}
37 |
38 | def __call__(self, updates: list[V]) -> dict[str, V]:
39 | for update in updates:
40 | self._latest[self._key_extractor(update)] = update
41 | return self._latest
42 |
43 |
44 | class GetUnique(Generic[K, V]):
45 | def __init__(self, key_extractor: Callable[[V], K]):
46 | self._key_extractor = key_extractor
47 |
48 | def __call__(self, updates: list[V]) -> list[str]:
49 | return sorted(list({self._key_extractor(update) for update in updates}))
50 |
51 |
52 | def create_day_test_prices(date: pd.Timestamp) -> list[PriceRecord]:
53 | end = date + pd.offsets.Day()
54 | return sorted(
55 | [
56 | PriceRecord(
57 | timestamp=pd.Timestamp(
58 | np.random.randint(date.value, end.value), unit="ns"
59 | ),
60 | ticker=random.choice(["AAPL", "GOOGL", "MSFT"]), # nosec B311
61 | price=random.random(), # nosec B311
62 | )
63 | for _ in range(random.randint(0, 1000)) # nosec B311
64 | ],
65 | key=lambda x: x.timestamp,
66 | )
67 |
68 |
69 | def calculate_nav(
70 | composition: EtfComposition, prices: dict[str, PriceRecord]
71 | ) -> PriceRecord:
72 | timestamp = composition.timestamp
73 | quotient = 0.0
74 | dividend = 0.0
75 | error = False
76 | for ticker, weight in composition.weights.items():
77 | try:
78 | price = prices[ticker]
79 | except KeyError:
80 | error = True
81 | else:
82 | quotient += price.price * weight
83 | dividend += weight
84 | timestamp = max(timestamp, price.timestamp)
85 |
86 | return PriceRecord(
87 | timestamp,
88 | composition.ticker,
89 | None if dividend == 0.0 or error else quotient / dividend,
90 | )
91 |
92 |
93 | def calculate_navs(
94 | updated_tickers: set[str],
95 | etf_compositions: dict[str, EtfComposition],
96 | prices: dict[str, PriceRecord],
97 | ) -> list[PriceRecord]:
98 | return [
99 | calculate_nav(etf_composition, prices)
100 | for etf_composition in etf_compositions.values()
101 | if (
102 | etf_composition.ticker in updated_tickers
103 | or (updated_tickers & etf_composition.weights.keys())
104 | )
105 | ]
106 |
107 |
108 | def get_updated_tickers(
109 | updated_prices: list[PriceRecord],
110 | updated_etf_compositions: list[EtfComposition],
111 | ) -> set[str]:
112 | return set(p.ticker for p in updated_prices) | set(
113 | e.ticker for e in updated_etf_compositions
114 | )
115 |
116 |
117 | def create_dag() -> Dag:
118 | dag = Dag()
119 | price_stream = dag.source_stream([], name="price")
120 | etf_composition_stream = dag.source_stream([], name="etf_composition")
121 | price_latest = dag.state(GetLatest(attrgetter("ticker"))).map(price_stream)
122 | etf_composition_latest = dag.state(GetLatest(attrgetter("ticker"))).map(
123 | etf_composition_stream
124 | )
125 |
126 | updated_tickers = dag.stream(get_updated_tickers, set()).map(
127 | price_stream, etf_composition_stream
128 | )
129 | updated_navs = dag.stream(calculate_navs, []).map(
130 | updated_tickers, etf_composition_latest, price_latest
131 | )
132 | dag.sink("etf_price", updated_navs)
133 | return dag
134 |
--------------------------------------------------------------------------------
/examples/kafka_concepts.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa: E402
2 | # isort: skip_file
3 |
4 |
5 | import confluent_kafka
6 | import pandas as pd
7 |
8 | # --8<-- [start:dag]
9 | from beavers import Dag
10 |
11 |
12 | class CountWords:
13 | state = {}
14 |
15 | def __call__(self, new_words: list[str]) -> dict[str, int]:
16 | for word in new_words:
17 | self.state[word] = self.state.get(word, 0) + 1
18 | return self.state
19 |
20 |
21 | def update_stream(
22 | state: dict[str, int], updated_words: list[str]
23 | ) -> list[tuple[str, int]]:
24 | return [(word, state[word]) for word in set(updated_words)]
25 |
26 |
27 | dag = Dag()
28 | word_source = dag.source_stream(name="words")
29 | count_state = dag.state(CountWords()).map(word_source)
30 | count_stream = dag.stream(update_stream, []).map(count_state, word_source)
31 | dag.sink("counts", count_stream)
32 | # --8<-- [end:dag]
33 |
34 |
35 | # --8<-- [start:deserializer]
36 | def deserialize_messages(messages: list[confluent_kafka.Message]) -> list[str]:
37 | return [message.value() for message in messages]
38 |
39 |
40 | # --8<-- [end:deserializer]
41 |
42 | # --8<-- [start:kafka_source]
43 | from beavers.kafka import SourceTopic, KafkaDriver
44 |
45 | source_topic = SourceTopic.from_start_of_day(
46 | "words", deserialize_messages, pd.to_timedelta("15min"), "UTC"
47 | )
48 | # --8<-- [end:kafka_source]
49 |
50 |
51 | # --8<-- [start:serializer]
52 | from beavers.kafka import KafkaProducerMessage
53 |
54 |
55 | def serialize_counts(values: list[tuple[str, int]]) -> list[KafkaProducerMessage]:
56 | return [
57 | KafkaProducerMessage(
58 | topic="counts",
59 | key=word,
60 | value=str(count),
61 | )
62 | for word, count in values
63 | ]
64 |
65 |
66 | # --8<-- [end:serializer]
67 |
68 |
69 | # --8<-- [start:kafka_driver]
70 | kafka_driver = KafkaDriver.create(
71 | dag=dag,
72 | consumer_config={
73 | "group.id": "beavers",
74 | "bootstrap.servers": "localhost:9092",
75 | },
76 | producer_config={"bootstrap.servers": "localhost:9092"},
77 | source_topics={"words": source_topic},
78 | sink_topics={"counts": serialize_counts},
79 | )
80 | while True:
81 | kafka_driver.run_cycle()
82 | # --8<-- [end:kafka_driver]
83 |
84 |
85 | # Note: you can test it with the following commands
86 | # kafka-topics --create --topic words --bootstrap-server=localhost:9092
87 | # kafka-console-producer --topic words --bootstrap-server=localhost:9092
88 | # kafka-console-consumer --topic=counts --bootstrap-server=localhost:9092 \
89 | # --property print.key=true
90 |
--------------------------------------------------------------------------------
/examples/pandas_concepts.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa: E402
2 | # isort: skip_file
3 |
4 | # --8<-- [start:business_logic_price]
5 | import pandas as pd
6 |
7 | price_table = pd.DataFrame.from_records(
8 | [
9 | {"ticker": "AAPL", "price": 174.79},
10 | {"ticker": "GOOGL", "price": 130.25},
11 | {"ticker": "MSFT", "price": 317.01},
12 | {"ticker": "F", "price": 12.43},
13 | {"ticker": "GM", "price": 35.28},
14 | ],
15 | )
16 |
17 | price_dtypes = price_table.dtypes
18 |
19 | # --8<-- [end:business_logic_price]
20 |
21 | # print(price_table.to_pandas().to_markdown(index=False))
22 |
23 | # --8<-- [start:business_logic_composition]
24 | etf_composition_table = pd.DataFrame.from_records(
25 | [
26 | {"etf": "TECH", "ticker": "AAPL", "quantity": 2.0},
27 | {"etf": "TECH", "ticker": "GOOGL", "quantity": 2.0},
28 | {"etf": "TECH", "ticker": "MSFT", "quantity": 1.0},
29 | {"etf": "CARS", "ticker": "F", "quantity": 3.0},
30 | {"etf": "CARS", "ticker": "GM", "quantity": 2.0},
31 | ],
32 | )
33 |
34 | etf_composition_dtypes = etf_composition_table.dtypes
35 | # --8<-- [end:business_logic_composition]
36 |
37 | # print(etf_composition_table.to_pandas().to_markdown(index=False, ffmt=".1f"))
38 |
39 |
40 | # --8<-- [start:business_logic_calculation]
41 | def calculate_etf_value(
42 | etf_composition: pd.DataFrame, price: pd.DataFrame
43 | ) -> pd.DataFrame:
44 | return (
45 | etf_composition.merge(price, left_on="ticker", right_on="ticker", how="left")
46 | .assign(values=lambda x: x["price"] * x["quantity"])
47 | .groupby("etf")
48 | .aggregate([("value", "sum")])
49 | )
50 |
51 |
52 | etf_value_table = calculate_etf_value(
53 | etf_composition=etf_composition_table, price=price_table
54 | )
55 | # --8<-- [end:business_logic_calculation]
56 |
57 |
58 | # print(etf_value_table.to_pandas().to_markdown(index=False, floatfmt=".2f"))
59 |
60 | # --8<-- [start:dag_source]
61 | from beavers import Dag
62 |
63 | dag = Dag()
64 | price_source = dag.pd.source_df(dtypes=price_dtypes, name="price")
65 | etf_composition_source = dag.pd.source_df(
66 | dtypes=etf_composition_dtypes, name="etf_composition"
67 | )
68 | # --8<-- [end:dag_source]
69 |
70 | # --8<-- [start:dag_state]
71 | price_state = dag.pd.last_by_keys(price_source, ["ticker"])
72 | etf_composition_state = dag.pd.last_by_keys(
73 | etf_composition_source,
74 | ["etf", "ticker"],
75 | )
76 | # --8<-- [end:dag_state]
77 |
78 |
79 | # --8<-- [start:dag_calculation]
80 | etf_value_state = dag.state(calculate_etf_value).map(
81 | etf_composition_state,
82 | price_state,
83 | )
84 | # --8<-- [end:dag_calculation]
85 |
86 |
87 | # --8<-- [start:dag_test]
88 | price_source.set_stream(price_table)
89 | etf_composition_source.set_stream(etf_composition_table)
90 | dag.execute()
91 | pd.testing.assert_frame_equal(etf_value_state.get_value(), etf_value_table)
92 | # --8<-- [end:dag_test]
93 |
--------------------------------------------------------------------------------
/examples/perspective_concepts.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa: E402
2 | # isort: skip_file
3 |
4 | from typing import Sequence, Optional
5 |
6 | # --8<-- [start:schema]
7 | import pyarrow as pa
8 |
9 |
10 | KEY_VALUE_SCHEMA = pa.schema(
11 | [
12 | pa.field("timestamp", pa.timestamp("ms", "UTC")),
13 | pa.field("topic", pa.string()),
14 | pa.field("partition", pa.int32()),
15 | pa.field("offset", pa.int64()),
16 | pa.field("key", pa.string()),
17 | pa.field("value", pa.string()),
18 | ]
19 | )
20 | # --8<-- [end:schema]
21 |
22 | # --8<-- [start:converter]
23 | import confluent_kafka
24 |
25 |
26 | def kafka_messages_to_pyarrow(
27 | messages: Sequence[confluent_kafka.Message],
28 | ) -> pa.Table:
29 | return pa.table(
30 | [
31 | [m.timestamp()[1] for m in messages],
32 | [m.topic() for m in messages],
33 | [m.partition() for m in messages],
34 | [m.offset() for m in messages],
35 | [None if m.key() is None else m.key().decode("utf-8") for m in messages],
36 | [
37 | None if m.value() is None else m.value().decode("utf-8")
38 | for m in messages
39 | ],
40 | ],
41 | schema=KEY_VALUE_SCHEMA,
42 | )
43 |
44 |
45 | # --8<-- [end:converter]
46 |
47 | # --8<-- [start:dag]
48 | from beavers import Dag
49 | from beavers.perspective_wrapper import PerspectiveTableDefinition
50 |
51 |
52 | def create_test_dag() -> Dag:
53 | dag = Dag()
54 | stream = dag.pa.source_table(
55 | name="key_value",
56 | schema=KEY_VALUE_SCHEMA,
57 | )
58 | dag.psp.to_perspective(
59 | stream,
60 | PerspectiveTableDefinition(
61 | name="key_value",
62 | index_column="key",
63 | ),
64 | )
65 | return dag
66 |
67 |
68 | # --8<-- [end:dag]
69 |
70 | # --8<-- [start:run]
71 | from beavers.kafka import KafkaDriver, SourceTopic
72 | from beavers.perspective_wrapper import run_web_application
73 |
74 |
75 | def run_dashboard(
76 | topic: str = "key-value",
77 | port: int = 8082,
78 | consumer_config: Optional[dict] = None,
79 | ):
80 | if consumer_config is None:
81 | consumer_config = {"bootstrap.servers": "localhost:9092", "group.id": "beavers"}
82 |
83 | dag = create_test_dag()
84 |
85 | kafka_driver = KafkaDriver.create(
86 | dag=dag,
87 | producer_config={},
88 | consumer_config=consumer_config,
89 | source_topics={
90 | "key_value": SourceTopic.from_earliest(topic, kafka_messages_to_pyarrow)
91 | },
92 | sink_topics={},
93 | )
94 |
95 | run_web_application(kafka_driver=kafka_driver, port=port)
96 |
97 |
98 | # --8<-- [end:run]
99 |
--------------------------------------------------------------------------------
/examples/polars_concepts.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa: E402
2 | # isort: skip_file
3 |
4 | import polars.testing
5 |
6 |
7 | # --8<-- [start:business_logic_price]
8 | import polars as pl
9 |
10 | PRICE_SCHEMA = pl.Schema(
11 | [
12 | ("ticker", pl.String()),
13 | ("price", pl.Float64()),
14 | ]
15 | )
16 |
17 | price_table = pl.DataFrame(
18 | [
19 | {"ticker": "AAPL", "price": 174.79},
20 | {"ticker": "GOOGL", "price": 130.25},
21 | {"ticker": "MSFT", "price": 317.01},
22 | {"ticker": "F", "price": 12.43},
23 | {"ticker": "GM", "price": 35.28},
24 | ],
25 | schema=PRICE_SCHEMA,
26 | )
27 | # --8<-- [end:business_logic_price]
28 |
29 | # print(price_table.to_pandas().to_markdown(index=False))
30 |
31 | # --8<-- [start:business_logic_composition]
32 | ETF_COMPOSITION_SCHEMA = pl.Schema(
33 | [
34 | ("etf", pl.String()),
35 | ("ticker", pl.String()),
36 | ("quantity", pl.Float64()),
37 | ]
38 | )
39 |
40 |
41 | etf_composition_table = pl.DataFrame(
42 | [
43 | {"etf": "TECH", "ticker": "AAPL", "quantity": 2.0},
44 | {"etf": "TECH", "ticker": "GOOGL", "quantity": 2.0},
45 | {"etf": "TECH", "ticker": "MSFT", "quantity": 1.0},
46 | {"etf": "CARS", "ticker": "F", "quantity": 3.0},
47 | {"etf": "CARS", "ticker": "GM", "quantity": 2.0},
48 | ],
49 | schema=ETF_COMPOSITION_SCHEMA,
50 | )
51 | # --8<-- [end:business_logic_composition]
52 |
53 | # print(etf_composition_table.to_pandas().to_markdown(index=False, floatfmt=".1f"))
54 |
55 |
56 | # --8<-- [start:business_logic_calculation]
57 | ETF_VALUE_SCHEMA = pl.Schema(
58 | [
59 | ("etf", pl.String()),
60 | ("value", pl.Float64()),
61 | ]
62 | )
63 |
64 |
65 | def calculate_etf_value(
66 | etf_composition: pl.DataFrame, price: pl.DataFrame
67 | ) -> pl.DataFrame:
68 | return (
69 | etf_composition.join(price, on=["ticker"])
70 | .select(pl.col("etf"), (pl.col("price") * pl.col("quantity")).alias("value"))
71 | .group_by("etf", maintain_order=True)
72 | .agg(pl.col("value").sum())
73 | .cast(ETF_VALUE_SCHEMA)
74 | )
75 |
76 |
77 | etf_value_table = calculate_etf_value(
78 | etf_composition=etf_composition_table, price=price_table
79 | )
80 | # --8<-- [end:business_logic_calculation]
81 |
82 |
83 | # print(etf_value_table.to_pandas().to_markdown(index=False, floatfmt=".2f"))
84 |
85 | # --8<-- [start:dag_source]
86 | from beavers import Dag
87 |
88 | dag = Dag()
89 | price_source = dag.pl.source_table(schema=PRICE_SCHEMA, name="price")
90 | etf_composition_source = dag.pl.source_table(
91 | schema=ETF_COMPOSITION_SCHEMA, name="etf_composition"
92 | )
93 | # --8<-- [end:dag_source]
94 |
95 | # --8<-- [start:dag_state]
96 | price_state = dag.pl.last_by_keys(price_source, ["ticker"])
97 | etf_composition_state = dag.pl.last_by_keys(
98 | etf_composition_source,
99 | ["etf", "ticker"],
100 | )
101 | # --8<-- [end:dag_state]
102 |
103 |
104 | # --8<-- [start:dag_calculation]
105 | etf_value_state = dag.state(calculate_etf_value).map(
106 | etf_composition_state,
107 | price_state,
108 | )
109 | # --8<-- [end:dag_calculation]
110 |
111 |
112 | # --8<-- [start:dag_test]
113 | price_source.set_stream(price_table)
114 | etf_composition_source.set_stream(etf_composition_table)
115 | dag.execute()
116 | polars.testing.assert_frame_equal(etf_value_state.get_value(), etf_value_table)
117 | # --8<-- [end:dag_test]
118 |
119 |
120 | # --8<-- [start:spurious_update]
121 | new_price_updates = pl.DataFrame(
122 | [{"ticker": "GME", "price": 123.0}],
123 | PRICE_SCHEMA,
124 | )
125 | price_source.set_stream(new_price_updates)
126 | dag.execute()
127 | assert len(etf_value_state.get_value()) == 2
128 | assert etf_value_state.get_cycle_id() == dag.get_cycle_id()
129 | # --8<-- [end:spurious_update]
130 |
131 | # --8<-- [start:updated_because_of_composition]
132 | updated_because_of_composition = dag.pl.get_series(
133 | etf_composition_source,
134 | "etf",
135 | )
136 | # --8<-- [end:updated_because_of_composition]
137 |
138 |
139 | # --8<-- [start:updated_because_of_price]
140 | def get_etf_to_update_because_of_price(
141 | etf_composition_state: pl.DataFrame, price_update: pl.DataFrame
142 | ) -> pl.Series:
143 | updated_tickers = price_update["ticker"].unique()
144 | return etf_composition_state.filter(pl.col("ticker").is_in(updated_tickers))[
145 | "etf"
146 | ].unique()
147 |
148 |
149 | updated_because_of_price = dag.stream(
150 | get_etf_to_update_because_of_price, empty=pl.Series(name="etf", dtype=pl.String())
151 | ).map(etf_composition_state, price_source)
152 | # --8<-- [end:updated_because_of_price]
153 |
154 | # --8<-- [start:update_all]
155 | stale_etfs = dag.pl.concat_series(
156 | updated_because_of_price, updated_because_of_composition
157 | )
158 |
159 |
160 | def get_composition_for_etfs(
161 | etf_composition_state: pl.DataFrame,
162 | etfs: pl.Series,
163 | ) -> pl.DataFrame:
164 | return etf_composition_state.filter(pl.col("etf").is_in(etfs))
165 |
166 |
167 | stale_etf_compositions = dag.pl.table_stream(
168 | get_composition_for_etfs, ETF_COMPOSITION_SCHEMA
169 | ).map(etf_composition_state, stale_etfs)
170 |
171 | updated_etf = dag.pl.table_stream(calculate_etf_value, ETF_VALUE_SCHEMA).map(
172 | stale_etf_compositions, price_state
173 | )
174 | # --8<-- [end:update_all]
175 |
176 | # --8<-- [start:update_all_test]
177 | price_source.set_stream(
178 | pl.DataFrame(
179 | [{"ticker": "MSFT", "price": 317.05}],
180 | schema=PRICE_SCHEMA,
181 | )
182 | )
183 | dag.execute()
184 | assert len(updated_etf.get_value()) == 1
185 | # --8<-- [end:update_all_test]
186 |
187 | # print(updated_etf.get_value().to_pandas().to_markdown(index=False))
188 |
--------------------------------------------------------------------------------
/examples/pyarrow_concepts.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa: E402
2 | # isort: skip_file
3 |
4 | # --8<-- [start:business_logic_price]
5 | import pyarrow as pa
6 |
7 | PRICE_SCHEMA = pa.schema(
8 | [
9 | pa.field("ticker", pa.string()),
10 | pa.field("price", pa.float64()),
11 | ]
12 | )
13 |
14 | price_table = pa.Table.from_pylist(
15 | [
16 | {"ticker": "AAPL", "price": 174.79},
17 | {"ticker": "GOOGL", "price": 130.25},
18 | {"ticker": "MSFT", "price": 317.01},
19 | {"ticker": "F", "price": 12.43},
20 | {"ticker": "GM", "price": 35.28},
21 | ],
22 | schema=PRICE_SCHEMA,
23 | )
24 | # --8<-- [end:business_logic_price]
25 |
26 | # print(price_table.to_pandas().to_markdown(index=False))
27 |
28 | # --8<-- [start:business_logic_composition]
29 | ETF_COMPOSITION_SCHEMA = pa.schema(
30 | [
31 | pa.field("etf", pa.string()),
32 | pa.field("ticker", pa.string()),
33 | pa.field("quantity", pa.float64()),
34 | ]
35 | )
36 |
37 |
38 | etf_composition_table = pa.Table.from_pylist(
39 | [
40 | {"etf": "TECH", "ticker": "AAPL", "quantity": 2.0},
41 | {"etf": "TECH", "ticker": "GOOGL", "quantity": 2.0},
42 | {"etf": "TECH", "ticker": "MSFT", "quantity": 1.0},
43 | {"etf": "CARS", "ticker": "F", "quantity": 3.0},
44 | {"etf": "CARS", "ticker": "GM", "quantity": 2.0},
45 | ],
46 | schema=ETF_COMPOSITION_SCHEMA,
47 | )
48 | # --8<-- [end:business_logic_composition]
49 |
50 | # print(etf_composition_table.to_pandas().to_markdown(index=False, floatfmt=".1f"))
51 |
52 |
53 | # --8<-- [start:business_logic_calculation]
54 | import pyarrow.compute as pc
55 |
56 | ETF_VALUE_SCHEMA = pa.schema(
57 | [
58 | pa.field("etf", pa.string()),
59 | pa.field("value", pa.float64()),
60 | ]
61 | )
62 |
63 |
64 | def calculate_etf_value(etf_composition: pa.Table, price: pa.Table) -> pa.Table:
65 | positions_with_prices = etf_composition.join(price, keys=["ticker"])
66 | values = pc.multiply(
67 | positions_with_prices["price"], positions_with_prices["quantity"]
68 | )
69 | positions_with_prices = positions_with_prices.append_column("value", values)
70 | return (
71 | positions_with_prices.group_by("etf")
72 | .aggregate([("value", "sum")])
73 | .rename_columns(ETF_VALUE_SCHEMA.names)
74 | )
75 |
76 |
77 | etf_value_table = calculate_etf_value(
78 | etf_composition=etf_composition_table, price=price_table
79 | )
80 | # --8<-- [end:business_logic_calculation]
81 |
82 |
83 | # print(etf_value_table.to_pandas().to_markdown(index=False, floatfmt=".2f"))
84 |
85 | # --8<-- [start:dag_source]
86 | from beavers import Dag
87 |
88 | dag = Dag()
89 | price_source = dag.pa.source_table(schema=PRICE_SCHEMA, name="price")
90 | etf_composition_source = dag.pa.source_table(
91 | schema=ETF_COMPOSITION_SCHEMA, name="etf_composition"
92 | )
93 | # --8<-- [end:dag_source]
94 |
95 | # --8<-- [start:dag_state]
96 | price_state = dag.pa.last_by_keys(price_source, ["ticker"])
97 | etf_composition_state = dag.pa.last_by_keys(
98 | etf_composition_source,
99 | ["etf", "ticker"],
100 | )
101 | # --8<-- [end:dag_state]
102 |
103 |
104 | # --8<-- [start:dag_calculation]
105 | etf_value_state = dag.state(calculate_etf_value).map(
106 | etf_composition_state,
107 | price_state,
108 | )
109 | # --8<-- [end:dag_calculation]
110 |
111 |
112 | # --8<-- [start:dag_test]
113 | price_source.set_stream(price_table)
114 | etf_composition_source.set_stream(etf_composition_table)
115 | dag.execute()
116 | assert etf_value_state.get_value() == etf_value_table
117 | # --8<-- [end:dag_test]
118 |
119 |
120 | # --8<-- [start:spurious_update]
121 | new_price_updates = pa.Table.from_pylist(
122 | [{"ticker": "GME", "price": 123.0}],
123 | PRICE_SCHEMA,
124 | )
125 | price_source.set_stream(new_price_updates)
126 | dag.execute()
127 | assert len(etf_value_state.get_value()) == 2
128 | assert etf_value_state.get_cycle_id() == dag.get_cycle_id()
129 | # --8<-- [end:spurious_update]
130 |
131 | # --8<-- [start:updated_because_of_composition]
132 | updated_because_of_composition = dag.pa.get_column(
133 | etf_composition_source,
134 | "etf",
135 | )
136 | # --8<-- [end:updated_because_of_composition]
137 |
138 |
139 | # --8<-- [start:updated_because_of_price]
140 | def get_etf_to_update_because_of_price(
141 | etf_composition_state: pa.Table, price_update: pa.Table
142 | ) -> pa.Array:
143 | updated_tickers = pc.unique(price_update["ticker"])
144 | return pc.unique(
145 | etf_composition_state.filter(
146 | pc.is_in(etf_composition_state["ticker"], updated_tickers)
147 | )["etf"]
148 | )
149 |
150 |
151 | updated_because_of_price = dag.stream(
152 | get_etf_to_update_because_of_price, pa.array([], pa.string())
153 | ).map(etf_composition_state, price_source)
154 | # --8<-- [end:updated_because_of_price]
155 |
156 | # --8<-- [start:update_all]
157 | stale_etfs = dag.pa.concat_arrays(
158 | updated_because_of_price, updated_because_of_composition
159 | )
160 |
161 |
162 | def get_composition_for_etfs(
163 | etf_composition_state: pa.Table, etfs: pa.Array
164 | ) -> pa.Table:
165 | return etf_composition_state.filter(
166 | pc.is_in(
167 | etf_composition_state["etf"],
168 | etfs,
169 | )
170 | )
171 |
172 |
173 | stale_etf_compositions = dag.pa.table_stream(
174 | get_composition_for_etfs, ETF_COMPOSITION_SCHEMA
175 | ).map(etf_composition_state, stale_etfs)
176 |
177 | updated_etf = dag.pa.table_stream(calculate_etf_value, ETF_VALUE_SCHEMA).map(
178 | stale_etf_compositions, price_state
179 | )
180 | # --8<-- [end:update_all]
181 |
182 | # --8<-- [start:update_all_test]
183 | price_source.set_stream(
184 | pa.Table.from_pylist(
185 | [{"ticker": "MSFT", "price": 317.05}],
186 | PRICE_SCHEMA,
187 | )
188 | )
189 | dag.execute()
190 | assert len(updated_etf.get_value()) == 1
191 | # --8<-- [end:update_all_test]
192 |
193 | # print(updated_etf.get_value().to_pandas().to_markdown(index=False))
194 |
--------------------------------------------------------------------------------
/examples/replay_concepts.py:
--------------------------------------------------------------------------------
1 | # isort: skip_file
2 | # ruff: noqa: E402
3 | import operator
4 |
5 | import beavers
6 |
7 |
8 | # --8<-- [start:simple_dag]
9 | dag = beavers.Dag()
10 | my_source = dag.source_stream(name="my_source")
11 | my_sink = dag.sink("my_sink", my_source)
12 | # --8<-- [end:simple_dag]
13 |
14 | # --8<-- [start:simple_data_class]
15 | import dataclasses
16 | import pandas as pd
17 |
18 |
19 | @dataclasses.dataclass(frozen=True)
20 | class Message:
21 | timestamp: pd.Timestamp
22 | message: str
23 |
24 |
25 | # --8<-- [end:simple_data_class]
26 |
27 | # --8<-- [start:manual_replay]
28 | my_source.set_stream(
29 | [
30 | Message(pd.Timestamp("2023-01-01T00:00:00Z"), "hello"),
31 | Message(pd.Timestamp("2023-01-01T00:00:30Z"), "How are you"),
32 | ]
33 | )
34 | dag.execute(pd.Timestamp("2023-01-01T00:01:00Z"))
35 | assert my_sink.get_sink_value() == [
36 | Message(pd.Timestamp("2023-01-01T00:00:00Z"), "hello"),
37 | Message(pd.Timestamp("2023-01-01T00:00:30Z"), "How are you"),
38 | ]
39 | # --8<-- [end:manual_replay]
40 |
41 |
42 | # --8<-- [start:data_source]
43 | import beavers.replay
44 |
45 |
46 | @dataclasses.dataclass(frozen=True)
47 | class MessageDataSource:
48 | messages: list[Message]
49 |
50 | def read_to(self, timestamp: pd.Timestamp) -> list[Message]:
51 | results = []
52 | while self.messages and self.messages[0].timestamp <= timestamp:
53 | results.append(self.messages.pop(0))
54 | return results
55 |
56 | def get_next(self) -> pd.Timestamp:
57 | if self.messages:
58 | return self.messages[0].timestamp
59 | else:
60 | return beavers.replay.UTC_MAX
61 |
62 |
63 | # --8<-- [end:data_source]
64 |
65 |
66 | # --8<-- [start:replay_context]
67 | from beavers.replay import ReplayContext
68 |
69 | replay_context = ReplayContext(
70 | start=pd.to_datetime("2023-01-01T00:00:00Z"),
71 | end=pd.to_datetime("2023-01-02T00:00:00Z"),
72 | frequency=pd.to_timedelta("1h"),
73 | )
74 | # --8<-- [end:replay_context]
75 |
76 |
77 | # --8<-- [start:data_source_provider]
78 | @dataclasses.dataclass(frozen=True)
79 | class CsvDataSourceProvider:
80 | file_name: str
81 |
82 | def __call__(
83 | self, replay_context: ReplayContext
84 | ) -> beavers.replay.DataSource[list[Message]]:
85 | df = pd.read_csv(self.file_name, parse_dates=["timestamp"])
86 | messages = [Message(*row) for _, row in df.iterrows()]
87 | messages.sort(key=lambda x: x.timestamp)
88 | return MessageDataSource(messages)
89 |
90 |
91 | # --8<-- [end:data_source_provider]
92 |
93 |
94 | # --8<-- [start:data_sink]
95 | @dataclasses.dataclass(frozen=True)
96 | class CsvDataSink:
97 | destination: str
98 | data: list[Message] = dataclasses.field(default_factory=list)
99 |
100 | def append(self, timestamp: pd.Timestamp, data: list[Message]):
101 | self.data.extend(data)
102 |
103 | def close(self):
104 | pd.DataFrame([dataclasses.asdict(value) for value in self.data]).to_csv(
105 | self.destination, index=False
106 | )
107 |
108 |
109 | # --8<-- [end:data_sink]
110 |
111 |
112 | # --8<-- [start:data_sink_provider]
113 | @dataclasses.dataclass(frozen=True)
114 | class CsvDataSinkProvider:
115 | destination: str
116 |
117 | def __call__(self, replay_context: ReplayContext) -> CsvDataSink:
118 | return CsvDataSink(self.destination)
119 |
120 |
121 | # --8<-- [end:data_sink_provider]
122 |
123 |
124 | # This is just to print the csv file:
125 | file = "data.csv"
126 | df = pd.DataFrame(
127 | {
128 | "timestamp": [
129 | pd.Timestamp("2023-01-01T01:00:00Z"),
130 | pd.Timestamp("2023-01-01T01:01:00Z"),
131 | ],
132 | "message": ["Hello", "How are you"],
133 | }
134 | )
135 | df.to_csv("input.csv", index=False)
136 |
137 | df_after = pd.read_csv("input.csv", parse_dates=["timestamp"])
138 | pd.testing.assert_frame_equal(df, df_after)
139 |
140 | messages = [Message(*row) for _, row in df_after.iterrows()]
141 |
142 | df2 = pd.DataFrame(
143 | {
144 | "timestamp": [
145 | pd.Timestamp("2023-01-02T01:00:00Z"),
146 | pd.Timestamp("2023-01-02T01:01:00Z"),
147 | ],
148 | "message": ["I'm fine", "Thanks"],
149 | }
150 | )
151 | df.to_csv("input_2023-01-01.csv", index=False)
152 | df2.to_csv("input_2023-01-02.csv", index=False)
153 | df2[:0].to_csv("input_2023-01-03.csv", index=False)
154 |
155 |
156 | # --8<-- [start:replay_driver]
157 | from beavers.replay import ReplayDriver
158 |
159 | replay_driver = beavers.replay.ReplayDriver.create(
160 | dag=dag,
161 | replay_context=replay_context,
162 | data_source_providers={"my_source": CsvDataSourceProvider("input.csv")},
163 | data_sink_providers={"my_sink": CsvDataSinkProvider("output.csv")},
164 | )
165 | replay_driver.run()
166 | # --8<-- [end:replay_driver]
167 |
168 |
169 | # --8<-- [start:iterator_data_source_adapter]
170 | from beavers.replay import IteratorDataSourceAdapter
171 |
172 |
173 | @dataclasses.dataclass(frozen=True)
174 | class PartitionedCsvDataSourceProvider:
175 | source_format: str
176 |
177 | def __call__(self, replay_context: ReplayContext):
178 | file_names = [
179 | self.source_format.format(date=date)
180 | for date in pd.date_range(replay_context.start, replay_context.end)
181 | ]
182 | generator = (self._load_one_file(file_name) for file_name in file_names)
183 | return IteratorDataSourceAdapter(
184 | sources=generator,
185 | empty=[],
186 | concatenator=operator.add,
187 | )
188 |
189 | def _load_one_file(self, file_name: str) -> MessageDataSource:
190 | return MessageDataSource(
191 | [
192 | Message(*row)
193 | for _, row in pd.read_csv(
194 | file_name, parse_dates=["timestamp"]
195 | ).iterrows()
196 | ]
197 | )
198 |
199 |
200 | source_provider = PartitionedCsvDataSourceProvider("input_{date:%Y-%m-%d}.csv")
201 | # --8<-- [end:iterator_data_source_adapter]
202 |
203 | # --8<-- [start:iterator_data_source_adapter_run]
204 | ReplayDriver.create(
205 | dag=dag,
206 | replay_context=ReplayContext(
207 | start=pd.to_datetime("2023-01-01T00:00:00Z"),
208 | end=pd.to_datetime("2023-01-03T00:00:00Z"),
209 | frequency=pd.to_timedelta("1h"),
210 | ),
211 | data_source_providers={
212 | "my_source": PartitionedCsvDataSourceProvider("input_{date:%Y-%m-%d}.csv")
213 | },
214 | data_sink_providers={"my_sink": CsvDataSinkProvider("output.csv")},
215 | ).run()
216 |
217 | # --8<-- [end:iterator_data_source_adapter_run]
218 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Beavers
2 | site_url: https://beavers.readthedocs.io/en/latest/
3 | repo_url: https://github.com/tradewelltech/beavers
4 | theme:
5 | name: material
6 | features:
7 | - navigation.tabs
8 | - navigation.tabs.sticky
9 | - content.code.annotate
10 | - content.tabs.link
11 | - content.code.copy
12 | - header.autohide
13 | - navigation.indexes
14 | - navigation.instant
15 | - navigation.tracking
16 | - search.highlight
17 | - search.share
18 | - search.suggest
19 | palette:
20 | scheme: slate
21 | accent: green
22 | logo: static/icons/beavers/logo.svg
23 | favicon: static/icons/beavers/icon.png
24 |
25 | plugins:
26 | - search
27 | - mkdocstrings:
28 | default_handler: python
29 | handlers:
30 | python:
31 | options:
32 | show_source: false
33 |
34 | markdown_extensions:
35 | - def_list
36 | - pymdownx.inlinehilite
37 | - pymdownx.superfences
38 | - pymdownx.snippets:
39 | - pymdownx.emoji
40 | - pymdownx.highlight
41 | - attr_list
42 | - md_in_html
43 | extra:
44 | project_name: "beavers"
45 |
46 |
47 | nav:
48 | - Home:
49 | - index.md
50 | - Concepts:
51 | - concepts/dag.md
52 | - concepts/advanced.md
53 | - concepts/replay.md
54 | - concepts/kafka.md
55 | - concepts/pandas.md
56 | - concepts/pyarrow.md
57 | - concepts/polars.md
58 | - concepts/perspective.md
59 | - API Reference:
60 | - reference/dag.md
61 | - reference/replay.md
62 | - reference/kafka.md
63 | - reference/pandas_wrapper.md
64 | - reference/pyarrow_wrapper.md
65 | - install.md
66 | - contributing.md
67 | - faq.md
68 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | build-backend = "poetry_dynamic_versioning.backend"
3 | requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"]
4 |
5 | [project]
6 | authors = [
7 | {name = "Tradewell Tech", email = "engineering@tradewelltech.co"}
8 | ]
9 | classifiers = [
10 | "Development Status :: 5 - Production/Stable",
11 | "License :: OSI Approved :: Apache Software License",
12 | "Natural Language :: English",
13 | "Programming Language :: Python :: 3.10",
14 | "Programming Language :: Python :: 3.11",
15 | "Programming Language :: Python :: 3.12",
16 | "Programming Language :: Python :: 3.13"
17 | ]
18 | dependencies = [
19 | "confluent_kafka>=2.1.1",
20 | "pandas",
21 | "perspective-python>=3.0.0",
22 | "polars",
23 | "pyarrow",
24 | "tornado"
25 | ]
26 | description = "Python stream processing"
27 | documentation = "https://beavers.readthedocs.io/en/latest/"
28 | keywords = ["apache-arrow", "streaming", "data"]
29 | license = "Apache-2.0"
30 | maintainers = [
31 | {name = "0x26res", email = "0x26res@gmail.com"}
32 | ]
33 | name = "beavers"
34 | packages = [
35 | {include = "beavers"}
36 | ]
37 | readme = "README.md"
38 | repository = "https://github.com/tradewelltech/beavers"
39 | requires-python = ">=3.10,<4"
40 | version = "0.0.0"
41 |
42 | [project.optional-dependencies]
43 | confluent-kafka = ["confluent-kafka"]
44 | perspective-python = ["perspective-python", "tornado"]
45 | polars = ["polars"]
46 | pyarrow = ["pyarrow"]
47 |
48 | [project.urls]
49 | "Bug Tracker" = "https://github.com/tradewelltech/beavers/issues"
50 | "Changelog" = "https://github.com/tradewelltech/beavers/blob/main/CHANGELOG.md"
51 |
52 | [tool.bandit]
53 | skips = ["B101", "B311"]
54 |
55 | [tool.black]
56 | exclude = "venv/|tox/"
57 | target-version = ["py310"]
58 |
59 | [tool.coverage.report]
60 | # https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185
61 | exclude_lines = ["if TYPE_CHECKING:"]
62 |
63 | [tool.coverage.run]
64 | omit = [
65 | # This is hard to test, and the API is about to change a lot
66 | "*/beavers/perspective_wrapper.py"
67 | ]
68 |
69 | [tool.poetry.group.dev.dependencies]
70 | black = ">=22.10.0"
71 | click = ">=8.1.7"
72 | coverage = ">=6.5.0"
73 | flake8 = ">=5.0.4"
74 | git-changelog = ">=2.2.0"
75 | isort = ">=5.10.1"
76 | mock = "*"
77 | pip-tools = ">=6.12.1"
78 | pre-commit = ">=2.20.0"
79 | pylint = ">=2.15.0"
80 | pytest = ">=7.2.0"
81 | pytest-asyncio = "*"
82 | tabulate = "*"
83 |
84 | [tool.poetry.group.docs]
85 | optional = true
86 |
87 | [tool.poetry.group.docs.dependencies]
88 | markdown-include = "*"
89 | mkdocs = ">=1.5.3"
90 | mkdocs-material = ">=9.3.2"
91 | mkdocs-material-extensions = "*"
92 | mkdocstrings = {version = ">=0.21.2", extras = ["python"]}
93 | pymdown-extensions = "*"
94 | tornado = "*"
95 |
96 | [tool.poetry-dynamic-versioning]
97 | enable = true
98 |
99 | [tool.poetry-dynamic-versioning.substitution]
100 | files = ["*/__init__.py"]
101 | folders = [{path = "beavers"}]
102 |
103 | [tool.pydocstyle]
104 | ignore = ["D102", "D107", "D203", "D212"]
105 |
106 | [tool.pytest.ini_options]
107 | asyncio_default_fixture_loop_scope = "function"
108 | asyncio_mode = "auto"
109 |
110 | [tool.ruff]
111 | line-length = 88
112 |
113 | [tool.ruff.lint.isort]
114 | known-first-party = ["beavers", "tradewell_proto"]
115 |
--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
1 | # Scripts
2 |
3 | These script are helpful for testing beavers with simple real time applications
4 |
5 | ## Set up
6 |
7 | Use kafka-kraft in docker for kafka:
8 |
9 | ```shell
10 | docker run --name=simple_kafka -p 9092:9092 -d bashj79/kafka-kraft
11 | ```
12 |
13 | ## `kafka_test_bench`
14 |
15 | Tests a simple application with kafka, making sure it replays in order.
16 | The "timestamp" of the output messages should be in order across topics when replaying.
17 |
18 |
19 | ### Create Topics
20 |
21 | ```shell
22 | docker exec -it simple_kafka /opt/kafka/bin/kafka-topics.sh --bootstrap-server=localhost:9092 --create --topic=left --partitions=1 --replication-factor=1
23 | docker exec -it simple_kafka /opt/kafka/bin/kafka-topics.sh --bootstrap-server=localhost:9092 --create --topic=right --partitions=1 --replication-factor=1
24 | docker exec -it simple_kafka /opt/kafka/bin/kafka-topics.sh --bootstrap-server=localhost:9092 --create --topic=both --partitions=1 --replication-factor=1
25 | ```
26 |
27 | ### Run the Beavers job
28 |
29 | ```shell
30 | python -m scripts.kafka_test_bench --batch-size=2
31 | ```
32 |
33 | ### Publish data
34 |
35 | ```shell
36 | docker exec -it simple_kafka /opt/kafka/bin/kafka-console-producer.sh --bootstrap-server=localhost:9092 --topic=left
37 | docker exec -it simple_kafka /opt/kafka/bin/kafka-console-producer.sh --bootstrap-server=localhost:9092 --topic=right
38 | ```
39 |
40 | ### See out put data
41 |
42 | ```shell
43 | docker exec -it simple_kafka /opt/kafka/bin/kafka-console-consumer.sh \
44 | --bootstrap-server=localhost:9092 \
45 | --topic=both \
46 | --property print.key=true \
47 | --from-beginning
48 | ```
49 |
50 | ## `perpective_test_bench.py`
51 |
52 | ### Create the topic
53 |
54 | ```shell
55 | docker exec -it simple_kafka /opt/kafka/bin/kafka-topics.sh --bootstrap-server=localhost:9092 --create --topic=key-value --partitions=1 --replication-factor=1
56 | ```
57 |
58 | ### Publish data
59 |
60 | ```shell
61 | docker exec -it simple_kafka /opt/kafka/bin/kafka-console-producer.sh \
62 | --topic=key-value \
63 | --bootstrap-server=localhost:9092 \
64 | --property parse.key=true \
65 | --property key.separator=,
66 | ```
67 |
--------------------------------------------------------------------------------
/scripts/kafka_test_bench.py:
--------------------------------------------------------------------------------
1 | import functools
2 | import json
3 | import logging
4 | from operator import itemgetter
5 | from typing import Any, Callable, Sequence
6 |
7 | import click
8 | import confluent_kafka
9 | import pandas as pd
10 |
11 | from beavers import Dag
12 | from beavers.kafka import KafkaDriver, KafkaProducerMessage, SourceTopic
13 |
14 |
15 | def create_test_dag() -> "Dag":
16 | dag = Dag()
17 | left_stream = dag.source_stream(name="left")
18 | right_stream = dag.source_stream(name="right")
19 | both_stream = dag.stream(
20 | lambda left, right: sorted(left + right, key=itemgetter("timestamp"))
21 | ).map(left_stream, right_stream)
22 | dag.sink("both", both_stream)
23 | return dag
24 |
25 |
26 | def kafka_messages_to_json(
27 | messages: Sequence[confluent_kafka.Message],
28 | ) -> list[dict[str, Any]]:
29 | return [
30 | {
31 | "topic": message.topic(),
32 | "partition": message.partition(),
33 | "offset": message.offset(),
34 | "timestamp": str(
35 | pd.to_datetime(message.timestamp()[1], unit="ms", utc=True)
36 | ),
37 | "key": message.key().encode("utf-8") if message.key() else None,
38 | "value": message.value().decode("utf-8"),
39 | }
40 | for message in messages
41 | ]
42 |
43 |
44 | def kafka_message_serializer(
45 | payloads: list[dict[str, Any]], topic: str
46 | ) -> list[KafkaProducerMessage]:
47 | return [
48 | KafkaProducerMessage(topic, key=None, value=json.dumps(payload))
49 | for payload in payloads
50 | ]
51 |
52 |
53 | SOURCE_TOPIC_CREATORS: dict[str, Callable[[str], SourceTopic]] = {
54 | "latest": functools.partial(
55 | SourceTopic.from_latest, message_deserializer=kafka_messages_to_json
56 | ),
57 | "earliest": functools.partial(
58 | SourceTopic.from_earliest, message_deserializer=kafka_messages_to_json
59 | ),
60 | "15min": functools.partial(
61 | SourceTopic.from_relative_time,
62 | message_deserializer=kafka_messages_to_json,
63 | relative_time=pd.to_timedelta("15min"),
64 | ),
65 | "start-of-day": functools.partial(
66 | SourceTopic.from_start_of_day,
67 | message_deserializer=kafka_messages_to_json,
68 | start_of_day_time=pd.to_timedelta("00:00:00"),
69 | start_of_day_timezone="UTC",
70 | ),
71 | "absolute-time": functools.partial(
72 | SourceTopic.from_absolute_time,
73 | message_deserializer=kafka_messages_to_json,
74 | absolute_time=pd.Timestamp.utcnow().normalize(),
75 | ),
76 | "committed": functools.partial(
77 | SourceTopic.from_committed,
78 | message_deserializer=kafka_messages_to_json,
79 | ),
80 | }
81 |
82 |
83 | @click.command()
84 | @click.option("--left-topic", type=click.STRING, default="left")
85 | @click.option(
86 | "--left-offset", type=click.Choice(SOURCE_TOPIC_CREATORS.keys()), default="earliest"
87 | )
88 | @click.option("--right-topic", type=click.STRING, default="right")
89 | @click.option(
90 | "--right-offset",
91 | type=click.Choice(SOURCE_TOPIC_CREATORS.keys()),
92 | default="earliest",
93 | )
94 | @click.option("--both-topic", type=click.STRING, default="both")
95 | @click.option(
96 | "--consumer-config",
97 | type=json.loads,
98 | default='{"bootstrap.servers": "localhost:9092", "group.id": "beavers"}',
99 | )
100 | @click.option(
101 | "--producer-config",
102 | type=json.loads,
103 | default='{"bootstrap.servers": "localhost:9092"}',
104 | )
105 | @click.option("--batch-size", type=click.INT, default="2")
106 | def kafka_test_bench(
107 | left_topic: str,
108 | left_offset: str,
109 | right_topic: str,
110 | right_offset: str,
111 | both_topic: str,
112 | consumer_config: dict,
113 | producer_config: dict,
114 | batch_size: int,
115 | ):
116 | logging.basicConfig(
117 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
118 | level=logging.DEBUG,
119 | )
120 |
121 | dag = create_test_dag()
122 |
123 | driver = KafkaDriver.create(
124 | dag=dag,
125 | producer_config=producer_config,
126 | consumer_config=consumer_config,
127 | source_topics={
128 | "left": SOURCE_TOPIC_CREATORS[left_offset](left_topic),
129 | "right": SOURCE_TOPIC_CREATORS[right_offset](right_topic),
130 | },
131 | sink_topics={
132 | "both": functools.partial(kafka_message_serializer, topic=both_topic)
133 | },
134 | batch_size=batch_size,
135 | )
136 | while True:
137 | driver.run_cycle()
138 |
139 |
140 | if __name__ == "__main__":
141 | kafka_test_bench()
142 |
--------------------------------------------------------------------------------
/scripts/perpective_test_bench.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import click
4 |
5 | from examples.perspective_concepts import run_dashboard
6 |
7 |
8 | @click.command()
9 | @click.option("--topic", type=click.STRING, default="key-value")
10 | @click.option("--port", type=click.INT, default=8082)
11 | @click.option(
12 | "--consumer-config",
13 | type=json.loads,
14 | default='{"bootstrap.servers": "localhost:9092", "group.id": "beavers"}',
15 | )
16 | def perspective_test_bench(
17 | topic: str,
18 | port: int,
19 | consumer_config: dict,
20 | ):
21 | run_dashboard(topic=topic, port=port, consumer_config=consumer_config)
22 |
23 |
24 | if __name__ == "__main__":
25 | perspective_test_bench()
26 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | from _pytest.assertion import register_assert_rewrite
2 |
3 | register_assert_rewrite("beavers.testing")
4 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tradewelltech/beavers/ec9979086868589ab82b47ce55fa11cc31b32c16/tests/conftest.py
--------------------------------------------------------------------------------
/tests/test_docs.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 |
4 | def test_readme_and_index_same():
5 | """Check the README matches the doc home page"""
6 | root = Path(__file__).parent.parent
7 | readme = root / "README.md"
8 | index = root / "docs" / "index.md"
9 |
10 | with readme.open() as fp:
11 | readme_content = fp.read()
12 |
13 | with index.open() as fp:
14 | # Skip first and last line
15 | index_content = "".join(fp.readlines()[1:-1])
16 |
17 | assert index_content in readme_content
18 |
--------------------------------------------------------------------------------
/tests/test_etfs.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from beavers.testing import DagTestBench
4 | from examples import etfs
5 |
6 |
7 | def test_run_dag():
8 | dag = etfs.create_dag()
9 | bench = DagTestBench(dag)
10 |
11 | # Price and ETF come in:
12 | timestamp_0 = pd.to_datetime("2023-06-10 12:00:00+0000")
13 | (
14 | bench.set_source(
15 | "price",
16 | [
17 | etfs.PriceRecord(timestamp_0, "AAPL", 180.0),
18 | etfs.PriceRecord(timestamp_0, "GOOG", 120.0),
19 | ],
20 | )
21 | .set_source(
22 | "etf_composition",
23 | [etfs.EtfComposition(timestamp_0, "TECH", {"AAPL": 1.0, "GOOG": 1.5})],
24 | )
25 | .execute(timestamp_0)
26 | .assert_sink_list("etf_price", [etfs.PriceRecord(timestamp_0, "TECH", 144.0)])
27 | )
28 |
29 | # AAPL price update:
30 | timestamp_1 = timestamp_0 + pd.to_timedelta("1s")
31 | (
32 | bench.set_source(
33 | "price",
34 | [
35 | etfs.PriceRecord(timestamp_1, "AAPL", 200.0),
36 | ],
37 | )
38 | .execute(timestamp_1)
39 | .assert_sink_list("etf_price", [etfs.PriceRecord(timestamp_1, "TECH", 152.0)])
40 | )
41 |
42 | # Unrelated price updates:
43 | timestamp_2 = timestamp_0 + pd.to_timedelta("2s")
44 | (
45 | bench.set_source(
46 | "price",
47 | [
48 | etfs.PriceRecord(timestamp_2, "MSFT", 330.0),
49 | ],
50 | )
51 | .execute(timestamp_2)
52 | .assert_sink_not_updated("etf_price")
53 | )
54 |
55 | # New ETF comes in
56 | timestamp_3 = timestamp_0 + pd.to_timedelta("4s")
57 | (
58 | bench.set_source(
59 | "etf_composition",
60 | [etfs.EtfComposition(timestamp_3, "SOFT", {"MSFT": 0.5, "GOOG": 1.0})],
61 | )
62 | .execute(timestamp_3)
63 | .assert_sink_list("etf_price", [etfs.PriceRecord(timestamp_3, "SOFT", 190.0)])
64 | )
65 |
66 | # ETF extends with missing price:
67 | timestamp_4 = timestamp_0 + pd.to_timedelta("4s")
68 | (
69 | bench.set_source(
70 | "etf_composition",
71 | [
72 | etfs.EtfComposition(
73 | timestamp_4, "SOFT", {"MSFT": 0.5, "GOOG": 1.0, "ORCL": 0.5}
74 | )
75 | ],
76 | )
77 | .execute(timestamp_4)
78 | .assert_sink_list("etf_price", [etfs.PriceRecord(timestamp_4, "SOFT", None)])
79 | )
80 |
--------------------------------------------------------------------------------
/tests/test_pandas_wrapper.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pytest
4 |
5 | from beavers import Dag
6 | from beavers.pandas_wrapper import _empty_df, _get_stream_dtypes, _LastTracker
7 |
8 | DTYPES = pd.Series(
9 | {
10 | "col1": np.int64,
11 | "col2": np.object_,
12 | }
13 | )
14 | DF = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
15 | DF_UPDATE = pd.DataFrame({"col1": [1, 2, 2], "col2": ["e", "f", "g"]})
16 |
17 |
18 | def test_dtypes():
19 | df = _empty_df(dtypes=DTYPES)
20 | pd.testing.assert_series_equal(df.dtypes, DTYPES)
21 |
22 |
23 | def test_source_df():
24 | dag = Dag()
25 | source = dag.pd.source_df(dtypes=DTYPES)
26 |
27 | dag.execute()
28 | pd.testing.assert_series_equal(source.get_value().dtypes, DTYPES)
29 |
30 | source.set_stream(DF)
31 | dag.execute()
32 | pd.testing.assert_frame_equal(source.get_value(), DF)
33 |
34 |
35 | def test_table_stream():
36 | dag = Dag()
37 | source = dag.pd.source_df(dtypes=DTYPES)
38 | stream = dag.pd.df_stream(lambda x: x[x["col1"] > 1], DTYPES).map(source)
39 |
40 | dag.execute()
41 | pd.testing.assert_frame_equal(stream.get_value(), _empty_df(DTYPES))
42 |
43 | source.set_stream(DF)
44 | dag.execute()
45 | pd.testing.assert_frame_equal(stream.get_value(), DF[lambda x: x["col1"] > 1])
46 |
47 |
48 | def test_get_stream_dtypes():
49 | dag = Dag()
50 | source = dag.pd.source_df(dtypes=DTYPES)
51 | pd.testing.assert_series_equal(_get_stream_dtypes(source), DTYPES)
52 |
53 | state = dag.state(lambda: "foo").map()
54 | with pytest.raises(TypeError, match=r"Argument should be a stream Node"):
55 | pd.testing.assert_series_equal(_get_stream_dtypes(state), DTYPES)
56 |
57 | list_node = dag.source_stream()
58 | with pytest.raises(TypeError, match=r"Argument should be a Node\[pd.DataFrame\]"):
59 | pd.testing.assert_series_equal(_get_stream_dtypes(list_node), DTYPES)
60 |
61 |
62 | def test_latest_tracker():
63 | tracker = _LastTracker(["col1"], _empty_df(DTYPES))
64 | pd.testing.assert_frame_equal(tracker(_empty_df(DTYPES)), _empty_df(DTYPES))
65 | pd.testing.assert_frame_equal(tracker(DF), DF)
66 | pd.testing.assert_frame_equal(tracker(DF), DF)
67 |
68 | pd.testing.assert_frame_equal(
69 | tracker(DF_UPDATE), pd.DataFrame({"col1": [3, 1, 2], "col2": ["c", "e", "g"]})
70 | )
71 |
72 |
73 | def test_last_by_keys():
74 | dag = Dag()
75 | source = dag.pd.source_df(dtypes=DTYPES)
76 | latest = dag.pd.last_by_keys(source, ["col1"])
77 |
78 | dag.execute()
79 | pd.testing.assert_frame_equal(latest.get_value(), _empty_df(DTYPES))
80 |
81 | source.set_stream(DF)
82 | dag.execute()
83 | pd.testing.assert_frame_equal(latest.get_value(), DF)
84 |
85 | source.set_stream(DF)
86 | dag.execute()
87 | pd.testing.assert_frame_equal(latest.get_value(), DF)
88 |
--------------------------------------------------------------------------------
/tests/test_perpective_wrapper.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import MagicMock
2 |
3 | import perspective
4 | import pyarrow as pa
5 | import pytest
6 | from mock import mock
7 | from perspective import Server
8 | from tornado.testing import AsyncHTTPTestCase
9 | from tornado.web import Application
10 |
11 | from beavers import Dag
12 | from beavers.perspective_wrapper import (
13 | DATA_TYPES,
14 | PerspectiveTableDefinition,
15 | TableRequestHandler,
16 | _PerspectiveNode,
17 | _table_to_bytes,
18 | _TableConfig,
19 | _UpdateRunner,
20 | perspective_thread,
21 | )
22 |
23 | PERSPECTIVE_TABLE_SCHEMA = pa.schema(
24 | [
25 | pa.field("index", pa.string()),
26 | pa.field("remove", pa.string()),
27 | ]
28 | )
29 | PERSPECTIVE_TABLE_DEFINITION = config = PerspectiveTableDefinition(
30 | name="name",
31 | index_column="index",
32 | remove_column="remove",
33 | )
34 |
35 |
36 | def test_config_validate():
37 | definition = PERSPECTIVE_TABLE_DEFINITION
38 |
39 | with pytest.raises(AssertionError, match="index"):
40 | definition.validate(pa.schema([]))
41 |
42 | with pytest.raises(AssertionError, match="remove"):
43 | definition.validate(pa.schema([pa.field("index", pa.string())]))
44 |
45 | definition.validate(PERSPECTIVE_TABLE_SCHEMA)
46 |
47 |
48 | def test_to_table_config():
49 | assert _TableConfig.from_definition(
50 | PERSPECTIVE_TABLE_DEFINITION, PERSPECTIVE_TABLE_SCHEMA
51 | ) == _TableConfig(
52 | name="name", index="index", columns=["index", "remove"], sort=[], filters=[]
53 | )
54 |
55 |
56 | def test_table_to_bytes():
57 | results = _table_to_bytes(PERSPECTIVE_TABLE_SCHEMA.empty_table())
58 | assert isinstance(results, bytes)
59 | assert len(results) > 100
60 |
61 |
62 | def test_update_runner():
63 | mock = MagicMock()
64 |
65 | runner = _UpdateRunner(mock)
66 | runner()
67 | assert mock.run_cycle.called
68 |
69 |
70 | def test_add_node():
71 | dag = Dag()
72 | source = dag.pa.source_table(schema=PERSPECTIVE_TABLE_SCHEMA)
73 | state = dag.state(lambda x: x).map(source)
74 | assert dag.psp.to_perspective(source, PERSPECTIVE_TABLE_DEFINITION) is None
75 |
76 | with pytest.raises(AssertionError, match="Must provide a schema for state nodes"):
77 | dag.psp.to_perspective(state, PERSPECTIVE_TABLE_DEFINITION)
78 |
79 | dag.psp.to_perspective(
80 | state, PERSPECTIVE_TABLE_DEFINITION, schema=PERSPECTIVE_TABLE_SCHEMA
81 | )
82 |
83 | for node in dag._nodes:
84 | if isinstance(node._function, _PerspectiveNode):
85 | assert node._function.table is None
86 | node._function.table = MagicMock()
87 |
88 | dag.execute()
89 |
90 | nodes = [
91 | n._function for n in dag._nodes if isinstance(n._function, _PerspectiveNode)
92 | ]
93 | assert len(nodes) == 2
94 | assert nodes[0].get_table_config() == _TableConfig(
95 | name="name", index="index", columns=["index", "remove"], sort=[], filters=[]
96 | )
97 |
98 |
99 | class FakeLoop:
100 | @staticmethod
101 | def current():
102 | return FakeLoop()
103 |
104 | def add_callback(self):
105 | pass
106 |
107 | def time(self):
108 | return 0
109 |
110 | def add_timeout(self, *args, **kwargs):
111 | pass
112 |
113 | def start(self):
114 | pass
115 |
116 |
117 | @mock.patch("tornado.ioloop.IOLoop", FakeLoop)
118 | def test_perspective_thread():
119 | manager = Server()
120 |
121 | perspective_thread(manager, MagicMock(), [])
122 |
123 |
124 | class TestHandler(AsyncHTTPTestCase):
125 | def get_app(self):
126 | table_configs = [
127 | _TableConfig(
128 | "table1", index="col_1", columns=["col_1", "col_2"], sort=(), filters=()
129 | )
130 | ]
131 | return Application(
132 | [
133 | (
134 | r"/([a-z0-9_]*)",
135 | TableRequestHandler,
136 | {"table_configs": table_configs},
137 | ),
138 | ]
139 | )
140 |
141 | def test_table(self):
142 | response = self.fetch("/")
143 | assert response.code == 200
144 | assert b'["col_1", "col_2"]' in response.body
145 |
146 |
147 | def test_schema():
148 | server = perspective.Server()
149 | client = server.new_local_client()
150 |
151 | client.table({str(i): v[1] for i, v in enumerate(DATA_TYPES)})
152 |
--------------------------------------------------------------------------------
/tests/test_polars_wrapper.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 | import polars.testing
3 | import pytest
4 |
5 | from beavers import Dag
6 | from beavers.polars_wrapper import _get_stream_schema, _get_stream_dtype
7 |
8 | SIMPLE_SCHEMA = pl.Schema(
9 | [
10 | ("col1", pl.Int32()),
11 | ("col2", pl.Utf8()),
12 | ]
13 | )
14 | EMPTY_FRAME = pl.DataFrame(schema=SIMPLE_SCHEMA)
15 | SIMPLE_FRAME = pl.DataFrame([[1, 2, 3], ["a", "b", "c"]], schema=SIMPLE_SCHEMA)
16 | SIMPLE_FRAME_2 = table = pl.DataFrame([[1, 2], ["d", "e"]], schema=SIMPLE_SCHEMA)
17 |
18 |
19 | def test_source_stream():
20 | dag = Dag()
21 |
22 | node = dag.pl.source_table(schema=SIMPLE_SCHEMA)
23 | polars.testing.assert_frame_equal(
24 | node._empty_factory(), pl.DataFrame(schema=SIMPLE_SCHEMA)
25 | )
26 |
27 | node.set_stream(SIMPLE_FRAME)
28 | dag.execute()
29 | polars.testing.assert_frame_equal(node.get_value(), SIMPLE_FRAME)
30 |
31 | dag.execute()
32 | polars.testing.assert_frame_equal(
33 | node.get_value(), pl.DataFrame(schema=SIMPLE_SCHEMA)
34 | )
35 |
36 |
37 | def test_table_stream():
38 | dag = Dag()
39 |
40 | schema = pl.Schema([("col1", pl.Int32())])
41 | source = dag.pl.source_table(SIMPLE_SCHEMA)
42 | node = dag.pl.table_stream(lambda x: x.select(["col1"]), schema).map(source)
43 |
44 | dag.execute()
45 | polars.testing.assert_frame_equal(node.get_value(), pl.DataFrame(schema=schema))
46 |
47 | source.set_stream(SIMPLE_FRAME)
48 | dag.execute()
49 | polars.testing.assert_frame_equal(node.get_value(), SIMPLE_FRAME.select(["col1"]))
50 |
51 |
52 | def test_filter_stream():
53 | dag = Dag()
54 |
55 | source = dag.pl.source_table(SIMPLE_SCHEMA)
56 | filtered = dag.pl.filter_stream(source, pl.col("col1") > 1, pl.col("col2") == "a")
57 |
58 | dag.execute()
59 | polars.testing.assert_frame_equal(
60 | filtered.get_value(), pl.DataFrame(schema=SIMPLE_SCHEMA)
61 | )
62 |
63 | source.set_stream(SIMPLE_FRAME)
64 | dag.execute()
65 | polars.testing.assert_frame_equal(
66 | filtered.get_value(),
67 | SIMPLE_FRAME.filter(pl.col("col1") > 1, pl.col("col2") == "a"),
68 | )
69 |
70 |
71 | def test_get_stream_schema():
72 | dag = Dag()
73 |
74 | polars_source = dag.pl.source_table(SIMPLE_SCHEMA)
75 | assert _get_stream_schema(polars_source) == SIMPLE_SCHEMA
76 |
77 | list_source = dag.source_stream(empty=[], name="source1")
78 | with pytest.raises(TypeError, match=r"Argument should be a Node\[pl\.DataFrame\]"):
79 | _get_stream_schema(list_source)
80 |
81 |
82 | def test_last_by():
83 | dag = Dag()
84 |
85 | source = dag.pl.source_table(SIMPLE_SCHEMA)
86 | last_by = dag.pl.last_by_keys(source, ["col1"])
87 |
88 | dag.execute()
89 | polars.testing.assert_frame_equal(
90 | last_by.get_value(), pl.DataFrame(schema=SIMPLE_SCHEMA)
91 | )
92 |
93 | source.set_stream(SIMPLE_FRAME)
94 | dag.execute()
95 | polars.testing.assert_frame_equal(last_by.get_value(), SIMPLE_FRAME)
96 |
97 | source.set_stream(SIMPLE_FRAME_2)
98 | dag.execute()
99 | assert str(last_by.get_value()) == str(
100 | pl.DataFrame([[1, 2, 3], ["d", "e", "c"]], schema=SIMPLE_SCHEMA)
101 | )
102 |
103 |
104 | def test_last_by_order_of_column():
105 | dag = Dag()
106 |
107 | source = dag.pl.source_table(SIMPLE_SCHEMA)
108 | last_by = dag.pl.last_by_keys(source, ["col2"])
109 |
110 | dag.execute()
111 | polars.testing.assert_frame_equal(
112 | last_by.get_value(), pl.DataFrame(schema=SIMPLE_SCHEMA)
113 | )
114 |
115 | source.set_stream(SIMPLE_FRAME)
116 | dag.execute()
117 | polars.testing.assert_frame_equal(last_by.get_value(), SIMPLE_FRAME)
118 |
119 |
120 | def test_last_by_bad_keys():
121 | dag = Dag()
122 | source = dag.pl.source_table(SIMPLE_SCHEMA)
123 | with pytest.raises(AssertionError, match="Keys must be strings"):
124 | dag.pl.last_by_keys(source, [1])
125 |
126 |
127 | def test_concat_series():
128 | dag = Dag()
129 | left_source = dag.pl.source_table(SIMPLE_SCHEMA)
130 | left = dag.pl.get_series(left_source, "col1")
131 | right_source = dag.pl.source_table(SIMPLE_SCHEMA)
132 | right = dag.pl.get_series(right_source, "col1")
133 |
134 | both = dag.pl.concat_series(left, right)
135 |
136 | dag.execute()
137 | polars.testing.assert_series_equal(
138 | both.get_value(), pl.Series(dtype=pl.Int32(), name="col1")
139 | )
140 |
141 | left_source.set_stream(SIMPLE_FRAME)
142 | dag.execute()
143 | polars.testing.assert_series_equal(
144 | both.get_value(), pl.Series(values=[1, 2, 3], dtype=pl.Int32(), name="col1")
145 | )
146 |
147 | left_source.set_stream(SIMPLE_FRAME)
148 | right_source.set_stream(SIMPLE_FRAME_2)
149 | dag.execute()
150 | polars.testing.assert_series_equal(
151 | both.get_value(),
152 | pl.Series(values=[1, 2, 3, 1, 2], dtype=pl.Int32(), name="col1"),
153 | )
154 |
155 | right_source.set_stream(SIMPLE_FRAME_2)
156 | dag.execute()
157 | polars.testing.assert_series_equal(
158 | both.get_value(),
159 | pl.Series(values=[1, 2], dtype=pl.Int32(), name="col1"),
160 | )
161 |
162 |
163 | def test_concat_series_bad_no_series():
164 | dag = Dag()
165 | with pytest.raises(ValueError, match="Must pass at least one series"):
166 | dag.pl.concat_series()
167 |
168 |
169 | def test_concat_series_bad_mismatching_series():
170 | dag = Dag()
171 | source = dag.pl.source_table(SIMPLE_SCHEMA)
172 | left = dag.pl.get_series(source, "col1")
173 | right = dag.pl.get_series(source, "col2")
174 | with pytest.raises(TypeError, match="Series type mismatch Int32 vs String"):
175 | dag.pl.concat_series(left, right)
176 |
177 |
178 | def test_get_series():
179 | dag = Dag()
180 | left_source = dag.pl.source_table(SIMPLE_SCHEMA)
181 | left_series = dag.pl.get_series(left_source, "col1")
182 |
183 | dag.execute()
184 | polars.testing.assert_series_equal(left_series.get_value(), EMPTY_FRAME["col1"])
185 |
186 | left_source.set_stream(SIMPLE_FRAME)
187 | dag.execute()
188 | polars.testing.assert_series_equal(left_series.get_value(), SIMPLE_FRAME["col1"])
189 |
190 | dag.execute()
191 | polars.testing.assert_series_equal(left_series.get_value(), EMPTY_FRAME["col1"])
192 |
193 |
194 | def test_get_stream_dtype_bad():
195 | with pytest.raises(TypeError, match=r"Argument should be a Node\[pl\.Series\]"):
196 | _get_stream_dtype(Dag().source_stream())
197 |
--------------------------------------------------------------------------------
/tests/test_pyarrow_kafka.py:
--------------------------------------------------------------------------------
1 | from beavers.pyarrow_kafka import JsonDeserializer, JsonSerializer
2 | from tests.test_kafka import mock_kafka_message
3 | from tests.test_util import TEST_TABLE
4 |
5 |
6 | def test_json_deserializer_empty():
7 | deserializer = JsonDeserializer(TEST_TABLE.schema)
8 | assert deserializer([]) == TEST_TABLE.schema.empty_table()
9 |
10 |
11 | def test_end_to_end():
12 | deserializer = JsonDeserializer(TEST_TABLE.schema)
13 | serializer = JsonSerializer("topic-1")
14 | out_messages = serializer(TEST_TABLE)
15 | in_messages = [
16 | mock_kafka_message(topic=m.topic, value=m.value) for m in out_messages
17 | ]
18 | assert deserializer(in_messages) == TEST_TABLE
19 |
--------------------------------------------------------------------------------
/tests/test_pyarrow_replay.py:
--------------------------------------------------------------------------------
1 | from operator import itemgetter
2 |
3 | import pandas as pd
4 | import pyarrow as pa
5 | import pyarrow.csv
6 | import pytest
7 |
8 | from beavers.dag import UTC_MAX
9 | from beavers.pyarrow_replay import ArrowTableDataSink, ArrowTableDataSource
10 | from tests.test_util import TEST_TABLE
11 |
12 |
13 | def test_arrow_table_data_source():
14 | source = ArrowTableDataSource(TEST_TABLE, itemgetter("timestamp"))
15 |
16 | assert source.get_next() == pd.to_datetime("2023-01-01T00:00:00Z")
17 | assert source.read_to(pd.to_datetime("2023-01-01T00:00:00Z")) == TEST_TABLE[:1]
18 | assert source.read_to(pd.to_datetime("2023-01-01T00:00:00Z")) == TEST_TABLE[:0]
19 | assert source.get_next() == pd.to_datetime("2023-01-02T00:00:00Z")
20 | assert source.read_to(pd.to_datetime("2023-01-02T00:00:00Z")) == TEST_TABLE[1:]
21 | assert source.get_next() == UTC_MAX
22 | assert source.read_to(UTC_MAX) == TEST_TABLE[:0]
23 |
24 |
25 | def test_arrow_table_data_source_ooo():
26 | with pytest.raises(
27 | AssertionError, match="Timestamp column should be monotonic increasing"
28 | ):
29 | ArrowTableDataSource(
30 | pa.table(
31 | {
32 | "timestamp": [
33 | pd.to_datetime("2023-01-02T00:00:00Z"),
34 | pd.to_datetime("2023-01-01T00:00:00Z"),
35 | ],
36 | "value": [1, 2],
37 | }
38 | ),
39 | itemgetter("timestamp"),
40 | )
41 |
42 |
43 | def test_arrow_table_data_sink(tmpdir):
44 | file = tmpdir / "file.csv"
45 | sink = ArrowTableDataSink(lambda table: pyarrow.csv.write_csv(table, file))
46 |
47 | sink.close()
48 | assert not file.exists()
49 |
50 | sink.append(UTC_MAX, TEST_TABLE)
51 | sink.close()
52 | assert file.exists()
53 |
--------------------------------------------------------------------------------
/tests/test_pyarrow_wrapper.py:
--------------------------------------------------------------------------------
1 | import pyarrow as pa
2 | import pyarrow.compute as pc
3 | import pytest
4 |
5 | from beavers import Dag
6 | from beavers.pyarrow_wrapper import _concat_arrow_arrays, _get_last_by, _LastByKey
7 |
8 | SIMPLE_SCHEMA = pa.schema(
9 | [
10 | pa.field("col1", pa.int32()),
11 | pa.field("col2", pa.string()),
12 | pa.field("col3", pa.timestamp("ns", "UTC")),
13 | ]
14 | )
15 | SIMPLE_TABLE = pa.table([[1, 2, 3], ["a", "b", "c"], [0, 0, 0]], schema=SIMPLE_SCHEMA)
16 | SIMPLE_TABLE_2 = table = pa.table([[1, 2], ["d", "e"], [0, 0]], schema=SIMPLE_SCHEMA)
17 |
18 |
19 | def test_source_stream():
20 | dag = Dag()
21 |
22 | node = dag.pa.source_table(schema=SIMPLE_SCHEMA)
23 | assert node._empty_factory() == SIMPLE_SCHEMA.empty_table()
24 |
25 | node.set_stream(SIMPLE_TABLE)
26 | dag.execute()
27 | assert node.get_value() == SIMPLE_TABLE
28 |
29 | dag.execute()
30 | assert node.get_value() == SIMPLE_SCHEMA.empty_table()
31 |
32 |
33 | def test_source_stream_name():
34 | dag = Dag()
35 |
36 | node = dag.pa.source_table(schema=SIMPLE_SCHEMA, name="source_1")
37 | assert dag.get_sources() == {"source_1": node}
38 |
39 |
40 | def test_table_stream():
41 | dag = Dag()
42 |
43 | source = dag.pa.source_table(SIMPLE_SCHEMA)
44 | node = dag.pa.table_stream(
45 | lambda x: x.select(["col1"]),
46 | pa.schema([pa.field("col1", pa.int32())]),
47 | ).map(source)
48 |
49 | source.set_stream(SIMPLE_TABLE)
50 | dag.execute()
51 | assert node.get_value() == SIMPLE_TABLE.select(["col1"])
52 |
53 |
54 | def test_filter_stream():
55 | dag = Dag()
56 |
57 | source = dag.pa.source_table(SIMPLE_SCHEMA)
58 | node = dag.pa.filter_stream(
59 | lambda x, y: pc.equal(x["col1"], y), source, dag.const(1)
60 | )
61 | SIMPLE_SCHEMA.empty_table()
62 | source.set_stream(SIMPLE_TABLE)
63 | dag.execute()
64 | assert node.get_value() == SIMPLE_TABLE[0:1]
65 |
66 | dag.execute()
67 | assert node.get_value() == SIMPLE_SCHEMA.empty_table()
68 |
69 |
70 | def _predicate(table: pa.Table) -> pa.Array:
71 | return pc.equal(table["col1"], 1)
72 |
73 |
74 | def test_filter_stream_bad_arguments():
75 | dag = Dag()
76 |
77 | state_node = dag.state(lambda: "HELLO").map()
78 | with pytest.raises(TypeError, match=r"Argument should be a stream Node"):
79 | dag.pa.filter_stream(_predicate, state_node)
80 |
81 | list_stream_node = dag.source_stream()
82 | with pytest.raises(TypeError, match=r"Argument should be a Node\[pa\.Table\]"):
83 | dag.pa.filter_stream(_predicate, list_stream_node)
84 |
85 |
86 | def test_learn_expression_type():
87 | field = pc.field("col1")
88 | assert isinstance(field, pc.Expression)
89 | greater_with_pc = pc.greater(field, 2)
90 | assert SIMPLE_TABLE.filter(greater_with_pc) == SIMPLE_TABLE[-1:]
91 | greater_with_python = field > 2
92 | assert SIMPLE_TABLE.filter(greater_with_python) == SIMPLE_TABLE[-1:]
93 | with pytest.raises(TypeError):
94 | pc.min(SIMPLE_TABLE, field)
95 |
96 |
97 | def test_group_by_last():
98 | with pytest.raises(
99 | pa.ArrowNotImplementedError,
100 | match="Using ordered aggregator"
101 | " in multiple threaded execution is not supported",
102 | ):
103 | SIMPLE_TABLE.group_by("col1").aggregate([("col2", "last")])
104 |
105 |
106 | def test_get_latest():
107 | table = pa.table(
108 | [[1, 2, 3, 1, 2], ["a", "b", "c", "d", "e"], [0] * 5], schema=SIMPLE_SCHEMA
109 | )
110 | assert _get_last_by(table, ["col1"]) == table[2:]
111 | assert _get_last_by(table, ["col1", "col2"]) == table
112 |
113 |
114 | def test_get_last_by_batches():
115 | table = pa.concat_tables([SIMPLE_TABLE, SIMPLE_TABLE])
116 | assert _get_last_by(table, ["col1"]) == SIMPLE_TABLE
117 |
118 |
119 | def test_get_last_by_all_columns():
120 | table = pa.concat_tables([SIMPLE_TABLE, SIMPLE_TABLE])
121 | assert _get_last_by(table, ["col1", "col2"]) == SIMPLE_TABLE
122 |
123 |
124 | def test_latest_tracker():
125 | tracker = _LastByKey(["col1"], SIMPLE_SCHEMA.empty_table())
126 |
127 | assert tracker(SIMPLE_SCHEMA.empty_table()) == SIMPLE_SCHEMA.empty_table()
128 | assert tracker(SIMPLE_TABLE) == SIMPLE_TABLE
129 | assert tracker(SIMPLE_TABLE_2) == pa.table(
130 | [[3, 1, 2], ["c", "d", "e"], [0] * 3], schema=SIMPLE_SCHEMA
131 | )
132 |
133 |
134 | def test_last_by_keys():
135 | dag = Dag()
136 | source = dag.pa.source_table(SIMPLE_SCHEMA)
137 | latest = dag.pa.last_by_keys(source, ["col1"])
138 |
139 | dag.execute()
140 | assert latest.get_value() == SIMPLE_SCHEMA.empty_table()
141 |
142 | source.set_stream(SIMPLE_TABLE)
143 | dag.execute()
144 | assert latest.get_value() == SIMPLE_TABLE
145 |
146 | dag.execute()
147 | assert latest.get_value() == SIMPLE_TABLE
148 |
149 | source.set_stream(SIMPLE_TABLE_2)
150 | dag.execute()
151 | assert latest.get_value() == pa.table(
152 | [[3, 1, 2], ["c", "d", "e"], [0] * 3], schema=SIMPLE_SCHEMA
153 | )
154 |
155 |
156 | def test_last_by_keys_bad():
157 | dag = Dag()
158 |
159 | with pytest.raises(
160 | AttributeError, match=r"'str' object has no attribute '_get_empty'"
161 | ):
162 | dag.pa.last_by_keys("Not a node", ["col1"])
163 | with pytest.raises(TypeError, match=r"Argument should be a Node\[pa.Table\]"):
164 | dag.pa.last_by_keys(dag.source_stream(), ["col1"])
165 | with pytest.raises(TypeError, match=r"Argument should be a stream Node"):
166 | dag.pa.last_by_keys(dag.state(lambda: None).map(), ["col1"])
167 |
168 | source = dag.pa.source_table(SIMPLE_SCHEMA)
169 |
170 | with pytest.raises(TypeError, match="123"):
171 | dag.pa.last_by_keys(source, 123)
172 | with pytest.raises(TypeError, match="123"):
173 | dag.pa.last_by_keys(source, [123])
174 | with pytest.raises(
175 | TypeError, match=r"field colz no in schema: \['col1', 'col2', 'col3'\]"
176 | ):
177 | dag.pa.last_by_keys(source, ["colz"])
178 |
179 |
180 | def test_get_column():
181 | dag = Dag()
182 | source = dag.pa.source_table(SIMPLE_SCHEMA)
183 | array = dag.pa.get_column(source, "col1")
184 |
185 | dag.execute()
186 | assert array.get_value() == pa.chunked_array([pa.array([], pa.int32())])
187 |
188 | source.set_stream(SIMPLE_TABLE)
189 | dag.execute()
190 | assert array.get_value() == SIMPLE_TABLE["col1"]
191 |
192 | dag.execute()
193 | assert array.get_value() == pa.chunked_array([pa.array([], pa.int32())])
194 |
195 | source.set_stream(SIMPLE_TABLE_2)
196 | dag.execute()
197 | assert array.get_value() == SIMPLE_TABLE_2["col1"]
198 |
199 |
200 | def test_get_column_bad():
201 | dag = Dag()
202 |
203 | with pytest.raises(
204 | AttributeError, match=r"'str' object has no attribute '_get_empty'"
205 | ):
206 | dag.pa.get_column("Not a node", "col1")
207 | with pytest.raises(TypeError, match=r"Argument should be a Node\[pa.Table\]"):
208 | dag.pa.get_column(dag.source_stream(), "col1")
209 | with pytest.raises(TypeError, match=r"Argument should be a stream Node"):
210 | dag.pa.get_column(dag.state(lambda: None).map(), "col1")
211 |
212 | source = dag.pa.source_table(SIMPLE_SCHEMA)
213 |
214 | with pytest.raises(TypeError, match="123"):
215 | dag.pa.get_column(source, 123)
216 | with pytest.raises(
217 | TypeError, match=r"field colz no in schema: \['col1', 'col2', 'col3'\]"
218 | ):
219 | dag.pa.get_column(source, "colz")
220 |
221 |
222 | def test_concat_arrays_ok():
223 | dag = Dag()
224 | left = dag.source_stream(empty=pa.array([], pa.string()))
225 | right = dag.source_stream(empty=pa.array([], pa.string()))
226 | both = dag.pa.concat_arrays(left, right)
227 |
228 | dag.execute()
229 | assert both.get_value() == pa.chunked_array([], pa.string())
230 |
231 | left.set_stream(pa.array(["a", "b"]))
232 | right.set_stream(pa.array(["c"]))
233 | dag.execute()
234 | assert both.get_value() == pa.chunked_array(["a", "b", "c"], pa.string())
235 |
236 | dag.execute()
237 | assert both.get_value() == pa.chunked_array([], pa.string())
238 |
239 |
240 | def test_concat_arrays_bad():
241 | dag = Dag()
242 |
243 | with pytest.raises(ValueError, match=r"Must pass at least one array"):
244 | dag.pa.concat_arrays()
245 | with pytest.raises(TypeError, match=r"Argument should be a stream Node"):
246 | dag.pa.concat_arrays(dag.state(lambda: None).map())
247 | with pytest.raises(TypeError, match=r"Argument should be a Node\[pa\.Array\]"):
248 | dag.pa.concat_arrays(dag.source_stream())
249 | with pytest.raises(TypeError, match=r"Array type mismatch string vs int32"):
250 | dag.pa.concat_arrays(
251 | dag.source_stream(empty=pa.array([], pa.string())),
252 | dag.source_stream(empty=pa.array([], pa.int32())),
253 | )
254 |
255 |
256 | def test_concat_arrow_arrays_mixed():
257 | assert _concat_arrow_arrays(
258 | [
259 | pa.array([], pa.string()),
260 | pa.chunked_array(pa.array([], pa.string())),
261 | ]
262 | ) == pa.chunked_array([], pa.string())
263 |
264 |
265 | def test_concat_arrow_arrays_bad():
266 | with pytest.raises(TypeError, match="123"):
267 | _concat_arrow_arrays([123])
268 |
--------------------------------------------------------------------------------
/tests/test_replay.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | from operator import attrgetter
3 |
4 | import pandas as pd
5 | import pytest
6 |
7 | from beavers.dag import UTC_MAX, Dag
8 | from beavers.replay import (
9 | DataSource,
10 | IteratorDataSourceAdapter,
11 | NoOpDataSinkProvider,
12 | ReplayContext,
13 | ReplayDriver,
14 | T,
15 | _create_sinks,
16 | _create_sources,
17 | )
18 | from tests.test_util import ListDataSink, ListDataSource
19 |
20 |
21 | @dataclasses.dataclass(frozen=True)
22 | class Word:
23 | timestamp: pd.Timestamp
24 | value: str
25 |
26 |
27 | @pytest.fixture
28 | def replay_context() -> ReplayContext:
29 | return ReplayContext(
30 | pd.to_datetime("2023-01-01", utc=True),
31 | pd.to_datetime("2023-01-02", utc=True),
32 | pd.to_timedelta("1min"),
33 | )
34 |
35 |
36 | def create_data_source(context: ReplayContext):
37 | return ListDataSource(
38 | [Word(context.start + pd.Timedelta(minutes=i), "hello") for i in range(10)],
39 | attrgetter("timestamp"),
40 | )
41 |
42 |
43 | def test_create_sources_mismatch(replay_context: ReplayContext):
44 | with pytest.raises(
45 | ValueError,
46 | match=r"Source node and DataSource names don't match: \[\] vs \['words'\]",
47 | ):
48 | _create_sources(Dag(), replay_context, {"words": create_data_source})
49 |
50 |
51 | def test_create_sources_match(replay_context: ReplayContext):
52 | dag = Dag()
53 | node = dag.source_stream(empty=[], name="words")
54 |
55 | results = _create_sources(dag, replay_context, {"words": create_data_source})
56 | assert len(results) == 1
57 | assert results[0].name == "words"
58 | assert results[0].node == node
59 | assert isinstance(results[0].data_source, ListDataSource)
60 |
61 |
62 | def test_create_sinks_mismatch(replay_context: ReplayContext):
63 | sink = ListDataSink()
64 | with pytest.raises(
65 | ValueError,
66 | match=r"Sink node and DataSink names don't match: \[\] vs \['words'\]",
67 | ):
68 | _create_sinks(Dag(), replay_context, {"words": lambda _: sink})
69 |
70 |
71 | def test_create_sinks_match(replay_context: ReplayContext):
72 | sink = ListDataSink()
73 | dag = Dag()
74 | source_node = dag.source_stream(empty=[], name="words")
75 | sink_node = dag.sink("words", source_node)
76 | results = _create_sinks(dag, replay_context, {"words": lambda _: sink})
77 | assert len(results) == 1
78 | assert results[0].name == "words"
79 | assert results[0].nodes == [sink_node]
80 | assert results[0].data_sink is sink
81 |
82 |
83 | def test_pass_through_replay(replay_context: ReplayContext):
84 | source = create_data_source(replay_context)
85 | sink = ListDataSink()
86 | dag = Dag()
87 | source_node = dag.source_stream(empty=[], name="words")
88 | dag.sink("words", source_node)
89 |
90 | driver = ReplayDriver.create(
91 | dag,
92 | replay_context,
93 | {"words": lambda _: source},
94 | {"words": lambda _: sink},
95 | )
96 | driver.run()
97 | assert sink._data == source._data
98 |
99 |
100 | def test_no_op_through_replay(replay_context: ReplayContext):
101 | """
102 | Test a corner case of the driver were a sink did not update during a cycle
103 | """
104 | sink = ListDataSink()
105 | dag = Dag()
106 | dag.source_stream(empty=[], name="words_1")
107 | source_2 = dag.source_stream(empty=[], name="words_2")
108 | dag.sink("words", source_2)
109 |
110 | driver = ReplayDriver.create(
111 | dag,
112 | replay_context,
113 | {
114 | "words_1": create_data_source,
115 | "words_2": lambda _: ListDataSource([], attrgetter("timestamp")),
116 | },
117 | {"words": lambda _: sink},
118 | )
119 | driver.run()
120 | assert sink._data == []
121 |
122 |
123 | def create_data_groups() -> list[list[Word]]:
124 | timestamp = pd.to_datetime("2022-01-01", utc=True)
125 | return [
126 | [
127 | Word(timestamp + pd.Timedelta(minutes=0), "hello"),
128 | Word(timestamp + pd.Timedelta(minutes=1), "world"),
129 | ],
130 | [
131 | Word(timestamp + pd.Timedelta(minutes=2), "hello"),
132 | Word(timestamp + pd.Timedelta(minutes=2), "world"),
133 | ],
134 | [
135 | Word(timestamp + pd.Timedelta(minutes=3), "hello"),
136 | Word(timestamp + pd.Timedelta(minutes=3), "world"),
137 | Word(timestamp + pd.Timedelta(minutes=3), "world"),
138 | Word(timestamp + pd.Timedelta(minutes=4), "world"),
139 | ],
140 | [],
141 | [
142 | Word(timestamp + pd.Timedelta(minutes=5), "hello"),
143 | Word(timestamp + pd.Timedelta(minutes=5), "world"),
144 | ],
145 | ]
146 |
147 |
148 | def create_adapter(data_groups: list[list[Word]]) -> DataSource[list[Word]]:
149 | return IteratorDataSourceAdapter(
150 | (
151 | ListDataSource(data_group, attrgetter("timestamp"))
152 | for data_group in data_groups
153 | ),
154 | [],
155 | lambda left, right: left + right,
156 | )
157 |
158 |
159 | def test_iterator_data_source_adapter_run_all():
160 | data_groups = create_data_groups()
161 | adapter = create_adapter(data_groups)
162 | assert adapter.read_to(UTC_MAX) == [
163 | word for data_group in data_groups for word in data_group
164 | ]
165 | assert adapter.read_to(UTC_MAX) == []
166 |
167 |
168 | def test_iterator_data_source_adapter_run_one_by_one():
169 | timestamp = pd.to_datetime("2022-01-01", utc=True)
170 | data_groups = create_data_groups()
171 | adapter = create_adapter(data_groups)
172 | assert adapter.get_next() == timestamp
173 | assert adapter.read_to(timestamp) == [data_groups[0][0]]
174 | assert adapter.read_to(timestamp) == []
175 | assert adapter.read_to(timestamp + pd.Timedelta(minutes=1)) == [data_groups[0][1]]
176 | assert adapter.read_to(timestamp + pd.Timedelta(minutes=1)) == []
177 | assert (
178 | adapter.read_to(timestamp + pd.Timedelta(minutes=3))
179 | == data_groups[1] + data_groups[2][:-1]
180 | )
181 | assert adapter.read_to(timestamp + pd.Timedelta(minutes=4)) == data_groups[2][-1:]
182 | assert adapter.read_to(timestamp + pd.Timedelta(minutes=5)) == data_groups[4]
183 | assert adapter.read_to(timestamp + pd.Timedelta(minutes=6)) == []
184 | assert adapter.read_to(UTC_MAX) == []
185 |
186 |
187 | def test_iterator_data_source_empty():
188 | adapter = create_adapter([])
189 | assert adapter.get_next() == UTC_MAX
190 | assert adapter.read_to(UTC_MAX) == []
191 | assert adapter.get_next() == UTC_MAX
192 | assert adapter.read_to(UTC_MAX) == []
193 |
194 |
195 | def test_iterator_data_source_all_empty():
196 | adapter = create_adapter([[], []])
197 | assert adapter.get_next() == UTC_MAX
198 | assert adapter.read_to(UTC_MAX) == []
199 | assert adapter.get_next() == UTC_MAX
200 | assert adapter.read_to(UTC_MAX) == []
201 |
202 |
203 | class CornerCaseTester(DataSource[list[Word]]):
204 | def __init__(self, timestamp: pd.Timestamp):
205 | self._timestamp = timestamp
206 | self._read = False
207 |
208 | def read_to(self, timestamp: pd.Timestamp) -> list[T]:
209 | self._read = True
210 | return []
211 |
212 | def get_next(self) -> pd.Timestamp:
213 | if self._read:
214 | return UTC_MAX
215 | else:
216 | return self._timestamp
217 |
218 |
219 | def test_iterator_data_source_cutoff():
220 | """
221 | Test a tricky corner case were the underlying DataSource of
222 | IteratorDataSourceAdapter doesn't behave as expected.
223 | """
224 | timestamp = pd.to_datetime("2022-01-01", utc=True)
225 | adapter = IteratorDataSourceAdapter(
226 | (
227 | source
228 | for source in [
229 | CornerCaseTester(timestamp + pd.Timedelta(minutes=1)),
230 | ListDataSource(
231 | [Word(timestamp + pd.Timedelta(minutes=2), "hello")],
232 | attrgetter("timestamp"),
233 | ),
234 | ]
235 | ),
236 | [],
237 | lambda left, right: left + right,
238 | )
239 |
240 | assert adapter.read_to(UTC_MAX) == [
241 | Word(
242 | timestamp=pd.Timestamp("2022-01-01 00:02:00+0000", tz="UTC"), value="hello"
243 | )
244 | ]
245 |
246 |
247 | def test_replay_read_sources():
248 | source = ListDataSource(
249 | [
250 | Word(pd.to_datetime("2023-01-01 00:01:00Z"), "1"),
251 | Word(pd.to_datetime("2023-01-01 00:02:00Z"), "2"),
252 | Word(pd.to_datetime("2023-01-01 12:01:00Z"), "3"),
253 | Word(pd.to_datetime("2023-01-01 12:04:00Z"), "4"),
254 | ],
255 | attrgetter("timestamp"),
256 | )
257 |
258 | dag = Dag()
259 | dag.source_stream([], name="hello")
260 | driver = ReplayDriver.create(
261 | dag=dag,
262 | replay_context=ReplayContext(
263 | pd.to_datetime("2023-01-01", utc=True),
264 | pd.to_datetime("2023-01-02", utc=True) - pd.to_timedelta("1ns"),
265 | pd.to_timedelta("12h"),
266 | ),
267 | data_source_providers={"hello": lambda x: source},
268 | data_sink_providers={},
269 | )
270 |
271 | records, timestamp = driver.read_sources()
272 | assert timestamp == pd.to_datetime("2023-01-01 00:01:00Z", utc=True)
273 | assert records == 0
274 |
275 |
276 | def test_replay_run_cycle():
277 | source = ListDataSource(
278 | [
279 | Word(pd.to_datetime("2023-01-01 00:01:00Z"), "1"),
280 | Word(pd.to_datetime("2023-01-01 00:02:00Z"), "2"),
281 | Word(pd.to_datetime("2023-01-01 12:01:00Z"), "3"),
282 | Word(pd.to_datetime("2023-01-01 12:04:00Z"), "4"),
283 | ],
284 | attrgetter("timestamp"),
285 | )
286 |
287 | dag = Dag()
288 | dag.source_stream([], name="hello")
289 | driver = ReplayDriver.create(
290 | dag=dag,
291 | replay_context=ReplayContext(
292 | pd.to_datetime("2023-01-01", utc=True),
293 | pd.to_datetime("2023-01-02", utc=True) - pd.to_timedelta("1ns"),
294 | pd.to_timedelta("12h"),
295 | ),
296 | data_source_providers={"hello": lambda x: source},
297 | data_sink_providers={},
298 | )
299 |
300 | metrics = driver.run_cycle()
301 | assert metrics is None
302 | assert driver.current_time == pd.to_datetime("2023-01-01 12:00:00Z")
303 |
304 | metrics = driver.run_cycle()
305 | assert metrics.timestamp == pd.to_datetime("2023-01-01 12:00:00Z")
306 | assert metrics.source_records == 2
307 | assert metrics.sink_records == 0
308 | assert metrics.cycle_time_ns > 0
309 | assert metrics.warp_ratio > 0.0
310 | assert driver.current_time == pd.to_datetime("2023-01-02 00:00:00Z")
311 |
312 | metrics = driver.run_cycle()
313 | assert metrics.timestamp == pd.to_datetime("2023-01-01 23:59:59.999999999Z")
314 | assert metrics.source_records == 2
315 | assert metrics.sink_records == 0
316 | assert metrics.cycle_time_ns > 0
317 | assert metrics.warp_ratio > 0.0
318 | assert driver.current_time == pd.to_datetime("2023-01-02 12:00:00Z")
319 | assert driver.is_done()
320 |
321 |
322 | def test_no_op():
323 | provider = NoOpDataSinkProvider()
324 | data_sink = provider(ReplayContext(UTC_MAX, UTC_MAX, pd.to_timedelta("1s")))
325 | data_sink.append(UTC_MAX, None)
326 | data_sink.close()
327 |
--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import dataclasses
3 | import random
4 | from typing import Callable, Dict, Generic, TypeVar
5 |
6 | import pandas as pd
7 | import pyarrow as pa
8 |
9 | from beavers.dag import UTC_MAX, Dag, TimerManager
10 | from beavers.replay import DataSink, DataSource
11 |
12 | T = TypeVar("T")
13 |
14 | TEST_TABLE = pa.table(
15 | {
16 | "timestamp": [
17 | pd.to_datetime("2023-01-01T00:00:00Z"),
18 | pd.to_datetime("2023-01-02T00:00:00Z"),
19 | ],
20 | "value": [1, 2],
21 | }
22 | )
23 |
24 |
25 | class GetLatest(Generic[T]):
26 | def __init__(self, default: T):
27 | self._value = default
28 |
29 | def __call__(self, values: list[T]) -> T:
30 | if values:
31 | self._value = values[-1]
32 | return self._value
33 |
34 |
35 | def add(left, right):
36 | return left + right
37 |
38 |
39 | def add_with_noise(left, right):
40 | return left + right + random.randint(0, 1000) # nosec
41 |
42 |
43 | def add_no_42(left, right):
44 | results = add(left, right)
45 | if results == 42:
46 | raise ValueError(f"{left} + {right} == 42")
47 | else:
48 | return results
49 |
50 |
51 | class AddOther:
52 | def __init__(self, other):
53 | self._other = other
54 |
55 | def set_other(self, other):
56 | self._other = other
57 |
58 | def __call__(self, value):
59 | return self._other + value
60 |
61 |
62 | def select(key, **values):
63 | return values[key]
64 |
65 |
66 | class WordCount:
67 | def __init__(self):
68 | self._counts = collections.defaultdict(lambda: 0)
69 |
70 | def __call__(self, words: list[str]) -> dict[str, int]:
71 | for word in words:
72 | self._counts[word] += 1
73 |
74 | return self._counts
75 |
76 |
77 | def join_counts(**kwargs: Dict[str, int]) -> pd.DataFrame:
78 | return pd.concat(
79 | [pd.Series(value, name=key) for key, value in kwargs.items()], axis=1
80 | ).fillna(0)
81 |
82 |
83 | @dataclasses.dataclass(frozen=True)
84 | class TimerEntry:
85 | timestamp: pd.Timestamp
86 | values: list[int]
87 |
88 |
89 | class SetATimer:
90 | def __init__(self):
91 | self._entry = None
92 |
93 | def __call__(
94 | self, entries: list[TimerEntry], now: pd.Timestamp, timer_manager: TimerManager
95 | ) -> list[int]:
96 | if entries:
97 | self._entry = entries[-1]
98 | timer_manager.set_next_timer(self._entry.timestamp)
99 | if self._entry is not None and now >= self._entry.timestamp:
100 | results = self._entry.values
101 | self._entry = None
102 | return results
103 | else:
104 | return []
105 |
106 |
107 | def create_word_count_dag() -> tuple[Dag, WordCount]:
108 | dag = Dag()
109 | messages_stream = dag.source_stream([], name="messages")
110 | word_count = WordCount()
111 | state = dag.state(word_count).map(messages_stream)
112 | changed_key = dag.stream(lambda x: sorted(set(x)), []).map(messages_stream)
113 | records = dag.stream(lambda x, y: {v: y[v] for v in x}, {}).map(changed_key, state)
114 | dag.sink("results", records)
115 | return dag, word_count
116 |
117 |
118 | class ListDataSource(DataSource[list[T]]):
119 | def __init__(self, data: list[T], extractor: Callable[[T], pd.Timestamp]):
120 | self._data = data
121 | self._extractor = extractor
122 | self._position = 0
123 |
124 | def read_to(self, timestamp: pd.Timestamp) -> list[T]:
125 | results = []
126 | while (
127 | self._position < len(self._data)
128 | and self._extractor(self._data[self._position]) <= timestamp
129 | ):
130 | results.append(self._data[self._position])
131 | self._position += 1
132 | return results
133 |
134 | def get_next(self) -> pd.Timestamp:
135 | if self._position >= len(self._data):
136 | return UTC_MAX
137 | else:
138 | return self._extractor(self._data[self._position])
139 |
140 |
141 | class ListDataSink(DataSink[list[T]]):
142 | def __init__(self):
143 | self._data = []
144 |
145 | def append(self, timestamp: pd.Timestamp, data: list[T]):
146 | self._data.extend(data)
147 |
148 | def close(self):
149 | pass
150 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | isolated_build = True
3 | envlist =
4 | py310,
5 | py311,
6 | py312,
7 | py313,
8 | linting,
9 |
10 | [testenv]
11 | allowlist_externals = poetry
12 | commands_pre =
13 | poetry install --no-root --sync --extras pyarrow --extras confluent_kafka --extras polars
14 | changedir = {envtmpdir}
15 | commands =
16 | poetry run coverage run --source=beavers --rcfile={toxinidir}/pyproject.toml --branch -m pytest {toxinidir}/tests
17 | poetry run python {toxinidir}/examples/advanced_concepts.py
18 | poetry run python {toxinidir}/examples/dag_concepts.py
19 | poetry run python {toxinidir}/examples/etfs.py
20 | poetry run python {toxinidir}/examples/pandas_concepts.py
21 | poetry run python {toxinidir}/examples/polars_concepts.py
22 | poetry run python {toxinidir}/examples/pyarrow_concepts.py
23 | poetry run python {toxinidir}/examples/replay_concepts.py
24 | poetry run coverage report --rcfile={toxinidir}/pyproject.toml -m --fail-under 95
25 | poetry run coverage xml --rcfile={toxinidir}/pyproject.toml -o {toxinidir}/coverage.xml
26 |
27 | [testenv:linting]
28 | deps = pre-commit
29 | commands = pre-commit run --all-files --show-diff-on-failure
30 |
31 | [gh-actions]
32 | python =
33 | 3.10: py310, linting
34 | 3.11: py311
35 | 3.12: py312
36 | 3.13: py313
37 |
--------------------------------------------------------------------------------