├── .bumpversion.cfg ├── .flake8 ├── .github └── workflows │ └── actions.yml ├── .gitignore ├── .gitlinks ├── .tarignore ├── CONTRIBUTING.md ├── LICENSE ├── LICENSE-NOSELLCLAUSE ├── MANIFEST.in ├── Makefile ├── NOTICE ├── README.rst ├── SECURITY.md ├── docker-compose.yml ├── minibatch ├── VERSION ├── __init__.py ├── __main__.py ├── _version.py ├── contrib │ ├── __init__.py │ ├── apps │ │ ├── __init__.py │ │ └── omegaml.py │ ├── celery.py │ ├── kafka.py │ ├── mongodb.py │ ├── mqtt.py │ └── omegaml.py ├── emitter │ ├── __init__.py │ └── base.py ├── example │ ├── __init__.py │ ├── basic │ │ ├── __init__.py │ │ └── __main__.py │ ├── kafka │ │ └── __init__.py │ ├── mongodb │ │ ├── __init__.py │ │ └── __main__.py │ ├── mqtt │ │ ├── __init__.py │ │ └── __main__.py │ └── util.py ├── marshaller.py ├── models.py ├── tests │ ├── __init__.py │ ├── test_celery.py │ ├── test_kafka.py │ ├── test_minibatch.py │ ├── test_mongodb.py │ ├── test_mqtt.py │ ├── test_omegaml.py │ └── util.py └── window.py ├── requirements.txt ├── resources ├── docker-compose-kafka.yml ├── docker-compose-mqtt.yml ├── enabled_plugins ├── rabbitmq.conf └── rabbitmq.config └── setup.py /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.5.2 3 | commit = True 4 | tag = False 5 | parse = (?P\d+)\.(?P\d+)\.(?P\d+)([-](?P(dev|rc))+(?P\d+))? 6 | serialize = 7 | {major}.{minor}.{patch}-{release}{build} 8 | {major}.{minor}.{patch} 9 | 10 | [bumpversion:part:release] 11 | first_value = dev 12 | optional_value = ga 13 | values = 14 | dev 15 | rc 16 | ga 17 | 18 | [bumpversion:part:build] 19 | first_value = 1 20 | 21 | [bumpversion:file:./minibatch/VERSION] 22 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E226,E302,E41,E501 3 | max-line-length = 119 4 | exclude = tests/*, build/* 5 | max-complexity = 10 6 | -------------------------------------------------------------------------------- /.github/workflows/actions.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [ push ] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: [ "3.10", "3.11" ] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v1 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install gil 23 | gil clone 24 | pip install -r requirements.txt 25 | - name: Lint with flake8 26 | run: | 27 | pip install flake8 28 | # stop the build if there are Python syntax errors or undefined names 29 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 30 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 31 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 32 | - name: Test with pytest 33 | run: | 34 | make test 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | 3 | # Created by https://www.gitignore.io/api/python 4 | # Edit at https://www.gitignore.io/?templates=python 5 | 6 | ### Python ### 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | pip-wheel-metadata/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # pipenv 76 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 77 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 78 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 79 | # install all needed dependencies. 80 | #Pipfile.lock 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # Spyder project settings 89 | .spyderproject 90 | .spyproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | # Mr Developer 96 | .mr.developer.cfg 97 | .project 98 | .pydevproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | .dmypy.json 106 | dmypy.json 107 | 108 | # Pyre type checker 109 | .pyre/ 110 | 111 | # End of https://www.gitignore.io/api/python 112 | /config.yml 113 | /config_hub.yml 114 | -------------------------------------------------------------------------------- /.gitlinks: -------------------------------------------------------------------------------- 1 | # dependent repositories pulled in by initlocal --install 2 | # or use gil clone 3 | # see https://github.com/chronoxor/gil 4 | # format: 5 | # name relpath git-repo-url branch 6 | omegaml ../omegaml-ce https://github.com/omegaml/omegaml master 7 | -------------------------------------------------------------------------------- /.tarignore: -------------------------------------------------------------------------------- 1 | .tox 2 | .venv 3 | .git 4 | 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to minibatch 2 | 3 | We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's: 4 | 5 | - Reporting a bug 6 | - Discussing the current state of the code 7 | - Submitting a fix 8 | - Proposing new features 9 | - Becoming a maintainer 10 | 11 | ## We Develop with Github 12 | We use github to host code, to track issues and feature requests, as well as accept pull requests. 13 | 14 | ## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests 15 | Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests: 16 | 17 | 1. Fork the repo and create your branch from `master`. 18 | 2. If you've added code that should be tested, add tests (in `tests/test_.py`). 19 | 3. If you've changed APIs, update the documentation. 20 | 4. Ensure the test suite passes (run `$ make test`). 21 | 5. Make sure your code lints (run `$ make lint`). 22 | 6. Issue that pull request! 23 | 24 | ## Any contributions you make will be under the Apache 2.0 Software License 25 | In short, when you submit code changes, your submissions are understood to be under the same [Appache 2.0](https://choosealicense.com/licenses/apache-2.0/) + "No Sell, Consulting Yes" License Condition that covers the project. Feel free to contact the maintainers if that's a concern. 26 | 27 | ## Report bugs using Github's [issues](https://github.com/omegaml/minibatch/issues) 28 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/omegaml/minibatch/issues/new); it's that easy! 29 | 30 | ## Write bug reports with detail, background, and sample code 31 | 32 | When submitting a bug report, please use this format 33 | 34 | * Summary - in just one sentence describe what this issue is about 35 | * Actual results/bug - what was the observed result 36 | * Expected results - what was the expected result 37 | * Steps to reproduce - a list of steps to reproduce the problem 38 | * Software versions - include the output of `pip freeze | grep -E "minibatch|mongo|kafka|mqtt|omega"` 39 | 40 | **Great Bug Reports** should include: 41 | 42 | - A quick summary and/or background 43 | - Steps to reproduce 44 | - Be specific! 45 | - Give sample code if you can 46 | - What you expected would happen 47 | - What actually happens 48 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work) 49 | 50 | People *love* thorough bug reports. I'm not even kidding. Read more about [why you should care to write a good bug report](https://medium.com/pitch-perfect/how-to-write-the-perfect-bug-report-6430f5a45cd) by Haje Jan Kamps. 51 | 52 | 53 | ## Coding Style 54 | 55 | In general we're following [pep8](https://www.python.org/dev/peps/pep-0008/) and use flake8 to achieve consistency. 56 | Run the following command to see if your code matches this project's standard. 57 | 58 | ``` 59 | $ make lint 60 | CONGRATULATIONS all is OK 61 | ``` 62 | 63 | ## License 64 | By contributing, you agree that your contributions will be licensed under the project's license as detailed in the LICENSE and LICENSE-NOSELLCLAUSE files 65 | 66 | ## References 67 | This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md) 68 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-NOSELLCLAUSE: -------------------------------------------------------------------------------- 1 | “No Sell, Consulting Yes” License Condition v1.1 2 | 3 | The Software is provided to you by the Licensor under the License, as defined below, subject to the following condition. 4 | 5 | Without limiting other conditions in the License, the grant of rights under the License will not include, 6 | and the License does not grant to you, right to Sell the Software. 7 | 8 | For purposes of the foregoing, “Sell” means practicing any or all of the rights granted to you under the License 9 | to provide to third parties, for a fee or other consideration (including without limitation fees for hosting Software), 10 | a product or service whose value derives, entirely or substantially, from the functionality of the Software. 11 | 12 | Any license notice or attribution required by the License must also include this Commons Cause License Condition notice. 13 | 14 | Software: minibatch 15 | License: Apache 2.0 16 | Licensor: one2seven GmbH, Hagenholzstr. 83b, Zuerich, Switzerland 17 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include docs 2 | recursive-include docs/source *.rst *.css *.py *.png *.gif *.svg *.jpg 3 | recursive-include omegaml/tests/features *.feature 4 | include LICENSE 5 | include NOTICE 6 | include THIRDPARTY 7 | include THIRDPARTY-LICENSES 8 | include README.rst 9 | include minibatch/VERSION 10 | include Procfile 11 | include docker-compose.yml 12 | include Dockerfile 13 | include requirements.txt 14 | include conda-requirements.txt -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: dist 2 | 3 | dist: 4 | mkdir -p dist 5 | rm -rf dist/* 6 | python setup.py sdist bdist_wheel 7 | 8 | pypi-test: dist 9 | twine check dist/* 10 | twine upload --repository testpypi-omegaml dist/* 11 | @echo now test your package! 12 | 13 | pypi-prod: lint test dist 14 | twine check dist/* 15 | twine upload --repository pypi-omegaml dist/* 16 | 17 | pypitest: 18 | # run this in a new conda env 19 | pip install -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ minibatch[all] 20 | 21 | clean: 22 | rm -rf ./dist 23 | rm -rf ./build 24 | 25 | lint: 26 | flake8 --exclude "build/*,dist/*" && echo CONGRATULATIONS all is OK 27 | 28 | test: 29 | docker compose up -d 30 | pytest -s -v 31 | 32 | bumppatch: 33 | bumpversion patch 34 | 35 | bumpminor: 36 | bumpversion minor 37 | 38 | bumpbuild: 39 | bumpversion build 40 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright 2014-2018 one2seven GmbH, Zurich 2 | Copyright 2014-2018 Patrick Senti, Zurich 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License 6 | as per the included LICENSE file. 7 | 8 | Unless required by applicable law or agreed to in writing, software 9 | distributed under the License is distributed on an "AS IS" BASIS, 10 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | See the License for the specific language governing permissions and 12 | limitations under the License. 13 | 14 | Third Party Packages are used by this Work as listed in the THIRDPARTY 15 | file. By using this Work you agree to adhere to the respective license 16 | of any and all third party packages. Runtime Dependencies are listed 17 | below. The licenses as known to us at the time of writing are listed 18 | in THIRDPARTY-LICENSES along with the download URL for each package. 19 | 20 | Runtime Dependencies are not included in this Work and shall be obtained 21 | and used by You in agreement with the respective third party in the name 22 | of and at the risk and cost of You or your Legal Entity. 23 | 24 | Noteable runtime dependencies are: 25 | 26 | RabbitMQ - https://www.rabbitmq.com/ (Mozilla Public License) 27 | MongoDB - https://www.mongodb.com/ (Server Side Public License, GNU AGPL v3.0) 28 | Docker - https://www.docker.com/ (Apache 2.0 license) [1] 29 | 30 | [1] is recommended, not required 31 | 32 | 33 | Software & Hardware requirements: 34 | 35 | * Single node / single user deployment 36 | 37 | CPU: at least dual core 64-bit 2.4 GHz 38 | RAM: at least 8GB, 16GB recommended 39 | Disk: at least 100GB free disk space, SSD recommended 40 | Operating System: Linux Ubuntu, Mint, Debian 41 | 42 | * Multi node / multi user 43 | 44 | Effectively operating a cluster of a scalable data science 45 | platform is an engineering challenge. We have solved this 46 | challenge and provide the omega|ml enterprise edition for 47 | this purpose. Request more information at https://omegaml.io 48 | or by sending an email to info@omegaml.io 49 | 50 | for every worker node 51 | CPU: at least dual core 64-bit 2.4 GHz 52 | RAM: at least 8GB, 16GB recommended 53 | Disk: at least 100GB free disk space, SSD recommended 54 | Operating System: Linux Ubuntu, Mint, Debian 55 | 56 | Windows is not currently supported - if you have a Windows requirements 57 | please let us know at info@omegaml.io 58 | 59 | for every MongoDB node 60 | CPU: at least dual core 64-bit 2.4 GHz 61 | RAM: at least 16GB, 32GB recommended 62 | Disk: at least 500GB free disk space, SSD 1TB free disk space recommended 63 | Operating System: Linux Ubuntu, Mint, Debian (recommended) 64 | 65 | see https://docs.mongodb.com/manual/administration/production-notes/#hardware-considerations 66 | 67 | for every RabbitMQ node 68 | CPU: at least dual core 64-bit 2.4 GHz 69 | RAM: at least 4GB 70 | Disk: at least 10GB free disk space 71 | Operating System: Linux Ubuntu, Mint, Debian (recommended) 72 | 73 | see https://www.rabbitmq.com/production-checklist.html 74 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | minibatch - Python Stream Processing for humans 2 | =============================================== 3 | 4 | |build badge| 5 | 6 | .. |build badge| image:: https://github.com/omegaml/minibatch/workflows/Python%20package/badge.svg 7 | .. _CONTRIBUTING.md: https://github.com/omegaml/minibatch/blog/master/CONTRIBUTING.md 8 | 9 | Dependencies: 10 | * a running MongoDB accessible to minibatch 11 | * Python 3.x 12 | * see extras & optional dependencies below for specific requirements 13 | 14 | minibatch provides a straight-forward, Python-native approach to mini-batch streaming and complex-event 15 | processing that is easily scalable. Streaming primarily consists of 16 | 17 | * a producer, which is some function inserting data into the stream 18 | * a consumer, which is some function retrieving data from the stream 19 | * transform and windowing functions to process the data in small batches and in parallel 20 | 21 | minibatch is an integral part of `omega|ml `_, however also works independently. omega|ml is the Python DataOps and MLOps 22 | platform for humans. 23 | 24 | Features 25 | -------- 26 | 27 | * native Python producers and consumers 28 | * includes three basic Window strategies: CountWindow, FixedTimeWindow, RelaxedTimeWindow 29 | * extensible Window strategies by subclassing and overriding a few methods 30 | * scalable, persistent streams - parallel inserts, parallel processing of windows 31 | 32 | A few hightlights 33 | 34 | * creating a stream and appending data is just 2 lines of code 35 | * producer and consumer stream code runs anywhere 36 | * no dependencies other than mongoengine, pymongo 37 | * extensible sources and sinks (already available: Kafka, MQTT, MongoDB collections, omega|ml datasets) 38 | * a fully functional streaming web app can be built in less than 15 lines of code (using Flask) 39 | 40 | Why is it called minibatch? Because it focuses on getting things done by using existing 41 | technology, and making it easy to use this techonlogy. It may be minimalistic in approach, but maximises results. 42 | 43 | Quick start 44 | ----------- 45 | 46 | 1. Install and setup 47 | 48 | .. code:: python 49 | 50 | $ pip install minibatch 51 | $ docker run -d -p 27017:27017 mongo 52 | 53 | See extras & optional dependencies below to select specific packages according 54 | to your deployment needs, e.g. for MQTT, Kafka, omega|ml 55 | 56 | 2. Create a stream producer or attach to a source 57 | 58 | .. code:: python 59 | 60 | import minibatch as mb 61 | stream = mb.stream('test') 62 | for i in range(100): 63 | stream.append({'date': datetime.datetime.utcnow().isoformat()}) 64 | sleep(.5) 65 | 66 | Currently there is support for Kafka and MQTT sources. However 67 | arbitrary other sources can be added. 68 | 69 | .. code:: python 70 | 71 | from minibatch.contrib.kafka import KafkaSource 72 | source = KafkaSource('topic', urls=['kafka:port']) 73 | stream.attach(source) 74 | 75 | 76 | 3. Consume the stream 77 | 78 | .. code:: python 79 | 80 | from minibatch import streaming 81 | @streaming('test', size=2, keep=True) 82 | def myprocess(window): 83 | print(window.data) 84 | return window 85 | 86 | => 87 | [{'date': '2018-04-30T20:18:22.918060'}, {'date': '2018-04-30T20:18:23.481320'}] 88 | [{'date': '2018-04-30T20:18:24.041337'}, {'date': '2018-04-30T20:18:24.593545'} 89 | ... 90 | 91 | `myprocess` is called for every N-tuple of items (`size=2`) appended to the stream by the producer(s). 92 | The frequency is determined by the emitter strategy. This can be configured or changed for a custom 93 | emitter strategy, as shown in the next step. 94 | 95 | 4. Configure the emitter strategy 96 | 97 | Note the `@streaming` decorator. It implements a blocking consumer that delivers batches 98 | of data according to some strategy implemented by a WindowEmitter. Currently `@streaming` 99 | provides the following interface: 100 | 101 | * `size=N` - uses the :code:`CountWindow` emitter 102 | * `interval=SECONDS` - uses the :code:`RelaxedTimeWindow` emitter 103 | * `interval=SECONDS, relaxed=False` - uses the :code:`FixedTimeWindow` emitter 104 | * `emitter=CLASS:WindowEmitter` - uses the given subclass of a :code:`WindowEmitter` 105 | * `workers=N` - set the number of workers to process the decorated function, defaults to number of CPUs 106 | * `executor=CLASS:Executor` - the asynchronous executor to use, defaults to :code:`concurrent.futures.ProcessPoolExecutor` 107 | 108 | 109 | 5. Write a flask app as a streaming source 110 | 111 | This is a simple helloworld-style streaming application that is fully 112 | functional and distributable. 113 | 114 | .. code:: python 115 | 116 | # app.py 117 | def consumer(url): 118 | @streaming('test-stream', url=url) 119 | def processing(window): 120 | ... # whatever processing you need to do 121 | 122 | if __name__ == '__main__': 123 | app = StreamingApp() 124 | app.start_streaming(consumer) 125 | app.run() 126 | 127 | # run the app (check status at http://localhost:5000/status) 128 | $ python app.py 129 | 130 | # in an other process, stream data 131 | $ python 132 | [] import minibatch as mb 133 | stream = mb.stream('test-stream') 134 | stream.append(dict(data='foobar') 135 | 136 | Note there is no UI in this example, however the data is processed as 137 | it comes in. To add a UI, specify using @app.route, as for any flask app, 138 | write the processed data into a sink that the UI can access. For a 139 | full example see help(minibatch.contrib.apps.omegaml.StreamingApp) 140 | 141 | 142 | 143 | Stream sources 144 | -------------- 145 | 146 | Currently provided in :code:`minibatch.contrib`: 147 | 148 | * KafkaSource - attach a stream to a Apache Kafka topic 149 | * MQTTSource - attach to an MQTT broker 150 | * MongoSource - attach to a MongoDB collection 151 | * DatasetSource - attach to a omega|ml dataset 152 | * CeleryEventSource - attach to a Celery app event dispatcher 153 | 154 | Stream sources are arbitrary objects that support the :code:`stream()` 155 | method, as follows. 156 | 157 | .. code:: python 158 | 159 | class SomeSource: 160 | ... 161 | def stream(self, stream): 162 | for data in source: 163 | stream.append(data) 164 | 165 | 166 | Stream Sinks 167 | ------------ 168 | 169 | The result of a stream can be forwarded to a sink. Currently 170 | provided sinks in :code:`minibatch.contrib` are: 171 | 172 | * KafkaSink - forward messagess to a Apache Kafka topic 173 | * MQTTSink - forward messages to an MQTT broker 174 | * MongoSink - forward messages to a MongoDB collection 175 | * DatasetSink - write to a omega|ml dataset 176 | 177 | Stream sinks are arbitrary objects that support the :code:`put()` 178 | method, as follows. 179 | 180 | .. code:: python 181 | 182 | class SomeSink: 183 | ... 184 | def put(self, message): 185 | sink.send(message) 186 | 187 | 188 | Window emitters 189 | --------------- 190 | 191 | minibatch provides the following window emitters out of the box: 192 | 193 | * :code:`CountWindow` - emit fixed-sized windows. Waits until at least *n* messages are 194 | available before emitting a new window 195 | * :code:`FixedTimeWindow`- emit all messages retrieved within specific, time-fixed windows of 196 | a given interval of *n* seconds. This guarantees that messages were received in the specific 197 | window. 198 | * :code:`RelaxedTimeWindow` - every interval of *n* seconds emit all messages retrieved since 199 | the last window was created. This does not guarantee that messages were received in a given 200 | window. 201 | 202 | 203 | Implementing a custom WindowEmitter 204 | ----------------------------------- 205 | 206 | Custom emitter strategies are implemented as a subclass to :code:`WindowEmitter`. The main methods 207 | to implement are 208 | 209 | * :code:`window_ready` - returns the tuple :code:`(ready, data)`, where ready is True if there is data 210 | to emit 211 | * :code:`query` - returns the data for the new window. This function retrieves the :code:`data` part 212 | of the return value of :code:`window_ready` 213 | 214 | See the API reference for more details. 215 | 216 | .. code:: python 217 | 218 | class SortedWindow(WindowEmitter): 219 | """ 220 | sort all data by value and output only multiples of 2 in batches of interval size 221 | """ 222 | def window_ready(self): 223 | qs = Buffer.objects.no_cache().filter(processed=False) 224 | data = [] 225 | for obj in sorted(qs, key=lambda obj : obj.data['value']): 226 | if obj.data['value'] % 2 == 0: 227 | data.append(obj) 228 | if len(data) >= self.interval: 229 | break 230 | self._data = data 231 | return len(self._data) == self.interval, () 232 | 233 | def query(self, *args): 234 | return self._data 235 | 236 | 237 | What is streaming and how does minibatch implement it? 238 | ------------------------------------------------------ 239 | 240 | *Concepts* 241 | 242 | Instead of directly connection producers and consumers, a producer sends messages to a stream. Think 243 | of a stream as an endless buffer, or a pipeline, that takes input from many producers on one end, and 244 | outputs messages to a consumer on the other end. This transfer of messages happens asynchronously, that 245 | is the producer can send messages to the stream independent of whether the consumer is ready to receive, and the consumer can take messages from the stream independent of whether the producer is ready to send. 246 | 247 | Unlike usual asynchronous messaging, however, we want the consumer to receive messages in small batches to optimize throughput. That is, we want the pipeline to *emit* messages only subject to some criteria 248 | of grouping messages, where each group is called a *mini-batch*. The function that determines whether the 249 | batching criteria is met (e.g. time elapsed, number of messages in the pipeline) is called *emitter strategy*, 250 | and the output it produces is called *window*. 251 | 252 | Thus in order to connect producers and consumers we need the following parts to our streaming system: 253 | 254 | * a :code:`Stream`, keeping metadata for the stream such as its name and when it was created, last read etc. 255 | * a :code:`Buffer` acting as the buffer where messages sent by producers are stored until the emitting 256 | * a :code:`WindowEmitter` implementing the emitter strategy 257 | * a :code:`Window` representing the output produced by the emitter strategy 258 | 259 | .. note:: 260 | 261 | The producer accepts input from some external system, say an MQTT end point. The producer's responsibility is to enter the data into the streaming buffer. 262 | The consumer uses an emitter strategy to produce a Window of data that is then forwarded to the user's processing code. 263 | 264 | *Implementation* 265 | 266 | minibatch uses MongoDB to implement Streams, Buffers and Windows. Specifically, the following collections are used: 267 | 268 | * `stream` - represents instances of `Stream`, each document is a stream with a unique name 269 | * `buffer` - a virtually endless buffer for all streams in the system, each document contains one message of a stream 270 | * `window`- each document represents the data as emitted by the particular emitter strategy 271 | 272 | By default messages go through the following states 273 | 274 | 1. upon append by a producer: message is inserted into `buffer`, with flag `processed = False` 275 | 2. upon being seen by an emitter: message is marked as `processed = True` 276 | 3. upon being emitted: message is copied to `window`, marked `processed = False` (in Window) 277 | 4. upon emit success (no exceptions raised by the emit function): message is deleted from `buffer` 278 | and marked `processed = True` in `window` 279 | 280 | Notes: 281 | 282 | * emitters typically act on a collection of messages, that is steps 2 - 4 are applied to more 283 | than one message at a time 284 | 285 | * to avoid deleting messages from the buffer, pass `@streaming(..., keep=True)` 286 | 287 | * custom emitters can modify the behavior of both creating windows and handling the buffer by 288 | overriding the `process()`, `emit()` and `commit()` methods for each of the above steps 289 | 2/3/4, respectively. 290 | 291 | Extras & optional dependencies 292 | ------------------------------ 293 | 294 | minibatch provides the following pip install extras, which come with some 295 | additional dependencies. Extras are installed by running 296 | 297 | .. code:: bash 298 | 299 | $ pip install minibatch[|all] 300 | 301 | Available extras are: 302 | 303 | * :code:`apps` - adds StreamingApp for easy development & deployment of producers & consumers 304 | * :code:`kafka` - to work with Kafka as a source or a sink 305 | * :code:`mqtt` - to work with an MQTT broker as a source or a sink 306 | * :code:`mongodb` - to work with MongoDB as a source or a sink 307 | * :code:`omegaml` - to work with omega|ml datasets as a source or a sink 308 | * :code:`all` - all of the above 309 | * :code:`dev` - all of the above plus a few development packages 310 | 311 | 312 | Further development 313 | ------------------- 314 | 315 | Here are a couple of ideas to extend minibatch. Contributions are welcome. 316 | 317 | * more examples, following typical streaming examples like word count, filtering 318 | * more emitter strategies, e.g. for sliding windows 319 | * performance testing, benchmarking 320 | * distributed processing of windows via distributed framework such as celery, ray, dask 321 | * extend emitters by typical stream operations e.g. to support operations like count, filter, map, groupby, merge, join 322 | * add other storage backends (e.g. Redis, or some Python-native in-memory db that provides network access and an easy to use ORM layer, like mongoengine does for MongoDB) 323 | 324 | Contributing 325 | ------------ 326 | 327 | We welcome any contributions - examples, issues, bug reports, documentation, code. Please see `CONTRIBUTING.md`_ 328 | for details. 329 | 330 | License 331 | ------- 332 | 333 | Apache 2.0 licensed with "No Sell, Consulting Yes" clause. 334 | See LICENSE and LICENSE-NOSELLCLAUSE files. 335 | 336 | 337 | 338 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | Use this section to tell people about which versions of your project are 6 | currently being supported with security updates. 7 | 8 | | Version | Supported | 9 | | ------- | ------------------ | 10 | | 0.5.2 | :heavy_check_mark: | 11 | | 0.5.1 | :x: | 12 | | < 0.5.0 | :x: | 13 | 14 | ## Reporting a Vulnerability 15 | 16 | Report any observed or suspected vulnerability to security@omegaml.io 17 | For urgent matters please follow the steps found in https://www.omegaml.io/sirp 18 | 19 | Attending to urgent matters in your setup is subject to our license and support terms of services 20 | as specified at https://www.omegaml.io 21 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | mongodb: 4 | image: mongo:7 5 | ports: 6 | - "27017:27017" 7 | 8 | -------------------------------------------------------------------------------- /minibatch/VERSION: -------------------------------------------------------------------------------- 1 | 0.5.2 2 | -------------------------------------------------------------------------------- /minibatch/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import types 4 | from mongoengine import get_connection, ConnectionFailure 5 | from pymongo.errors import AutoReconnect 6 | from time import sleep 7 | 8 | from minibatch._version import version # noqa 9 | from minibatch.models import Stream, Buffer, Window # noqa 10 | 11 | logger = logging.getLogger(__name__) 12 | mongo_pid = None 13 | 14 | 15 | def streaming(name, fn=None, interval=None, size=None, emitter=None, 16 | relaxed=True, keep=False, url=None, sink=None, 17 | queue=None, source=None, blocking=True, **kwargs): 18 | """ 19 | make and call a streaming function 20 | 21 | Usage: 22 | # fixed-size stream 23 | @stream(name, size=n) 24 | def myproc(window): 25 | # process window.data 26 | 27 | # time-based stream 28 | @stream(name, interval=seconds) 29 | def myproc(window): 30 | # process window.data 31 | 32 | # arbitrary WindowEmitter subclass 33 | @stream(name, emitter=MyWindowEmitter): 34 | def myproc(window): 35 | # process window.data 36 | 37 | If interval is given, a RelaxedTimeWindow into the stream is created. A 38 | RelaxedTimeWindow will call the decorated function with a window of data 39 | since the last time it did so. To get a FixedTimeWindow, specify 40 | relaxed=False. 41 | 42 | If size is given, a CountWindow into the stream is created. A CountWindow 43 | will call the decorated function with a window of exactly #size of 44 | objects in data. 45 | 46 | If a WindowEmitter subclass is given, an instance of that emitter is 47 | created and passed any optional kwargs and it's run() method is called. 48 | This emitter may process the buffered data in any arbitrary way it chooses. 49 | 50 | Args: 51 | name: the stream name 52 | interval: interval in seconds 53 | size: interval in count of buffered, unprocessed objects in stream 54 | emitter: optional, a WindowEmitter subclass (advanced) 55 | relaxed: optional, defaults to True. chooses between Relaxed and 56 | keep: optional, keep Buffer and Stream data. defaults to False 57 | url: the mongo db url 58 | **kwargs: kwargs passed to emitter class 59 | """ 60 | 61 | def make(fn): 62 | return make_emitter(name, fn, interval=interval, size=size, 63 | emitter=emitter, relaxed=relaxed, keep=keep, 64 | url=url, sink=sink, queue=queue, source=source, 65 | **kwargs) 66 | 67 | def inner(fn): 68 | if not hasattr(inner, '_em'): 69 | inner._em = make(fn) 70 | 71 | inner._em.run(blocking=blocking) 72 | 73 | inner.apply = lambda fn: inner(fn) 74 | inner.make = lambda fn: make(fn) 75 | return inner if fn is None else inner.apply(fn) 76 | 77 | 78 | def stream(name, fn=None, url=None, ssl=False, **kwargs): 79 | if callable(fn): 80 | return streaming(name, url=url, ssl=ssl, **kwargs)(fn) 81 | kwargs.update(url=url, ssl=ssl) 82 | return Stream.get_or_create(name, **kwargs) 83 | 84 | 85 | class IntegrityError(Exception): 86 | pass 87 | 88 | 89 | def make_emitter(name, emitfn, interval=None, size=None, relaxed=False, 90 | url=None, sink=None, emitter=None, keep=False, queue=None, 91 | source=None, cnx_kwargs=None, **kwargs): 92 | from minibatch.window import RelaxedTimeWindow, FixedTimeWindow, CountWindow 93 | 94 | size = 1 if size is None and interval is None else size 95 | forwardfn = sink.put if sink else None 96 | if isinstance(emitfn, types.BuiltinFunctionType): 97 | orig_emitfn = emitfn 98 | emitfn = lambda *args, **kwargs: orig_emitfn(*args, **kwargs) # noqa 99 | emitfn._count = 0 100 | cnx_kwargs = cnx_kwargs or {} 101 | cnx_kwargs.update(url=url) if url else None 102 | stream = Stream.get_or_create(name, interval=interval or size, **cnx_kwargs) 103 | kwargs.update(stream=stream, emitfn=emitfn, forwardfn=forwardfn, queue=queue) 104 | if interval and emitter is None: 105 | if relaxed: 106 | em = RelaxedTimeWindow(name, interval=interval, **kwargs) 107 | else: 108 | em = FixedTimeWindow(name, interval=interval, **kwargs) 109 | elif size and emitter is None: 110 | em = CountWindow(name, interval=size, **kwargs) 111 | elif emitter is not None: 112 | em = emitter(name, emitfn=emitfn, 113 | interval=interval or size, 114 | **kwargs) 115 | else: 116 | raise ValueError("need either interval=, size= or emitter=") 117 | em.persist(keep) 118 | if source: 119 | # starts a background thread that inserts source messages into the Buffer 120 | stream.attach(source, background=True) 121 | return em 122 | 123 | 124 | def reset_mongoengine(): 125 | # this is to avoid mongoengine's MongoClient instances in subprocesses 126 | # resulting in "MongoClient opened before fork" warning 127 | # the source of the problem is that mongoengine stores MongoClients in 128 | # a module-global dictionary. here we simply clear that dictionary before 129 | # the connections are re-created in a forked process 130 | # note this doesn't actually disconnect it just deletes the MongoClient 131 | # see https://stackoverflow.com/a/49404748/890242 132 | # https://github.com/MongoEngine/mongoengine/issues/1599#issuecomment-374901186 133 | from mongoengine import connection 134 | # There is a fix in mongoengine 18.0 that is supposed to introduce the same 135 | # behavior using disconnection_all(), however in some cases this is the 136 | # actual source of the warning due to calling connection.close() 137 | # see https://github.com/MongoEngine/mongoengine/pull/2038 138 | # -- the implemented solution simply ensures MongoClients get recreated 139 | # whenever needed 140 | 141 | def clean(d): 142 | if 'minibatch' in d: 143 | del d['minibatch'] 144 | if 'default' in d: 145 | del d['default'] 146 | 147 | clean(connection._connection_settings) 148 | clean(connection._connections) 149 | clean(connection._dbs) 150 | Window._collection = None 151 | Buffer._collection = None 152 | Stream._collection = None 153 | 154 | 155 | def authenticated_url(mongo_url, authSource='admin'): 156 | if '+srv' in str(mongo_url): 157 | # all configuration is provided by the DNS 158 | # https://docs.mongodb.com/manual/reference/connection-string/#std-label-connections-dns-seedlist 159 | return mongo_url 160 | if mongo_url and 'authSource' not in str(mongo_url): 161 | joiner = '&' if '?' in mongo_url else '?' 162 | mongo_url = '{}{}authSource={}'.format(mongo_url, joiner, authSource) 163 | return mongo_url 164 | 165 | 166 | def connectdb(url=None, dbname=None, alias=None, authSource='admin', 167 | fast_insert=True, **kwargs): 168 | from mongoengine import connect 169 | from mongoengine.connection import get_db 170 | 171 | url = url or os.environ.get('MINIBATCH_MONGO_URL') or os.environ.get('MONGO_URL') 172 | url = authenticated_url(url, authSource=authSource) if authSource else url 173 | alias = alias or 'minibatch' 174 | reset_mongoengine() 175 | if False and fast_insert: 176 | # set writeConcern options 177 | # https://pymongo.readthedocs.io/en/stable/api/pymongo/mongo_client.html?highlight=mongoclient 178 | # w = 0 - disable write ack 179 | # journal = False - do not wait for write 180 | kwargs['w'] = 0 181 | kwargs['journal'] = False 182 | kwargs.setdefault('uuidRepresentation', 'standard') 183 | connect(alias=alias, db=dbname, host=url, connect=False, **kwargs) 184 | waitForConnection(get_connection(alias)) 185 | return get_db(alias=alias) 186 | 187 | 188 | def waitForConnection(client): 189 | _exc = None 190 | command = client.admin.command 191 | for i in range(100): 192 | try: 193 | # The ping command is cheap and does not require auth. 194 | command('ping') 195 | except (ConnectionFailure, AutoReconnect) as e: 196 | sleep(0.01) 197 | _exc = e 198 | else: 199 | _exc = None 200 | break 201 | if _exc is not None: 202 | raise _exc 203 | -------------------------------------------------------------------------------- /minibatch/__main__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omegaml/minibatch/82775bebef485d91da343f587b5484c6858424dd/minibatch/__main__.py -------------------------------------------------------------------------------- /minibatch/_version.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | base_path = os.path.join(os.path.dirname(__file__), 'VERSION') 4 | with open(base_path) as rin: 5 | version = rin.read().split('\n')[0] 6 | -------------------------------------------------------------------------------- /minibatch/contrib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omegaml/minibatch/82775bebef485d91da343f587b5484c6858424dd/minibatch/contrib/__init__.py -------------------------------------------------------------------------------- /minibatch/contrib/apps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omegaml/minibatch/82775bebef485d91da343f587b5484c6858424dd/minibatch/contrib/apps/__init__.py -------------------------------------------------------------------------------- /minibatch/contrib/apps/omegaml.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process 2 | from threading import Thread 3 | 4 | from flask import Blueprint, Flask 5 | from time import sleep 6 | 7 | 8 | class StreamingApp(Blueprint): 9 | """ 10 | A streaming app using omegaml as its data storage 11 | 12 | This is a simple Flask app that runs a streaming processing function, 13 | receiving and processing data written to a given minibatch stream. Optionally 14 | the app can have its own UI. 15 | 16 | Syntax: 17 | app = StreamingApp() 18 | app.start_streaming(consumer, on=om) 19 | app.run() 20 | 21 | Where consumer is a function that creates a @streaming processor, i.e. 22 | 23 | def consumer(url=None): 24 | @streaming('stream-name', url=url) 25 | def processing(window): 26 | ... 27 | 28 | Notes: 29 | - the consumer(url=) argument receives the omegaml's data store url. If 30 | not given it defaults to env $MINIBATCH_MONGO_URL, reverting to $MONGO_URL. 31 | Typically specify as om.datasets.mongo_url 32 | - the start_streaming(..., on=om) argument receives the omegaml instance. 33 | this is optional, if not given, import omegaml as om is used to create 34 | an instance from the environment 35 | 36 | Usage: 37 | A working example 38 | 39 | # in your app.py 40 | def create_app(server=None, uri=None, **kwargs): 41 | # this creates the flask app as the serverless app to run the stream 42 | app = StreamingApp(server=server, uri=uri) 43 | 44 | # add any route you like to have, if any 45 | @app.route('/') 46 | def index(): 47 | import omegaml as om 48 | return '{}'.format(om.datasets.get('test-stream')) 49 | 50 | # start it up 51 | app.server.register_blueprint(app) 52 | app.start_streaming(consumer) 53 | return app 54 | 55 | def consumer(url=None): 56 | # this creates the processing function attached to the 'test' stream 57 | # note consumer will be run on a sep process, processing will be run on a pool 58 | @streaming('test', url=url) 59 | def processing(window): 60 | import omegaml as om 61 | om.datasets.put(window.data, 'test-stream', raw=True) 62 | 63 | # to test locally 64 | if __name__ == '__main__': 65 | app = create_app(server=True) 66 | app.run() 67 | app.stop() 68 | 69 | # in some other process send data to the stream 70 | $ python 71 | [] import minibatch as mb 72 | import omegaml as om 73 | 74 | stream = mb.stream('test', url=om.defaults.OMEGA_MONGO_URL) 75 | stream.append(dict(data='foo')) 76 | 77 | # by refreshing on http://localhost:5000 you will see the data written to the stream 78 | """ 79 | 80 | def __init__(self, consumer=None, name=None, import_name=None, uri=None, server=None, **kwargs): 81 | name = name or 'streaming' 82 | import_name = import_name or __name__ 83 | super().__init__(name, import_name, url_prefix=uri, **kwargs) 84 | self.setup(server, import_name, consumer) 85 | 86 | def setup(self, server, import_name, consumer): 87 | if server is None or server is True: 88 | server = Flask(import_name) 89 | self.server = server 90 | self.consumer = consumer 91 | self.proc = None 92 | self.register_routes(self) 93 | 94 | def register_routes(self, app): 95 | @app.route('/status') 96 | def streaming_app_status(): 97 | return "running minibatch" 98 | 99 | def _start_consumer(self, consumer=None, on=None, debug=False): 100 | from minibatch import authenticated_url 101 | import omegaml as om 102 | 103 | consumer = consumer or self.consumer 104 | assert consumer is not None, "you must specify a consumer=function(url) to create the @streaming processor" 105 | 106 | om = on or om 107 | consumer_kwargs = dict(url=authenticated_url(om.defaults.OMEGA_MONGO_URL)) 108 | if debug: 109 | consumer(**consumer_kwargs) 110 | elif self.proc is None: 111 | self.proc = Process(target=consumer, 112 | kwargs=consumer_kwargs) 113 | self.proc.start() 114 | 115 | watcher = Thread(target=self.watcher) 116 | watcher.start() 117 | 118 | def watcher(self): 119 | while self.proc.is_alive(): 120 | sleep(.1) 121 | print("WARNING streaming processor has stopped, exitcode {}".format(self.proc.exitcode)) 122 | 123 | def _stop_consumer(self): 124 | if self.proc: 125 | self.proc.terminate() 126 | self.proc.join() 127 | 128 | def run_server(self, *args, **kwargs): 129 | return self.server.run(*args, **kwargs) 130 | 131 | # convenience 132 | run = run_server 133 | start = _start_consumer 134 | stop = _stop_consumer 135 | start_streaming = _start_consumer 136 | stop_streaming = _stop_consumer 137 | -------------------------------------------------------------------------------- /minibatch/contrib/celery.py: -------------------------------------------------------------------------------- 1 | class CeleryEventSource: 2 | """ A CeleryEventSource 3 | 4 | This implements a Celery event listener that forwards task 5 | information to a minibatch stream 6 | 7 | Usage: 8 | # start consuming from celery 9 | celeryapp = celery.current_app # 10 | stream = mb.stream('test') 11 | source = CeleryEventSource(celeryapp) 12 | 13 | # stream consumer 14 | streaming('test')(lambda v: print(v)) 15 | 16 | The information forwarded is the dict returned by the 17 | CeleryEventSource.task_info method: 18 | 19 | { 20 | 'task_name': task.name, # the task's anme 21 | 'task_id': task.uuid, # the task's uuid 22 | 'task_info': task.info(), # the task info (kwargs, args, return value) 23 | 'task_state': task.state, # state 24 | 'task_runtime': task.runtime, # total runtime 25 | } 26 | 27 | Args: 28 | celeryapp (Celery.app): the celery application 29 | events (list): optional, list of events to process, defaults to the 30 | self.default_events 31 | 32 | See Also 33 | https://docs.celeryproject.org/en/latest/userguide/monitoring.html#real-time-processing 34 | """ 35 | default_events = ('task-succeeded', 'task-failed') 36 | 37 | def __init__(self, celeryapp, events=None): 38 | self.celeryapp = celeryapp 39 | self._stream = None 40 | self._state = None 41 | self._events = events if events is not None else self.default_events 42 | self._recv = None 43 | 44 | @property 45 | def handlers(self): 46 | state = self.state 47 | handlers = { 48 | '*': state.event 49 | } 50 | for ev in self._events: 51 | handlers[ev] = self._append 52 | return handlers 53 | 54 | def stream(self, stream): 55 | # adopted from https://docs.celeryproject.org/en/stable/userguide/monitoring.html#real-time-processing 56 | app = self.celeryapp 57 | self._stream = stream 58 | with app.connection() as connection: 59 | recv = self._recv = app.events.Receiver(connection, handlers=self.handlers) 60 | recv.capture(limit=None, timeout=None, wakeup=True) 61 | 62 | @property 63 | def state(self): 64 | if self._state is None: 65 | self._state = self.celeryapp.events.State() 66 | return self._state 67 | 68 | @property 69 | def recv(self): 70 | return self._recv 71 | 72 | def task_info(self, task, event): 73 | return { 74 | 'task_event': event, 75 | 'task_name': task.name, 76 | 'task_id': getattr(task, 'uuid'), 77 | 'task_info': task.info(), 78 | 'task_state': task.state, 79 | 'task_runtime': task.runtime, 80 | } 81 | 82 | def _append(self, event): 83 | state = self.state 84 | # process latest event 85 | state.event(event) 86 | if 'uuid' in event: 87 | # get task info 88 | task = state.tasks.get(event['uuid']) 89 | event = event.get('type') 90 | # append to stream 91 | self._stream.append(self.task_info(task, event)) 92 | else: 93 | self._stream.append(event) 94 | 95 | def cancel(self): 96 | recv = self.recv 97 | if recv is not None: 98 | recv.should_stop = True 99 | -------------------------------------------------------------------------------- /minibatch/contrib/kafka.py: -------------------------------------------------------------------------------- 1 | from json import loads, dumps 2 | 3 | from kafka import KafkaConsumer, KafkaProducer 4 | 5 | 6 | class KafkaSource: 7 | """ 8 | A kafka topic source 9 | 10 | Usage: 11 | # start consuming from Kafka 12 | stream = mb.stream('test') 13 | source = KafkaSource('kafka-topic', urls=['kafka:9092']) 14 | stream.attach(source) 15 | 16 | # stream to a python callable 17 | streaming('test')(lambda v: print(v)) 18 | 19 | Args: 20 | topic (str): the kafka topic 21 | urls (list): the kafka broker urls, defaults to localhost:9092 22 | **configs: the keyword parameters to use on the kafka.KafkaConsumer 23 | defaults to dict(bootstrap_servers=urls or ['localhost:9092'], 24 | auto_offset_reset='earliest', enable_auto_commit=True, 25 | group_id='group', 26 | value_deserializer=lambda x: loads(x.decode('utf-8')) 27 | """ 28 | 29 | def __init__(self, topic, urls=None, **configs): 30 | self.topic = topic 31 | if isinstance(urls, str): 32 | urls = [urls] 33 | self.configs = dict( 34 | bootstrap_servers=urls or ['localhost:9092'], 35 | auto_offset_reset='earliest', 36 | enable_auto_commit=True, 37 | group_id='group', 38 | value_deserializer=lambda x: loads(x.decode('utf-8')) 39 | ) 40 | self.configs.update(configs) 41 | self._consumer = None 42 | self._cancel = False 43 | 44 | @property 45 | def consumer(self): 46 | if self._consumer is None: 47 | self._consumer = KafkaConsumer(self.topic, **self.configs) 48 | return self._consumer 49 | 50 | def stream(self, stream): 51 | self._cancel = False 52 | for message in self.consumer: 53 | if self._cancel: 54 | break 55 | stream.append(message.value) 56 | 57 | def cancel(self): 58 | self._cancel = True 59 | 60 | 61 | class KafkaSink: 62 | """ 63 | A Kafka topic sink 64 | """ 65 | 66 | def __init__(self, topic, urls=None, expand=True, **configs): 67 | """ 68 | 69 | Args: 70 | topic: 71 | urls: 72 | expand: if True will send each input message seperately, if False 73 | will send all input messages as provided. 74 | **configs: 75 | """ 76 | self.topic = topic 77 | if isinstance(urls, str): 78 | urls = [urls] 79 | self.configs = dict( 80 | bootstrap_servers=urls or ['localhost:9092'], 81 | value_serializer=lambda x: dumps(x).encode('utf-8') 82 | ) 83 | self.configs.update(configs) 84 | self._producer = None 85 | self.expand = expand 86 | 87 | @property 88 | def producer(self): 89 | if self._producer is None: 90 | self._producer = KafkaProducer(**self.configs) 91 | return self._producer 92 | 93 | def put(self, message, topic=None): 94 | topic = topic or self.topic 95 | if self.expand: 96 | if not isinstance(message, (tuple, list)): 97 | message = [message] 98 | result = [self.producer.send(topic, value=m) for m in message] 99 | else: 100 | result = self.producer.send(topic, value=message) 101 | return result 102 | -------------------------------------------------------------------------------- /minibatch/contrib/mongodb.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | from time import sleep 3 | 4 | from minibatch import logger 5 | 6 | 7 | class MongoSource: 8 | """ 9 | A mongodb collection source 10 | 11 | Usage: 12 | # start consuming from mongo collection 13 | stream = mb.stream('test') 14 | source = MongoSource(collection) 15 | stream.attach(source) 16 | 17 | # stream to a python callable 18 | streaming('test')(lambda v: print(v)) 19 | 20 | Args: 21 | collection (pymongo.Collection): a mongo collection 22 | size (int): the number of new documents to fetch for each stream.append 23 | defaults to 1 24 | idcol (str): the name of the id column, defaults to _id 25 | delay (float): the wait time in seconds between change queries, defaults 26 | to .1 27 | 28 | Notes: 29 | * the collection must have a key column that is naturally ordered (ascending), 30 | that is a new records' key must be compare greater as any previous key 31 | 32 | * by default MongoSources uses the object id (_id column) as the sort order, 33 | because it is naturally increasing for each new inserted object. Specify 34 | as idcol='column name' 35 | 36 | * MongoSource implements a polling change observer that is executed on 37 | once every delay seconds, for every query it retrieves at most N=size 38 | messages. All messages are appended to the stream one by one. 39 | If you know that new messages arrive more or less frequently change 40 | either size or delay to optimize polling behavior. For example if 41 | messages arrive more frequently than .1 seconds but processing them 42 | in steps of > .1 seconds is ok, specify size=number of messages in each 43 | interval. If messages arrive less frequently than every .1 seconds, 44 | considering specifying a delay > .1 seconds to reduce the polling load 45 | on the database. 46 | 47 | * For a Mongo replicaset, use the MongoReplicasetSource instead. It uses 48 | the MongoDB native change stream instead of a polling change observer 49 | which is more efficient. 50 | """ 51 | 52 | def __init__(self, collection, size=1, idcol=None, delay=.1): 53 | self.collection = collection 54 | self._cancel = None 55 | self._lastid = None 56 | self._size = size 57 | self._idcol = '_id' 58 | self._delay = delay 59 | 60 | def changes(self, N=1): 61 | latest_id = None 62 | while not self._cancel: 63 | sortkey = { 64 | 'sort': [('_id', pymongo.ASCENDING)], 65 | } 66 | query = {} 67 | if latest_id is not None: 68 | query[self._idcol] = { 69 | '$gt': latest_id 70 | } 71 | docs = self.collection.find(query, **sortkey).limit(N) 72 | for doc in docs: 73 | latest_id = doc[self._idcol] 74 | yield doc 75 | sleep(self._delay) 76 | 77 | def stream(self, stream): 78 | self._cancel = False 79 | while not self._cancel: 80 | for doc in self.changes(N=self._size): 81 | if self._cancel: 82 | break 83 | stream.append(doc) 84 | logger.debug("stream done") 85 | 86 | def cancel(self): 87 | self._cancel = True 88 | 89 | 90 | class MongoSink: 91 | """ 92 | A mongodb collection sink 93 | """ 94 | def __init__(self, collection): 95 | self.collection = collection 96 | 97 | def put(self, messages): 98 | if isinstance(messages, dict): 99 | messages = [messages] 100 | return self.collection.insert_many(messages) 101 | 102 | 103 | class MongoReplicasetSource(MongoSource): 104 | def changes(self, N=1): 105 | criteria = [ 106 | {'$match': { 107 | 'operationType': 'insert' 108 | }} 109 | ] 110 | self._cancel = False 111 | docs = [] 112 | with self.collection.watch(criteria) as changes: 113 | for doc in changes: 114 | if self._cancel: 115 | break 116 | docs.append(doc) 117 | if len(docs) >= N: 118 | yield docs 119 | docs = [] 120 | -------------------------------------------------------------------------------- /minibatch/contrib/mqtt.py: -------------------------------------------------------------------------------- 1 | import json 2 | from uuid import uuid4 3 | 4 | import socket 5 | 6 | import paho.mqtt.client as mqtt 7 | from urllib.parse import urlparse 8 | 9 | 10 | class MQTTNode: 11 | # based on https://pypi.org/project/paho-mqtt/ 12 | mqtt_kind = 'node' 13 | 14 | def __init__(self, url, topic, **kwargs): 15 | self.topic = topic 16 | self.url = url 17 | self._stream = [] 18 | self._client = None 19 | self._client_kwargs = dict(kwargs) 20 | if 'client_id' not in kwargs: 21 | client_id = '{}.{}.{}'.format(socket.gethostname(), 22 | self.mqtt_kind, uuid4().hex) 23 | self._client_kwargs['client_id'] = client_id 24 | if 'clean_session' not in kwargs: 25 | self._client_kwargs['clean_session'] = True 26 | 27 | @property 28 | def client(self): 29 | if self._client is None: 30 | client = self._client = mqtt.Client(**self._client_kwargs) 31 | client.on_connect = self.on_connect 32 | client.on_message = self.on_message 33 | return self._client 34 | 35 | def on_connect(self, client, userdata, flags, rc): 36 | pass 37 | 38 | def on_message(self, client, userdata, message): 39 | message.payload = json.loads(message.payload) 40 | 41 | def connect(self): 42 | client = self.client 43 | parsed = urlparse(self.url) 44 | client.username_pw_set(parsed.username, parsed.password) 45 | client.connect(parsed.hostname, port=parsed.port or 1883) 46 | 47 | def disconnect(self): 48 | self.client.disconnect() 49 | 50 | def publish(self, message): 51 | self.connect() 52 | self.client.publish(self.topic, json.dumps(message)) 53 | 54 | 55 | class MQTTSource(MQTTNode): 56 | mqtt_kind = 'source' 57 | 58 | def on_connect(self, client, userdata, flags, rc): 59 | client.subscribe(self.topic) 60 | 61 | def on_message(self, client, userdata, message): 62 | super().on_message(client, userdata, message) 63 | stream_message = { 64 | 'topic': message.topic, 65 | 'payload': message.payload, 66 | 'qos': message.qos, 67 | 'retain': message.retain 68 | } 69 | self._stream.append(stream_message) 70 | 71 | def stream(self, stream): 72 | self._stream = stream 73 | self.connect() 74 | self.client.loop_forever() 75 | 76 | def cancel(self): 77 | self.client.disconnect() 78 | 79 | 80 | class MQTTSink(MQTTNode): 81 | mqtt_kind = 'sink' 82 | 83 | def put(self, message): 84 | self.publish(message) 85 | -------------------------------------------------------------------------------- /minibatch/contrib/omegaml.py: -------------------------------------------------------------------------------- 1 | from minibatch.contrib.mongodb import MongoSource, MongoSink 2 | 3 | 4 | class DatasetSource: 5 | """ 6 | A omegaml dataset source 7 | 8 | Usage: 9 | # start consuming from dataset 10 | stream = mb.stream('test') 11 | source = DatasetSource(om, 'mydataset') 12 | stream.attach(source) 13 | 14 | # stream to a python callable 15 | streaming('test')(lambda v: print(v)) 16 | 17 | Args: 18 | om (Omega): the omega instance 19 | dataset (str): the dataset name 20 | **source_kwargs: kwargs to the MongoSource 21 | 22 | Notes: 23 | 24 | * om.datasets is used as the interface to the omegaml analytics store, 25 | backed by MongoDB. Thus the actual source is MongoSource 26 | 27 | * if the dataset does not exist it will be created as a MongoDB 28 | native collection 29 | """ 30 | 31 | def __init__(self, om, dataset, **source_kwargs): 32 | self.om = om 33 | self._dataset_name = dataset 34 | self._source = None 35 | self._source_kwargs = source_kwargs 36 | # ensure the source dataset exists 37 | assert self.source is not None 38 | 39 | @property 40 | def source(self): 41 | om = self.om 42 | if self._source is None: 43 | coll = om.datasets.collection(self._dataset_name) 44 | if om.datasets.metadata(self._dataset_name) is None: 45 | # create a collection-linked dataset if it does not exist yet 46 | om.datasets.put(coll, self._dataset_name) 47 | self._source = MongoSource(coll, **self._source_kwargs) 48 | return self._source 49 | 50 | def stream(self, stream): 51 | self.source.stream(stream) 52 | 53 | def cancel(self): 54 | self.source.cancel() 55 | 56 | 57 | class DatasetSink: 58 | """ 59 | A omegaml dataset sink 60 | 61 | Usage: 62 | sink = DatasetSink(om, 'mydataset') 63 | @streaming('stream', sink=ink) 64 | def process(window): 65 | ... 66 | 67 | Args: 68 | om (Omega): the Omega instance 69 | dataset (str): the name of the dataset 70 | **sink_kwargs (dict): the kwargs to the MongoSink 71 | 72 | Notes: 73 | 74 | * om.datasets is used as the interface to the omegaml analytics store, 75 | backed by MongoDB. Thus the actual sink is MongoSink 76 | 77 | * if the dataset does not exist it will be created as a MongoDB native 78 | collection 79 | """ 80 | 81 | def __init__(self, om, dataset, **sink_kwargs): 82 | self.om = om 83 | self._dataset_name = dataset 84 | self._sink = None 85 | self._sink_kwargs = sink_kwargs 86 | 87 | @property 88 | def sink(self): 89 | om = self.om 90 | if self._sink is None: 91 | coll = om.datasets.collection(self._dataset_name) 92 | if om.datasets.metadata(self._dataset_name) is None: 93 | # create a collection-linked dataset if it does not exist yet 94 | om.datasets.put(coll, self._dataset_name) 95 | self._sink = MongoSink(coll) 96 | return self._sink 97 | 98 | def put(self, messages): 99 | self.sink.put(messages) 100 | -------------------------------------------------------------------------------- /minibatch/emitter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omegaml/minibatch/82775bebef485d91da343f587b5484c6858424dd/minibatch/emitter/__init__.py -------------------------------------------------------------------------------- /minibatch/emitter/base.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import Future, ProcessPoolExecutor 2 | 3 | import datetime 4 | import logging 5 | 6 | from minibatch import Stream, logger 7 | from minibatch.marshaller import SerializableFunction, MinibatchFuture 8 | from minibatch.models import Buffer 9 | 10 | 11 | class Emitter(object): 12 | """ 13 | the basic emitter 14 | 15 | Emitter.run() implements the generic emitter protocol as follows: 16 | 17 | 1. determine if a window is ready to be processed 18 | 2. retrieve the data from the buffer to create a Window 19 | 3. process the data (i.e. mark the buffered data processed) 20 | 4. run the emit function on the window 21 | 5. commit (emit function successful) or undo (exception raised) 22 | 23 | Note that run() is blocking. Between running the protocol, 24 | it will sleep to conserve resources. 25 | 26 | Each time run() wakes up, it will call the following methods in turn: 27 | 28 | ready() - called to determine if the buffer contains new data 29 | query() - return the Buffer objects to process 30 | process() - process the data 31 | emit() - emit a Window 32 | timestamp() - timestamp the stream for the next processing 33 | commit() - commit processed data back to the buffer. by 34 | default this means removing the objects from the 35 | buffer and deleting the window. 36 | sleep() - sleep until the next round 37 | 38 | Use timestamp() to mark the stream (or the buffer data) for the next 39 | round. Use sleep() to set the amount of time to sleep. Depending on 40 | the emitter's semantics this may be a e.g. a fixed interval or some 41 | function of the data. 42 | 43 | Emitter implements several defaults: 44 | 45 | process() - mark all data returned by query() as processed 46 | sleep() - sleep self.interval / 2 seconds 47 | undo() - called if the emit function raises an exception. marks 48 | the data returned by query() as not processed and deletes 49 | the window 50 | 51 | For examples of how to implement a custom emitter see TimeWindow, 52 | CountWindow and SampleFunctionWindow. 53 | 54 | Note there should only be one WindowEmitter per stream. This is a 55 | a limitation of the Buffer's way of marking documentes as processed 56 | (a boolean flag). This decision was made in favor of performance and 57 | simplicity. Supporting concurrent emitters would mean each Buffer object 58 | needs to keep track of which emitter has processed its data and make 59 | sure Window objects are processed by exactly one emitter. 60 | """ 61 | 62 | def __init__(self, stream_name, processfn=None, 63 | emitfn=None, emit_empty=False, executor=None, 64 | max_workers=None, stream=None, stream_url=None, 65 | forwardfn=None): 66 | self.stream_name = stream_name 67 | self.emit_empty = emit_empty 68 | self.emitfn = emitfn 69 | self.processfn = processfn 70 | self.executor = (executor or ProcessPoolExecutor(max_workers=max_workers)) 71 | self._stream = stream 72 | self._stream_url = stream_url 73 | self._delete_on_commit = True 74 | self._forwardfn = forwardfn 75 | self._stop = False 76 | 77 | def query(self, *args): 78 | now, last_read = args 79 | last_read, max_read = args 80 | fltkwargs = dict(stream=self.stream_name, 81 | created__gte=last_read, created__lte=now) 82 | return Buffer.objects.no_cache().filter(**fltkwargs) 83 | 84 | def ready(self): 85 | """ return a tuple of (ready, qargs) """ 86 | stream = self.stream 87 | last_read = stream.last_read 88 | now = datetime.datetime.utcnow() 89 | return now > last_read, (now, last_read) 90 | 91 | def timestamp(self, query_args): 92 | self.stream.modify(query={}, last_read=datetime.datetime.utcnow()) 93 | 94 | @property 95 | def stream(self): 96 | if self._stream: 97 | return self._stream 98 | self._stream = Stream.get_or_create(self.stream_name, 99 | url=self._stream_url) 100 | return self._stream 101 | 102 | def process(self, qs): 103 | if self.processfn: 104 | return self.processfn(qs) 105 | data = [] 106 | for obj in qs: 107 | obj.modify(processed=True) 108 | data.append(obj) 109 | return data 110 | 111 | def undo(self, qs, data): 112 | for obj in qs: 113 | obj.modify(processed=False) 114 | return qs 115 | 116 | def persist(self, flag=True): 117 | self._delete_on_commit = not flag 118 | 119 | def commit(self, qs, data): 120 | if self._delete_on_commit: 121 | for obj in qs: 122 | obj.delete() 123 | 124 | def emit(self, qs): 125 | data = list(doc for doc in qs) 126 | return self._run_emitfn(data) 127 | 128 | def _run_emitfn(self, data): 129 | if self.emitfn: 130 | logging.debug("calling emitfn") 131 | try: 132 | sjob = SerializableFunction(self.emitfn, data) 133 | future = self.executor.submit(sjob) 134 | except Exception: 135 | raise 136 | else: 137 | future = Future() 138 | future.set_result(data) 139 | future = MinibatchFuture(future, data=data) 140 | return future 141 | 142 | def forward(self, window): 143 | if self._forwardfn: 144 | self._forwardfn(window.data) 145 | 146 | def sleep(self): 147 | import time 148 | time.sleep(.01) 149 | 150 | def stop(self): 151 | self._stop = True 152 | 153 | def run(self): 154 | while not self._stop: 155 | logger.debug("testing window ready") 156 | ready, query_args = self.ready() 157 | if ready: 158 | logger.debug("data ready") 159 | qs = self.query(*query_args) 160 | qs = self.process(qs) 161 | # note self.emit is usin an async executor 162 | # that returns a future 163 | if qs or self.emit_empty: 164 | logger.debug("Emitting") 165 | future = self.emit(qs) 166 | logger.debug("got future {}".format(future)) 167 | future['qs'] = qs 168 | future['query_args'] = query_args 169 | 170 | def emit_done(future): 171 | # this is called once upon future resolves 172 | future = MinibatchFuture(future) 173 | logger.debug("emit done {}".format(future)) 174 | qs = future.qs 175 | data = future.data 176 | query_args = future.query_args 177 | try: 178 | data = future.result() or data 179 | except Exception: 180 | self.undo(qs, data) 181 | else: 182 | self.commit(qs, data) 183 | self.forward(data) 184 | finally: 185 | self.timestamp(*query_args) 186 | self.sleep() 187 | 188 | future.add_done_callback(emit_done) 189 | logger.debug("sleeping") 190 | self.sleep() 191 | logger.debug("awoke") 192 | self.executor.shutdown(wait=True) 193 | -------------------------------------------------------------------------------- /minibatch/example/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omegaml/minibatch/82775bebef485d91da343f587b5484c6858424dd/minibatch/example/__init__.py -------------------------------------------------------------------------------- /minibatch/example/basic/__init__.py: -------------------------------------------------------------------------------- 1 | # some worker's process function 2 | 3 | from minibatch import connectdb, Stream, streaming 4 | from minibatch.example.util import clean 5 | 6 | 7 | def consumer(): 8 | # process window.data. maybe split processing in parallel... whatever 9 | # @stream('test', size=2, emitter=SampleFunctionWindow) 10 | # @stream('test', interval=5) 11 | # @stream('test', interval=5, relaxed=False, keep=True) 12 | @streaming('test', size=5, keep=True) 13 | def myprocess(window): 14 | try: 15 | db = connectdb(alias='consumer') 16 | print("consuming ... {}".format(window.data)) 17 | db.processed.insert_one({'data': window.data or {}}) 18 | except Exception as e: 19 | print(e) 20 | return window 21 | 22 | 23 | # some producer 24 | def producer(data): 25 | import os 26 | import time 27 | import random 28 | # sleep to simulate multiple time windows 29 | time.sleep(random.randrange(0, 1, 1) / 10.0) 30 | data.update({'pid': os.getpid()}) 31 | connectdb(alias='producer') 32 | stream_name = 'test' 33 | stream = Stream.get_or_create(stream_name) 34 | print("producing ... {}".format(data)) 35 | stream.append(data) 36 | 37 | 38 | def main(): 39 | from multiprocessing import Pool, Process 40 | import time 41 | 42 | clean() 43 | emitp = Process(target=consumer) 44 | emitp.start() 45 | pool = Pool(4) 46 | data = [{'value': i} for i in range(0, 100)] 47 | pool.map(producer, data, 1) 48 | time.sleep(5) 49 | emitp.terminate() 50 | db = connectdb() 51 | print("processed items:") 52 | print(list(doc for doc in db.processed.find())) 53 | -------------------------------------------------------------------------------- /minibatch/example/basic/__main__.py: -------------------------------------------------------------------------------- 1 | from minibatch.example.basic import main 2 | 3 | if __name__ == '__main__': 4 | main() 5 | -------------------------------------------------------------------------------- /minibatch/example/kafka/__init__.py: -------------------------------------------------------------------------------- 1 | from minibatch.contrib.kafka import KafkaSink, KafkaSource 2 | 3 | 4 | def producer(url): 5 | sink = KafkaSink('test', urls=url) 6 | sink.put(dict(foo='bar')) 7 | 8 | def consumer(url): 9 | source = KafkaSource('test-result', urls=url) 10 | for data in source.consumer: 11 | print(data) 12 | -------------------------------------------------------------------------------- /minibatch/example/mongodb/__init__.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from multiprocessing import Process 3 | 4 | from time import sleep 5 | 6 | from minibatch import connectdb, streaming, stream 7 | from minibatch.contrib.mongodb import MongoSource, MongoSink 8 | from minibatch.example.util import clean 9 | 10 | 11 | def consumer(): 12 | @streaming('test', size=1, keep=True) 13 | def myprocess(window): 14 | try: 15 | db = connectdb(alias='consumer') 16 | print("consuming ... {}".format(window.data)) 17 | db.processed.insert_one({'data': window.data or {}}) 18 | except Exception as e: 19 | print(e) 20 | return window 21 | 22 | 23 | def main(): 24 | print("setting up") 25 | clean() 26 | # setup mqtt source and producer 27 | url = 'mongodb://localhost/test' 28 | db = connectdb(url=url) 29 | source_coll = db['source'] 30 | sink_coll = db['processed'] 31 | source = MongoSource(source_coll) 32 | producer = MongoSink(sink_coll) 33 | # attach to the stream 34 | s = stream('test') 35 | s.attach(source) 36 | # set up a streaming function 37 | emitp = Process(target=consumer) 38 | emitp.start() 39 | # publish some messages 40 | print("publishing messages") 41 | for i in range(10): 42 | producer.put(dict(foo='bar', time=datetime.utcnow().isoformat())) 43 | sleep(.1) 44 | # check we got the messages 45 | print("wait to receive all messages") 46 | sleep(3) 47 | docs = list(doc for doc in sink_coll.find()) 48 | print("processed items:", len(docs)) 49 | print(docs) 50 | emitp.terminate() 51 | -------------------------------------------------------------------------------- /minibatch/example/mongodb/__main__.py: -------------------------------------------------------------------------------- 1 | from minibatch.example.mongodb import main 2 | 3 | if __name__ == '__main__': 4 | main() 5 | -------------------------------------------------------------------------------- /minibatch/example/mqtt/__init__.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process 2 | 3 | from datetime import datetime 4 | from time import sleep 5 | 6 | from minibatch import connectdb, streaming, stream 7 | from minibatch.contrib.mqtt import MQTTSource, MQTTSink 8 | from minibatch.example.util import clean 9 | 10 | 11 | def consumer(): 12 | @streaming('test', size=1, keep=True) 13 | def myprocess(window): 14 | try: 15 | db = connectdb(alias='consumer') 16 | print("consuming ... {}".format(window.data)) 17 | db.processed.insert_one({'data': window.data or {}}) 18 | except Exception as e: 19 | print(e) 20 | return window 21 | 22 | 23 | def main(): 24 | print("setting up") 25 | clean() 26 | # setup mqtt source and producer 27 | mqtt_broker = 'mqtt://rabbitmq:rabbitmq@localhost' 28 | topic = 'TEST/MESSAGE' 29 | source = MQTTSource(mqtt_broker, topic) 30 | producer = MQTTSink(mqtt_broker, topic) 31 | # attach to the stream 32 | s = stream('test') 33 | s.attach(source) 34 | # set up a streaming function 35 | emitp = Process(target=consumer) 36 | emitp.start() 37 | # publish some messages 38 | print("publishing messages") 39 | for i in range(10): 40 | producer.put(dict(foo='bar', time=datetime.utcnow().isoformat())) 41 | sleep(.1) 42 | # check we got the messages 43 | print("wait to receive all messages") 44 | sleep(3) 45 | db = connectdb() 46 | docs = list(doc for doc in db.processed.find()) 47 | print("processed items:", len(docs)) 48 | print(docs) 49 | emitp.terminate() 50 | source.disconnect() 51 | producer.disconnect() 52 | -------------------------------------------------------------------------------- /minibatch/example/mqtt/__main__.py: -------------------------------------------------------------------------------- 1 | from minibatch.example.mqtt import main 2 | 3 | if __name__ == '__main__': 4 | main() 5 | -------------------------------------------------------------------------------- /minibatch/example/util.py: -------------------------------------------------------------------------------- 1 | from mongoengine import disconnect 2 | 3 | from minibatch import connectdb 4 | 5 | 6 | def clean(): 7 | db = connectdb() 8 | db.drop_collection('buffer') 9 | db.drop_collection('stream') 10 | db.drop_collection('window') 11 | db.drop_collection('processed') 12 | disconnect('minibatch') 13 | -------------------------------------------------------------------------------- /minibatch/marshaller.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import base64 3 | 4 | import dill 5 | 6 | 7 | class FunctionMarshaller(object): 8 | """ 9 | Serialize/deserialize any function or BatchCallable 10 | * serialize any function and its arguments for transport, called a job 11 | * deserialize a job back to a function and its arguments 12 | inspired by https://medium.com/@emlynoregan/serialising-all-the-functions-in-python-cd880a63b591 # noqa 13 | """ 14 | 15 | def _serialize_anyfunc(self, func, args, kwargs): 16 | # create serialized functions 17 | sfunc = dill.dumps(func, protocol=2) 18 | kind = 'object' 19 | # serialize arguments 20 | sargs = dill.dumps(args, protocol=2) 21 | skwargs = dill.dumps(kwargs, protocol=2) 22 | # create transportable representation 23 | sfunc = base64.encodebytes(sfunc).decode('latin1') 24 | sargs = base64.encodebytes(sargs).decode('latin1') 25 | skwargs = base64.encodebytes(skwargs).decode('latin1') 26 | return kind, sfunc, sargs, skwargs 27 | 28 | def serialize(self, func, *args, **kwargs): 29 | """ 30 | serialize a function object into transportable format 31 | """ 32 | if hasattr(func, 'items'): 33 | func.items = [self._serialize_anyfunc( 34 | ifunc, args, kwargs) for ifunc, args, kwargs in func.items] 35 | funcjob = self._serialize_anyfunc(func, None, None) 36 | else: 37 | funcjob = self._serialize_anyfunc(func, args, kwargs) 38 | return funcjob 39 | 40 | def _deserialize_anyfunc(self, kind, sfunc, sargs, skwargs): 41 | # unwrap 42 | sfunc = base64.decodebytes(sfunc.encode('latin1')) 43 | sargs = base64.decodebytes(sargs.encode('latin1')) 44 | skwargs = base64.decodebytes(skwargs.encode('latin1')) 45 | func = dill.loads(sfunc) 46 | args = dill.loads(sargs) 47 | kwargs = dill.loads(skwargs) 48 | return func, args, kwargs 49 | 50 | def deserialize(self, sjob): 51 | # decode serialized arguments to get back a function 52 | kind, sfunc, sargs, skwargs = sjob 53 | func, args, kwargs = self._deserialize_anyfunc( 54 | kind, sfunc, sargs, skwargs) 55 | # resolve functions 56 | if hasattr(func, 'items'): 57 | func.items = [ 58 | self._deserialize_anyfunc(kind, sfunc, sargs, skwargs) 59 | for kind, sfunc, sargs, skwargs in func.items] 60 | if args: 61 | func = partial(func, *args) 62 | if kwargs: 63 | func = partial(func, **kwargs) 64 | return func 65 | 66 | 67 | class SerializableFunction: 68 | # a function object that is fully serializable by pickle 69 | # and callable by a remote process 70 | def __init__(self, fn, *args, **kwargs): 71 | from minibatch.marshaller import FunctionMarshaller 72 | self.sjob = FunctionMarshaller().serialize(fn, *args, **kwargs) 73 | 74 | def __call__(self): 75 | from minibatch.marshaller import FunctionMarshaller 76 | 77 | func = FunctionMarshaller().deserialize(self.sjob) 78 | return func() 79 | 80 | 81 | class MinibatchFuture: 82 | # a type-safe future with custom attributes 83 | # we could just as well store future._mb_data directly 84 | # but this is nicer code-wise 85 | def __init__(self, future, **data): 86 | self.future = future 87 | if hasattr(future, '_mb_data'): 88 | # we have seen this future before, reuse data dict 89 | self.data = future._mb_data 90 | else: 91 | # new future, add our data dict 92 | self.data = dict(data) 93 | self.future._mb_data = self.data 94 | 95 | def __getattr__(self, k): 96 | # if we know of the attribute as a key in our data, use it 97 | # else return the future's attribute 98 | if k in self.data: 99 | return self.data[k] 100 | return getattr(self.future, k) 101 | 102 | def __setitem__(self, k, v): 103 | # update our data 104 | self.data[k] = v 105 | -------------------------------------------------------------------------------- /minibatch/models.py: -------------------------------------------------------------------------------- 1 | from logging import warning 2 | 3 | import datetime 4 | import logging 5 | import threading 6 | from mongoengine import Document 7 | from mongoengine.errors import NotUniqueError 8 | from mongoengine.fields import (StringField, IntField, DateTimeField, 9 | ListField, DictField, BooleanField) 10 | from threading import Thread 11 | from uuid import uuid4 12 | 13 | STATUS_INIT = 'initialize' 14 | STATUS_OPEN = 'open' 15 | STATUS_CLOSED = 'closed' 16 | STATUS_PROCESSED = 'processed' 17 | STATUS_FAILED = 'failed' 18 | STATUS_CHOICES = (STATUS_OPEN, STATUS_CLOSED, STATUS_FAILED) 19 | 20 | # we don't propagate this logger to avoid logging housekeeping messages unless requested 21 | hk_logger = logging.getLogger(__name__ + '.housekeeping') 22 | hk_logger.propagate = False 23 | 24 | 25 | class Batcher: 26 | """ A batching list-like 27 | 28 | This will batch up to batchsize items in an internal array of objects. To see if 29 | is full, check batcher.is_full. Note the internal array is only constrained by memory. 30 | 31 | To increase performance, Batcher will allocate an empty array of batchsize elements 32 | at creation time, or when setting .batchsize. If the array becomes too small, it will 33 | be increased by 10%. Note it will not be shrunk unless you call .clear(reset=True) 34 | 35 | Usage: 36 | batch = Batcher(batchsize=N) 37 | 38 | while True: 39 | batch.add(doc) 40 | if batch.is_full: 41 | process items 42 | batch.clear() 43 | """ 44 | 45 | def __init__(self, batchsize=1): 46 | self._batch = [] 47 | self.head = 0 48 | self.batchsize = batchsize 49 | 50 | @property 51 | def is_full(self): 52 | return self.head > self.batchsize + 1 53 | 54 | def add(self, doc): 55 | # protect against buffer overflow 56 | # update batch 57 | self._batch[self.head] = dict(doc) 58 | self.head += 1 59 | if len(self._batch) <= self.head: 60 | self._batch.extend([None] * int(self.batchsize * .1)) 61 | 62 | def clear(self, reset=False): 63 | # reset batch 64 | self.head = 0 65 | if reset: 66 | self.batchsize = self.batchsize 67 | 68 | @property 69 | def batchsize(self): 70 | return self._batchsize 71 | 72 | @batchsize.setter 73 | def batchsize(self, v): 74 | # pre-allocate batch array 75 | self._batch = [None] * (v + 10) # avoid eager extension on first fill 76 | self._batchsize = v 77 | self.clear() 78 | 79 | @property 80 | def batch(self): 81 | return self._batch[0:self.head] 82 | 83 | 84 | class ImmediateWriter: 85 | @classmethod 86 | def write(cls, doc, batcher=None): 87 | """ 88 | this does a fast, unchecked insert_one(), or insert_many() for batched 89 | 90 | No validation is done whatsoever. Only use this if you know what you are doing. This is 91 | 250x times faster than Document.save() at the cost of not validating the documents. 92 | 93 | Args: 94 | doc (dict): the actual mongodb document to be written 95 | 96 | Returns: 97 | 98 | """ 99 | cls: (ImmediateWriter, Document) 100 | if batcher is None: 101 | cls._get_collection().insert_one(doc) 102 | else: 103 | batcher.add(doc) 104 | if batcher.is_full: 105 | cls.flush(batcher) 106 | 107 | @classmethod 108 | def flush(cls, batcher=None): 109 | cls: (ImmediateWriter, Document) 110 | if batcher is not None: 111 | # the iterable is to avoid duplicate objects (batch op errors) 112 | cls._get_collection().insert_many(dict(d) for d in batcher.batch) 113 | batcher.clear() 114 | 115 | 116 | class Window(ImmediateWriter, Document): 117 | """ 118 | A Window is the data collected from a stream according 119 | to the WindowEmitter strategy. 120 | """ 121 | stream = StringField(required=True) 122 | created = DateTimeField(default=datetime.datetime.utcnow) 123 | data = ListField(default=[]) 124 | processed = BooleanField(default=False) 125 | query = ListField(default=[]) 126 | meta = { 127 | 'db_alias': 'minibatch', 128 | 'strict': False, # support previous releases 129 | 'indexes': [ 130 | 'created', 131 | 'stream', 132 | ] 133 | } 134 | 135 | def __unicode__(self): 136 | return u"Window [%s] %s" % (self.created, self.data) 137 | 138 | 139 | class Buffer(ImmediateWriter, Document): 140 | stream = StringField(required=True) 141 | created = DateTimeField(default=datetime.datetime.utcnow) 142 | data = DictField(required=True) 143 | processed = BooleanField(default=False) 144 | meta = { 145 | 'db_alias': 'minibatch', 146 | 'strict': False, # support previous releases 147 | 'indexes': [ 148 | 'created', 149 | 'stream', 150 | ] 151 | } 152 | 153 | def __unicode__(self): 154 | return u"Buffer created=[%s] processed=%s data=%s" % (self.created, self.processed, self.data) 155 | 156 | 157 | class Stream(Document): 158 | """ 159 | Stream provides meta data for a streaming buffer 160 | 161 | Streams are synchronized among multiple Stream clients using last_read. 162 | """ 163 | name = StringField(default=lambda: uuid4().hex, required=True) 164 | status = StringField(choices=STATUS_CHOICES, default=STATUS_INIT) 165 | created = DateTimeField(default=datetime.datetime.utcnow) 166 | closed = DateTimeField(default=None) 167 | # interval in seconds or count in #documents 168 | interval = IntField(default=10) 169 | last_read = DateTimeField(default=datetime.datetime.utcnow) 170 | meta = { 171 | 'db_alias': 'minibatch', 172 | 'strict': False, # support previous releases 173 | 'indexes': [ 174 | 'created', # most recent is last, i.e. [-1] 175 | {'fields': ['name'], 176 | 'unique': True 177 | } 178 | ] 179 | } 180 | 181 | def __init__(self, *args, batchsize=1, max_age=None, **kwargs): 182 | super().__init__(*args, **kwargs) 183 | self._batcher = None 184 | self._url = None 185 | self._cnx_kwargs = None 186 | self._stream_source = None 187 | self.batchsize = batchsize 188 | self._max_age = max_age 189 | self.ensure_initialized() 190 | self.autoclear(max_age) 191 | 192 | def ensure_initialized(self): 193 | if self.status == STATUS_INIT: 194 | self.modify({'status': STATUS_INIT}, 195 | status=STATUS_OPEN) 196 | 197 | @property 198 | def batchsize(self): 199 | return self._batcher.batchsize if self._batcher else 1 200 | 201 | @batchsize.setter 202 | def batchsize(self, size): 203 | if size > 1: 204 | self._batcher = self._batcher or Batcher(batchsize=size) 205 | self._batcher.batchsize = size 206 | else: 207 | self._batcher = None 208 | 209 | def append(self, data): 210 | t = datetime.datetime.utcnow() 211 | Buffer.write(dict(stream=self.name, data=data or {}, processed=False, created=t), batcher=self._batcher) 212 | 213 | def flush(self): 214 | Buffer.flush(self._batcher) 215 | 216 | def clear(self): 217 | Buffer.objects.no_cache().filter(**{'stream': self.name}).delete() 218 | 219 | def attach(self, source, background=True): 220 | """ 221 | use an external producer to start streaming 222 | """ 223 | self._stream_source = source 224 | if not background: 225 | return source.stream(self) 226 | self._start_source(source) 227 | 228 | def stop(self): 229 | """ stop the stream 230 | 231 | Stops the source and housekeeping threads (if any). 232 | 233 | Returns: 234 | None 235 | """ 236 | self._stop_source() 237 | self._stop_housekeeping() 238 | 239 | @classmethod 240 | def get_or_create(cls, name, url=None, interval=None, batchsize=1, max_age=None, 241 | **kwargs): 242 | """ get or create a stream 243 | 244 | Args: 245 | name (str): the name of the stream 246 | url (str): the database url 247 | interval (float): the interval in seconds, or a fraction thereof, 248 | DEPRECATED 249 | batchsize (int): the batch size, DEPRECATED 250 | max_age (int|dict): the maximum age of the data in seconds, or as a dict 251 | to datetime.timedelta(**kwargs). Specifies the interval to run the 252 | housekeeping thread and the maximum age of data in the buffer, relative 253 | to the created timestamp. If None, the housekeeping thread is stopped. 254 | 255 | Returns: 256 | Stream: the stream object 257 | """ 258 | # critical section 259 | # this may fail in concurrency situations 260 | from minibatch import connectdb 261 | try: 262 | connectdb(alias='minibatch', url=url, **kwargs) 263 | except Exception as e: 264 | warning("Stream setup resulted in {} {}".format(type(e), str(e))) 265 | try: 266 | stream = Stream.objects(name=name).no_cache().get() 267 | except Stream.DoesNotExist: 268 | try: 269 | stream = Stream(name=name or uuid4().hex, 270 | interval=interval, 271 | status=STATUS_OPEN).save() 272 | except NotUniqueError: 273 | pass 274 | stream = Stream.objects(name=name).no_cache().get() 275 | stream._url = url 276 | stream._cnx_kwargs = kwargs 277 | stream.batchsize = batchsize 278 | stream._max_age = max_age 279 | stream.autoclear(max_age) 280 | return stream 281 | 282 | def buffer(self, **kwargs): 283 | self.flush() 284 | return Buffer.objects.no_cache().filter(**{'stream': self.name, **kwargs}) 285 | 286 | def window(self, **kwargs): 287 | self.flush() 288 | return Window.objects.no_cache().filter(**{'stream': self.name, **kwargs}) 289 | 290 | def streaming(self, fn=None, **kwargs): 291 | """ returns a streaming function 292 | 293 | Args: 294 | fn (callable): optional, a window function. If not 295 | specified the streaming function is returned as a decorator 296 | to an actual window function. 297 | **kwargs: kwargs passed to minibatch.streaming() 298 | 299 | Returns: 300 | fn (callable): the streaming function 301 | """ 302 | from minibatch import streaming as _base_streaming 303 | return _base_streaming(self.name, fn=fn, url=self._url, cnx_kwargs=self._cnx_kwargs, **kwargs) 304 | 305 | @property 306 | def source(self): 307 | return self._stream_source 308 | 309 | def autoclear(self, max_age=None): 310 | # specify max_age in seconds or as a dict to timedelta(**kwargs) 311 | # None means never clear 312 | self._max_age = max_age if max_age is not None else self._max_age 313 | self._start_housekeeping() 314 | 315 | def _housekeeping(self): 316 | while self._max_age: 317 | max_age = self._max_age 318 | if not isinstance(max_age, dict): 319 | max_age = dict(seconds=max_age) 320 | earliest = datetime.datetime.utcnow() - datetime.timedelta(**max_age) 321 | try: 322 | count = Buffer.objects.no_cache().filter(**{'stream': self.name, 'created__lte': earliest}).delete() 323 | except Exception as e: 324 | hk_logger.warning(f"housekeeping for stream {self.name} failed: {e}") 325 | else: 326 | hk_logger.info(f"housekeeping for stream {self.name}: deleted {count} objects earlier than {earliest}") 327 | # effectively we keep at most 2x _max_age periods of data 328 | # -- example: max_age = 10 329 | # t: ---------+---------+---------+ 330 | # 0 10 20 30 331 | # dddddddddd (d = data) 332 | # * _housekeeping runs, deletes before t0 333 | # dddddddddddddddddddd 334 | # * _housekeeping runs, deletes before t10 335 | # xxxxxxxxxxdddddddddd (x = deleted) 336 | # dddddddddddddddddddd 337 | # * _housekeeping runs, deletes before t20 338 | # xxxxxxxxxxddddddddddd (x = deleted) 339 | 340 | # we use this instead of sleep() to allow for a quick stop() 341 | # -- using sleep() means the thread waits up to max_age time (which could be very long, days, months, etc) 342 | # -- using Event.wait() means the thread waits up to max_age time, but can be stopped immediately by 343 | # setting the event 344 | # -- see https://stackoverflow.com/a/42710697/890242 345 | # TODO refactor this into a context manager or decorator so we can easily reuse it 346 | if self._housekeeping_stop_ev.wait(timeout=datetime.timedelta(**max_age).total_seconds()): 347 | break 348 | hk_logger.debug(f"housekeeping for stream {self.name} stopped") 349 | 350 | def _start_source(self, source): 351 | try: 352 | self._source_thread = t = Thread(target=source.stream, 353 | args=(self,)) 354 | t.start() 355 | except (KeyboardInterrupt, SystemExit): 356 | self.stop() 357 | 358 | def _stop_source(self): 359 | # stop source 360 | source = getattr(self, '_stream_source', None) 361 | if source: 362 | source.cancel() 363 | 364 | def _start_housekeeping(self): 365 | try: 366 | self._housekeeping_stop_ev = threading.Event() 367 | self._housekeeping_thread = t = Thread(target=self._housekeeping) 368 | t.start() 369 | except (KeyboardInterrupt, SystemExit): 370 | self._stop_housekeeping() 371 | 372 | def _stop_housekeeping(self): 373 | self._max_age = None 374 | self._housekeeping_stop_ev.set() 375 | -------------------------------------------------------------------------------- /minibatch/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omegaml/minibatch/82775bebef485d91da343f587b5484c6858424dd/minibatch/tests/__init__.py -------------------------------------------------------------------------------- /minibatch/tests/test_celery.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from uuid import uuid4 3 | 4 | from contextlib import contextmanager 5 | from unittest import TestCase 6 | 7 | from unittest.mock import MagicMock 8 | 9 | from minibatch import connectdb, stream 10 | from minibatch.contrib.celery import CeleryEventSource 11 | from minibatch.tests.util import delete_database 12 | 13 | 14 | class CeleryEventSourceTests(TestCase): 15 | def setUp(self): 16 | self.url = 'mongodb://localhost/test' 17 | delete_database(url=self.url) 18 | self.db = connectdb(url=self.url) 19 | 20 | def test_source(self): 21 | celeryapp = DummyCeleryApp() 22 | source = CeleryEventSource(celeryapp) 23 | s = stream('test', url=self.url) 24 | # mock stream append because sut is CeleryEventSource, not append 25 | s.append = MagicMock() 26 | # mock event source 27 | event = { 28 | 'name': 'test', 29 | 'uuid': '12345', 30 | 'state': 'SUCCESS', 31 | 'runtime': 1.0, 32 | } 33 | celeryapp.source = source 34 | celeryapp.dummy_events = [event] 35 | s.attach(source) 36 | source.stream(s) 37 | s.append.assert_called() 38 | s.stop() 39 | 40 | def test_source_non_task_events(self): 41 | celeryapp = DummyCeleryApp() 42 | source = CeleryEventSource(celeryapp) 43 | s = stream('test', url=self.url) 44 | # mock stream append because sut is CeleryEventSource, not append 45 | s.append = MagicMock() 46 | # mock event source 47 | # note there is no task uuid 48 | event = { 49 | 'name': 'test', 50 | 'state': 'SUCCESS', 51 | 'runtime': 1.0, 52 | } 53 | celeryapp.source = source 54 | celeryapp.dummy_events = [event] 55 | s.attach(source) 56 | source.stream(s) 57 | s.append.assert_called() 58 | s.stop() 59 | 60 | 61 | class attrdict(dict): 62 | def __init__(self, *args, **kwargs): 63 | super().__init__(*args, **kwargs) 64 | self.__dict__ = self 65 | 66 | 67 | class DummyCeleryApp: 68 | def __init__(self): 69 | self._tasks = defaultdict(dict) 70 | 71 | @contextmanager 72 | def connection(self): 73 | yield self 74 | 75 | @property 76 | def events(self): 77 | return self 78 | 79 | def Receiver(self, *args, **kwargs): 80 | return self 81 | 82 | def State(self): 83 | return self 84 | 85 | def event(self, state): 86 | ev_uuid = state.get('uuid', uuid4().hex) 87 | self._tasks[ev_uuid].update(state) 88 | v = attrdict(self._tasks[ev_uuid]) 89 | v.info = lambda: {k: v for k, v in v.__dict__.items() if k != 'info'} 90 | self._tasks[ev_uuid] = v 91 | return self 92 | 93 | @property 94 | def tasks(self): 95 | return self._tasks 96 | 97 | def capture(self, *args, **kwargs): 98 | for event in self.dummy_events: 99 | for eventkey, handler in self.source.handlers.items(): 100 | handler(event) 101 | -------------------------------------------------------------------------------- /minibatch/tests/test_kafka.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process, Queue 2 | from unittest import TestCase 3 | 4 | from threading import Thread 5 | from time import sleep 6 | from unittest.mock import MagicMock 7 | 8 | from minibatch import connectdb, stream, streaming, make_emitter 9 | from minibatch.contrib.kafka import KafkaSource, KafkaSink 10 | from minibatch.tests.util import delete_database, LocalExecutor 11 | 12 | 13 | class KafkaTests(TestCase): 14 | def setUp(self): 15 | self.url = 'mongodb://localhost/test' 16 | delete_database(url=self.url) 17 | self.db = connectdb(url=self.url) 18 | 19 | def test_consumer(self): 20 | # we simply inject a mock KafkaConsumer into the KafkaSource 21 | # as we don't want to test KafkaConsumer but KafkaSource 22 | message = MagicMock() 23 | message.value = dict(foo='bar') 24 | source = KafkaSource('topic') 25 | consumer = MagicMock() 26 | consumer.__iter__.return_value = [message] 27 | source._consumer = consumer 28 | s = stream('test', url=self.url) 29 | s.attach(source) 30 | 31 | def consumer(q): 32 | url = str(self.url) 33 | 34 | @streaming('test', executor=LocalExecutor(), url=url, queue=q) 35 | def process(window): 36 | db = connectdb(url=url) 37 | db.processed.insert_many(window.data) 38 | 39 | q = Queue() 40 | p = Process(target=consumer, args=(q,)) 41 | p.start() 42 | sleep(5) 43 | q.put(True) 44 | p.join() 45 | 46 | docs = list(self.db.processed.find()) 47 | self.assertEqual(len(docs), 1) 48 | 49 | def test_sink(self): 50 | # we simply inject a mock KafkaProducer into the KafkaSink 51 | s = stream('test', url=self.url) 52 | s.append(dict(foo='baz')) 53 | sink = KafkaSink('test') 54 | producer = MagicMock() 55 | sink._producer = producer 56 | # create a threaded emitter that we can stop 57 | em = make_emitter('test', url=self.url, sink=sink, emitfn=lambda v: v) 58 | t = Thread(target=em.run) 59 | t.start() 60 | sleep(1) 61 | em._stop = True 62 | # check the sink got called and forward to the mock KafkaProducer 63 | producer.send.assert_called_with('test', value={'foo': 'baz'}) 64 | -------------------------------------------------------------------------------- /minibatch/tests/test_minibatch.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process, Queue 2 | from unittest import TestCase 3 | 4 | import multiprocessing 5 | import sys 6 | import time 7 | 8 | from minibatch import Stream, Buffer, connectdb, reset_mongoengine 9 | from minibatch.tests.util import delete_database 10 | from minibatch.window import CountWindow 11 | 12 | # use this for debugging subprocesses 13 | logger = multiprocessing.log_to_stderr() 14 | logger.setLevel('INFO') 15 | 16 | 17 | def sleepdot(seconds=1): 18 | mult = 10 # ensure integers 19 | iterations = int(seconds * mult) 20 | for _ in range(0, iterations, mult): 21 | time.sleep(1) 22 | sys.stdout.write('.') 23 | sys.stdout.flush() 24 | 25 | 26 | class MiniBatchTests(TestCase): 27 | def setUp(self): 28 | self.url = 'mongodb://localhost/test' 29 | delete_database(url=self.url) 30 | self.db = connectdb(url=self.url) 31 | 32 | def tearDown(self): 33 | reset_mongoengine() 34 | 35 | def sleep(self, seconds): 36 | sleepdot(seconds) 37 | 38 | def test_stream(self): 39 | """ 40 | Test a stream writes to a buffer 41 | """ 42 | stream = Stream.get_or_create('test', url=self.url) 43 | stream.append({'foo': 'bar1'}) 44 | stream.append({'foo': 'bar2'}) 45 | count = len(list(doc for doc in Buffer.objects.all())) 46 | self.assertEqual(count, 2) 47 | 48 | def test_fixed_size(self): 49 | """ 50 | Test batch windows of fixed sizes work ok 51 | """ 52 | from minibatch import streaming 53 | 54 | def consumer(q): 55 | logger.debug("starting consumer on {self.url}".format(**locals())) 56 | url = str(self.url) 57 | 58 | # note the stream decorator blocks the consumer and runs the decorated 59 | # function asynchronously upon the window criteria is satisfied 60 | @streaming('test', size=2, keep=True, url=self.url, queue=q) 61 | def myprocess(window): 62 | logger.debug("*** processing") 63 | try: 64 | db = connectdb(url) 65 | db.processed.insert_one({'data': window.data or {}}) 66 | except Exception as e: 67 | print(e) 68 | raise 69 | 70 | # start stream consumer 71 | q = Queue() 72 | stream = Stream.get_or_create('test', url=self.url) 73 | proc = Process(target=consumer, args=(q,)) 74 | proc.start() 75 | # fill stream 76 | for i in range(10): 77 | stream.append({'index': i}) 78 | # give it some time to process 79 | logger.debug("waiting") 80 | self.sleep(10) 81 | q.put(True) # stop @streaming 82 | proc.join() 83 | # expect 5 entries, each of length 2 84 | data = list(doc for doc in self.db.processed.find()) 85 | count = len(data) 86 | self.assertEqual(count, 5) 87 | self.assertTrue(all(len(w) == 2 for w in data)) 88 | 89 | def test_timed_window(self): 90 | """ 91 | Test timed windows work ok 92 | """ 93 | from minibatch import streaming 94 | 95 | def consumer(q): 96 | # note the stream decorator blocks the consumer and runs the decorated 97 | # function asynchronously upon the window criteria is satisfied 98 | url = str(self.url) 99 | 100 | @streaming('test', interval=1, relaxed=False, keep=True, queue=q, url=self.url) 101 | def myprocess(window): 102 | try: 103 | db = connectdb(url=url) 104 | db.processed.insert_one({'data': window.data or {}}) 105 | except Exception as e: 106 | print(e) 107 | raise 108 | # return window 109 | 110 | # start stream consumer 111 | q = Queue() 112 | stream = Stream.get_or_create('test', url=self.url) 113 | proc = Process(target=consumer, args=(q,)) 114 | proc.start() 115 | # fill stream 116 | for i in range(10): 117 | stream.append({'index': i}) 118 | self.sleep(.5) 119 | # give it some time to process 120 | self.sleep(10) 121 | q.put(True) 122 | proc.join() 123 | # expect at least 5 entries (10 x .5 = 5 seconds), each of length 1-2 124 | data = list(doc for doc in self.db.processed.find()) 125 | count = len(data) 126 | self.assertGreater(count, 5) 127 | self.assertTrue(all(len(w) >= 2 for w in data)) 128 | 129 | def test_timed_window_relaxed(self): 130 | """ 131 | Test relaxed timed windows work ok 132 | """ 133 | from minibatch import streaming 134 | 135 | def consumer(q): 136 | # note the stream decorator blocks the consumer and runs the decorated 137 | # function asynchronously upon the window criteria is satisfied 138 | url = str(self.url) 139 | 140 | @streaming('test', interval=1, relaxed=True, keep=True, queue=q, url=url) 141 | def myprocess(window): 142 | try: 143 | db = connectdb(url) 144 | db.processed.insert_one({'data': window.data or {}}) 145 | except Exception as e: 146 | print(e) 147 | return window 148 | 149 | # start stream consumer 150 | q = Queue() 151 | stream = Stream.get_or_create('test', url=self.url) 152 | proc = Process(target=consumer, args=(q,)) 153 | proc.start() 154 | # fill stream 155 | for i in range(10): 156 | stream.append({'index': i}) 157 | self.sleep(.5) 158 | # give it some time to process 159 | self.sleep(5) 160 | q.put(True) 161 | proc.join() 162 | # expect at least 5 entries (10 x .5 = 5 seconds), each of length 1-2 163 | data = list(doc for doc in self.db.processed.find()) 164 | count = len(data) 165 | self.assertGreater(count, 5) 166 | self.assertTrue(all(len(w) >= 2 for w in data)) 167 | 168 | def _do_test_slow_emitfn(self, workers=None, expect_fail=None, timeout=None): 169 | """ 170 | Test slow batch windows work properly using {workers} workers 171 | """ 172 | from minibatch import streaming 173 | 174 | MiniBatchTests._do_test_slow_emitfn.__doc__ = MiniBatchTests._do_test_slow_emitfn.__doc__.format( 175 | workers=workers) 176 | 177 | def consumer(workers, q): 178 | logger.debug("starting consumer on={self.url} workers={workers}".format(**locals())) 179 | url = str(self.url) 180 | 181 | # note the stream decorator blocks the consumer and runs the decorated 182 | # function asynchronously upon the window criteria is satisfied 183 | @streaming('test', size=2, keep=True, url=self.url, max_workers=workers, queue=q) 184 | def myprocess(window): 185 | logger.debug("*** processing {}".format(window.data)) 186 | from minibatch import connectdb 187 | try: 188 | sleepdot(5) 189 | db = connectdb(url=url) 190 | db.processed.insert_one({'data': window.data or {}}) 191 | except Exception as e: 192 | logger.error(e) 193 | raise 194 | return window 195 | 196 | def check(): 197 | # expect 5 entries, each of length 2 198 | data = list(doc for doc in self.db.processed.find()) 199 | count = len(data) 200 | logger.debug("data={}".format(data)) 201 | self.assertEqual(count, 5) 202 | self.assertTrue(all(len(w) == 2 for w in data)) 203 | 204 | # start stream consumer 205 | # -- use just one worker, we expect to fail 206 | stream = Stream.get_or_create('test', url=self.url) 207 | q = Queue() 208 | proc = Process(target=consumer, args=(workers, q)) 209 | proc.start() 210 | # fill stream 211 | for i in range(10): 212 | stream.append({'index': i}) 213 | # give it some time to process 214 | logger.debug("waiting") 215 | # note it takes at least 25 seconds using 1 worker (5 windows, 5 seconds) 216 | # so we expect to fail 217 | self.sleep(12) 218 | q.put(True) 219 | if expect_fail: 220 | with self.assertRaises(AssertionError): 221 | check() 222 | else: 223 | check() 224 | # wait for everything to terminate, avoid stream corruption in next test 225 | self.sleep(timeout) 226 | proc.join() 227 | 228 | def test_slow_emitfn_single_worker(self): 229 | self._do_test_slow_emitfn(workers=1, expect_fail=True, timeout=30) 230 | 231 | def test_slow_emitfn_parallel_workers(self): 232 | self._do_test_slow_emitfn(workers=5, expect_fail=False, timeout=12) 233 | 234 | def test_buffer_cleaned(self): 235 | stream = Stream.get_or_create('test', url=self.url) 236 | stream.append({'foo': 'bar1'}) 237 | stream.append({'foo': 'bar2'}) 238 | 239 | em = CountWindow('test') 240 | em._run_once() 241 | em._run_once() 242 | 243 | docs = list(Buffer.objects.filter()) 244 | self.assertEqual(len(docs), 0) 245 | 246 | def test_buffer_housekeeping(self): 247 | stream = Stream.get_or_create('test', url=self.url, max_age=.5) 248 | stream.append({'foo': 'bar1'}) 249 | stream.append({'foo': 'bar1'}) 250 | stream.append({'foo': 'bar1'}) 251 | # expect buffer contains all 3 entries 252 | self.assertEqual(stream.buffer().count(), 3) 253 | # wait for housekeeping to take effect 254 | time.sleep(1) 255 | # expect buffer is empty 256 | self.assertEqual(stream.buffer().count(), 0) 257 | stream.stop() 258 | -------------------------------------------------------------------------------- /minibatch/tests/test_mongodb.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from multiprocessing import Process, Queue 3 | from threading import Thread 4 | from unittest import TestCase 5 | 6 | from time import sleep 7 | 8 | from minibatch import connectdb, streaming, stream, make_emitter, Buffer, reset_mongoengine 9 | from minibatch.contrib.mongodb import MongoSource, MongoSink 10 | from minibatch.tests.util import delete_database, LocalExecutor 11 | 12 | 13 | class MongodbTests(TestCase): 14 | def setUp(self): 15 | self.url = 'mongodb://localhost/test' 16 | delete_database(url=self.url) 17 | self.db = connectdb(url=self.url) 18 | 19 | def tearDown(self): 20 | reset_mongoengine() 21 | 22 | def test_source(self): 23 | N = 1 24 | interval = 1 25 | docs = self._run_streaming_test(N, interval, timeout=10) 26 | self.assertEqual(len(docs), N) 27 | 28 | def test_large_dataset_source(self): 29 | N = 100 30 | interval = 10 31 | docs = self._run_streaming_test(N, interval, timeout=15) 32 | self.assertEqual(len(docs), interval) 33 | print(list(d['delta'] for d in docs)) 34 | 35 | def test_xlarge_dataset_source(self): 36 | # assert that large N do not cause the window processing time to grow 37 | N = 1000 38 | interval = 1000 39 | docs = self._run_streaming_test(N, interval, timeout=60) 40 | self.assertEqual(len(docs), N / interval) 41 | t_delta = sum(d['delta'] for d in docs) / len(docs) 42 | # we expect to see 0.5 seconds per batch, average 43 | # t_delta is in microseconds => 1e6 convert to seconds 44 | self.assertTrue(t_delta / 1e6 < 1.0) # t_delta is in microseconds 45 | 46 | def test_sink(self): 47 | # we simply inject a mock KafkaProducer into the KafkaSink 48 | s = stream('test', url=self.url) 49 | s.append(dict(foo='baz')) 50 | db = self.db 51 | sink_coll = db['processed'] 52 | sink = MongoSink(sink_coll) 53 | em = make_emitter('test', url=self.url, sink=sink, emitfn=lambda v: v) 54 | t = Thread(target=em.run) 55 | t.start() 56 | sleep(1) 57 | em._stop = True 58 | docs = list(sink_coll.find()) 59 | self.assertEqual(len(docs), 1) 60 | 61 | def _run_streaming_test(self, N, interval, timeout=10): 62 | # set up a source collection that we want to steram 63 | coll = self.db['test'] 64 | source = MongoSource(coll, size=N) 65 | # attach to the stream 66 | s = stream('test', url=self.url) 67 | s.attach(source) 68 | 69 | # stream consumer 70 | def consumer(q, interval): 71 | url = str(self.url) 72 | 73 | @streaming('test', size=interval, executor=LocalExecutor(), url=url, queue=q) 74 | def process(window): 75 | db = connectdb(url=url) 76 | # calculate average time t_delta it took for documents to be received since insertion 77 | dtnow = datetime.utcnow() 78 | t_delta = sum((dtnow - doc['dt']).microseconds for doc in window.data) / len(window.data) 79 | db.processed.insert_one(dict(delta=t_delta)) 80 | 81 | # give it some input 82 | q = Queue() 83 | p = Process(target=consumer, args=(q, interval)) 84 | p.start() 85 | 86 | for x in range(0, N, interval): 87 | docs = [{ 88 | 'foo': 'bar', 89 | 'dt': datetime.utcnow() 90 | } for i in range(interval)] 91 | coll.insert_many(docs) 92 | sleep(1) 93 | 94 | sleep(timeout) 95 | s.stop() 96 | q.put(True) 97 | p.terminate() 98 | 99 | # check buffer is empty 100 | buffered_docs = list(Buffer.objects.filter()) 101 | self.assertEqual(len(buffered_docs), 0) 102 | 103 | # return processed docs (in sink) 104 | docs = list(self.db.processed.find()) 105 | return docs 106 | -------------------------------------------------------------------------------- /minibatch/tests/test_mqtt.py: -------------------------------------------------------------------------------- 1 | import json 2 | from unittest import TestCase 3 | 4 | from time import sleep 5 | from unittest.mock import MagicMock 6 | 7 | from minibatch import connectdb, stream, reset_mongoengine 8 | from minibatch.contrib.mqtt import MQTTSource, MQTTSink 9 | from minibatch.tests.util import delete_database 10 | 11 | 12 | class MQTTTests(TestCase): 13 | def setUp(self): 14 | self.url = 'mongodb://localhost/test' 15 | delete_database(url=self.url) 16 | self.db = connectdb(url=self.url) 17 | 18 | def tearDown(self): 19 | reset_mongoengine() 20 | 21 | def test_source(self): 22 | # we simply inject a mock MQTTClient into the MQTTSource 23 | source = MQTTSource('localhost', 'TEST/#') 24 | client = MagicMock() 25 | client.loop_forever = lambda *args: sleep(10) 26 | source._client = client 27 | s = stream('test', url=self.url) 28 | s.attach(source) 29 | s.append = MagicMock() 30 | message = MagicMock() 31 | message.payload = json.dumps({'foo': 'bar'}).encode('utf-8') 32 | source.on_message(client, {}, message) 33 | s.append.assert_called() 34 | s.stop() 35 | 36 | def test_sink(self): 37 | # we simply inject a mock MQTTClient into the MQTTSource 38 | sink = MQTTSink('localhost', 'TEST/#') 39 | client = MagicMock() 40 | client.loop_forever = lambda *args: sleep(10) 41 | sink._client = client 42 | sink.put({'foo': 'bar'}) 43 | client.publish.assert_called_with('TEST/#', json.dumps(dict(foo='bar'))) 44 | -------------------------------------------------------------------------------- /minibatch/tests/test_omegaml.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | try: 4 | from omegaml import Omega 5 | from time import sleep 6 | 7 | from minibatch import stream, connectdb, reset_mongoengine 8 | from minibatch.contrib.omegaml import DatasetSource, DatasetSink 9 | from minibatch.tests.util import delete_database, LocalExecutor 10 | from minibatch.window import CountWindow 11 | 12 | class OmegamlTests(TestCase): 13 | def setUp(self): 14 | self.url = 'mongodb://localhost/test' 15 | delete_database(url=self.url) 16 | self.om = Omega(mongo_url=self.url) 17 | self.db = connectdb(self.url) 18 | 19 | def tearDown(self): 20 | reset_mongoengine() 21 | 22 | def test_source(self): 23 | om = self.om 24 | db = self.db 25 | url = str(self.url) 26 | 27 | source = DatasetSource(om, 'stream-test') 28 | s = stream('test', url=url) 29 | s.attach(source) 30 | 31 | def emit(window): 32 | # this runs in a sep thread, so reconnect db 33 | db = connectdb(url) 34 | db.processed.insert_many(window.data) 35 | 36 | om.datasets.put({'foo': 'bar'}, 'stream-test') 37 | sleep(2) 38 | 39 | em = CountWindow('test', emitfn=emit, executor=LocalExecutor()) 40 | em.run(blocking=False) 41 | sleep(1) 42 | s.stop() 43 | 44 | docs = list(db.processed.find()) 45 | self.assertEqual(len(docs), 1) 46 | 47 | def test_sink(self): 48 | om = self.om 49 | db = self.db # noqa 50 | url = str(self.url) 51 | 52 | source = DatasetSource(om, 'stream-test') 53 | sink = DatasetSink(om, 'stream-sink') 54 | s = stream('test', url=url) 55 | s.attach(source) 56 | 57 | def emit(window): 58 | # this runs in a sep thread, so reconnect db 59 | db = connectdb(url) 60 | db.processed.insert_many(window.data) 61 | 62 | om.datasets.put({'foo': 'bar'}, 'stream-test') 63 | sleep(2) 64 | 65 | em = CountWindow('test', emitfn=emit, forwardfn=sink.put, executor=LocalExecutor()) 66 | # run emitter until the message has arrived in sink 67 | # -- sleep() is not sufficiently stable depending on system load 68 | em.should_stop = lambda *args, **kwargs: om.datasets.collection('stream-sink').count_documents({}) > 0 69 | em.run(blocking=True) 70 | s.stop() 71 | 72 | docs = list(db.processed.find()) 73 | docs = list(om.datasets.collection('stream-sink').find()) 74 | self.assertEqual(len(docs), 1) 75 | 76 | 77 | except Exception as e: # noqa 78 | print("WARNING could not load omegaml dependencies => omegaml dataset source/sink are not supported") 79 | -------------------------------------------------------------------------------- /minibatch/tests/util.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures._base import Executor, Future 2 | 3 | from minibatch import connectdb 4 | 5 | 6 | def delete_database(url=None, dbname='test'): 7 | """ test support """ 8 | db = connectdb(url=url, dbname=dbname) 9 | db.client.drop_database(dbname) 10 | return db 11 | 12 | 13 | class LocalExecutor(Executor): 14 | def submit(self, fn): 15 | result = fn() 16 | future = Future() 17 | future.set_result(result) 18 | return future 19 | -------------------------------------------------------------------------------- /minibatch/window.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import Future, ProcessPoolExecutor 2 | 3 | import datetime 4 | import logging 5 | import time 6 | from queue import Empty 7 | 8 | from minibatch import Buffer, Stream, logger 9 | from minibatch.marshaller import SerializableFunction, MinibatchFuture 10 | from minibatch.models import Window 11 | 12 | 13 | class WindowEmitter(object): 14 | """ 15 | a window into a stream of buffered objects 16 | 17 | WindowEmitter.run() implements the generic emitter protocol as follows: 18 | 19 | 1. determine if a window is ready to be processed 20 | 2. retrieve the data from the buffer to create a Window 21 | 3. process the data (i.e. mark the buffered data processed) 22 | 4. run the emit function on the window 23 | 5. commit (emit function successful) or undo (exception raised) 24 | 25 | Note that run() is blocking. Between running the protocol, 26 | it will sleep to conserve resources. 27 | 28 | Each time run() wakes up, it will call the following methods in turn: 29 | 30 | window_ready() - called to determine if the buffer contains enough 31 | data for a window. 32 | query() - return the Buffer objects to process 33 | process() - optionally process the data. By default his just 34 | marks the Buffer objects returned by query() as 35 | processed=True 36 | emit() - emit a Window, this calls the streaming function 37 | which may optionally return a result that can be 38 | forwarded to a sink, see forward() 39 | timestamp() - timestamp the stream for the next processing 40 | commit() - commit processed data back to the buffer. by 41 | default this means removing the objects from the 42 | buffer and deleting the window. 43 | forward() - if a forward function has been defined, call it with 44 | the result of emit() 45 | sleep() - sleep until the next round 46 | 47 | Use timestamp() to mark the stream (or the buffer data) for the next 48 | round. Use sleep() to set the amount of time to sleep. Depending on 49 | the emitter's semantics this may be a e.g. a fixed interval or some 50 | function of the data. 51 | 52 | WindowEmitter implements several defaults: 53 | 54 | process() - mark all data returned by query() as processed 55 | sleep() - sleep self.interval / 2 seconds 56 | undo() - called if the emit function raises an exception. marks 57 | the data returned by query() as not processed and deletes 58 | the window 59 | 60 | For examples of how to implement a custom emitter see TimeWindow, 61 | CountWindow and SampleFunctionWindow. 62 | 63 | Notes: 64 | * there should only be one WindowEmitter per stream. This is a 65 | a limitation of the Buffer's way of marking documentes as processed 66 | (a boolean flag). This decision was made in favor of performance and 67 | simplicity. Supporting concurrent emitters would mean each Buffer object 68 | needs to keep track of which emitter has processed its data and make 69 | sure Window objects are processed by exactly one emitter. 70 | 71 | * to stop the run() method, specify a threading or multiprocessing Queue 72 | and send True 73 | """ 74 | 75 | def __init__(self, stream_name, interval=None, processfn=None, 76 | emitfn=None, emit_empty=False, executor=None, 77 | max_workers=None, stream=None, stream_url=None, 78 | forwardfn=None, queue=None): 79 | self.stream_name = stream_name 80 | self.interval = interval if interval is not None else 1 81 | self.emit_empty = emit_empty 82 | self.emitfn = emitfn 83 | self.processfn = processfn 84 | self.executor = (executor or ProcessPoolExecutor(max_workers=max_workers)) 85 | self._stream = stream 86 | self._stream_url = stream_url 87 | self._delete_on_commit = True 88 | self._forwardfn = forwardfn 89 | self._stop = False 90 | self._queue = queue 91 | 92 | def query(self, *args): 93 | raise NotImplementedError 94 | 95 | def window_ready(self): 96 | """ return a tuple of (ready, qargs) """ 97 | raise NotImplementedError 98 | 99 | def timestamp(self, query_args): 100 | self.stream.modify(last_read=datetime.datetime.utcnow()) 101 | 102 | @property 103 | def stream(self): 104 | if self._stream: 105 | return self._stream 106 | self._stream = Stream.get_or_create(self.stream_name, 107 | url=self._stream_url) 108 | return self._stream 109 | 110 | def process(self, qs): 111 | if self.processfn: 112 | return self.processfn(qs) 113 | data = [] 114 | for obj in qs: 115 | obj.modify(processed=True) 116 | data.append(obj) 117 | return data 118 | 119 | def undo(self, qs, window=None): 120 | for obj in qs: 121 | obj.modify(processed=False) 122 | if window is not None and hasattr(window, 'delete'): 123 | window.delete() 124 | return qs 125 | 126 | def persist(self, flag=True): 127 | self._delete_on_commit = not flag 128 | 129 | def commit(self, qs, window): 130 | if not self._delete_on_commit: 131 | window.modify(processed=True) 132 | return 133 | for obj in qs: 134 | obj.delete() 135 | if window is not None and hasattr(window, 'delete'): 136 | window.delete() 137 | 138 | def emit(self, qs, query_args=None): 139 | window = Window(stream=self.stream.name, 140 | query=query_args, 141 | data=[obj.data for obj in qs]).save() 142 | if self.emitfn: 143 | logging.debug("calling emitfn") 144 | try: 145 | sjob = SerializableFunction(self.emitfn, window) 146 | future = self.executor.submit(sjob) 147 | except Exception: 148 | raise 149 | else: 150 | future = Future() 151 | future.set_result(window) 152 | future = MinibatchFuture(future, window=window) 153 | return future 154 | 155 | def forward(self, data): 156 | if self._forwardfn: 157 | self._forwardfn(data) 158 | 159 | def sleep(self): 160 | time.sleep((self.interval or self.stream.interval) / 2.0) 161 | 162 | def should_stop(self): 163 | if self._queue is not None: 164 | try: 165 | stop_message = self._queue.get(block=False) 166 | logger.debug("queue result {}".format(self._stop)) 167 | except Empty: 168 | logger.debug("queue was empty") 169 | else: 170 | if stop_message: 171 | self._stop = True 172 | logger.debug("should stop") 173 | return self._stop 174 | 175 | def run(self, blocking=True): 176 | while not self.should_stop(): 177 | self._run_once() 178 | logger.debug("sleeping") 179 | self.sleep() 180 | logger.debug("awoke") 181 | if not blocking: 182 | break 183 | if blocking: 184 | # if we did not block, keep executor running 185 | try: 186 | self.executor.shutdown(wait=True) 187 | except Exception as e: 188 | logger.debug(e) 189 | logger.debug('stopped running') 190 | 191 | def _run_once(self): 192 | logger.debug("testing window ready") 193 | ready, query_args = self.window_ready() 194 | if ready: 195 | logger.debug("window ready") 196 | qs = self.query(*query_args) 197 | qs = self.process(qs) 198 | self.timestamp(*query_args) 199 | # note self.emit is usin an async executor 200 | # that returns a future 201 | if qs or self.emit_empty: 202 | logger.debug("Emitting") 203 | future = self.emit(qs, query_args) 204 | logger.debug("got future {}".format(future)) 205 | future['qs'] = qs 206 | future['query_args'] = query_args 207 | 208 | def emit_done(future): 209 | # this is called once upon future resolves 210 | future = MinibatchFuture(future) 211 | logger.debug("emit done {}".format(future)) 212 | qs = future.qs 213 | window = future.window 214 | try: 215 | data = future.result() or window 216 | except Exception: 217 | self.undo(qs, window) 218 | else: 219 | self.commit(qs, window) 220 | if isinstance(data, Window): 221 | data = data.data 222 | self.forward(data) 223 | finally: 224 | logger.debug('emit done') 225 | self.sleep() 226 | 227 | future.add_done_callback(emit_done) 228 | 229 | 230 | class FixedTimeWindow(WindowEmitter): 231 | """ 232 | a fixed time-interval window 233 | 234 | Yields windows of all data retrieved in fixed intervals of n 235 | seconds. Note that windows are created in fixed-block sequences, 236 | i.e. in steps of n_seconds since the start of the stream. Empty 237 | windows are also emitted. This guarantees that any window 238 | contains only those documents received in that particular window. 239 | This is useful if you want to count e.g. the number of events 240 | per time-period. 241 | 242 | Usage: 243 | 244 | @stream(name, interval=n_seconds) 245 | def myproc(window): 246 | # ... 247 | """ 248 | 249 | def __init__(self, *args, **kwargs): 250 | super().__init__(*args, **kwargs) 251 | self.emit_empty = True 252 | 253 | def window_ready(self): 254 | last_read = self.stream.last_read 255 | now = datetime.datetime.utcnow() 256 | max_read = last_read + datetime.timedelta(seconds=self.interval) 257 | return now > max_read, (last_read, max_read) 258 | 259 | def query(self, *args): 260 | last_read, max_read = args 261 | fltkwargs = dict(stream=self.stream_name, 262 | created__gte=last_read, created__lte=max_read) 263 | return Buffer.objects.no_cache().filter(**fltkwargs) 264 | 265 | def timestamp(self, *args): 266 | last_read, max_read = args 267 | self.stream.modify(last_read=max_read) 268 | self.stream.reload() 269 | 270 | def sleep(self): 271 | import time 272 | # sleep slightly longer to make sure the interval is complete 273 | # and all data had a chance to accumulate. if we don't do 274 | # this we might get empty windows on accident, resulting in 275 | # lost data 276 | now = datetime.datetime.utcnow() 277 | if self.stream.last_read > now - datetime.timedelta(seconds=self.interval): 278 | # only sleep if all previous windows were processed 279 | time.sleep(self.interval + 0.01) 280 | 281 | 282 | class RelaxedTimeWindow(FixedTimeWindow): 283 | """ 284 | a relaxed time-interval window 285 | 286 | Every interval n_seconds, yields windows of all data in the buffer 287 | since the last successful retrieval of data. This does _not_ 288 | guarantee the data retrieved is in a specific time range. This is 289 | useful if you want to retrieve data every n_seconds but do not 290 | care when the data was inserted into the buffer. 291 | 292 | Usage: 293 | 294 | @stream(name, interval=n_seconds) 295 | def myproc(window): 296 | # ... 297 | """ 298 | 299 | def query(self, *args): 300 | last_read, max_read = args 301 | fltkwargs = dict(stream=self.stream_name, 302 | created__lte=max_read, processed=False) 303 | return Buffer.objects.no_cache().filter(**fltkwargs) 304 | 305 | 306 | class CountWindow(WindowEmitter): 307 | def window_ready(self): 308 | fltkwargs = dict(stream=self.stream_name, processed=False) 309 | qs = Buffer.objects.no_cache().filter(**fltkwargs).limit(self.interval) 310 | n_docs = qs.count(with_limit_and_skip=True) 311 | self._qs = qs 312 | return n_docs >= self.interval, [] 313 | 314 | def query(self, *args): 315 | return self._qs 316 | 317 | def timestamp(self, *args): 318 | self.stream.modify(last_read=datetime.datetime.utcnow()) 319 | self.stream.reload() 320 | 321 | def sleep(self): 322 | import time 323 | time.sleep(0.1) 324 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -e ../omegaml-ce 2 | -e .[all,dev] 3 | -------------------------------------------------------------------------------- /resources/docker-compose-kafka.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | 3 | services: 4 | zookeeper: 5 | image: 'bitnami/zookeeper:3' 6 | ports: 7 | - '2181:2181' 8 | volumes: 9 | - 'zookeeper_data:/bitnami' 10 | environment: 11 | - ALLOW_ANONYMOUS_LOGIN=yes 12 | kafka: 13 | image: 'bitnami/kafka:2' 14 | ports: 15 | - '9092:9092' 16 | - "29092:29092" 17 | volumes: 18 | - 'kafka_data:/bitnami' 19 | environment: 20 | # https://stackoverflow.com/a/53093678/890242 21 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181' 22 | ALLOW_PLAINTEXT_LISTENER: "yes" 23 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT,NGROK:PLAINTEXT 24 | KAFKA_LISTENERS: PLAINTEXT://:9092,NGROK://:29092 25 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092,NGROK://0.tcp.ngrok.io:15839 26 | depends_on: 27 | - zookeeper 28 | 29 | volumes: 30 | zookeeper_data: 31 | driver: local 32 | kafka_data: 33 | driver: local 34 | -------------------------------------------------------------------------------- /resources/docker-compose-mqtt.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | rabbitmq: 4 | image: rabbitmq:latest 5 | ports: 6 | # 5672 amqp 7 | # 15672 mgmt ui 8 | # 1883 mqtt 9 | - "5672:5672" 10 | - "15672:15672" 11 | - "1883:1883" 12 | volumes: 13 | - "./enabled_plugins:/etc/rabbitmq/enabled_plugins" 14 | - "./rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf:ro" 15 | -------------------------------------------------------------------------------- /resources/enabled_plugins: -------------------------------------------------------------------------------- 1 | [rabbitmq_management, rabbitmq_mqtt, rabbitmq_management_visualiser]. 2 | -------------------------------------------------------------------------------- /resources/rabbitmq.conf: -------------------------------------------------------------------------------- 1 | # https://www.rabbitmq.com/configure.html#config-file 2 | default_vhost = / 3 | # actual user/password required when allow_anonymous is false 4 | default_user = rabbitmq 5 | default_pass = rabbitmq 6 | 7 | # https://www.rabbitmq.com/mqtt.html 8 | # u/p mqtt test 9 | mqtt.listeners.tcp.default = 1883 10 | mqtt.allow_anonymous = false 11 | # used for anonymous connections, must be valid rabbitmq user/password 12 | mqtt.default_user = rabbitmq 13 | mqtt.default_pass = rabbitmq 14 | # map to existing vhost and exchange 15 | mqtt.vhost = / 16 | mqtt.exchange = amq.topic 17 | -------------------------------------------------------------------------------- /resources/rabbitmq.config: -------------------------------------------------------------------------------- 1 | [ 2 | {rabbit, 3 | [ 4 | {default_vhost, <<"/">>}, 5 | {default_user, <<"rabbitmq">>}, 6 | {default_pass, <<"rabbitmq">>}, 7 | {loopback_users, []} 8 | ] 9 | } 10 | ]. -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | from setuptools import setup, find_packages 5 | 6 | basedir = Path(os.path.dirname(__file__)) 7 | README = open(basedir / 'README.rst').read() 8 | version = open(basedir / 'minibatch' / 'VERSION').read() 9 | 10 | dev_deps = ['pytest', 'twine', 'flake8', 'bumpversion'] 11 | app_deps = ['flask', 'dash'] 12 | kafka_deps = ['kafka-python==1.4.7'] 13 | mqtt_deps = ['paho-mqtt==1.5.0'] 14 | mongo_deps = ['pymongo>=3.2.2', 'dnspython'] 15 | omega_deps = ['omegaml[client]>=0.15.3'] 16 | 17 | setup(name='minibatch', 18 | version=version, 19 | description='Python stream processing for humans', 20 | url='http://github.com/omegaml/minibatch', 21 | long_description=README, 22 | long_description_content_type='text/x-rst', 23 | include_package_data=True, 24 | author='Patrick Senti', 25 | author_email='patrick.senti@omegaml.io', 26 | license='Apache 2.0 + "No Sell, Consulting Yes" License Condition', 27 | packages=find_packages(), 28 | zip_safe=False, 29 | install_requires=[ 30 | # Mongo 4.2 requires at least mongoengine 0.19 due to 31 | # https://github.com/MongoEngine/mongoengine/pull/2160/files 32 | 'mongoengine>=0.23.0', 33 | 'dill', 34 | ], 35 | extras_require={ 36 | 'apps': app_deps, 37 | 'kafka': kafka_deps, 38 | 'mqtt': mqtt_deps, 39 | 'mongodb': mongo_deps, 40 | 'omegaml': mongo_deps + omega_deps, 41 | 'all': kafka_deps + mqtt_deps + mongo_deps + app_deps, 42 | 'dev': kafka_deps + mqtt_deps + mongo_deps + omega_deps + dev_deps, 43 | }, 44 | ) 45 | --------------------------------------------------------------------------------