├── .editorconfig ├── .gitignore ├── .readthedocs.yml ├── .travis.yml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── ROADMAP.md ├── Vagrantfile ├── docs ├── Makefile ├── conf.py ├── index.rst ├── make.bat └── source │ ├── modules.rst │ ├── winton_kafka_streams.processor.rst │ ├── winton_kafka_streams.processor.serde.rst │ ├── winton_kafka_streams.rst │ └── winton_kafka_streams.state.rst ├── examples ├── binning │ ├── README.md │ ├── binning.py │ ├── config.properties │ ├── generator.py │ ├── live-plot.ipynb │ ├── random_prices.py │ └── source.py ├── debug │ ├── README.md │ ├── config.properties │ └── example.py └── wordcount │ ├── README.md │ ├── config.properties │ ├── custom_serde.py │ ├── docker │ ├── docker-compose.yml │ ├── kafka-debug │ │ └── Dockerfile │ ├── source_client │ │ └── Dockerfile │ └── wordcount │ │ ├── Dockerfile │ │ └── config.properties │ ├── example.py │ └── source_client.py ├── requirements_docs.txt ├── setup.cfg ├── setup.py ├── tests ├── processor │ ├── serde │ │ ├── __init__.py │ │ ├── mock_schema_registry.py │ │ ├── test_avro_serde.py │ │ ├── test_instantiation.py │ │ └── test_serialisation.py │ ├── test_base_processor.py │ ├── test_extract_timestamp.py │ ├── test_punctuation_queue.py │ ├── test_sink_processor.py │ ├── test_source_processor.py │ ├── test_stream_task.py │ ├── test_task_id.py │ ├── test_topology.py │ └── test_wallclock_timestamp.py ├── state │ └── test_in_memory_key_value_store.py └── test_kafka_streams.py └── winton_kafka_streams ├── __init__.py ├── errors ├── __init__.py ├── _kafka_error_codes.py ├── kafka_streams_error.py └── task_migrated_error.py ├── kafka_client_supplier.py ├── kafka_config.py ├── kafka_streams.py ├── processor ├── __init__.py ├── _context.py ├── _punctuation_queue.py ├── _record_collector.py ├── _stream_task.py ├── _stream_thread.py ├── _timestamp.py ├── extract_timestamp.py ├── processor.py ├── processor_context.py ├── serialization │ ├── __init__.py │ ├── _avro.py │ ├── _bytes.py │ ├── _double.py │ ├── _float.py │ ├── _integer.py │ ├── _json.py │ ├── _long.py │ ├── _string.py │ ├── deserializer.py │ ├── serde.py │ ├── serdes │ │ ├── __init__.py │ │ ├── _serdes.py │ │ ├── avro_serde.py │ │ ├── bytes_serde.py │ │ ├── double_serde.py │ │ ├── float_serde.py │ │ ├── integer_serde.py │ │ ├── json_serde.py │ │ ├── long_serde.py │ │ ├── string_serde.py │ │ └── wrapper_serde.py │ └── serializer.py ├── task_id.py ├── topology.py └── wallclock_timestamp.py ├── state ├── __init__.py ├── factory │ ├── __init__.py │ ├── base_storage_key_value_store_factory.py │ ├── in_memory_key_value_store_factory.py │ ├── key_value_store_factory.py │ ├── store_factory.py │ └── value_store_factory.py ├── in_memory │ ├── __init__.py │ ├── in_memory_state_store.py │ └── in_memory_state_store_supplier.py ├── key_value_state_store.py ├── logging │ ├── __init__.py │ ├── change_logging_state_store.py │ └── store_change_logger.py ├── state_store.py └── state_store_supplier.py └── version.py /.editorconfig: -------------------------------------------------------------------------------- 1 | ; EditorConfig helps developers define and maintain consistent 2 | ; coding styles between different editors and IDEs. 3 | 4 | ; For more visit http://editorconfig.org. 5 | root = true 6 | 7 | ; Choose between lf or rf on "end_of_line" property 8 | [*] 9 | indent_style = space 10 | end_of_line = lf 11 | charset = utf-8 12 | trim_trailing_whitespace = true 13 | insert_final_newline = true 14 | 15 | [*.{js,css,scss}] 16 | indent_size = 2 17 | 18 | [*.html] 19 | indent_style = tab 20 | 21 | [*.{py,html,md}] 22 | indent_size = 4 23 | 24 | [*.md] 25 | trim_trailing_whitespace = false 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | .pytest_cache/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # Environments 83 | .env 84 | .venv 85 | env/ 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | .spyproject 92 | 93 | # Rope project settings 94 | .ropeproject 95 | 96 | # mkdocs documentation 97 | /site 98 | 99 | # mypy 100 | .mypy_cache/ 101 | 102 | # vim 103 | *.*~ 104 | *.swp 105 | 106 | # PyCharm 107 | .idea/ 108 | 109 | # Vagrant 110 | .vagrant/ 111 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | --- 2 | build: 3 | image: latest 4 | 5 | requirements_file: requirements_docs.txt 6 | 7 | python: 8 | version: 3.6 9 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | dist: trusty # librdkafka support begins at Trusty. 3 | python: 4 | - "3.6" # Advertised support 5 | - "3.6-dev" # Check we'll keep working in future 6 | before_install: 7 | - wget -qO - http://packages.confluent.io/deb/3.2/archive.key | sudo apt-key add - # Use the confluent repository 8 | - sudo add-apt-repository "deb [arch=amd64] http://packages.confluent.io/deb/3.2 stable main" 9 | - sudo apt-get update -qq # Update quietly. 10 | - sudo apt-get install -y librdkafka-dev librdkafka1 11 | install: "pip install --editable .[develop]" 12 | script: pytest 13 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | One of the easiest ways to contribute is to participate in discussions and discuss issues. You can also contribute by submitting pull requests with code changes. 4 | 5 | ## Filing issues 6 | The best way to get your bug fixed is to be as detailed as you can be about the problem. 7 | Providing a minimal project with steps to reproduce the problem is ideal. 8 | Here are questions you can answer before you file a bug to make sure you're not missing any important information. 9 | 10 | 1. Did you include the snippet of broken code in the issue? 11 | 2. What are the *EXACT* steps to reproduce this problem? 12 | 3. What package versions are you using (you can see these in the `project.json` file)? 13 | 4. What operating system are you using? 14 | 15 | GitHub supports [markdown](https://help.github.com/articles/github-flavored-markdown/), so when filing bugs make sure you check the formatting before clicking submit. 16 | 17 | ## Contributor License Agreement ("CLA") 18 | 19 | In order to accept your pull request, we need you to submit a CLA. You only need to do this once to work on any of Winton's open source projects. 20 | 21 | The text of the CLA can be seen in this gist: [Winton CLA](https://cla-assistant.io/wintoncode/winton-kafka-streams) 22 | 23 | ## Contributing code and content 24 | Make sure you can build the code and run the tests. Familiarize yourself with the project workflow and our coding conventions. If you don't know what a pull request is read this article: https://help.github.com/articles/using-pull-requests. 25 | 26 | Before submitting a feature or substantial code contribution please discuss it with the team and ensure it follows the product roadmap. You might also read these two blogs posts on contributing code: [Open Source Contribution Etiquette](http://tirania.org/blog/archive/2010/Dec-31.html) by Miguel de Icaza and [Don't "Push" Your Pull Requests](https://www.igvita.com/2011/12/19/dont-push-your-pull-requests/) by Ilya Grigorik. Note that all code submissions will be rigorously reviewed and tested by the Winton team prior to merging. 27 | 28 | Here's a few things you should always do when making changes to the code base: 29 | 30 | **Engineering guidelines** 31 | 32 | Please follow the existing coding style used in this project. 33 | 34 | **Commit/Pull Request Format** 35 | 36 | ``` 37 | Summary of the changes (Less than 80 chars) 38 | - Detail 1 39 | - Detail 2 40 | 41 | Addresses #bugnumber (in this specific format) 42 | ``` 43 | 44 | **Tests** 45 | 46 | - Tests need to be provided for every bug/feature that is completed. 47 | - If there is a scenario that is far too hard to test there does not need to be a test for it. 48 | - "Too hard" is determined by the team as a whole. 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2016 Winton 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Winton Kafka Streams 2 | 3 | [![Build Status](https://travis-ci.org/wintoncode/winton-kafka-streams.svg?branch=master)](https://travis-ci.org/wintoncode/winton-kafka-streams) 4 | 5 | Implementation of [Apache Kafka's Streams API](https://kafka.apache.org/documentation/streams/) in Python. 6 | 7 | ## What and why? 8 | Apache Kafka is an open-source stream processing platform developed 9 | by the Apache Software Foundation written in Scala and Java. Kafka 10 | has Streams API added for building stream processing applications 11 | using Apache Kafka. Applications built with Kafka's Streams API do not require any 12 | setup beyond the provision of a Kafka cluster. 13 | 14 | Winton Kafka Streams is a Python implementation of Apache Kafka's 15 | Streams API. It builds on Confluent's librdkafka (a high 16 | performance C library implementing the Kafka protocol) and the 17 | Confluent Python Kafka library to achieve this. 18 | 19 | The power and simplicity of both Python and Kafka's Streams API combined 20 | opens the streaming model to many more people and applications. 21 | 22 | ## Getting started 23 | 24 | ### Dependencies 25 | 26 | The minimum Python version is currently 3.6 and a working Kafka 27 | cluster (a single replica is sufficient for testing) are required. 28 | 29 | You will require [librdkafka](https://github.com/edenhill/librdkafka). On Mac OS, we recommend installing this via HomeBrew and setting `CFLAGS=-I/usr/local/include` and `LDFLAGS=-L/usr/local/lib` is 30 | when installing Confluent Python Kafka (see below). 31 | The librdkafka GitHub page lists packages available for Debian and Ubuntu, as well as RPMS. 32 | For Arch Linux it is available via [AUR](https://aur.archlinux.org/packages/librdkafka-git/). 33 | 34 | Confluent Python Kafka is also required and it should be installed 35 | as a dependency by pip. 36 | 37 | ### Installing 38 | 39 | Cloning the Winton Kafka Streams repository from GitHub is 40 | recommended if you want to contribute to the project. 41 | Then use 42 | `pip install --editable [develop]` 43 | to install as an editable workspace with additional dependencies 44 | required for development. 45 | You may need to do this using `sudo` on Linux. 46 | 47 | If you want to install the code and get a feel for it as a user then 48 | we recommend using `pip install git+https://github.com/wintoncode/winton-kafka-streams`. 49 | 50 | ### Running tests 51 | Tests will run when py.test is called in the root of the repository. 52 | 53 | ### Running examples 54 | To run examples, you must have cloned the code locally from GitHub. 55 | 56 | The debug and wordcount examples will run without further additional 57 | requirements. 58 | 59 | The Jupyter notebook in the binning example requires some additional 60 | packages. Install these with the command: 61 | 62 | pip install [binning_example] 63 | 64 | ## Contributing 65 | Please see the CONTRIBUTING.md document for more details on getting involved. 66 | 67 | ## Contact 68 | - GitHub: https://github.com/wintoncode/ 69 | - Email: opensource@winton.com 70 | - Twitter: @wintoncapital 71 | -------------------------------------------------------------------------------- /ROADMAP.md: -------------------------------------------------------------------------------- 1 | # Roadmap 2 | 3 | The roadmap is a high level overview of work we would like to see implemented For more details and discussion of new features, improvements or bugs, please see the [issue list](https://github.com/wintoncode/winton-kafka-streams/issues) in GitHub. 4 | 5 | * Complete implementation of Kafka's Streams API in Python 6 | * The current code is a good proof of concept but is still under active development. There are a number of key features remaining, in particular a persistent state store and a DSL. There are also many improvements to existing features left to implement - check the issue list for the latest status. 7 | * Implement new features of Kafka's Streams API 8 | * v0.11 of Apache Kafka was released on 28 June 2017 with many important and useful features. 9 | * Investigate a more Pythonic API/DSL 10 | * The current Processor API follows the Java layout very closely. A Python Streams domain specific language (DSL) should leverage Python's unique language stregnths to make writing stream application as easy and intuitive as possible. 11 | * Optimise performance 12 | * Python has many known performance limitations; continue to optimise the code to perform as well as possible. Consider implementing some or all of the application in C. 13 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | Vagrant.configure("2") do |config| 4 | 5 | config.vm.box = "bento/ubuntu-16.04" 6 | 7 | config.vm.network "forwarded_port", guest: 2181, host: 2181 8 | config.vm.network "forwarded_port", guest: 9092, host: 9092 9 | 10 | config.vm.provider "virtualbox" do |v| 11 | v.memory = 2048 12 | v.cpus = 2 13 | end 14 | 15 | config.vm.provision "shell", inline: <<-SHELL 16 | export SCALA_VER=2.11 17 | export KAFKA_VER=1.0.0 18 | export KAFKA_PACKAGE=kafka_${SCALA_VER}-${KAFKA_VER} 19 | 20 | apt-get update 21 | apt-get install -y tmux htop vim wget git 22 | 23 | apt-get install -y build-essential software-properties-common python-software-properties 24 | wget -qO - http://packages.confluent.io/deb/3.3/archive.key | apt-key add - 25 | add-apt-repository "deb [arch=amd64] http://packages.confluent.io/deb/3.3 stable main" 26 | apt-get update 27 | apt-get install -y librdkafka-dev 28 | 29 | wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh 30 | sh Miniconda3-latest-Linux-x86_64.sh -b -f -p /home/vagrant/miniconda3 31 | rm -f Miniconda3-latest-Linux-x86_64.sh 32 | /home/vagrant/miniconda3/bin/conda create -q -y -n vagrant python=3.6 33 | echo PATH=/home/vagrant/miniconda3/bin:\$PATH >> /home/vagrant/.profile 34 | echo source activate vagrant >> /home/vagrant/.profile 35 | echo cd /vagrant/ >> /home/vagrant/.profile 36 | 37 | apt-get install -y zookeeperd openjdk-8-jdk kafkacat 38 | wget -q http://mirror.ox.ac.uk/sites/rsync.apache.org/kafka/${KAFKA_VER}/${KAFKA_PACKAGE}.tgz 39 | tar -xzf ${KAFKA_PACKAGE}.tgz 40 | rm -f ${KAFKA_PACKAGE}.tgz 41 | mv ${KAFKA_PACKAGE} /opt/kafka 42 | SHELL 43 | 44 | config.vm.provision "shell", run: "always", inline: <<-SHELL 45 | /home/vagrant/miniconda3/envs/vagrant/bin/pip install -e /vagrant/.[develop] 46 | chown -R vagrant:vagrant /home/vagrant/miniconda3 47 | rm -fr /tmp/kafka* 48 | nohup /opt/kafka/bin/kafka-server-start.sh /opt/kafka/config/server.properties > /tmp/kafka.log 2>&1 & 49 | SHELL 50 | end 51 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = WintonKafkaStreamsPython 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Winton Kafka Streams Python documentation build configuration file, created by 5 | # sphinx-quickstart on Tue May 16 21:00:14 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | 23 | # Get the project root dir 24 | cwd = os.getcwd() 25 | project_root = os.path.dirname(cwd) 26 | 27 | # Insert the project root dir as the first element in the PYTHONPATH. 28 | # This lets us ensure that the source package is imported, and that its 29 | # version is used. 30 | sys.path.insert(0, project_root) 31 | 32 | import winton_kafka_streams 33 | 34 | from mock import MagicMock 35 | 36 | class Mock(MagicMock): 37 | @classmethod 38 | def __getattr__(cls, name): 39 | return MagicMock() 40 | 41 | MOCK_MODULES = ['confluent_kafka', 'confluent_kafka.cimpl', 'confluent_kafka.avro'] 42 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) 43 | 44 | # -- General configuration ------------------------------------------------ 45 | 46 | # If your documentation needs a minimal Sphinx version, state it here. 47 | # 48 | # needs_sphinx = '1.0' 49 | 50 | # Add any Sphinx extension module names here, as strings. They can be 51 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 52 | # ones. 53 | extensions = ['sphinx.ext.autodoc'] 54 | 55 | # Add any paths that contain templates here, relative to this directory. 56 | templates_path = ['_templates'] 57 | 58 | # The suffix(es) of source filenames. 59 | # You can specify multiple suffix as a list of string: 60 | # 61 | # source_suffix = ['.rst', '.md'] 62 | source_suffix = '.rst' 63 | 64 | # The master toctree document. 65 | master_doc = 'index' 66 | 67 | # General information about the project. 68 | project = 'Winton Kafka Streams Python' 69 | copyright = '2017, Winton Group' 70 | author = 'Winton Group' 71 | 72 | # The version info for the project you're documenting, acts as replacement for 73 | # |version| and |release|, also used in various other places throughout the 74 | # built documents. 75 | # 76 | 77 | from setuptools_scm import get_version 78 | version = release = get_version(root='..') 79 | 80 | # The language for content autogenerated by Sphinx. Refer to documentation 81 | # for a list of supported languages. 82 | # 83 | # This is also used if you do content translation via gettext catalogs. 84 | # Usually you set "language" from the command line for these cases. 85 | language = None 86 | 87 | # List of patterns, relative to source directory, that match files and 88 | # directories to ignore when looking for source files. 89 | # This patterns also effect to html_static_path and html_extra_path 90 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 91 | 92 | # The name of the Pygments (syntax highlighting) style to use. 93 | pygments_style = 'sphinx' 94 | 95 | # If true, `todo` and `todoList` produce output, else they produce nothing. 96 | todo_include_todos = False 97 | 98 | 99 | # -- Options for HTML output ---------------------------------------------- 100 | 101 | # The theme to use for HTML and HTML Help pages. See the documentation for 102 | # a list of builtin themes. 103 | # 104 | html_theme = "sphinx_rtd_theme" 105 | 106 | # Theme options are theme-specific and customize the look and feel of a theme 107 | # further. For a list of options available for each theme, see the 108 | # documentation. 109 | # 110 | # html_theme_options = {} 111 | 112 | # Add any paths that contain custom static files (such as style sheets) here, 113 | # relative to this directory. They are copied after the builtin static files, 114 | # so a file named "default.css" will overwrite the builtin "default.css". 115 | html_static_path = ['_static'] 116 | 117 | 118 | # -- Options for HTMLHelp output ------------------------------------------ 119 | 120 | # Output file base name for HTML help builder. 121 | htmlhelp_basename = 'WintonKafkaStreamsPythondoc' 122 | 123 | 124 | # -- Options for LaTeX output --------------------------------------------- 125 | 126 | latex_elements = { 127 | # The paper size ('letterpaper' or 'a4paper'). 128 | # 129 | # 'papersize': 'letterpaper', 130 | 131 | # The font size ('10pt', '11pt' or '12pt'). 132 | # 133 | # 'pointsize': '10pt', 134 | 135 | # Additional stuff for the LaTeX preamble. 136 | # 137 | # 'preamble': '', 138 | 139 | # Latex figure (float) alignment 140 | # 141 | # 'figure_align': 'htbp', 142 | } 143 | 144 | # Grouping the document tree into LaTeX files. List of tuples 145 | # (source start file, target name, title, 146 | # author, documentclass [howto, manual, or own class]). 147 | latex_documents = [ 148 | (master_doc, 'WintonKafkaStreamsPython.tex', 'Winton Kafka Streams Python Documentation', 149 | 'Winton Group', 'manual'), 150 | ] 151 | 152 | 153 | # -- Options for manual page output --------------------------------------- 154 | 155 | # One entry per manual page. List of tuples 156 | # (source start file, name, description, authors, manual section). 157 | man_pages = [ 158 | (master_doc, 'wintonkafkastreamspython', 'Winton Kafka Streams Python Documentation', 159 | [author], 1) 160 | ] 161 | 162 | 163 | # -- Options for Texinfo output ------------------------------------------- 164 | 165 | # Grouping the document tree into Texinfo files. List of tuples 166 | # (source start file, target name, title, author, 167 | # dir menu entry, description, category) 168 | texinfo_documents = [ 169 | (master_doc, 'WintonKafkaStreamsPython', 'Winton Kafka Streams Python Documentation', 170 | author, 'WintonKafkaStreamsPython', 'One line description of project.', 171 | 'Miscellaneous'), 172 | ] 173 | 174 | 175 | 176 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Winton Kafka Streams Python documentation master file, created by 2 | sphinx-quickstart on Tue May 16 21:00:14 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Winton Kafka Streams Python's documentation! 7 | ======================================================= 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=WintonKafkaStreamsPython 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | winton_kafka_streams 2 | ==================== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | winton_kafka_streams 8 | -------------------------------------------------------------------------------- /docs/source/winton_kafka_streams.processor.rst: -------------------------------------------------------------------------------- 1 | winton\_kafka\_streams\.processor package 2 | ========================================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | winton_kafka_streams.processor.serde 10 | 11 | Submodules 12 | ---------- 13 | 14 | winton\_kafka\_streams\.processor\.extract\_timestamp module 15 | ------------------------------------------------------------ 16 | 17 | .. automodule:: winton_kafka_streams.processor.extract_timestamp 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | winton\_kafka\_streams\.processor\.processor module 23 | --------------------------------------------------- 24 | 25 | .. automodule:: winton_kafka_streams.processor.processor 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | winton\_kafka\_streams\.processor\.processor\_context module 31 | ------------------------------------------------------------ 32 | 33 | .. automodule:: winton_kafka_streams.processor.processor_context 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | winton\_kafka\_streams\.processor\.topology module 39 | -------------------------------------------------- 40 | 41 | .. automodule:: winton_kafka_streams.processor.topology 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | winton\_kafka\_streams\.processor\.wallclock\_timestamp module 47 | -------------------------------------------------------------- 48 | 49 | .. automodule:: winton_kafka_streams.processor.wallclock_timestamp 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | 54 | 55 | Module contents 56 | --------------- 57 | 58 | .. automodule:: winton_kafka_streams.processor 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | -------------------------------------------------------------------------------- /docs/source/winton_kafka_streams.processor.serde.rst: -------------------------------------------------------------------------------- 1 | winton\_kafka\_streams\.processor\.serde package 2 | ================================================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | winton\_kafka\_streams\.processor\.serde\.identity module 8 | --------------------------------------------------------- 9 | 10 | .. automodule:: winton_kafka_streams.processor.serde.identity 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: winton_kafka_streams.processor.serde 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/source/winton_kafka_streams.rst: -------------------------------------------------------------------------------- 1 | winton\_kafka\_streams package 2 | ============================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | winton_kafka_streams.processor 10 | winton_kafka_streams.state 11 | 12 | Submodules 13 | ---------- 14 | 15 | winton\_kafka\_streams\.kafka\_config module 16 | -------------------------------------------- 17 | 18 | .. automodule:: winton_kafka_streams.kafka_config 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | winton\_kafka\_streams\.kafka\_stream module 24 | -------------------------------------------- 25 | 26 | .. automodule:: winton_kafka_streams.kafka_stream 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | 32 | Module contents 33 | --------------- 34 | 35 | .. automodule:: winton_kafka_streams 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /docs/source/winton_kafka_streams.state.rst: -------------------------------------------------------------------------------- 1 | winton\_kafka\_streams\.state package 2 | ===================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | winton\_kafka\_streams\.state\.simple module 8 | -------------------------------------------- 9 | 10 | .. automodule:: winton_kafka_streams.state.simple 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: winton_kafka_streams.state 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /examples/binning/README.md: -------------------------------------------------------------------------------- 1 | # Binning example 2 | 3 | ## Additional Python Package 4 | 5 | In addition to the packages required by the Winton Kafka Streams package, you 6 | will also need to have `pandas` available. See also `live-plot.ipynb` 7 | for a jupyter notebook visualising this example (it additionally 8 | requires `jupyter` and `bokeh`). 9 | 10 | ## Prepare Kafka 11 | 12 | Start up Zookeeper and Kafka: 13 | 14 | bin/zookeeper-server-start.sh config/zookeeper.properties 15 | bin/kafka-server-start.sh config/server.properties 16 | 17 | Then create the topics used in this example: 18 | 19 | bin/kafka-topics.sh \ 20 | --create \ 21 | --zookeeper localhost:2181 \ 22 | --replication-factor 1 \ 23 | --partitions 1 \ 24 | --topic prices 25 | 26 | bin/kafka-topics.sh \ 27 | --create \ 28 | --zookeeper localhost:2181 \ 29 | --replication-factor 1 \ 30 | --partitions 1 \ 31 | --topic bin-prices 32 | 33 | ## Run generator 34 | 35 | First generate a log of the full data - see `python generator.py --help` 36 | for details of the options: 37 | 38 | python generator.py \ 39 | -i AAA,0.3,123,100.0,0.01 \ 40 | -i BBB,0.4,456,70.0,0.011 \ 41 | -l 60000 -f 250ms \ 42 | > full_data.log 43 | 44 | then run again, but this time producing 'in real-time' to a Kafka topic 45 | (the generated data is the same as above, as the setup is the same): 46 | 47 | python generator.py \ 48 | -i AAA,0.3,123,100.0,0.01 \ 49 | -i BBB,0.4,456,70.0,0.011 \ 50 | -l 6000 -f 1s \ 51 | -kb localhost:9092 -kt prices \ 52 | -rt 53 | 54 | This will produce outputs to the 'bin-prices' topic. 55 | 56 | In both cases the script will terminated once any of the two items has 57 | produced 60000 values. 58 | 59 | ## Run binning 60 | 61 | Now run the Winton Kafka Streams application that consumes the price topic: 62 | 63 | python -u binning.py --config-file config.properties 64 | 65 | Note: When terminating it, the last 2 bins are likely to be missing from 66 | the output - this needs improvement. 67 | 68 | ## Inspect output 69 | 70 | A simple way to see the produced results is to run: 71 | 72 | kafkacat -b localhost -t bin-prices -e -J 73 | 74 | which will print a JSON formatted view of the topic to stdout. 75 | -------------------------------------------------------------------------------- /examples/binning/binning.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python Kafka Streams example script for price binning 3 | """ 4 | 5 | import logging 6 | import time 7 | import pandas as pd 8 | from winton_kafka_streams.processor import BaseProcessor, TopologyBuilder 9 | import winton_kafka_streams.kafka_config as kafka_config 10 | import winton_kafka_streams.kafka_streams as kafka_streams 11 | 12 | LOGGER = logging.getLogger(__name__) 13 | 14 | _VERBOSITY = { 15 | 0: logging.WARN, 16 | 1: logging.INFO, 17 | 2: logging.DEBUG 18 | } 19 | 20 | 21 | class Binning(BaseProcessor): 22 | """ 23 | Implementation of binning process 24 | 25 | The code will be passed a value from the 'prices' source topic 26 | in Kafka. This processor will search for the final value in the 27 | binning range (1 minute) and output that to the 'bin-prices' 28 | sink topic in Kafka. 29 | 30 | There is a Python generator script provided to generate prices 31 | with normally distributed returns. You can control the frequency 32 | of generation, the mean and standard deviation and the number 33 | of items generated. 34 | 35 | TODO: Later this example should be extended to show partition 36 | support. 37 | """ 38 | 39 | def initialise(self, _name, _context): 40 | super().initialise(_name, _context) 41 | # bins tracks last time bin and price per symbol 42 | self.bins = {} # TODO: Replace with self.context.get_store(") 43 | 44 | def process(self, _, value): 45 | """ 46 | Processes values from the source in search of the last 47 | value in that bin. 48 | 49 | Parameters: 50 | ----------- 51 | _ : object, unused (key) 52 | The key read from the source topic (unused here) 53 | value: object 54 | The value read from the source topic 55 | 56 | Returns: 57 | -------- 58 | None 59 | """ 60 | timestamp, symbol, price = value.split(',') 61 | timestamp = pd.Timestamp(timestamp) 62 | 63 | bin_ts = pd.Timestamp( 64 | year=timestamp.year, month=timestamp.month, day=timestamp.day, 65 | hour=timestamp.hour, minute=timestamp.minute, second=0 66 | ) + pd.Timedelta('1min') 67 | bin_ts_and_price = '{},{}'.format(bin_ts.isoformat(), price) 68 | 69 | last_bin = self.bins.get(symbol) 70 | 71 | if last_bin is not None: 72 | last_bin_ts, last_price = last_bin.split(',') 73 | if last_bin_ts != bin_ts.isoformat(): 74 | key = '{},{}'.format(last_bin_ts, symbol) 75 | LOGGER.debug('Forwarding to sink (%s, %s)', key, last_price) 76 | self.context.forward(key, last_price) 77 | self.context.commit() # TODO: implement auto-commit, remove this 78 | 79 | self.bins[symbol] = bin_ts_and_price 80 | 81 | 82 | def run(config_file=None): 83 | """ 84 | Starts the binning process 85 | 86 | Called here from main() when invoked from command line 87 | but could equally import binning and call 88 | binning.run(config_file) 89 | 90 | """ 91 | if config_file: 92 | kafka_config.read_local_config(config_file) 93 | 94 | with TopologyBuilder() as topology_builder: 95 | topology_builder. \ 96 | source('prices', ['prices']). \ 97 | processor('binner', Binning, 'prices'). \ 98 | sink('result', 'bin-prices', 'binner') 99 | 100 | wks = kafka_streams.KafkaStreams(topology_builder, kafka_config) 101 | wks.start() 102 | try: 103 | while True: 104 | time.sleep(1) 105 | except KeyboardInterrupt: 106 | pass 107 | finally: 108 | wks.close() 109 | 110 | 111 | def _get_parser(): 112 | import argparse 113 | parser = argparse.ArgumentParser(description=__doc__) 114 | parser.add_argument( 115 | '--config-file', '-c', default='config.properties', 116 | help="Local configuration - will override internal defaults" 117 | ) 118 | parser.add_argument( 119 | '-v', dest='verbosity', action='count', default=0, 120 | help='Enable more verbose logging, use once for info, ' 121 | 'twice for debug.' 122 | ) 123 | return parser 124 | 125 | 126 | def main(): 127 | parser = _get_parser() 128 | args = parser.parse_args() 129 | logging.basicConfig(level=_VERBOSITY.get(args.verbosity, logging.DEBUG)) 130 | run(args.config_file) 131 | 132 | 133 | if __name__ == '__main__': 134 | main() 135 | -------------------------------------------------------------------------------- /examples/binning/config.properties: -------------------------------------------------------------------------------- 1 | bootstrap.servers = localhost:9092 2 | auto.offset.reset = earliest 3 | enable.auto.commit = false 4 | value.serde = winton_kafka_streams.processor.serialization.serdes.StringSerde 5 | key.serde = winton_kafka_streams.processor.serialization.serdes.StringSerde 6 | -------------------------------------------------------------------------------- /examples/binning/generator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple script to generate prices values with normally 3 | distributed returns on a Kafka 'prices' topic. 4 | 5 | Run ./generator --help to see the full range of options. 6 | 7 | """ 8 | 9 | import logging 10 | from collections import namedtuple 11 | import datetime as dt 12 | import time 13 | import pandas as pd 14 | from random_prices import RandomPrices 15 | from source import Source 16 | 17 | ITEM = namedtuple('ITEM', ['name', 'prob', 'seed', 'initial_price', 'sigma']) 18 | 19 | LOGGER = logging.getLogger(__name__) 20 | 21 | _VERBOSITY = { 22 | 0: logging.WARN, 23 | 1: logging.INFO, 24 | 2: logging.DEBUG 25 | } 26 | 27 | 28 | def _get_items(items): 29 | parsed_items = [] 30 | for item in items: 31 | vals = item.split(',') 32 | try: 33 | parsed_item = ITEM( 34 | vals[0], float(vals[1]), 35 | int(vals[2]), float(vals[3]), float(vals[4]) 36 | ) 37 | parsed_items.append(parsed_item) 38 | except Exception: 39 | raise ValueError( 40 | '{} should contain 5 comma separated options: ' 41 | 'name[string],prob[float],seed[int],' 42 | 'initial_price[float],sigma[float]' 43 | ) 44 | return parsed_items 45 | 46 | 47 | def _get_sources(items, limit): 48 | return { 49 | item.name: Source( 50 | item.prob, 51 | RandomPrices( 52 | item.seed, item.initial_price, item.sigma, limit 53 | ), 54 | item.seed 55 | ) 56 | for item in items 57 | } 58 | 59 | 60 | def _run(sources, timestamp, freq, real_time, rt_multiplier, produce): 61 | """ 62 | Start the generation of prices on the 'prices' topic 63 | """ 64 | 65 | stop = False 66 | while not stop: 67 | if real_time: 68 | start_time = dt.datetime.utcnow() 69 | for (name, source) in sources.items(): 70 | try: 71 | price = next(source) 72 | if price is not None: 73 | produce(timestamp, name, price) 74 | LOGGER.info('%s,%s,%s', timestamp, name, price) 75 | except StopIteration: 76 | stop = True 77 | timestamp = timestamp + freq 78 | if real_time: 79 | duration = dt.datetime.utcnow() - start_time 80 | sleep_seconds = (freq.total_seconds() - duration.total_seconds()) / rt_multiplier 81 | if sleep_seconds < 0.0: 82 | LOGGER.warning( 83 | 'Not keeping up, lagging by %ss', -sleep_seconds 84 | ) 85 | else: 86 | LOGGER.debug('Sleeping for %ss', sleep_seconds) 87 | time.sleep(sleep_seconds) 88 | 89 | 90 | def _get_parser(): 91 | import argparse 92 | parser = argparse.ArgumentParser(description=__doc__) 93 | parser.add_argument( 94 | '-i', '--item', required=True, action='append', dest='items', 95 | help='Comma separated list of construction details for random price ' 96 | 'sources, should be name[string],prob[float],seed[int],' 97 | 'initial_price[float],sigma[float]' 98 | ) 99 | parser.add_argument( 100 | '-l', '--limit', dest='limit', type=int, default=1_000_000, 101 | help='Limit of iterations to be performed (default 1M)' 102 | ) 103 | parser.add_argument( 104 | '-s', '--start', dest='start', default='2017-01-01', 105 | help='Date(time) to start the price series from, e.g. ' 106 | '2000-01-01T10:30:12; must be a valid pandas timestamp. ' 107 | '(default 2017-01-01)' 108 | ) 109 | parser.add_argument( 110 | '-f', '--freq', dest='freq', default='250ms', 111 | help='The frequence by which to increment the time, must be a ' 112 | 'valid pandas timedelta, e.g. 30s. (default 250ms)' 113 | ) 114 | parser.add_argument( 115 | '-kb', '--broker-list', dest='broker_list', default=None, 116 | help='Kafka broker list, e.g. kafka-1:9092,kafka-2:9092; also ' 117 | 'requires --topic to be specified. If not provided output ' 118 | 'will be produced to stdout instead of Kafka.' 119 | ) 120 | parser.add_argument( 121 | '-kt', '--topic', dest='topic', default=None, 122 | help='The Kafka topic to produce to, this will be ignored ' 123 | 'if --broker-list is not specified as well.' 124 | ) 125 | parser.add_argument( 126 | '-rt', '--real-time', dest='real_time', action='store_true', 127 | help='Toggle (approximate) real-time generation of random ' 128 | 'prices. This will output prices in real-time trying ' 129 | 'to match the frequency specified in --freq.' 130 | ) 131 | parser.add_argument( 132 | '-rtm', '--real-time-multiplier', type=float, default=1.0, 133 | help='Speed up real time producer of prices by a factor. ' 134 | 'Default=1.0 (actual time).' 135 | ) 136 | parser.add_argument( 137 | '-v', dest='verbosity', action='count', default=0, 138 | help='Enable more verbose logging (can be specified multiple ' 139 | 'times to increase verbosity)' 140 | ) 141 | return parser 142 | 143 | 144 | def main(): 145 | """Main entry for script""" 146 | parser = _get_parser() 147 | args = parser.parse_args() 148 | sources = _get_sources(_get_items(args.items), args.limit) 149 | timestamp = pd.Timestamp(args.start) 150 | freq = pd.Timedelta(args.freq) 151 | logging.basicConfig(level=_VERBOSITY.get(args.verbosity, logging.DEBUG)) 152 | if args.broker_list is None: 153 | def _produce(timestamp, name, price): 154 | print('{},{},{}'.format(timestamp, name, price)) 155 | 156 | LOGGER.debug('Running in console mode') 157 | _run(sources, timestamp, freq, args.real_time, args.real_time_multiplier, _produce) 158 | else: 159 | if args.topic is None: 160 | raise ValueError('Must specify --topic when using Kafka') 161 | from confluent_kafka import Producer 162 | producer = Producer({'bootstrap.servers': args.broker_list}) 163 | 164 | def _produce(timestamp, name, price): 165 | data = '{},{},{}'.format(timestamp, name, price) 166 | produced = False 167 | while not produced: 168 | try: 169 | producer.produce(args.topic, value=data.encode('utf-8'), key=name) 170 | producer.poll(0) 171 | produced = True 172 | except BufferError: 173 | producer.poll(10) 174 | 175 | LOGGER.debug('Producing to %s on %s', args.topic, args.broker_list) 176 | _run(sources, timestamp, freq, args.real_time, args.real_time_multiplier, _produce) 177 | producer.flush() 178 | 179 | 180 | if __name__ == '__main__': 181 | main() 182 | -------------------------------------------------------------------------------- /examples/binning/live-plot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Plotting binned generated sample prices. \n", 8 | "\n", 9 | "This notebook will display an auto-updating plot of prices as they are being generated by the example binning stream processor. \n", 10 | "\n", 11 | "In addition to the requirements for the Winton Kafka Streams code, these libraries must also be installed:\n", 12 | " * pandas\n", 13 | " * jupyter\n", 14 | " * bokeh\n", 15 | "\n", 16 | "These libraries can be installed manually or using pip: pip install .[binning_example]\n", 17 | "\n", 18 | "Once installed, run these two commands in examples/binning/ :\n", 19 | "\n", 20 | " * The binning stream processor:\n", 21 | " * python binning.py\n", 22 | " * The price generator:\n", 23 | " * python generator.py -i A.N.Other-Corp,0.3,123,100.0,0.01 -l 6000 -f 250ms -kb localhost:9092 -kt prices -rt\n", 24 | "\n", 25 | "The -rt argument will generate the prices in real-time. The prices can also be generated faster than realitime with the flag \"-rtm \" or in bulk by omitting the -rt flag" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "import datetime\n", 37 | "from pandas import Timestamp\n", 38 | "\n", 39 | "from ipywidgets import interact\n", 40 | "\n", 41 | "from bokeh.models.sources import ColumnDataSource\n", 42 | "from bokeh.plotting import figure\n", 43 | "from bokeh.io import push_notebook, show, output_notebook\n", 44 | "\n", 45 | "from confluent_kafka import Consumer, KafkaError" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "output_notebook()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "# only plot prices for one symbol\n", 66 | "symbol = 'A.N.Other-Corp'" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "consumer = Consumer({'bootstrap.servers': 'localhost:9092', 'group.id': 'test-group',\n", 78 | " 'default.topic.config': {'auto.offset.reset': 'earliest'}})\n", 79 | "\n", 80 | "consumer.subscribe(['prices', 'bin-prices'])" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "scrolled": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "price_figure = figure(title=symbol, plot_height=300, \n", 92 | " plot_width=600, y_range=(90, 110), x_axis_type='datetime')\n", 93 | "price_figure.xaxis.axis_label = 'Time'\n", 94 | "price_figure.yaxis.axis_label = 'Price'\n", 95 | "\n", 96 | "price_data = ColumnDataSource(data=dict(x=[datetime.datetime(2017,1,1)], y=[100]))\n", 97 | "price_line = price_figure.line(x=\"x\", y=\"y\", color=\"blue\", source=price_data, legend='Price')\n", 98 | "\n", 99 | "bin_data = ColumnDataSource(data=dict(x=[], y=[]))\n", 100 | "bin_circle = price_figure.circle(x=\"x\", y=\"y\", color=\"red\", source=bin_data, legend='Binned price')\n", 101 | "\n", 102 | "handle = show(price_figure, notebook_handle=True)\n", 103 | "\n", 104 | "xp, yp= [], []\n", 105 | "updated_price_data = dict(x=xp, y=yp)\n", 106 | "xb, yb= [], []\n", 107 | "updated_bin_data = dict(x=xb, y=yb)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "def process_price(msg, x, y, updated_data, price_data):\n", 117 | " dt, sym, prc = msg.value().decode(\"utf-8\").split(',')\n", 118 | " if sym == symbol:\n", 119 | " dt = Timestamp(dt).to_pydatetime()\n", 120 | " prc = float(prc)\n", 121 | "\n", 122 | " x.append(dt)\n", 123 | " y.append(prc)\n", 124 | "\n", 125 | " updated_data['x'] = x\n", 126 | " updated_data['y'] = y\n", 127 | " price_data.stream(updated_data, len(x))\n", 128 | " \n", 129 | "def process_bin(msg, x, y, updated_data, bin_data):\n", 130 | " prc = float(msg.value().decode(\"utf-8\"))\n", 131 | " dt, sym = msg.key().decode(\"utf-8\").split(',')\n", 132 | " \n", 133 | " if sym == symbol:\n", 134 | " x.append(Timestamp(dt).to_pydatetime())\n", 135 | " y.append(prc)\n", 136 | "\n", 137 | " updated_data['x'] = x\n", 138 | " updated_data['y'] = y\n", 139 | " bin_data.stream(updated_data, len(x))\n", 140 | "\n", 141 | "last_date = None\n", 142 | "running = True\n", 143 | "while running:\n", 144 | " msg = consumer.poll()\n", 145 | " if not msg.error():\n", 146 | " #print(f'Received message: {msg.value().decode(\"utf-8\")}')\n", 147 | " if msg.topic() == 'prices':\n", 148 | " process_price(msg, xp, yp, updated_price_data, price_data)\n", 149 | " elif msg.topic() == 'bin-prices':\n", 150 | " process_bin(msg, xb, yb, updated_bin_data, bin_data)\n", 151 | " \n", 152 | " push_notebook(handle=handle)\n", 153 | " elif msg.error().code() != KafkaError._PARTITION_EOF:\n", 154 | " print(msg.error())\n", 155 | " running = False\n", 156 | " \n", 157 | "c.close()" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": { 164 | "collapsed": true 165 | }, 166 | "outputs": [], 167 | "source": [] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "collapsed": true 174 | }, 175 | "outputs": [], 176 | "source": [] 177 | } 178 | ], 179 | "metadata": { 180 | "kernelspec": { 181 | "display_name": "Python 3", 182 | "language": "python", 183 | "name": "python3" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 3 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython3", 195 | "version": "3.6.1" 196 | } 197 | }, 198 | "nbformat": 4, 199 | "nbformat_minor": 2 200 | } 201 | -------------------------------------------------------------------------------- /examples/binning/random_prices.py: -------------------------------------------------------------------------------- 1 | """Provides simple class to compute (reproducible) random prices""" 2 | 3 | 4 | from random import Random as _Random 5 | from math import fabs as _fabs 6 | from collections import namedtuple as _nt 7 | 8 | _STATE = _nt('_STATE', ['last_price', 'random', 'iter']) 9 | 10 | 11 | class RandomPrices(object): 12 | """\ 13 | Provides iterable class producing random non-negative and non-zero 14 | prices. 15 | 16 | Given an initial price p, the next price will be calculated as 17 | abs(p + p*Gaussian(0.0, sigma)). The price is additionally floored 18 | at 0.1. 19 | 20 | This is not intended as a particularly realistic model of a price 21 | series, but rather as a reproducible source of suitable test data 22 | for handling price series. The reproducibility is handled by the seed 23 | and using a Pseudo RNG (Mersenne twister as provided by Python's random 24 | module). 25 | """ 26 | 27 | def __init__(self, seed=42, initial_price=100.0, sigma=0.01, 28 | max_iter=10_000_000): 29 | """ 30 | :param seed: Random number seed 31 | :param initial_price: The first price from which to start the price 32 | evolution. 33 | :param sigma: The sigma of the Gaussian used to generated the price 34 | movements. 35 | :param max_iter: The total amout of prices to emit before stopping 36 | an iteration of this object. 37 | """ 38 | super().__init__() 39 | self._seed = seed 40 | self._initial_price = initial_price 41 | self._sigma = sigma 42 | self._max_iter = max_iter 43 | self._state = None 44 | self.reset() 45 | 46 | def __iter__(self): 47 | return self 48 | 49 | def __next__(self): 50 | if self._state.iter >= self._max_iter: 51 | raise StopIteration 52 | return self.next_price() 53 | 54 | def next_price(self): 55 | """Calculate and return a new price; use initial price on first call""" 56 | if self._state.iter == 0: 57 | # On first iteration use initial price 58 | price = self._initial_price 59 | else: 60 | change = self._state.random.gauss(0.0, self._sigma) 61 | price = _fabs( 62 | self._state.last_price + self._state.last_price * change 63 | ) 64 | if price < 0.1: 65 | price = 0.1 66 | self._state = _STATE(price, self._state.random, self._state.iter + 1) 67 | return self._state.last_price 68 | 69 | def reset(self): 70 | """Reset this object back to the initial state""" 71 | self._state = _STATE(self._initial_price, _Random(self._seed), 0) 72 | -------------------------------------------------------------------------------- /examples/binning/source.py: -------------------------------------------------------------------------------- 1 | """Provides a wrapper to randomise whether underlying prices are generated""" 2 | 3 | from random import Random as _Random 4 | 5 | 6 | class Source(object): 7 | """\ 8 | Provides iterable class wrapping a price source and randomly produces a 9 | price or not. 10 | """ 11 | 12 | def __init__(self, prob, prices, seed=123): 13 | self.prob = prob 14 | self.prices = prices 15 | self._rand = _Random(seed) 16 | 17 | def __next__(self): 18 | return self.maybe_next_price() 19 | 20 | def __iter__(self): 21 | return self 22 | 23 | def maybe_next_price(self): 24 | """Based on the probability, return a price or None""" 25 | 26 | if self._rand.uniform(0.0, 1.0) <= self.prob: 27 | return next(self.prices) 28 | return None 29 | -------------------------------------------------------------------------------- /examples/debug/README.md: -------------------------------------------------------------------------------- 1 | # Debug Winton Kafka Streams Example 2 | 3 | ## Running 4 | * Edit the config.properties file if necessary to change where Kafka is running 5 | * Run: python example.py 6 | * Start a console producer writing to the topic 'wks-debug-example-topic-two' 7 | 8 | ## Features 9 | * Listens to the topic 'wks-debug-example-topic-two' and writes output to 'wks-debug-example-output' 10 | * The value on the input topic will be doubled when received 11 | * Every fourth value will cause the four values in the current state to be written to the output topic 12 | * It is possible to stop the application at any time and the application will restart where it left off 13 | -------------------------------------------------------------------------------- /examples/debug/config.properties: -------------------------------------------------------------------------------- 1 | bootstrap.servers = localhost:9092 2 | auto.offset.reset = earliest 3 | enable.auto.commit = false 4 | value.serde = winton_kafka_streams.processor.serialization.serdes.IntegerSerde 5 | -------------------------------------------------------------------------------- /examples/debug/example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Winton Kafka Streams 3 | 4 | Main entrypoints 5 | 6 | """ 7 | 8 | import logging 9 | import time 10 | 11 | from winton_kafka_streams.processor import BaseProcessor, TopologyBuilder 12 | import winton_kafka_streams.kafka_config as kafka_config 13 | import winton_kafka_streams.kafka_streams as kafka_streams 14 | import winton_kafka_streams.state as state_stores 15 | 16 | log = logging.getLogger(__name__) 17 | 18 | 19 | class DoubleProcessor(BaseProcessor): 20 | """ 21 | Example processor that will double the value passed in 22 | 23 | """ 24 | 25 | def initialise(self, name, context): 26 | super().initialise(name, context) 27 | self.state = context.get_store('double_store') 28 | 29 | def process(self, _, value): 30 | log.debug(f'DoubleProcessor::process({str(value)})') 31 | doubled = value*2 32 | items_in_state = len(self.state) 33 | self.state[items_in_state] = doubled 34 | if items_in_state >= 4: 35 | self.punctuate() 36 | 37 | def punctuate(self): 38 | for _, value in self.state.items(): 39 | log.debug(f'Forwarding to sink ({str(value)})') 40 | self.context.forward(None, value) 41 | self.state.clear() 42 | 43 | 44 | def _debug_run(config_file): 45 | kafka_config.read_local_config(config_file) 46 | 47 | double_store = state_stores.create('double_store'). \ 48 | with_integer_keys(). \ 49 | with_integer_values(). \ 50 | in_memory(). \ 51 | build() 52 | 53 | with TopologyBuilder() as topology_builder: 54 | topology_builder. \ 55 | source('input-value', ['wks-debug-example-topic-two']). \ 56 | processor('double', DoubleProcessor, 'input-value'). \ 57 | state_store(double_store, 'double'). \ 58 | sink('output-double', 'wks-debug-example-output', 'double') 59 | 60 | wks = kafka_streams.KafkaStreams(topology_builder, kafka_config) 61 | wks.start() 62 | try: 63 | while True: 64 | time.sleep(1) 65 | except KeyboardInterrupt: 66 | pass 67 | finally: 68 | wks.close() 69 | 70 | 71 | if __name__ == '__main__': 72 | 73 | logging.basicConfig(level=logging.DEBUG) 74 | 75 | import argparse 76 | 77 | parser = argparse.ArgumentParser(description="Debug runner for Python Kafka Streams") 78 | parser.add_argument('--config-file', '-c', help="Local configuration - will override internal defaults", 79 | default='config.properties') 80 | args = parser.parse_args() 81 | 82 | _debug_run(args.config_file) 83 | -------------------------------------------------------------------------------- /examples/wordcount/README.md: -------------------------------------------------------------------------------- 1 | # Wordcount Winton Kafka Streams Example 2 | 3 | ## Running native 4 | * Edit the config.properties file if necessary to change where Kafka is running 5 | * Run: python example.py 6 | * Start a console producer writing to the topic 'wks-wordcount-example-topic' 7 | 8 | ##Running dockerized 9 | * Install docker and docker-compose 10 | * cd into examples/wordcount/docker 11 | * start the docker services with `docker-compose up -d` 12 | * the kafka-debug service outputs the `wks-wordcount-example-count` 13 | * the output should be (after a minute or so): 14 | ``` 15 | $ docker-compose logs kafka-debug 16 | ... 17 | kafka-debug_1 | b 2 18 | kafka-debug_1 | c 1 19 | kafka-debug_1 | a 3 20 | ``` 21 | 22 | ## Features 23 | * Listens to the topic 'wks-wordcount-example-topic' and writes output to 'wks-wordcount-example-count' 24 | * Each string read in will be split by spaces 25 | * The count of the number of words will be maintained in a collections.Counter instance. This is not persistent so stopping the example will not maintain the previous state. 26 | -------------------------------------------------------------------------------- /examples/wordcount/config.properties: -------------------------------------------------------------------------------- 1 | application.id = wordcount-example 2 | bootstrap.servers = localhost:9092 3 | auto.offset.reset = earliest 4 | value.serde = winton_kafka_streams.processor.serialization.serdes.StringSerde 5 | key.serde = winton_kafka_streams.processor.serialization.serdes.StringSerde 6 | -------------------------------------------------------------------------------- /examples/wordcount/custom_serde.py: -------------------------------------------------------------------------------- 1 | from winton_kafka_streams.processor.serialization import IntegerSerializer 2 | from winton_kafka_streams.processor.serialization.serdes.wrapper_serde import WrapperSerde 3 | from winton_kafka_streams.processor.serialization import StringDeserializer 4 | 5 | 6 | class StringIntSerde(WrapperSerde): 7 | def __init__(self): 8 | serializer = IntegerSerializer() 9 | deserializer = StringDeserializer() 10 | super().__init__(serializer, deserializer) 11 | -------------------------------------------------------------------------------- /examples/wordcount/docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | wordcount: 4 | build: 5 | context: ../../../ 6 | dockerfile: examples/wordcount/docker/wordcount/Dockerfile 7 | volumes: 8 | - ../../../:/code 9 | depends_on: 10 | - kafka 11 | source_client: 12 | build: 13 | context: ../../../ 14 | dockerfile: examples/wordcount/docker/source_client/Dockerfile 15 | volumes: 16 | - ../../../:/code 17 | depends_on: 18 | - kafka 19 | kafka: 20 | image: "spotify/kafka" 21 | hostname: kafka 22 | ports: 23 | - 2181:2181 24 | - 9092:9092 25 | - 7203:7203 26 | environment: 27 | - JMX_PORT=7203 28 | - ADVERTISED_HOST=kafka 29 | - ADVERTISED_PORT=9092 30 | kafka-manager: 31 | image: "sheepkiller/kafka-manager" 32 | ports: 33 | - 9000:9000 34 | environment: 35 | - ZK_HOSTS=kafka:2181 36 | - APPLICATION_SECRET=letmein 37 | kafka-debug: 38 | build: 39 | context: ../../../ 40 | dockerfile: examples/wordcount/docker/kafka-debug/Dockerfile 41 | depends_on: 42 | - kafka 43 | -------------------------------------------------------------------------------- /examples/wordcount/docker/kafka-debug/Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | FROM openjdk:8-jre 3 | 4 | ENV SCALA_VERSION 2.11 5 | ENV KAFKA_VERSION 0.10.1.0 6 | ENV KAFKA_HOME /opt/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION" 7 | 8 | # Install Kafka and other needed things 9 | RUN apt-get update && \ 10 | apt-get install -y wget dnsutils && \ 11 | rm -rf /var/lib/apt/lists/* && \ 12 | apt-get clean && \ 13 | wget -q http://apache.mirrors.spacedump.net/kafka/"$KAFKA_VERSION"/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz -O /tmp/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz && \ 14 | tar xfz /tmp/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz -C /opt && \ 15 | rm /tmp/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz 16 | 17 | 18 | CMD ["/opt/kafka_2.11-0.10.1.0/bin/kafka-console-consumer.sh","--bootstrap-server","kafka:9092","--topic","wks-wordcount-example-count","--from-beginning","--property","print.key=true"] 19 | -------------------------------------------------------------------------------- /examples/wordcount/docker/source_client/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | ADD . /code 3 | 4 | RUN apt-get update 5 | RUN echo "/usr/local/lib" >> /etc/ld.so.conf 6 | RUN git clone https://github.com/edenhill/librdkafka.git /tmp/librdkafka 7 | RUN ls /tmp/ && cd /tmp/librdkafka && ./configure && make && make install && ldconfig 8 | 9 | WORKDIR /code/examples/wordcount/ 10 | RUN pip --version 11 | #RUN pip install -e git+https://github.com/confluentinc/confluent-kafka-python.git#egg=confluent-kafka 12 | RUN pip install -e ../../ 13 | 14 | CMD ["python", "source_client.py"] 15 | -------------------------------------------------------------------------------- /examples/wordcount/docker/wordcount/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | ADD . /code 3 | 4 | RUN apt-get update 5 | RUN echo "/usr/local/lib" >> /etc/ld.so.conf 6 | RUN git clone https://github.com/edenhill/librdkafka.git /tmp/librdkafka 7 | RUN ls /tmp/ && cd /tmp/librdkafka && ./configure && make && make install && ldconfig 8 | 9 | WORKDIR /code/examples/wordcount/ 10 | RUN pip --version 11 | #RUN pip install -e git+https://github.com/confluentinc/confluent-kafka-python.git#egg=confluent-kafka 12 | RUN pip install -e ../../ 13 | 14 | CMD ["python", "example.py", "-c", "docker/wordcount/config.properties"] 15 | -------------------------------------------------------------------------------- /examples/wordcount/docker/wordcount/config.properties: -------------------------------------------------------------------------------- 1 | bootstrap.servers = kafka:9092 2 | auto.offset.reset = earliest 3 | -------------------------------------------------------------------------------- /examples/wordcount/example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Winton Kafka Streams 3 | 4 | Main entrypoints 5 | 6 | """ 7 | 8 | import logging 9 | import sys 10 | import time 11 | 12 | import winton_kafka_streams.kafka_config as kafka_config 13 | import winton_kafka_streams.kafka_streams as kafka_streams 14 | from winton_kafka_streams.processor import BaseProcessor, TopologyBuilder 15 | import winton_kafka_streams.state as state_stores 16 | 17 | log = logging.getLogger(__name__) 18 | 19 | 20 | # An example implementation of word count, 21 | # showing where punctuate can be useful 22 | class WordCount(BaseProcessor): 23 | 24 | def initialise(self, _name, _context): 25 | super().initialise(_name, _context) 26 | self.word_count_store = _context.get_store('counts') 27 | # dirty_words tracks what words have changed since the last punctuate 28 | self.dirty_words = set() 29 | # output updated counts every 10 seconds 30 | self.context.schedule(10.) 31 | 32 | def process(self, key, value): 33 | words = value.split() 34 | log.debug(f'words list ({words})') 35 | for word in words: 36 | count = self.word_count_store.get(word, 0) 37 | self.word_count_store[word] = count + 1 38 | self.dirty_words |= set(words) 39 | 40 | def punctuate(self, timestamp): 41 | for word in self.dirty_words: 42 | count = str(self.word_count_store[word]) 43 | log.debug(f'Forwarding to sink ({word}, {count})') 44 | self.context.forward(word, count) 45 | self.dirty_words = set() 46 | 47 | 48 | def run(config_file, binary_output): 49 | kafka_config.read_local_config(config_file) 50 | if binary_output: 51 | kafka_config.VALUE_SERDE = 'examples.wordcount.custom_serde.StringIntSerde' 52 | 53 | count_store = state_stores.create('counts'). \ 54 | with_string_keys(). \ 55 | with_integer_values(). \ 56 | in_memory(). \ 57 | build() 58 | 59 | with TopologyBuilder() as topology_builder: 60 | topology_builder. \ 61 | source('input-value', ['wks-wordcount-example-topic']). \ 62 | processor('count', WordCount, 'input-value'). \ 63 | state_store(count_store, 'count'). \ 64 | sink('output-count', 'wks-wordcount-example-count', 'count') 65 | 66 | wks = kafka_streams.KafkaStreams(topology_builder, kafka_config) 67 | wks.start() 68 | try: 69 | while True: 70 | time.sleep(1) 71 | except KeyboardInterrupt: 72 | pass 73 | finally: 74 | wks.close() 75 | 76 | 77 | if __name__ == '__main__': 78 | import argparse 79 | 80 | parser = argparse.ArgumentParser(description="Debug runner for Python Kafka Streams") 81 | parser.add_argument('--config-file', '-c', 82 | help="Local configuration - will override internal defaults", 83 | default='config.properties') 84 | parser.add_argument('--binary-output', 85 | help="Output topic will contain 4-byte integers", 86 | action='store_true') 87 | parser.add_argument('--verbose', '-v', 88 | help="Increase versbosity (repeat to increase level)", 89 | action='count', default=0) 90 | args = parser.parse_args() 91 | 92 | levels = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG} 93 | level = levels.get(args.verbose, logging.DEBUG) 94 | logging.basicConfig(stream=sys.stdout, level=level) 95 | run(args.config_file, binary_output=args.binary_output) 96 | -------------------------------------------------------------------------------- /examples/wordcount/source_client.py: -------------------------------------------------------------------------------- 1 | from confluent_kafka import Producer 2 | 3 | 4 | p = Producer({'bootstrap.servers': 'localhost:9092'}) 5 | topic = 'wks-wordcount-example-topic' 6 | some_data_source = ["a b c", "a b", "a"] 7 | for data in some_data_source: 8 | print("producing {} to {}".format(data, topic)) 9 | p.produce(topic, data.encode('utf-8')) 10 | p.flush() 11 | -------------------------------------------------------------------------------- /requirements_docs.txt: -------------------------------------------------------------------------------- 1 | javaproperties 2 | requests 3 | avro-python3 4 | setuptools_scm 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | addopts = -rsxX -q 3 | testpaths = tests 4 | python_files = test_* 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | setup for Winton Kafka Streams package 6 | 7 | """ 8 | 9 | from setuptools import setup 10 | 11 | with open('README.md') as readme_file: 12 | readme = readme_file.read() 13 | 14 | requirements = [ 15 | 'javaproperties', 16 | 'confluent-kafka>=0.11.4', 17 | 'requests', 18 | 'avro-python3' 19 | ] 20 | 21 | test_requirements = [ 22 | 'pytest' 23 | ] 24 | 25 | setup( 26 | name='Winton Kafka Streams', 27 | use_scm_version=True, 28 | setup_requires=['setuptools_scm'], 29 | description="Apache Kafka's Streams API for Python", 30 | long_description=readme, 31 | author="Winton Group", 32 | author_email='opensource@winton.com', 33 | url='https://github.com/wintoncode/winton_kafka_streams', 34 | packages=[ 35 | 'winton_kafka_streams', 36 | ], 37 | include_package_data=True, 38 | install_requires=requirements, 39 | license="Apache Software License 2.0", 40 | zip_safe=True, 41 | keywords='streams kafka winton', 42 | classifiers=[ 43 | 'Development Status :: 2 - Pre-Alpha', 44 | 'Intended Audience :: Developers', 45 | 'License :: OSI Approved :: Apache Software License', 46 | 'Natural Language :: English', 47 | 'Programming Language :: Python :: 3.6', 48 | ], 49 | test_suite='tests', 50 | tests_require=test_requirements, 51 | extras_require={ 52 | 'develop': ['pytest', 'sphinx_rtd_theme'], 53 | 'binning_example': ['jupyter', 'pandas', 'bokeh'], 54 | } 55 | ) 56 | -------------------------------------------------------------------------------- /tests/processor/serde/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wintoncode/winton-kafka-streams/5867a1c42fc80bba07173fd1d004b2849b429fdf/tests/processor/serde/__init__.py -------------------------------------------------------------------------------- /tests/processor/serde/mock_schema_registry.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2016 Confluent Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | # 20 | # derived from https://github.com/verisign/python-confluent-schemaregistry.git 21 | # 22 | 23 | from confluent_kafka.avro import ClientError 24 | 25 | 26 | class MockSchemaRegistryClient(object): 27 | """ 28 | A client that acts as a schema registry locally. 29 | 30 | Compatibiity related methods are not implemented at this time. 31 | """ 32 | 33 | def __init__(self, max_schemas_per_subject=1000): 34 | self.max_schemas_per_subject = max_schemas_per_subject 35 | # subj => { schema => id } 36 | self.subject_to_schema_ids = {} 37 | # id => avro_schema 38 | self.id_to_schema = {} 39 | # subj => { schema => version } 40 | self.subject_to_schema_versions = {} 41 | 42 | self.subject_to_latest_schema = {} 43 | 44 | # counters 45 | self.next_id = 1 46 | self.schema_to_id = {} 47 | 48 | def _get_next_id(self, schema): 49 | if schema in self.schema_to_id: 50 | return self.schema_to_id[schema] 51 | result = self.next_id 52 | self.next_id += 1 53 | self.schema_to_id[schema] = result 54 | return result 55 | 56 | def _get_next_version(self, subject): 57 | if subject not in self.subject_to_schema_versions: 58 | self.subject_to_schema_versions[subject] = {} 59 | return len(self.subject_to_schema_versions[subject]) 60 | 61 | def _get_all_versions(self, subject): 62 | versions = self.subject_to_schema_versions.get(subject, {}) 63 | return sorted(versions) 64 | 65 | def _add_to_cache(self, cache, subject, schema, value): 66 | if subject not in cache: 67 | cache[subject] = {} 68 | sub_cache = cache[subject] 69 | sub_cache[schema] = value 70 | 71 | def _cache_schema(self, schema, schema_id, subject, version): 72 | # don't overwrite anything 73 | if schema_id in self.id_to_schema: 74 | schema = self.id_to_schema[schema_id] 75 | else: 76 | self.id_to_schema[schema_id] = schema 77 | 78 | self._add_to_cache(self.subject_to_schema_ids, 79 | subject, schema, schema_id) 80 | 81 | self._add_to_cache(self.subject_to_schema_versions, 82 | subject, schema, version) 83 | 84 | if subject in self.subject_to_latest_schema: 85 | si, s, v = self.subject_to_latest_schema[subject] 86 | if v > version: 87 | return 88 | self.subject_to_latest_schema[subject] = (schema_id, schema, version) 89 | 90 | def register(self, subject, avro_schema): 91 | """ 92 | Register a schema with the registry under the given subject 93 | and receive a schema id. 94 | 95 | avro_schema must be a parsed schema from the python avro library 96 | 97 | Multiple instances of the same schema will result in inconsistencies. 98 | """ 99 | schemas_to_id = self.subject_to_schema_ids.get(subject, {}) 100 | schema_id = schemas_to_id.get(avro_schema, -1) 101 | if schema_id != -1: 102 | return schema_id 103 | 104 | # add it 105 | version = self._get_next_version(subject) 106 | schema_id = self._get_next_id(avro_schema) 107 | 108 | # cache it 109 | self._cache_schema(avro_schema, schema_id, subject, version) 110 | return schema_id 111 | 112 | def get_by_id(self, schema_id): 113 | """Retrieve a parsed avro schema by id or None if not found""" 114 | return self.id_to_schema.get(schema_id, None) 115 | 116 | def get_latest_schema(self, subject): 117 | """ 118 | Return the latest 3-tuple of: 119 | (the schema id, the parsed avro schema, the schema version) 120 | for a particular subject. 121 | 122 | If the subject is not found, (None,None,None) is returned. 123 | """ 124 | return self.subject_to_latest_schema.get(subject, (None, None, None)) 125 | 126 | def get_version(self, subject, avro_schema): 127 | """ 128 | Get the version of a schema for a given subject. 129 | 130 | Returns -1 if not found. 131 | """ 132 | schemas_to_version = self.subject_to_schema_versions.get(subject, {}) 133 | return schemas_to_version.get(avro_schema, -1) 134 | 135 | def get_id_for_schema(self, subject, avro_schema): 136 | """ 137 | Get the ID of a parsed schema 138 | """ 139 | schemas_to_id = self.subject_to_schema_ids.get(subject, {}) 140 | return schemas_to_id.get(avro_schema, -1) 141 | 142 | def test_compatibility(self, subject, avro_schema, version='latest'): 143 | raise ClientError("not implemented") 144 | 145 | def update_compatibility(self, level, subject=None): 146 | raise ClientError("not implemented") 147 | 148 | def get_compatibility(self, subject=None): 149 | raise ClientError("not implemented") 150 | -------------------------------------------------------------------------------- /tests/processor/serde/test_avro_serde.py: -------------------------------------------------------------------------------- 1 | import io 2 | import struct 3 | from confluent_kafka.avro import loads as avro_loads 4 | from .mock_schema_registry import MockSchemaRegistryClient 5 | from winton_kafka_streams.processor.serialization.serdes import AvroSerde 6 | import winton_kafka_streams.kafka_config as config 7 | 8 | string_avro = '{"type": "string"}' 9 | 10 | 11 | def create_serde(registry, schema): 12 | serde = AvroSerde() 13 | config.AVRO_SCHEMA_REGISTRY = 'nowhere' 14 | config.KEY_AVRO_SCHEMA = schema 15 | 16 | serde.configure(config, True) 17 | serde.serializer._avro_helper._set_serializer(registry) 18 | serde.deserializer._avro_helper._set_serializer(registry) 19 | 20 | serde.test_registry = registry 21 | return serde 22 | 23 | 24 | def test_serialize_avro(): 25 | registry = MockSchemaRegistryClient() 26 | serde = create_serde(registry, string_avro) 27 | 28 | message = serde.serializer.serialize('topic', 'data') 29 | message_io = io.BytesIO(message) 30 | magic, schema_id, length, string = struct.unpack('>bIb4s', message_io.read(10)) 31 | assert(0 == magic) 32 | assert(schema_id in registry.id_to_schema) 33 | assert(8 == length) # (==4) uses variable-length zig-zag encoding 34 | assert(b'data' == string) 35 | message_io.close() 36 | 37 | 38 | def test_deserialize_avro(): 39 | registry = MockSchemaRegistryClient() 40 | serde = create_serde(registry, string_avro) 41 | schema_id = registry.register('topic-value', avro_loads(string_avro)) 42 | 43 | serialized = b'\0' + schema_id.to_bytes(4, 'big') + b'\x08data' 44 | message = serde.deserializer.deserialize('ignored', serialized) 45 | assert('data' == message) 46 | -------------------------------------------------------------------------------- /tests/processor/serde/test_instantiation.py: -------------------------------------------------------------------------------- 1 | import winton_kafka_streams.processor.serialization.serdes as serdes 2 | 3 | 4 | def test_serde_instance_to_string(): 5 | serde = serdes.BytesSerde() 6 | serde_str = serdes.serde_as_string(serde) 7 | assert 'winton_kafka_streams.processor.serialization.serdes.bytes_serde.BytesSerde' == serde_str 8 | 9 | 10 | def test_serde_class_to_string(): 11 | serde = serdes.BytesSerde 12 | serde_str = serdes.serde_as_string(serde) 13 | assert 'winton_kafka_streams.processor.serialization.serdes.bytes_serde.BytesSerde' == serde_str 14 | 15 | 16 | def test_string_to_serde(): 17 | serde_str = 'winton_kafka_streams.processor.serialization.serdes.StringSerde' 18 | serde = serdes.serde_from_string(serde_str) 19 | byte_str = serde.serializer.serialize('topic', 'abc123') 20 | assert b'abc123' == byte_str 21 | -------------------------------------------------------------------------------- /tests/processor/serde/test_serialisation.py: -------------------------------------------------------------------------------- 1 | from winton_kafka_streams.processor.serialization.serdes import * 2 | 3 | 4 | def test_bytes_serde(): 5 | bytes_serde = BytesSerde() 6 | assert bytes_serde.serializer.serialize('topic', b'hello') == b'hello' 7 | assert bytes_serde.deserializer.deserialize('topic', b'hello') == b'hello' 8 | 9 | 10 | def test_string_serde(): 11 | string_serde = StringSerde() 12 | assert string_serde.serializer.serialize('topic', 'hello') == b'hello' 13 | assert string_serde.deserializer.deserialize('topic', b'hello') == 'hello' 14 | 15 | 16 | def test_integer_serde(): 17 | int_serde = IntegerSerde() 18 | assert int_serde.serializer.serialize('topic', -2132) == b'\xac\xf7\xff\xff' 19 | assert int_serde.deserializer.deserialize('topic', b'\xac\xf7\xff\xff') == -2132 20 | 21 | 22 | def test_long_serde(): 23 | int_serde = LongSerde() 24 | assert int_serde.serializer.serialize('topic', -2132) == b'\xac\xf7\xff\xff\xff\xff\xff\xff' 25 | assert int_serde.deserializer.deserialize('topic', b'\xac\xf7\xff\xff\xff\xff\xff\xff') == -2132 26 | 27 | 28 | def test_float_serde(): 29 | float_serde = FloatSerde() 30 | assert float_serde.serializer.serialize('topic', -18.125) == b'\x00\x00\x91\xc1' 31 | assert float_serde.deserializer.deserialize('topic', b'\x00\x00\x91\xc1') == -18.125 32 | 33 | 34 | def test_double_serde(): 35 | double_serde = DoubleSerde() 36 | assert double_serde.serializer.serialize('topic', 123.25) == b'\x00\x00\x00\x00\x00\xd0^@' 37 | assert double_serde.deserializer.deserialize('topic', b'\x00\x00\x00\x00\x00\xd0^@') == 123.25 38 | 39 | 40 | def test_json_serde(): 41 | json_serde = JsonSerde() 42 | test_dict = {'key1': 'val1', 'key2': ["val21", "val22"]} 43 | test_bytes = b'{"key1": "val1", "key2": ["val21", "val22"]}' 44 | assert json_serde.serializer.serialize('topic', test_dict) == test_bytes 45 | assert json_serde.deserializer.deserialize('topic', test_bytes) == test_dict 46 | -------------------------------------------------------------------------------- /tests/processor/test_base_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the base processor - base class to all 3 | custom processor implementations 4 | """ 5 | 6 | import unittest.mock as mock 7 | 8 | import winton_kafka_streams.processor as wks_processor 9 | from winton_kafka_streams.processor.processor_context import ProcessorContext 10 | from winton_kafka_streams.processor.task_id import TaskId 11 | 12 | 13 | def test_createBaseProcessor(): 14 | wks_processor.BaseProcessor() 15 | 16 | 17 | def test_initialiseBaseProcessor(): 18 | mock_task = mock.Mock() 19 | mock_task.application_id = 'test_id' 20 | mock_task_id = TaskId('test_group', 0) 21 | mock_context = ProcessorContext(mock_task_id, mock_task, None, None, {}) 22 | bp = wks_processor.BaseProcessor() 23 | bp.initialise('my-name', mock_context) 24 | 25 | assert bp.name == 'my-name' 26 | assert isinstance(bp.context, ProcessorContext) 27 | -------------------------------------------------------------------------------- /tests/processor/test_extract_timestamp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests of using wall clock time from a message 3 | 4 | """ 5 | 6 | import unittest.mock as mock 7 | import pytest 8 | 9 | import winton_kafka_streams.processor as wks_processor 10 | 11 | expected_time = 1496735099.23712 12 | error_time_offset = 1000 13 | timestamp_create_time = 1 14 | 15 | 16 | class TestRecordTimeStampExtractorImpl(wks_processor.RecordTimeStampExtractor): 17 | def on_error(self, record, timestamp, previous_timestamp): 18 | return timestamp - 1000 19 | 20 | 21 | class MockRecord: 22 | def __init__(self, time): 23 | self.time = time 24 | 25 | def timestamp(self): 26 | return (timestamp_create_time, self.time) 27 | 28 | 29 | def test_RecordTimeStampExtractorNoImpl(): 30 | pytest.raises(TypeError, wks_processor.RecordTimeStampExtractor) 31 | 32 | 33 | def test_RecordTimeStampExtractor(): 34 | rtse = TestRecordTimeStampExtractorImpl() 35 | assert rtse.extract(MockRecord(expected_time), expected_time-12345) == expected_time 36 | 37 | 38 | def test_InvalidRecordTimeStampExtractorNoImpl(): 39 | rtse = TestRecordTimeStampExtractorImpl() 40 | assert rtse.extract(MockRecord(-1), expected_time-12345) == -1 - error_time_offset 41 | -------------------------------------------------------------------------------- /tests/processor/test_punctuation_queue.py: -------------------------------------------------------------------------------- 1 | import winton_kafka_streams.processor._punctuation_queue as punctuation_queue 2 | 3 | 4 | def test_punctuation_queue(): 5 | punctuations = [] 6 | pq = punctuation_queue.PunctuationQueue(lambda ts, node: punctuations.append((ts, node))) 7 | pq.schedule('node', 100) 8 | now = -100 9 | 10 | pq.may_punctuate(now) 11 | assert len(punctuations) == 0 12 | 13 | pq.may_punctuate(now + 99) 14 | assert len(punctuations) == 0 15 | 16 | pq.may_punctuate(now + 100) 17 | assert len(punctuations) == 1 18 | 19 | pq.may_punctuate(now + 199) 20 | assert len(punctuations) == 1 21 | 22 | pq.may_punctuate(now + 200) 23 | assert len(punctuations) == 2 24 | 25 | assert punctuations == [('node', 0), ('node', 100)] 26 | 27 | 28 | def test_punctuation_schedule_can_compare_entires_with_same_timestamp(): 29 | schedule1 = punctuation_queue.PunctuationSchedule(123, {}, 100) 30 | schedule2 = punctuation_queue.PunctuationSchedule(123, {}, 100) 31 | 32 | assert not schedule1 < schedule2 33 | -------------------------------------------------------------------------------- /tests/processor/test_sink_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test of sink processor behaviour 3 | """ 4 | 5 | import unittest.mock as mock 6 | 7 | import winton_kafka_streams.processor as wks_processor 8 | from winton_kafka_streams.processor.task_id import TaskId 9 | 10 | _expected_timestamp = 1234567890 11 | 12 | 13 | def test_createSinkProcessorObject(): 14 | wks_processor.SinkProcessor('topic1') 15 | 16 | 17 | def test_sinkProcessorTopic(): 18 | sink = wks_processor.SinkProcessor('topic1') 19 | assert sink.topic == 'topic1' 20 | 21 | 22 | def test_sinkProcessorProcess(): 23 | 24 | with mock.patch('winton_kafka_streams.processor.ProcessorContext.timestamp', new_callable=mock.PropertyMock) as mock_timestamp: 25 | mock_timestamp.return_value = _expected_timestamp 26 | mock_task = mock.Mock() 27 | mock_task.application_id = 'test_id' 28 | mock_task_id = TaskId('test_group', 0) 29 | processor_context = wks_processor.ProcessorContext(mock_task_id, mock_task, None, None, {}) 30 | processor_context.record_collector = mock.MagicMock() 31 | 32 | sink = wks_processor.SinkProcessor('topic1') 33 | sink.initialise('test-sink', processor_context) 34 | assert sink.name == 'test-sink' 35 | 36 | test_key, test_value = 'test-key', 'test-value' 37 | sink.process(test_key, test_value) 38 | assert processor_context.record_collector.called_with(test_key, test_value, _expected_timestamp) 39 | -------------------------------------------------------------------------------- /tests/processor/test_source_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test of source processor behaviour 3 | """ 4 | 5 | import winton_kafka_streams.processor as wks_processor 6 | 7 | 8 | def test_createSourceProcessorObject(): 9 | wks_processor.SourceProcessor(['test-topic-name']) 10 | 11 | 12 | def test_sourceProcessorTopic(): 13 | sp1 = wks_processor.SourceProcessor(('topic1',)) 14 | assert sp1.topics == ('topic1',) 15 | sp2 = wks_processor.SourceProcessor(('topic1', 'topic2')) 16 | assert sp2.topics == ('topic1', 'topic2') 17 | -------------------------------------------------------------------------------- /tests/processor/test_stream_task.py: -------------------------------------------------------------------------------- 1 | """ 2 | StreamTask tests 3 | """ 4 | 5 | from unittest.mock import Mock, patch 6 | 7 | import pytest 8 | from confluent_kafka.cimpl import KafkaError, KafkaException 9 | 10 | from winton_kafka_streams import kafka_config 11 | from winton_kafka_streams.errors.task_migrated_error import TaskMigratedError 12 | from winton_kafka_streams.processor import TopologyBuilder 13 | from winton_kafka_streams.processor._stream_task import StreamTask 14 | from winton_kafka_streams.processor.task_id import TaskId 15 | 16 | taskMigratedErrorCodes = [KafkaError.ILLEGAL_GENERATION, 17 | KafkaError.UNKNOWN_MEMBER_ID, 18 | KafkaError.REBALANCE_IN_PROGRESS, 19 | 47 # INVALID_PRODUCER_EPOCH - not supported in all versions for Conluent Kafka so just use the explicit code in this test 20 | ] 21 | 22 | 23 | @pytest.mark.parametrize("error_code", taskMigratedErrorCodes) 24 | def test__given__commit__when__consumer_commit_fails_as_task_migrated__then__throw_task_migrated_error(error_code): 25 | kafka_error_attrs = {'code.return_value': error_code} 26 | kafka_error = Mock(**kafka_error_attrs) 27 | 28 | with patch.object(KafkaException, 'args', [kafka_error]): 29 | consumer_attrs = {'commit.side_effect': KafkaException()} 30 | consumer = Mock(**consumer_attrs) 31 | producer = Mock() 32 | processor_attrs = {'process.return_value': None} 33 | processor = Mock(**processor_attrs) 34 | 35 | topology_builder = TopologyBuilder() 36 | 37 | topology_builder.source('my-source', ['my-input-topic-1']) 38 | topology_builder.processor('my-processor', processor, 'my-source') 39 | topology_builder.sink('my-sink', 'my-output-topic-1', 'my-processor') 40 | 41 | task = StreamTask(TaskId('testgroup', 0), "myapp", [0], topology_builder, consumer, producer, kafka_config) 42 | 43 | record_attrs = {'topic.return_value': 'my-input-topic-1', 44 | 'offset.return_value': 1, 45 | 'partition.return_value': 0} 46 | record = Mock(**record_attrs) 47 | 48 | task.add_records([record]) 49 | 50 | task.process() 51 | 52 | with pytest.raises(TaskMigratedError, message='StreamTask:testgroup_0 migrated.'): 53 | task.commit() 54 | -------------------------------------------------------------------------------- /tests/processor/test_task_id.py: -------------------------------------------------------------------------------- 1 | from winton_kafka_streams.processor.task_id import TaskId 2 | 3 | 4 | def test_taskId(): 5 | task_id = TaskId('group1', 0) 6 | 7 | assert task_id == TaskId('group1', 0) 8 | assert not (task_id != TaskId('group1', 0)) 9 | assert task_id != TaskId('group1', 1) 10 | assert task_id != TaskId('group2', 0) 11 | 12 | assert repr(task_id) == 'group1_0' 13 | 14 | assert hash(task_id) == hash(TaskId('group1', 0)) 15 | -------------------------------------------------------------------------------- /tests/processor/test_topology.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test of topology creation 3 | 4 | Low level connection of processor units 5 | """ 6 | 7 | 8 | import unittest 9 | import winton_kafka_streams.processor as wks_processor 10 | 11 | def test_createTopologyBuilder(): 12 | wks_processor.topology.TopologyBuilder() 13 | 14 | 15 | class MyTestProcessor(wks_processor.processor.BaseProcessor): 16 | pass 17 | 18 | 19 | class TestTopology(unittest.TestCase): 20 | def setUp(self): 21 | self.topology = wks_processor.topology.TopologyBuilder() 22 | 23 | def test_source(self): 24 | self.topology.source('my-source', ['my-input-topic-1']) 25 | 26 | def test_processor(self): 27 | self.topology.source('my-source', ['my-input-topic-1']) 28 | self.topology.processor('my-processor', MyTestProcessor, 'my-source') 29 | 30 | self.topology = self.topology.build() 31 | 32 | assert len(self.topology.nodes) == 2 33 | assert 'my-source' in self.topology.nodes.keys() 34 | assert 'my-processor' in self.topology.nodes.keys() 35 | 36 | def test_sink(self): 37 | self.topology.source('my-source', ['my-input-topic-1']) 38 | self.topology.processor('my-processor', MyTestProcessor, 'my-source') 39 | self.topology.sink('my-sink', 'my-output-topic-1', 'my-processor') 40 | 41 | self.topology = self.topology.build() 42 | 43 | assert len(self.topology.nodes) == 3 44 | assert 'my-source' in self.topology.nodes.keys() 45 | assert 'my-processor' in self.topology.nodes.keys() 46 | assert 'my-sink' in self.topology.nodes.keys() 47 | -------------------------------------------------------------------------------- /tests/processor/test_wallclock_timestamp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests of using wall clock time from a message 3 | 4 | """ 5 | 6 | import unittest.mock as mock 7 | 8 | import winton_kafka_streams.processor.wallclock_timestamp as wallclock_timestamp 9 | 10 | expected_time = 1496735099.23712 11 | 12 | 13 | def test_WallClockTimeStampExtractor(): 14 | with mock.patch('time.time', return_value=expected_time): 15 | assert wallclock_timestamp.WallClockTimeStampExtractor().extract(None, expected_time-1) == expected_time 16 | -------------------------------------------------------------------------------- /tests/state/test_in_memory_key_value_store.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from winton_kafka_streams.processor.serialization.serdes import BytesSerde 4 | from winton_kafka_streams.state.in_memory.in_memory_state_store import InMemoryStateStore 5 | 6 | 7 | def test_inMemoryKeyValueStore(): 8 | store = InMemoryStateStore('teststore', BytesSerde(), BytesSerde(), False) 9 | kv_store = store.get_key_value_store() 10 | 11 | kv_store['a'] = 1 12 | assert kv_store['a'] == 1 13 | 14 | kv_store['a'] = 2 15 | assert kv_store['a'] == 2 16 | 17 | del kv_store['a'] 18 | assert kv_store.get('a') is None 19 | with pytest.raises(KeyError): 20 | _ = kv_store['a'] 21 | -------------------------------------------------------------------------------- /tests/test_kafka_streams.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the top-level Kafka Streams class 3 | """ 4 | 5 | 6 | import pytest 7 | import unittest.mock as mock 8 | 9 | from winton_kafka_streams import kafka_config 10 | from winton_kafka_streams.errors.kafka_streams_error import KafkaStreamsError 11 | from winton_kafka_streams.kafka_streams import KafkaStreams 12 | from winton_kafka_streams.processor.processor import BaseProcessor 13 | from winton_kafka_streams.processor.topology import TopologyBuilder 14 | 15 | 16 | class MyTestProcessor(BaseProcessor): 17 | pass 18 | 19 | 20 | def test__given__stream_already_started__when__call_start_again__then__raise_error(): 21 | kafka_config.NUM_STREAM_THREADS = 0 22 | topology_builder = TopologyBuilder() 23 | 24 | topology_builder.source('my-source', ['my-input-topic-1']) 25 | topology_builder.processor('my-processor', MyTestProcessor, 'my-source') 26 | topology_builder.sink('my-sink', 'my-output-topic-1', 'my-processor') 27 | 28 | topology = topology_builder.build() 29 | 30 | kafka_streams = KafkaStreams(topology, kafka_config) 31 | kafka_streams.start() 32 | 33 | with pytest.raises(KafkaStreamsError, message='KafkaStreams already started.'): 34 | kafka_streams.start() 35 | 36 | 37 | def test__two__processes__with__two__topic__partitions(): 38 | NUM_STREAM_PROCESSES = 2 39 | kafka_config.NUM_STREAM_THREADS = 1 40 | 41 | consumer = mock.Mock() 42 | producer = mock.Mock() 43 | 44 | processor_attrs = {'process.return_value': None} 45 | processor = mock.Mock(**processor_attrs) 46 | 47 | kafka_client_supplier_attrs = {'consumer.return_value': consumer, 48 | 'producer.return_value': producer} 49 | kafka_client_supplier = mock.Mock(**kafka_client_supplier_attrs) 50 | 51 | topology_builder = TopologyBuilder() 52 | 53 | topology_builder.source('my-source', ['my-input-topic-1']) 54 | topology_builder.processor('my-processor', processor, 'my-source') 55 | topology_builder.sink('my-sink', 'my-output-topic-1', 'my-processor') 56 | 57 | with mock.patch('winton_kafka_streams.kafka_client_supplier.KafkaClientSupplier', return_value=kafka_client_supplier): 58 | for partition in range(NUM_STREAM_PROCESSES): 59 | kafka_stream_process = KafkaStreams(topology_builder, kafka_config) 60 | 61 | topic_partition_attrs = {'topic': 'testtopic', 62 | 'partition': partition} 63 | topic_partition = mock.Mock(**topic_partition_attrs) 64 | 65 | kafka_stream_process.stream_threads[0].add_stream_tasks([topic_partition]) 66 | 67 | record_attrs = {'topic.return_value': 'my-input-topic-1', 68 | 'offset.return_value': 1, 69 | 'partition.return_value': partition} 70 | record = mock.Mock(**record_attrs) 71 | 72 | kafka_stream_process.stream_threads[0].add_records_to_tasks([record]) 73 | -------------------------------------------------------------------------------- /winton_kafka_streams/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wintoncode/winton-kafka-streams/5867a1c42fc80bba07173fd1d004b2849b429fdf/winton_kafka_streams/__init__.py -------------------------------------------------------------------------------- /winton_kafka_streams/errors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | import errors will import all common errors 3 | """ 4 | 5 | from .kafka_streams_error import KafkaStreamsError 6 | -------------------------------------------------------------------------------- /winton_kafka_streams/errors/_kafka_error_codes.py: -------------------------------------------------------------------------------- 1 | from confluent_kafka.cimpl import KafkaError 2 | 3 | 4 | def _get_invalid_producer_epoch_code(): 5 | """Some versions of confluent-kafka-python do not explicitly support this error code""" 6 | try: 7 | return KafkaError.INVALID_PRODUCER_EPOCH 8 | except AttributeError: 9 | return 47 10 | -------------------------------------------------------------------------------- /winton_kafka_streams/errors/kafka_streams_error.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run time exception thrown by winton kafka streams on error 3 | 4 | """ 5 | 6 | 7 | class KafkaStreamsError(RuntimeError): 8 | pass 9 | -------------------------------------------------------------------------------- /winton_kafka_streams/errors/task_migrated_error.py: -------------------------------------------------------------------------------- 1 | from .kafka_streams_error import KafkaStreamsError 2 | 3 | 4 | class TaskMigratedError(KafkaStreamsError): 5 | """ 6 | Indicates that a task got migrated to another thread. 7 | Thus, the task raising this exception can be cleaned up and closed as "zombie". 8 | """ 9 | pass 10 | -------------------------------------------------------------------------------- /winton_kafka_streams/kafka_client_supplier.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pprint 3 | 4 | import confluent_kafka as kafka 5 | 6 | log = logging.getLogger(__name__) 7 | 8 | 9 | class KafkaClientSupplier: 10 | def __init__(self, _config): 11 | self.config = _config 12 | 13 | def consumer(self): 14 | log.debug('Starting consumer...') 15 | # TODO: Must set all config values applicable to a consumer 16 | consumer_args = {'bootstrap.servers': self.config.BOOTSTRAP_SERVERS, 17 | 'group.id': self.config.APPLICATION_ID, 18 | 'default.topic.config': {'auto.offset.reset': 19 | self.config.AUTO_OFFSET_RESET}, 20 | 'enable.auto.commit': self.config.ENABLE_AUTO_COMMIT} 21 | 22 | log.debug('Consumer Arguments: %s', pprint.PrettyPrinter().pformat(consumer_args)) 23 | 24 | return kafka.Consumer(consumer_args) 25 | 26 | def producer(self): 27 | # TODO: Must set all config values applicable to a producer 28 | return kafka.Producer({'bootstrap.servers': self.config.BOOTSTRAP_SERVERS}) 29 | -------------------------------------------------------------------------------- /winton_kafka_streams/kafka_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration values that may be set to control behaviour of Winton Kafka Streams 3 | 4 | Configuration may either be set inline in your application using: 5 | 6 | import kafka_config 7 | kafka_config.BOOTSTRAP_SERVERS = 'localhost:9092' 8 | 9 | or as a file in java properties format. The property names are identical to 10 | those used in the Java implementation for ease of sharing between both. 11 | 12 | External files can be loaded using: 13 | 14 | import kafka_config 15 | kafka_config.read_local_config('path/to/kafka.streams.config') 16 | 17 | 18 | """ 19 | 20 | import logging 21 | import os 22 | import sys 23 | 24 | import javaproperties 25 | from typing import List 26 | 27 | from .processor.serialization.serdes import BytesSerde, serde_as_string 28 | from .errors.kafka_streams_error import KafkaStreamsError 29 | 30 | log = logging.getLogger(__name__) 31 | 32 | #### - Required options - #### 33 | 34 | """ 35 | A list of host/port pairs to use for establishing the 36 | initial connection to the Kafka cluster 37 | 38 | """ 39 | BOOTSTRAP_SERVERS = "localhost:9092" 40 | 41 | """ 42 | An identifier for the stream processing application. 43 | Must be unique within the Kafka cluster. 44 | It is used as: 45 | 1) the default client-id prefix 46 | 2) the group-id for membership management 47 | 3) the changelog topic prefix. 48 | 49 | """ 50 | APPLICATION_ID = "wkstream.application.id" 51 | 52 | #### - Optional Options - #### 53 | 54 | """ 55 | The replication factor for changelog topics and repartition topics created by the application 56 | Default: 1 57 | Importance: Low 58 | """ 59 | REPLICATION_FACTOR = 1 60 | 61 | """ 62 | Directory location for state stores 63 | Default: /var/lib/kafka-streams 64 | Importance: Low 65 | """ 66 | STATE_DIR = "/var/lib/kafka-streams" 67 | 68 | """ 69 | Maximum number of memory bytes to be used for record caches across all threads 70 | Default: 10485760 (bytes) 71 | Importance: Medium 72 | """ 73 | CACHE_MAX_BYTES_BUFFERING = 10485760 74 | 75 | """ 76 | The number of standby replicas for each task 77 | Default: 0 78 | Importance: Medium 79 | """ 80 | NUM_STANDBY_REPLICAS = 0 81 | 82 | """ 83 | The number of threads to execute stream processing 84 | Default: 1 85 | Importance: Medium 86 | """ 87 | NUM_STREAM_THREADS = 1 88 | 89 | """ 90 | Timestamp extractor class that implements the TimestampExtractor interface 91 | Default: see Timestamp Extractor 92 | Importance: Medium 93 | """ 94 | TIMESTAMP_EXTRACTOR = None #  TODO 95 | 96 | """ 97 | A host:port pair pointing to an embedded user defined endpoint that can be used for discovering the locations of state stores within a single Winton Kafka Streams application. The value of this must be different for each instance of the application. 98 | Default "" 99 | Importance: Low 100 | """ 101 | APPLICATION_SERVER = "" 102 | 103 | """ 104 | The maximum number of records to buffer per partition 105 | Default: 1000 106 | Importance: Low 107 | """ 108 | BUFFERED_RECORDS_PER_PARTITION = 1000 109 | 110 | """ 111 | An id string to pass to the server when making requests. (This setting is passed to the consumer/producer clients used internally by Winton Kafka Streams.) 112 | Default: "" 113 | Importance: Low 114 | """ 115 | CLIENT_ID = "" 116 | 117 | """ 118 | The frequency with which to save the position (offsets in source topics) of tasks 119 | Default: 30000 (millisecs) 120 | Importance: Low 121 | """ 122 | COMMIT_INTERVAL_MS = 30_000 123 | 124 | """ 125 | A list of classes to use as metrics reporters 126 | Default: [] 127 | Importance: Low 128 | """ 129 | METRIC_REPORTERS: List[str] = [] 130 | 131 | """ 132 | The number of samples maintained to compute metrics. 133 | Default: 2 134 | Importance: Low 135 | """ 136 | METRICS_NUM_SAMPLES = 2 137 | 138 | """ 139 | The highest recording level for metrics. 140 | Default: info 141 | Importance: Low 142 | """ 143 | METRICS_RECORDING_LEVEL = 'info' 144 | 145 | """ 146 | The window of time a metrics sample is computed over. 147 | Default: 30000 (millisecs) 148 | Importance: Low 149 | """ 150 | METRICS_SAMPLE_WINDOW_MS = 30_000 151 | 152 | """ 153 | Partition grouper class that implements the PartitionGrouper interface 154 | Defatult: see Partition Grouper 155 | Importance: Low 156 | """ 157 | PARITION_GROUPER = None # DEBUG 158 | 159 | """ 160 | The amount of time in milliseconds to block waiting for input 161 | Default: 100 (millisecs) 162 | Importance: Low 163 | """ 164 | POLL_MS = 100 165 | 166 | """ 167 | The amount of time in milliseconds to wait before deleting state when a partition has migrated 168 | Default: 60000 (millisecs) 169 | Importance: Low 170 | """ 171 | STATE_CLEANUP_DELAY_MS = 60_000 172 | 173 | """ 174 | Added to a windows maintainMs to ensure data is not deleted from the log prematurely. Allows for clock drift. 175 | Default: 86400000 (millisecons) = 1 day 176 | Importance: Low 177 | """ 178 | WINDOWSTORE_CHANGELOG_ADDITIONAL_RETENTION_MS = 86_000_000 179 | 180 | #### - Non streams configuration parameters - #### 181 | 182 | """ 183 | linger.ms (low) Producer 184 | Default: 100 185 | Importance: low 186 | """ 187 | LINGER_MS = 100 188 | 189 | """ 190 | Producer 191 | Default: 10 192 | Importance: low 193 | """ 194 | RETRIES = 10 195 | 196 | """ 197 | Consumer 198 | Default: earliest 199 | Importance: low 200 | """ 201 | AUTO_OFFSET_RESET = 'earliest' 202 | 203 | """ 204 | Consumer 205 | Default: false, see Consumer Auto Commit 206 | Importance: low 207 | """ 208 | ENABLE_AUTO_COMMIT = 'false' 209 | 210 | """ 211 | Consumer 212 | Default: Integer.MAX_VALUE 213 | Importance: low 214 | """ 215 | MAX_POLL_INTERVAL_MS = sys.maxsize # TODO: No max for Python, this is word size - is that correct for Java? 216 | 217 | """ 218 | Consumer 219 | Default: 1000 220 | Importance: low 221 | """ 222 | MAX_POLL_RECORDS = 1000 223 | 224 | #### - Serdes Configuration - #### 225 | 226 | """ 227 | Default serializer/deserializer class for record values, implements the Serde interface (see also key.serdes) 228 | Default: winton_kafka_streams.processor.serialization.serdes.BytesSerde 229 | Importance: Medium 230 | """ 231 | VALUE_SERDE = serde_as_string(BytesSerde) 232 | 233 | """ 234 | Default serializer/deserializer class for record keys, implements the Serde interface (see also value.serdes) 235 | Default: winton_kafka_streams.processor.serialization.serdes.BytesSerde 236 | Importance: Medium 237 | """ 238 | KEY_SERDE = serde_as_string(BytesSerde) 239 | 240 | # StringSerde - encoding 241 | SERIALIZER_ENCODING = 'utf-8' 242 | DESERIALIZER_ENCODING = 'utf-8' 243 | KEY_SERIALIZER_ENCODING = None 244 | KEY_DESERIALIZER_ENCODING = None 245 | VALUE_SERIALIZER_ENCODING = None 246 | VALUE_DESERIALIZER_ENCODING = None 247 | 248 | # StringSerde - error mode 249 | SERIALIZER_ERROR = 'strict' 250 | DESERIALIZER_ERROR = 'strict' 251 | KEY_SERIALIZER_ERROR = None 252 | KEY_DESERIALIZER_ERROR = None 253 | VALUE_SERIALIZER_ERROR = None 254 | VALUE_DESERIALIZER_ERROR = None 255 | 256 | # IntegerSerde/LongSerde - byte order 257 | SERIALIZER_BYTEORDER = 'little' 258 | DESERIALIZER_BYTEORDER = 'little' 259 | KEY_SERIALIZER_BYTEORDER = None 260 | KEY_DESERIALIZER_BYTEORDER = None 261 | VALUE_SERIALIZER_BYTEORDER = None 262 | VALUE_DESERIALIZER_BYTEORDER = None 263 | 264 | # IntegerSerde/LongSerde - signed integer 265 | SERIALIZER_SIGNED = 'true' 266 | DESERIALIZER_SIGNED = 'true' 267 | KEY_SERIALIZER_SIGNED = None 268 | KEY_DESERIALIZER_SIGNED = None 269 | VALUE_SERIALIZER_SIGNED = None 270 | VALUE_DESERIALIZER_SIGNED = None 271 | 272 | # AvroSerde 273 | AVRO_SCHEMA_REGISTRY = None 274 | AVRO_SCHEMA = None 275 | KEY_AVRO_SCHEMA_REGISTRY = None 276 | KEY_AVRO_SCHEMA = None 277 | VALUE_AVRO_SCHEMA_REGISTRY = None 278 | VALUE_AVRO_SCHEMA = None 279 | 280 | 281 | def read_local_config(config_file): 282 | if not os.path.exists(config_file): 283 | raise KafkaStreamsError(f'Config file {config_file} does not exist') 284 | 285 | with open(config_file, 'r') as cf: 286 | props = javaproperties.load(cf) 287 | 288 | for k, v in props.items(): 289 | ku = k.upper().replace('.', '_') 290 | if ku not in globals().keys(): 291 | raise KafkaStreamsError(f'Unrecognised property {k} read from config file {config_file}') 292 | globals()[ku] = v 293 | 294 | log.debug('Config from "%s": %s = %s', config_file, k, v) 295 | -------------------------------------------------------------------------------- /winton_kafka_streams/kafka_streams.py: -------------------------------------------------------------------------------- 1 | """ 2 | Primary entrypoint for applications wishing to implement Python Kafka Streams 3 | 4 | """ 5 | 6 | import logging 7 | import threading 8 | from enum import Enum 9 | 10 | from .errors.kafka_streams_error import KafkaStreamsError 11 | from .kafka_client_supplier import KafkaClientSupplier 12 | from .processor import StreamThread 13 | 14 | log = logging.getLogger(__name__) 15 | 16 | 17 | class KafkaStreams: 18 | """ 19 | Encapsulates stream graph processing units 20 | 21 | """ 22 | 23 | """ 24 | Kafka Streams states are the possible state that a Kafka Streams instance can be in. 25 | An instance must only be in one state at a time. 26 | Note this instance will be in "Rebalancing" state if any of its threads is rebalancing 27 | The expected state transition with the following defined states is: 28 | 29 |
 30 |                       +--------------+
 31 |               +<----- | Created      |
 32 |               |       +-----+--------+
 33 |               |             |
 34 |               |             v
 35 |               |       +-----+--------+
 36 |               +<----- | Rebalancing  | <----+
 37 |               |       +--------------+      |
 38 |               |                             |
 39 |               |                             |
 40 |               |       +--------------+      |
 41 |               +-----> | Running      | ---->+
 42 |               |       +-----+--------+
 43 |               |             |
 44 |               |             v
 45 |               |       +-----+--------+
 46 |               +-----> | Pending      |
 47 |                       | Shutdown     |
 48 |                       +-----+--------+
 49 |                             |
 50 |                             v
 51 |                       +-----+--------+
 52 |                       | Not Running  |
 53 |                       +--------------+
 54 |       
55 | """ 56 | class State(Enum): 57 | CREATED = 0 58 | RUNNING = 1 59 | REBALANCING = 2 60 | PENDING_SHUTDOWN = 3 61 | NOT_RUNNING = 4 62 | 63 | def valid_transition_to(self, new_state): 64 | if self is self.CREATED: 65 | return new_state in (self.REBALANCING, self.RUNNING, self.PENDING_SHUTDOWN) 66 | elif self is self.RUNNING: 67 | return new_state in (self.REBALANCING, self.PENDING_SHUTDOWN) 68 | elif self is self.REBALANCING: 69 | return new_state in (self.RUNNING, self.REBALANCING, self.PENDING_SHUTDOWN) 70 | elif self is self.PENDING_SHUTDOWN: 71 | return new_state in (self.NOT_RUNNING,) 72 | else: # including NOT_RUNNING 73 | return False 74 | 75 | def is_running(self): 76 | return self in (self.RUNNING, self.REBALANCING) 77 | 78 | def is_created_or_running(self): 79 | return self.is_running() or self == self.CREATED 80 | 81 | def __str__(self): 82 | return self.name 83 | 84 | def __init__(self, topology_builder, kafka_config): 85 | self.kafka_config = kafka_config 86 | 87 | self.state = self.State.CREATED 88 | self.state_lock = threading.Lock() 89 | self.thread_states = {} 90 | 91 | self.consumer = None 92 | 93 | self.stream_threads = [StreamThread(topology_builder, self.kafka_config, KafkaClientSupplier(self.kafka_config)) 94 | for _ in range(int(self.kafka_config.NUM_STREAM_THREADS))] 95 | for stream_thread in self.stream_threads: 96 | stream_thread.set_state_listener(self.on_thread_state_change) 97 | self.thread_states[stream_thread.thread_id()] = stream_thread.state 98 | 99 | def set_state(self, new_state): 100 | old_state = self.state 101 | if not old_state.valid_transition_to(new_state): 102 | log.warn(f'Unexpected state transition from {old_state} to {new_state}.') 103 | else: 104 | log.info(f'State transition from {old_state} to {new_state}.') 105 | self.state = new_state 106 | 107 | def on_thread_state_change(self, stream_thread, old_state, new_state): 108 | with self.state_lock: 109 | self.thread_states[stream_thread.thread_id()] = new_state 110 | if new_state in (StreamThread.State.ASSIGNING_PARTITIONS, StreamThread.State.PARTITIONS_REVOKED): 111 | self.set_state(self.State.REBALANCING) 112 | elif set(self.thread_states.values()) == set([StreamThread.State.RUNNING]): 113 | self.set_state(self.State.RUNNING) 114 | 115 | def start(self): 116 | log.debug('Starting Kafka Streams process') 117 | if self.state == self.State.CREATED: 118 | self.set_state(self.State.RUNNING) 119 | for stream_thread in self.stream_threads: 120 | stream_thread.start() 121 | else: 122 | raise KafkaStreamsError('KafkaStreams already started.') 123 | 124 | def close(self): 125 | if self.state.is_created_or_running(): 126 | self.set_state(self.State.PENDING_SHUTDOWN) 127 | for stream_thread in self.stream_threads: 128 | stream_thread.set_state_listener(None) 129 | stream_thread.close() 130 | self.set_state(self.State.NOT_RUNNING) 131 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Processor generating functions 3 | 4 | """ 5 | 6 | from .topology import TopologyBuilder 7 | from .processor import BaseProcessor, SourceProcessor, SinkProcessor 8 | from .processor_context import ProcessorContext 9 | 10 | from ._stream_thread import StreamThread 11 | 12 | # time extractors 13 | from .wallclock_timestamp import WallClockTimeStampExtractor 14 | from .extract_timestamp import RecordTimeStampExtractor 15 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/_context.py: -------------------------------------------------------------------------------- 1 | """ 2 | Processor context is the link to kafka from individual processor objects 3 | 4 | """ 5 | 6 | import functools 7 | import logging 8 | from typing import Any, Callable 9 | 10 | from winton_kafka_streams.state.key_value_state_store import KeyValueStateStore 11 | from ..errors.kafka_streams_error import KafkaStreamsError 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | 16 | def _raise_if_null_record(fn: Callable[..., Any]) -> Callable[..., Any]: 17 | @functools.wraps(fn) 18 | def _inner(*args, **kwargs): 19 | if args[0].current_record is None: 20 | raise KafkaStreamsError(f"Record cannot be unset when retrieving {fn.__name__}") 21 | return fn(*args, **kwargs) 22 | return _inner 23 | 24 | 25 | class Context: 26 | """ 27 | Processor context object 28 | 29 | """ 30 | 31 | def __init__(self, _state_record_collector, _state_stores): 32 | self.current_node = None 33 | self.current_record = None 34 | self.state_record_collector = _state_record_collector 35 | self._state_stores = _state_stores 36 | 37 | def send(self, topic, key, obj): 38 | """ 39 | Send the key value-pair to a Kafka topic 40 | 41 | """ 42 | print(f"Send {obj} to {topic}") 43 | pass 44 | 45 | def schedule(self, timestamp): 46 | """ 47 | Schedule the punctuation function call 48 | 49 | """ 50 | 51 | pass 52 | 53 | @property # type: ignore # waiting on https://github.com/python/mypy/issues/1362 54 | @_raise_if_null_record 55 | def offset(self): 56 | return self.current_record.offset() 57 | 58 | @property # type: ignore 59 | @_raise_if_null_record 60 | def partition(self): 61 | return self.current_record.partition() 62 | 63 | @property # type: ignore 64 | @_raise_if_null_record 65 | def timestamp(self): 66 | return self.current_record.timestamp() 67 | 68 | @property # type: ignore 69 | @_raise_if_null_record 70 | def topic(self): 71 | return self.current_record.topic() 72 | 73 | def get_store(self, name) -> KeyValueStateStore: 74 | if not self.current_node: 75 | raise KafkaStreamsError("Access of state from unknown node") 76 | 77 | # TODO: Need to check for a global state here 78 | # This is the reason that processors access store through context 79 | 80 | if name not in self.current_node.state_stores: 81 | raise KafkaStreamsError(f"Processor {self.current_node.name} does not have access to store {name}") 82 | if name not in self._state_stores: 83 | raise KafkaStreamsError(f"Store {name} is not found") 84 | 85 | return self._state_stores[name].get_key_value_store() 86 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/_punctuation_queue.py: -------------------------------------------------------------------------------- 1 | from queue import PriorityQueue 2 | from collections import namedtuple 3 | 4 | 5 | class PunctuationSchedule(namedtuple('PunctuationSchedule', ['timestamp', 'node', 'interval'])): 6 | def __lt__(self, other): 7 | return self.timestamp < other.timestamp 8 | 9 | 10 | class PunctuationQueue: 11 | 12 | def __init__(self, punctuate): 13 | self.pq = PriorityQueue() 14 | self.punctuate = punctuate 15 | 16 | def schedule(self, node, interval): 17 | self.pq.put(PunctuationSchedule(0, node, interval)) 18 | 19 | def may_punctuate(self, timestamp): 20 | punctuated = False 21 | while not self.pq.empty(): 22 | top = self.pq.get() 23 | if top.timestamp <= timestamp: 24 | self.punctuate(top.node, timestamp) 25 | punctuated = True 26 | next_timestamp = top.interval + (timestamp if top.timestamp == 0 else top.timestamp) 27 | self.pq.put(PunctuationSchedule(next_timestamp, top.node, top.interval)) 28 | else: 29 | self.pq.put(top) 30 | break 31 | return punctuated 32 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/_record_collector.py: -------------------------------------------------------------------------------- 1 | """ 2 | Record collector sends produced results to kafka topic 3 | 4 | """ 5 | 6 | import logging 7 | 8 | from ..errors.kafka_streams_error import KafkaStreamsError 9 | 10 | log = logging.getLogger(__name__) 11 | 12 | # When producing a message with partition = UA rdkafka will run a partitioner for us 13 | RD_KAFKA_PARTITION_UA = -1 14 | 15 | 16 | class RecordCollector: 17 | """ 18 | Collects records to be output to Kafka topics after 19 | they have been processed by the topology 20 | 21 | """ 22 | 23 | def __init__(self, _producer, _key_serde, _value_serde): 24 | self.producer = _producer 25 | self.key_serde = _key_serde 26 | self.value_serde = _value_serde 27 | 28 | def send(self, topic, key, value, timestamp, 29 | *, partition=RD_KAFKA_PARTITION_UA, partitioner=None): 30 | ser_key = self.key_serde.serializer.serialize(topic, key) 31 | ser_value = self.value_serde.serializer.serialize(topic, value) 32 | produced = False 33 | 34 | log.debug("Sending to partition %d of topic %s : (%s, %s, %s)", partition, topic, ser_key, ser_value, timestamp) 35 | 36 | while not produced: 37 | try: 38 | self.producer.produce(topic, ser_value, ser_key, partition, self.on_delivery, partitioner, timestamp) 39 | self.producer.poll(0) # Ensure previous message's delivery reports are served 40 | produced = True 41 | except BufferError as be: 42 | log.exception(be) 43 | self.producer.poll(10) # Wait a bit longer to give buffer more time to flush 44 | except NotImplementedError as nie: 45 | log.exception(nie) 46 | produced = True # should not enter infinite loop 47 | 48 | def on_delivery(self, err, msg): 49 | """ 50 | Callback function after a value is output to a source. 51 | 52 | Will raise an exception if an error is detected. 53 | 54 | TODO: Decide if an error should be raised or if this should be demoted? 55 | Can an error be raised if a broker fails? Should we simply warn 56 | and continue to poll and retry in this case? 57 | """ 58 | 59 | # TODO: Is err correct? Should we check if msg has error? 60 | if err: 61 | raise KafkaStreamsError(f'Error on delivery of message {msg}') 62 | 63 | def flush(self): 64 | """ 65 | Flush all pending items in the queue to the output topic on Kafka 66 | 67 | """ 68 | log.debug('Flushing producer') 69 | self.producer.flush() 70 | 71 | def close(self): 72 | log.debug('Closing producer') 73 | self.producer.close() 74 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/_stream_task.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import queue 3 | 4 | from confluent_kafka import TopicPartition 5 | from confluent_kafka.cimpl import KafkaException, KafkaError 6 | 7 | from winton_kafka_streams.processor.serialization.serdes import BytesSerde 8 | from ..errors._kafka_error_codes import _get_invalid_producer_epoch_code 9 | from ._punctuation_queue import PunctuationQueue 10 | from ._record_collector import RecordCollector 11 | from .processor_context import ProcessorContext 12 | from .wallclock_timestamp import WallClockTimeStampExtractor 13 | from ..errors.task_migrated_error import TaskMigratedError 14 | from ..processor.serialization.serdes import serde_from_string 15 | 16 | 17 | class DummyRecord: 18 | """ 19 | Dummy implementation of Record that provides the minimum needed 20 | to supply a timestamp to Context during punctuate. 21 | """ 22 | 23 | def __init__(self, timestamp): 24 | self._timestamp = timestamp 25 | 26 | def topic(self): 27 | return '__null_topic__' 28 | 29 | def partition(self): 30 | return -1 31 | 32 | def offset(self): 33 | return -1 34 | 35 | def timestamp(self): 36 | return self._timestamp 37 | 38 | 39 | 40 | 41 | _taskMigratedErrorCodes = [KafkaError.ILLEGAL_GENERATION, 42 | KafkaError.REBALANCE_IN_PROGRESS, 43 | KafkaError.UNKNOWN_MEMBER_ID, 44 | _get_invalid_producer_epoch_code()] 45 | 46 | 47 | class StreamTask: 48 | """ 49 | Stream tasks are associated with a partition group(s) 50 | and are responsible for passing values from that partition 51 | to an instance of the topology for processing. 52 | 53 | """ 54 | 55 | def __init__(self, _task_id, _application_id, _partitions, _topology_builder, _consumer, _producer, _config): 56 | self.log = logging.getLogger(__name__ + '(' + str(_task_id) + ')') 57 | self.task_id = _task_id 58 | self.application_id = _application_id 59 | self.partitions = _partitions 60 | self.topology = _topology_builder.build() 61 | self.state_stores = {name: store.get() for name, store in self.topology.state_stores.items()} 62 | self.consumer = _consumer 63 | self.producer = _producer 64 | self.config = _config 65 | 66 | self.key_serde = serde_from_string(self.config.KEY_SERDE) 67 | self.key_serde.configure(self.config, True) 68 | self.value_serde = serde_from_string(self.config.VALUE_SERDE) 69 | self.value_serde.configure(self.config, False) 70 | 71 | self.record_collector = RecordCollector(self.producer, self.key_serde, self.value_serde) 72 | self.state_record_collector = RecordCollector(self.producer, BytesSerde(), BytesSerde()) 73 | 74 | self.queue = queue.Queue() 75 | self.context = ProcessorContext(self.task_id, self, self.record_collector, 76 | self.state_record_collector, self.state_stores) 77 | 78 | self.punctuation_queue = PunctuationQueue(self.punctuate) 79 | # TODO: use the configured timestamp extractor. 80 | self.timestamp_extractor = WallClockTimeStampExtractor() 81 | self.current_timestamp = None 82 | 83 | self.commitRequested = False 84 | self.commitOffsetNeeded = False 85 | self.consumedOffsets = {} 86 | 87 | self._init_state_stores() 88 | self._init_topology(self.context) 89 | 90 | def _init_state_stores(self): 91 | self.log.debug(f'Initialising state stores') 92 | for store in self.state_stores.values(): 93 | store.initialize(self.context, store) 94 | 95 | def _init_topology(self, context): 96 | for node in self.topology.nodes.values(): 97 | try: 98 | context.current_node = node 99 | node.initialise(context) 100 | finally: 101 | context.current_node = None 102 | context.current_record = None 103 | 104 | def add_records(self, records): 105 | for record in records: 106 | self.queue.put(record) 107 | 108 | def process(self): 109 | if self.queue.empty(): 110 | return False 111 | 112 | record = self.queue.get() 113 | self.context.current_record = record 114 | self.current_timestamp = self.timestamp_extractor.extract(record, self.current_timestamp) 115 | 116 | topic = record.topic() 117 | raw_key = record.key() 118 | key = None if raw_key is None else self.key_serde.deserializer.deserialize(topic, record.key()) 119 | value = self.value_serde.deserializer.deserialize(topic, record.value()) 120 | 121 | self.context.current_node = self.topology.sources[topic] 122 | self.topology.sources[topic].process(key, value) 123 | 124 | self.consumedOffsets[(topic, record.partition())] = record.offset() 125 | self.commitOffsetNeeded = True 126 | 127 | self.context.current_record = None 128 | self.context.current_node = None 129 | 130 | return True 131 | 132 | def maybe_punctuate(self): 133 | timestamp = self.current_timestamp 134 | 135 | if timestamp is None: 136 | return False 137 | 138 | return self.punctuation_queue.may_punctuate(timestamp) 139 | 140 | def punctuate(self, node, timestamp): 141 | self.log.debug(f'Punctuating processor {node} at {timestamp}') 142 | self.context.current_record = DummyRecord(timestamp) 143 | self.context.current_node = node 144 | node.punctuate(timestamp) 145 | self.context.current_record = None 146 | self.context.current_node = None 147 | 148 | def commit(self): 149 | try: 150 | self.record_collector.flush() 151 | self.commit_offsets() 152 | self.commitRequested = False 153 | except Exception as e: 154 | self.log.exception(e) 155 | raise 156 | 157 | def commit_offsets(self): 158 | """ Commit consumed offsets if needed """ 159 | 160 | # may be asked to commit on rebalance or shutdown but 161 | # should only commit if the processor has requested. 162 | try: 163 | if self.commitOffsetNeeded: 164 | offsets_to_commit = [TopicPartition(t, p, o + 1) for ((t, p), o) in self.consumedOffsets.items()] 165 | self.consumer.commit(offsets=offsets_to_commit, asynchronous=False) 166 | self.consumedOffsets.clear() 167 | self.commitOffsetNeeded = False 168 | 169 | except KafkaException as ke: 170 | kafka_error = ke.args[0].code() 171 | 172 | if kafka_error in _taskMigratedErrorCodes: 173 | raise TaskMigratedError(f'{self} migrated.') 174 | else: 175 | raise 176 | 177 | def commit_needed(self): 178 | return self.commitRequested 179 | 180 | def need_commit(self): 181 | self.commitRequested = True 182 | 183 | def schedule(self, interval): 184 | self.punctuation_queue.schedule(self.context.current_node, interval) 185 | 186 | def __repr__(self): 187 | return self.__class__.__name__ + f":{self.task_id}" 188 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/_stream_thread.py: -------------------------------------------------------------------------------- 1 | """ 2 | Kafka consumer poll thread 3 | 4 | """ 5 | 6 | import logging 7 | import threading 8 | from enum import Enum 9 | from itertools import zip_longest 10 | 11 | from confluent_kafka import KafkaError 12 | 13 | from ..errors.task_migrated_error import TaskMigratedError 14 | from .task_id import TaskId 15 | from ._stream_task import StreamTask 16 | 17 | 18 | class StreamThread: 19 | """ 20 | Stream thread states are the possible states that a stream thread can be in. 21 | A thread must only be in one state at a time 22 | The expected state transitions with the following defined states is: 23 | 24 |
 25 |                      +-------------+
 26 |                      | Not Running | <-------+
 27 |                      +-----+-------+         |
 28 |                            |                 |
 29 |                            v                 |
 30 |                      +-----+-------+         |
 31 |                +<--- | Running     | <----+  |
 32 |                |     +-----+-------+      |  |
 33 |                |           |              |  |
 34 |                |           v              |  |
 35 |                |     +-----+-------+      |  |
 36 |                +<--- | Partitions  |      |  |
 37 |                |     | Revoked     |      |  |
 38 |                |     +-----+-------+      |  |
 39 |                |           |              |  |
 40 |                |           v              |  |
 41 |                |     +-----+-------+      |  |
 42 |                |     | Assigning   |      |  |
 43 |                |     | Partitions  | ---->+  |
 44 |                |     +-----+-------+         |
 45 |                |           |                 |
 46 |                |           v                 |
 47 |                |     +-----+-------+         |
 48 |                +---> | Pending     | ------->+
 49 |                      | Shutdown    |
 50 |                      +-------------+
 51 |       
52 | """ 53 | 54 | class State(Enum): 55 | NOT_RUNNING = 0 56 | RUNNING = 1 57 | PARTITIONS_REVOKED = 2 58 | ASSIGNING_PARTITIONS = 3 59 | PENDING_SHUTDOWN = 4 60 | 61 | def valid_transition_to(self, new_state): 62 | if self is self.NOT_RUNNING: 63 | return new_state in (self.RUNNING,) 64 | elif self is self.RUNNING: 65 | return new_state in (self.PARTITIONS_REVOKED, self.PENDING_SHUTDOWN) 66 | elif self is self.PARTITIONS_REVOKED: 67 | return new_state in (self.PENDING_SHUTDOWN, self.ASSIGNING_PARTITIONS) 68 | elif self is self.ASSIGNING_PARTITIONS: 69 | return new_state in (self.RUNNING, self.PENDING_SHUTDOWN) 70 | elif self is self.PENDING_SHUTDOWN: 71 | return new_state in (self.NOT_RUNNING,) 72 | else: 73 | return False 74 | 75 | def is_running(self): 76 | return self not in (self.NOT_RUNNING, self.PENDING_SHUTDOWN) 77 | 78 | def __str__(self): 79 | return self.name 80 | 81 | def __init__(self, _topology_builder, _config, _kafka_supplier): 82 | super().__init__() 83 | self.topology_builder = _topology_builder 84 | self.config = _config 85 | self.kafka_supplier = _kafka_supplier 86 | 87 | self.tasks = [] 88 | self.tasks_by_partition = {} 89 | self.state = self.State.NOT_RUNNING 90 | 91 | self.topics = _topology_builder.topics 92 | 93 | self.thread = threading.Thread(target=self.run) 94 | self.log = logging.getLogger(__name__ + '(' + self.thread.name + ')') 95 | 96 | self.log.info('Topics for consumer are: %s', self.topics) 97 | self.consumer = self.kafka_supplier.consumer() 98 | 99 | self.state_listener = None 100 | self.set_state(self.State.RUNNING) 101 | 102 | def thread_id(self): 103 | return self.thread.ident 104 | 105 | def set_state(self, new_state): 106 | old_state = self.state 107 | if not old_state.valid_transition_to(new_state): 108 | self.log.warning(f'Unexpected state transition from {old_state} to {new_state}.') 109 | else: 110 | self.log.info(f'State transition from {old_state} to {new_state}.') 111 | self.state = new_state 112 | if self.state_listener: 113 | self.state_listener(self, old_state, new_state) 114 | 115 | def set_state_when_not_in_pending_shutdown(self, new_state): 116 | if self.state is not self.State.PENDING_SHUTDOWN: 117 | self.set_state(new_state) 118 | 119 | def set_state_listener(self, listener): 120 | """ For internal use only. """ 121 | self.state_listener = listener 122 | 123 | def still_running(self): 124 | return self.state.is_running() 125 | 126 | def start(self): 127 | self.thread.start() 128 | 129 | def run(self): 130 | self.log.debug('Running stream thread...') 131 | try: 132 | self.consumer.subscribe(self.topics, on_assign=self.on_assign, on_revoke=self.on_revoke) 133 | 134 | while self.still_running(): 135 | try: 136 | records = self.poll_requests(0.1) 137 | if records: 138 | self.log.debug(f'Processing {len(records)} record(s)') 139 | self.add_records_to_tasks(records) 140 | self.process_and_punctuate() 141 | except TaskMigratedError as error: 142 | self.log.warning(f"Detected a task that got migrated to another thread. " + 143 | "This implies that this thread missed a rebalance and dropped out of the " 144 | "consumer group. " + 145 | "Trying to rejoin the consumer group now. %s", error) 146 | 147 | self.log.debug('Ending stream thread...') 148 | finally: 149 | self.commit_all() 150 | self.shutdown() 151 | 152 | def poll_requests(self, poll_timeout): 153 | """ Get the next batch of records """ 154 | 155 | # The current python kafka client gives us messages one by one, 156 | # but for better throughput we want to process many records at once. 157 | # Keep polling until we get no more records out. 158 | records = [] 159 | record = self.consumer.poll(poll_timeout) 160 | while record is not None: 161 | if not record.error(): 162 | self.log.debug('Received message at offset: %d', record.offset()) 163 | records.append(record) 164 | record = self.consumer.poll(0.) 165 | elif record.error().code() == KafkaError._PARTITION_EOF: 166 | record = self.consumer.poll(0.) 167 | elif record.error(): 168 | self.log.error('Record error received: %s', record.error()) 169 | 170 | return records 171 | 172 | def add_records_to_tasks(self, records): 173 | for record in records: 174 | self.tasks_by_partition[record.partition()].add_records([record]) 175 | 176 | def process_and_punctuate(self): 177 | while True: 178 | total_processed_each_round = 0 179 | 180 | for task in self.tasks: 181 | if task.process(): 182 | total_processed_each_round += 1 183 | 184 | if total_processed_each_round == 0: 185 | break 186 | 187 | for task in self.tasks: 188 | task.maybe_punctuate() 189 | if task.commit_needed(): 190 | self.commit(task) 191 | 192 | def commit(self, task): 193 | self.log.debug('Commit task "%s"', task) 194 | task.commit() 195 | 196 | def commit_all(self): 197 | for task in self.tasks: 198 | self.commit(task) 199 | 200 | def shutdown(self): 201 | self.set_state(self.State.NOT_RUNNING) 202 | 203 | def add_stream_tasks(self, assignment): 204 | # simplistic, but good enough for now. should take co-locating topics etc. into account in the future 205 | grouped_tasks = {TaskId(topic_partition.topic, topic_partition.partition): {topic_partition} 206 | for topic_partition in assignment} 207 | self.tasks = [StreamTask(task_id, self.config.APPLICATION_ID, 208 | partitions, self.topology_builder, self.consumer, 209 | self.kafka_supplier.producer(), self.config) 210 | for (task_id, partitions) 211 | in grouped_tasks.items()] 212 | 213 | for task in self.tasks: 214 | self.tasks_by_partition.update( 215 | zip_longest((topic_partition.partition for topic_partition in task.partitions), [], fillvalue=task)) 216 | 217 | def on_assign(self, consumer, partitions): 218 | self.log.debug('Assigning partitions %s', partitions) 219 | 220 | self.set_state_when_not_in_pending_shutdown(self.State.ASSIGNING_PARTITIONS) 221 | self.add_stream_tasks(partitions) 222 | self.set_state_when_not_in_pending_shutdown(self.State.RUNNING) 223 | 224 | def on_revoke(self, consumer, partitions): 225 | self.log.debug('Revoking partitions %s', partitions) 226 | self.commit_all() 227 | self.set_state_when_not_in_pending_shutdown(self.State.PARTITIONS_REVOKED) 228 | self.tasks = [] 229 | self.tasks_by_partition = {} 230 | 231 | def close(self): 232 | self.log.debug('Closing stream thread and consumer') 233 | self.set_state(self.State.PENDING_SHUTDOWN) 234 | self.consumer.close() 235 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/_timestamp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base class for all timestamp extractors 3 | 4 | """ 5 | 6 | import abc 7 | 8 | 9 | class TimeStampExtractor(metaclass=abc.ABCMeta): 10 | @abc.abstractmethod 11 | def extract(self, record, previous_timestamp): 12 | pass 13 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/extract_timestamp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Time extractor from the message being processed 3 | 4 | """ 5 | 6 | import abc 7 | 8 | from ._timestamp import TimeStampExtractor 9 | 10 | 11 | class RecordTimeStampExtractor(TimeStampExtractor): 12 | """ 13 | Time stamp extractor that returns a time taken from the message itself 14 | 15 | This is an abstract class, the on_error function must be implemented to 16 | use this extractor. 17 | """ 18 | 19 | def extract(self, record, previous_timestamp): 20 | """ 21 | Returns kafka timestamp for message 22 | 23 | Parameters: 24 | ----------- 25 | record : Kafka record 26 | New record from which time should be assigned 27 | previous_timestamp : long 28 | Last extracted timestamp (seconds since the epoch) 29 | 30 | Returns: 31 | -------- 32 | time : long 33 | Time in seconds since the epoch 34 | """ 35 | (timestamp_type, timestamp) = record.timestamp() 36 | if timestamp < 0: 37 | return self.on_error(record, timestamp, previous_timestamp) 38 | 39 | return timestamp 40 | 41 | @abc.abstractmethod 42 | def on_error(self, record, timestamp, previous_timestamp): 43 | """ 44 | Called when an invalid timestamp is found in a record 45 | 46 | Parameters: 47 | record : Kafka record 48 | The current record being processed 49 | timestamp : long 50 | The (invalid) timestamp that was processed 51 | previous_timestamp : long 52 | Last extracted timestamp (seconds since the epoch) 53 | """ 54 | pass 55 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base definitions for all processors 3 | 4 | """ 5 | 6 | import logging 7 | 8 | log = logging.getLogger(__name__) 9 | 10 | 11 | class BaseProcessor: 12 | def __init__(self): 13 | super().__init__() 14 | 15 | self.name = None 16 | self.context = None 17 | 18 | def initialise(self, _name, _context): 19 | self.name = _name 20 | self.context = _context 21 | 22 | 23 | class SourceProcessor(BaseProcessor): 24 | """ 25 | Fetches values from a kafka topic(s)and forwards 26 | them to child node for processing 27 | 28 | """ 29 | 30 | def __init__(self, topics): 31 | super().__init__() 32 | self.topics = topics 33 | 34 | def process(self, key, value): 35 | self.context.forward(key, value) 36 | 37 | def punctuate(self, timestamp): 38 | pass 39 | 40 | 41 | class SinkProcessor(BaseProcessor): 42 | """ 43 | Forward values from processor nodes to the record collector 44 | from where they will be written to a Kafka topic 45 | 46 | """ 47 | 48 | def __init__(self, _topic): 49 | super().__init__() 50 | self.topic = _topic 51 | 52 | def process(self, key, value): 53 | self._send(key, value, self.context.timestamp) 54 | 55 | def punctuate(self, timestamp): 56 | pass 57 | 58 | def _send(self, key, value, timestamp): 59 | self.context.record_collector.send(self.topic, key, value, timestamp) 60 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/processor_context.py: -------------------------------------------------------------------------------- 1 | """ 2 | Default context passed to every processor 3 | 4 | """ 5 | import logging 6 | 7 | from . import _context 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | 12 | class ProcessorContext(_context.Context): 13 | """ 14 | The same processor context is shared betwen all nodes in 15 | a single topology instance. It takes care of forwarding 16 | values to downstream processors. 17 | 18 | """ 19 | def __init__(self, _task_id, _task, _record_collector, _state_record_collector, _state_stores): 20 | 21 | super().__init__(_state_record_collector, _state_stores) 22 | 23 | self.application_id = _task.application_id 24 | self.task_id = _task_id 25 | self.task = _task 26 | self.record_collector = _record_collector 27 | 28 | def commit(self): 29 | """ 30 | Request a commit 31 | 32 | Returns: 33 | -------- 34 | - None 35 | 36 | """ 37 | 38 | self.task.need_commit() 39 | 40 | def forward(self, key, value): 41 | """ 42 | Forward the key/value to the next node in the topology 43 | 44 | """ 45 | previous_node = self.current_node 46 | try: 47 | for child in self.current_node.children: 48 | self.current_node = child 49 | child.process(key, value) 50 | finally: 51 | self.current_node = previous_node 52 | 53 | def schedule(self, timestamp): 54 | """ 55 | Schedule the punctuation function call 56 | 57 | """ 58 | self.task.schedule(timestamp) 59 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/__init__.py: -------------------------------------------------------------------------------- 1 | from .serde import Serde 2 | from .serializer import Serializer 3 | from .deserializer import Deserializer 4 | 5 | from ._avro import AvroDeserializer, AvroSerializer 6 | from ._bytes import BytesDeserializer, BytesSerializer 7 | from ._double import DoubleDeserializer, DoubleSerializer 8 | from ._float import FloatDeserializer, FloatSerializer 9 | from ._integer import IntegerDeserializer, IntegerSerializer 10 | from ._json import JsonDeserializer, JsonSerializer 11 | from ._long import LongDeserializer, LongSerializer 12 | from ._string import StringDeserializer, StringSerializer 13 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/_avro.py: -------------------------------------------------------------------------------- 1 | from confluent_kafka.avro import CachedSchemaRegistryClient, MessageSerializer 2 | from confluent_kafka.avro import loads as avro_loads 3 | 4 | from .serde import extract_config_property 5 | from .deserializer import Deserializer 6 | from .serializer import Serializer 7 | 8 | 9 | class AvroHelper: 10 | def __init__(self): 11 | self._is_key = False 12 | self._schema_registry = None 13 | self._serializer = None 14 | self._schema = None 15 | 16 | def _set_serializer(self, schema_registry): 17 | self._schema_registry = schema_registry 18 | self._serializer = MessageSerializer(registry_client=self._schema_registry) 19 | 20 | def configure(self, configs, is_key): 21 | self._is_key = is_key 22 | schema_registry_url = extract_config_property(configs, is_key, 'AVRO_SCHEMA_REGISTRY') 23 | schema = extract_config_property(configs, is_key, 'AVRO_SCHEMA') 24 | 25 | if schema_registry_url is None: 26 | raise Exception("Missing Avro Schema Registry Url") 27 | else: 28 | self._set_serializer(CachedSchemaRegistryClient(url=schema_registry_url)) 29 | 30 | if schema: 31 | self._schema = avro_loads(schema) 32 | 33 | def serialize(self, topic, data): 34 | if self._schema is None: 35 | raise Exception("Missing Avro Schema") 36 | 37 | return self._serializer.encode_record_with_schema(topic, self._schema, data, is_key=self._is_key) 38 | 39 | def deserialize(self, topic, data): 40 | return self._serializer.decode_message(data) 41 | 42 | 43 | class AvroSerializer(Serializer): 44 | def __init__(self): 45 | self._avro_helper = AvroHelper() 46 | 47 | def serialize(self, topic, data): 48 | return self._avro_helper.serialize(topic, data) 49 | 50 | def configure(self, configs, is_key): 51 | self._avro_helper.configure(configs, is_key) 52 | 53 | def close(self): 54 | pass 55 | 56 | 57 | class AvroDeserializer(Deserializer): 58 | def __init__(self): 59 | self._avro_helper = AvroHelper() 60 | 61 | def deserialize(self, topic, data): 62 | return self._avro_helper.deserialize(topic, data) 63 | 64 | def configure(self, configs, is_key): 65 | self._avro_helper.configure(configs, is_key) 66 | 67 | def close(self): 68 | pass 69 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/_bytes.py: -------------------------------------------------------------------------------- 1 | from .deserializer import Deserializer 2 | from .serializer import Serializer 3 | 4 | 5 | class BytesSerializer(Serializer[bytes]): 6 | def serialize(self, topic: str, data: bytes) -> bytes: 7 | return data 8 | 9 | def configure(self, configs, is_key): 10 | pass 11 | 12 | def close(self): 13 | pass 14 | 15 | 16 | class BytesDeserializer(Deserializer[bytes]): 17 | def deserialize(self, topic: str, data: bytes) -> bytes: 18 | return data 19 | 20 | def configure(self, configs, is_key): 21 | pass 22 | 23 | def close(self): 24 | pass 25 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/_double.py: -------------------------------------------------------------------------------- 1 | from .deserializer import Deserializer 2 | from .serializer import Serializer 3 | import struct 4 | 5 | 6 | class DoubleSerializer(Serializer[float]): 7 | def serialize(self, topic: str, data: float) -> bytes: 8 | return struct.pack('d', data) 9 | 10 | def configure(self, configs, is_key): 11 | pass 12 | 13 | def close(self): 14 | pass 15 | 16 | 17 | class DoubleDeserializer(Deserializer[float]): 18 | def deserialize(self, topic: str, data: bytes) -> float: 19 | return struct.unpack('d', data)[0] 20 | 21 | def configure(self, configs, is_key): 22 | pass 23 | 24 | def close(self): 25 | pass 26 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/_float.py: -------------------------------------------------------------------------------- 1 | from .deserializer import Deserializer 2 | from .serializer import Serializer 3 | import struct 4 | 5 | 6 | class FloatSerializer(Serializer[float]): 7 | def serialize(self, topic: str, data: float) -> bytes: 8 | return struct.pack('f', data) 9 | 10 | def configure(self, configs, is_key): 11 | pass 12 | 13 | def close(self): 14 | pass 15 | 16 | 17 | class FloatDeserializer(Deserializer[float]): 18 | def deserialize(self, topic: str, data: bytes) -> float: 19 | return struct.unpack('f', data)[0] 20 | 21 | def configure(self, configs, is_key): 22 | pass 23 | 24 | def close(self): 25 | pass 26 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/_integer.py: -------------------------------------------------------------------------------- 1 | from .serde import extract_config_property 2 | from .deserializer import Deserializer 3 | from .serializer import Serializer 4 | 5 | 6 | class IntegerSerializer(Serializer[int]): 7 | def __init__(self): 8 | self.byte_order = 'little' 9 | self.signed = True 10 | self.int_size = 4 11 | 12 | def serialize(self, topic: str, data: int) -> bytes: 13 | return int(data).to_bytes(length=self.int_size, byteorder=self.byte_order, signed=self.signed) 14 | 15 | def configure(self, configs, is_key): 16 | self.byte_order = extract_config_property(configs, is_key, 'SERIALIZER_BYTEORDER', self.byte_order) 17 | self.signed = extract_config_property(configs, is_key, 'SERIALIZER_SIGNED', str(self.signed)).lower() == 'true' 18 | 19 | def close(self): 20 | pass 21 | 22 | 23 | class IntegerDeserializer(Deserializer[int]): 24 | def __init__(self): 25 | self.byte_order = 'little' 26 | self.signed = True 27 | 28 | def deserialize(self, topic: str, data: bytes) -> int: 29 | return int.from_bytes(bytes=data, byteorder=self.byte_order, signed=self.signed) 30 | 31 | def configure(self, configs, is_key): 32 | self.byte_order = extract_config_property(configs, is_key, 'DESERIALIZER_BYTEORDER', self.byte_order) 33 | self.signed = extract_config_property(configs, is_key, 'DESERIALIZER_SIGNED', str(self.signed)).lower() == 'true' 34 | 35 | def close(self): 36 | pass 37 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/_json.py: -------------------------------------------------------------------------------- 1 | from ._string import StringSerializer, StringDeserializer 2 | from .deserializer import Deserializer 3 | from .serializer import Serializer 4 | import json 5 | 6 | 7 | class JsonSerializer(Serializer): 8 | def __init__(self): 9 | self.string_serializer = StringSerializer() 10 | 11 | def serialize(self, topic, data): 12 | string_form = json.dumps(data) 13 | return self.string_serializer.serialize(topic, string_form) 14 | 15 | def configure(self, configs, is_key): 16 | self.string_serializer.configure(configs, is_key) 17 | 18 | def close(self): 19 | pass 20 | 21 | 22 | class JsonDeserializer(Deserializer): 23 | def __init__(self): 24 | self.string_deserializer = StringDeserializer() 25 | 26 | def deserialize(self, topic, data): 27 | string_form = self.string_deserializer.deserialize(topic, data) 28 | return json.loads(string_form) 29 | 30 | def configure(self, configs, is_key): 31 | self.string_deserializer.configure(configs, is_key) 32 | 33 | def close(self): 34 | pass 35 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/_long.py: -------------------------------------------------------------------------------- 1 | from ._integer import IntegerSerializer, IntegerDeserializer 2 | 3 | 4 | class LongSerializer(IntegerSerializer): 5 | def __init__(self): 6 | super(LongSerializer, self).__init__() 7 | self.int_size = 8 8 | 9 | 10 | class LongDeserializer(IntegerDeserializer): 11 | def __init__(self): 12 | super(LongDeserializer, self).__init__() 13 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/_string.py: -------------------------------------------------------------------------------- 1 | from .serde import extract_config_property 2 | from .deserializer import Deserializer 3 | from .serializer import Serializer 4 | 5 | 6 | class StringSerializer(Serializer[str]): 7 | def __init__(self): 8 | self.encoding = 'utf-8' 9 | self.on_error = 'strict' 10 | 11 | def serialize(self, topic: str, data: str) -> bytes: 12 | return str(data).encode(self.encoding, self.on_error) 13 | 14 | def configure(self, configs, is_key): 15 | self.encoding = extract_config_property(configs, is_key, 'SERIALIZER_ENCODING', self.encoding) 16 | self.on_error = extract_config_property(configs, is_key, 'SERIALIZER_ERROR', self.on_error) 17 | 18 | def close(self): 19 | pass 20 | 21 | 22 | class StringDeserializer(Deserializer[str]): 23 | def __init__(self): 24 | self.encoding = 'utf-8' 25 | self.on_error = 'strict' 26 | 27 | def deserialize(self, topic: str, data: bytes) -> str: 28 | return data.decode(self.encoding, self.on_error) 29 | 30 | def configure(self, configs, is_key): 31 | self.encoding = extract_config_property(configs, is_key, 'DESERIALIZER_ENCODING', self.encoding) 32 | self.on_error = extract_config_property(configs, is_key, 'DESERIALIZER_ERROR', self.on_error) 33 | 34 | def close(self): 35 | pass 36 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/deserializer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base class for deserializer implementations 3 | 4 | """ 5 | 6 | import abc 7 | 8 | from typing import TypeVar, Generic 9 | 10 | T = TypeVar('T') 11 | 12 | 13 | class Deserializer(Generic[T], metaclass=abc.ABCMeta): 14 | """ 15 | Configure this deserializer. 16 | 17 | Parameters: 18 | ----------- 19 | configs : dict 20 | configs in key/value pairs 21 | is_key : bool 22 | whether is for key or value 23 | """ 24 | @abc.abstractmethod 25 | def configure(self, configs, is_key): 26 | pass 27 | 28 | """ 29 | Convert a bytes into typed data. 30 | 31 | Parameters: 32 | ----------- 33 | topic : string 34 | data : bytes 35 | 36 | Returns: 37 | -------- 38 | deserialized_data : typed data 39 | """ 40 | @abc.abstractmethod 41 | def deserialize(self, topic: str, data: bytes) -> T: 42 | pass 43 | 44 | """ 45 | Close this deserializer. 46 | This method has to be idempotent because it might be called multiple times. 47 | """ 48 | @abc.abstractmethod 49 | def close(self): 50 | pass 51 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/serde.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base class for deserializer implementations 3 | 4 | """ 5 | 6 | import abc 7 | 8 | from typing import TypeVar, Generic 9 | 10 | from .deserializer import Deserializer 11 | from .serializer import Serializer 12 | 13 | T = TypeVar('T') 14 | TSer = TypeVar('TSer') 15 | TDe = TypeVar('TDe') 16 | 17 | 18 | def extract_config_property(configs, is_key, property_name, default_value=None): 19 | overridden_property_name = ('KEY_%s' % property_name) if is_key else ('VALUE_%s' % property_name) 20 | prop_value = getattr(configs, overridden_property_name, None) 21 | if prop_value is None: 22 | prop_value = getattr(configs, property_name, default_value) 23 | return prop_value 24 | 25 | 26 | class AsymmetricSerde(Generic[TSer, TDe], metaclass=abc.ABCMeta): 27 | @property 28 | @abc.abstractmethod 29 | def serializer(self) -> Serializer[TSer]: 30 | pass 31 | 32 | @property 33 | @abc.abstractmethod 34 | def deserializer(self) -> Deserializer[TDe]: 35 | pass 36 | 37 | @abc.abstractmethod 38 | def configure(self, configs, is_key): 39 | pass 40 | 41 | @abc.abstractmethod 42 | def close(self): 43 | pass 44 | 45 | 46 | class Serde(AsymmetricSerde[T, T]): 47 | """ 48 | Get Serializer 49 | 50 | Returns: 51 | -------- 52 | serializer : Serializer 53 | """ 54 | 55 | @property 56 | @abc.abstractmethod 57 | def serializer(self) -> Serializer[T]: 58 | pass 59 | 60 | """ 61 | Get Deserializer 62 | 63 | Returns: 64 | -------- 65 | deserializer : Deserializer 66 | """ 67 | 68 | @property 69 | @abc.abstractmethod 70 | def deserializer(self) -> Deserializer[T]: 71 | pass 72 | 73 | """ 74 | Configure this class, which will configure the underlying serializer and deserializer. 75 | 76 | Parameters: 77 | ----------- 78 | configs : dict 79 | configs in key/value pairs 80 | is_key : bool 81 | whether is for key or value 82 | """ 83 | 84 | @abc.abstractmethod 85 | def configure(self, configs, is_key): 86 | pass 87 | 88 | """ 89 | Close this serde class, which will close the underlying serializer and deserializer. 90 | This method has to be idempotent because it might be called multiple times. 91 | """ 92 | 93 | @abc.abstractmethod 94 | def close(self): 95 | pass 96 | 97 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/serdes/__init__.py: -------------------------------------------------------------------------------- 1 | from .bytes_serde import BytesSerde 2 | from .float_serde import FloatSerde 3 | from .double_serde import DoubleSerde 4 | from .integer_serde import IntegerSerde 5 | from .long_serde import LongSerde 6 | from .string_serde import StringSerde 7 | from .json_serde import JsonSerde 8 | from .avro_serde import AvroSerde 9 | 10 | from ._serdes import serde_from_string 11 | from ._serdes import serde_as_string 12 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/serdes/_serdes.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import inspect 3 | 4 | 5 | def serde_from_string(serde_name): 6 | module_name, class_name = serde_name.rsplit(".", 1) 7 | module = importlib.import_module(module_name) 8 | SerdeClass = getattr(module, class_name) 9 | return SerdeClass() 10 | 11 | 12 | def serde_as_string(serde): 13 | module_name = serde.__module__ 14 | class_name = serde.__name__ if inspect.isclass(serde) else serde.__class__.__name__ 15 | return module_name + "." + class_name 16 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/serdes/avro_serde.py: -------------------------------------------------------------------------------- 1 | """ 2 | Avro Serde 3 | 4 | """ 5 | from .._avro import AvroSerializer, AvroDeserializer 6 | from .wrapper_serde import WrapperSerde 7 | 8 | 9 | class AvroSerde(WrapperSerde): 10 | """ 11 | Avro Serde that will use Avro and a schema registry 12 | for serialization and deserialization 13 | """ 14 | 15 | def __init__(self): 16 | serializer = AvroSerializer() 17 | deserializer = AvroDeserializer() 18 | super().__init__(serializer, deserializer) 19 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/serdes/bytes_serde.py: -------------------------------------------------------------------------------- 1 | """ 2 | Bytes Serde (default) 3 | 4 | """ 5 | from .._bytes import BytesSerializer, BytesDeserializer 6 | from .wrapper_serde import WrapperSerde 7 | 8 | 9 | class BytesSerde(WrapperSerde[bytes]): 10 | """ 11 | Bytes Serde that makes no changes to values 12 | during serialization or deserialization 13 | """ 14 | 15 | def __init__(self): 16 | serializer = BytesSerializer() 17 | deserializer = BytesDeserializer() 18 | super().__init__(serializer, deserializer) 19 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/serdes/double_serde.py: -------------------------------------------------------------------------------- 1 | """ 2 | Float Serde 3 | 4 | """ 5 | from .._double import DoubleDeserializer, DoubleSerializer 6 | from .wrapper_serde import WrapperSerde 7 | 8 | 9 | class DoubleSerde(WrapperSerde[float]): 10 | def __init__(self): 11 | serializer = DoubleSerializer() 12 | deserializer = DoubleDeserializer() 13 | super().__init__(serializer, deserializer) 14 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/serdes/float_serde.py: -------------------------------------------------------------------------------- 1 | """ 2 | Float Serde 3 | 4 | """ 5 | from .._float import FloatDeserializer, FloatSerializer 6 | from .wrapper_serde import WrapperSerde 7 | 8 | 9 | class FloatSerde(WrapperSerde[float]): 10 | def __init__(self): 11 | serializer = FloatSerializer() 12 | deserializer = FloatDeserializer() 13 | super().__init__(serializer, deserializer) 14 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/serdes/integer_serde.py: -------------------------------------------------------------------------------- 1 | """ 2 | Integer Serde 3 | 4 | """ 5 | from .._integer import IntegerDeserializer, IntegerSerializer 6 | from .wrapper_serde import WrapperSerde 7 | 8 | 9 | class IntegerSerde(WrapperSerde[int]): 10 | def __init__(self): 11 | serializer = IntegerSerializer() 12 | deserializer = IntegerDeserializer() 13 | super().__init__(serializer, deserializer) 14 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/serdes/json_serde.py: -------------------------------------------------------------------------------- 1 | """ 2 | Json Serde 3 | 4 | """ 5 | from .._json import JsonSerializer, JsonDeserializer 6 | from .wrapper_serde import WrapperSerde 7 | 8 | 9 | class JsonSerde(WrapperSerde): 10 | def __init__(self): 11 | serializer = JsonSerializer() 12 | deserializer = JsonDeserializer() 13 | super().__init__(serializer, deserializer) 14 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/serdes/long_serde.py: -------------------------------------------------------------------------------- 1 | """ 2 | Long Serde 3 | 4 | """ 5 | from .._long import LongDeserializer, LongSerializer 6 | from .wrapper_serde import WrapperSerde 7 | 8 | 9 | class LongSerde(WrapperSerde[int]): 10 | def __init__(self): 11 | serializer = LongSerializer() 12 | deserializer = LongDeserializer() 13 | super().__init__(serializer, deserializer) 14 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/serdes/string_serde.py: -------------------------------------------------------------------------------- 1 | """ 2 | String Serde 3 | 4 | """ 5 | from .._string import StringSerializer, StringDeserializer 6 | from .wrapper_serde import WrapperSerde 7 | 8 | 9 | class StringSerde(WrapperSerde[str]): 10 | def __init__(self): 11 | serializer = StringSerializer() 12 | deserializer = StringDeserializer() 13 | super().__init__(serializer, deserializer) 14 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/serdes/wrapper_serde.py: -------------------------------------------------------------------------------- 1 | """ 2 | Serde from a Serializer and Deserializer 3 | 4 | """ 5 | from typing import TypeVar 6 | 7 | from ..deserializer import Deserializer 8 | from ..serializer import Serializer 9 | from ..serde import AsymmetricSerde, Serde 10 | 11 | TSer = TypeVar('TSer') 12 | TDe = TypeVar('TDe') 13 | 14 | 15 | class AsymmetricWrapperSerde(AsymmetricSerde[TSer, TDe]): 16 | def __init__(self, serializer: Serializer[TSer], deserializer: Deserializer[TDe]) -> None: 17 | self._serializer = serializer 18 | self._deserializer = deserializer 19 | 20 | @property 21 | def serializer(self) -> Serializer[TSer]: 22 | return self._serializer 23 | 24 | @property 25 | def deserializer(self) -> Deserializer[TDe]: 26 | return self._deserializer 27 | 28 | def configure(self, configs, is_key) -> None: 29 | self.serializer.configure(configs, is_key) 30 | self.deserializer.configure(configs, is_key) 31 | 32 | def close(self) -> None: 33 | self.serializer.close() 34 | self.deserializer.close() 35 | 36 | 37 | T = TypeVar('T') 38 | 39 | 40 | class WrapperSerde(Serde[T]): 41 | def __init__(self, serializer: Serializer[T], deserializer: Deserializer[T]) -> None: 42 | self._serializer = serializer 43 | self._deserializer = deserializer 44 | 45 | @property 46 | def serializer(self) -> Serializer[T]: 47 | return self._serializer 48 | 49 | @property 50 | def deserializer(self) -> Deserializer[T]: 51 | return self._deserializer 52 | 53 | def configure(self, configs, is_key) -> None: 54 | self.serializer.configure(configs, is_key) 55 | self.deserializer.configure(configs, is_key) 56 | 57 | def close(self) -> None: 58 | self.serializer.close() 59 | self.deserializer.close() 60 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/serialization/serializer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base class for serializer implementations 3 | 4 | """ 5 | 6 | import abc 7 | 8 | from typing import TypeVar, Generic 9 | 10 | T = TypeVar('T') 11 | 12 | 13 | class Serializer(Generic[T], metaclass=abc.ABCMeta): 14 | """ 15 | Configure this serializer. 16 | 17 | Parameters: 18 | ----------- 19 | configs : dict 20 | configs in key/value pairs 21 | is_key : bool 22 | whether is for key or value 23 | """ 24 | @abc.abstractmethod 25 | def configure(self, configs, is_key): 26 | pass 27 | 28 | """ 29 | Convert typed data into a bytes. 30 | 31 | Parameters: 32 | ----------- 33 | topic : string 34 | data : typed data 35 | 36 | Returns: 37 | -------- 38 | serialized_bytearray : bytes 39 | """ 40 | @abc.abstractmethod 41 | def serialize(self, topic: str, data: T) -> bytes: 42 | pass 43 | 44 | """ 45 | Close this serializer. 46 | This method has to be idempotent because it might be called multiple times. 47 | """ 48 | @abc.abstractmethod 49 | def close(self): 50 | pass 51 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/task_id.py: -------------------------------------------------------------------------------- 1 | class TaskId: 2 | def __init__(self, topic_group_id, partition): 3 | self.topic_group_id = topic_group_id 4 | self.partition = partition 5 | 6 | def __repr__(self): 7 | return f"{self.topic_group_id}_{self.partition}" 8 | 9 | def __eq__(self, other): 10 | if other.__class__ is self.__class__: 11 | return (self.topic_group_id, self.partition) == (other.topic_group_id, other.partition) 12 | return False 13 | 14 | def __ne__(self, other): 15 | return not self.__eq__(other) 16 | 17 | def __hash__(self): 18 | return hash((self.topic_group_id, self.partition)) 19 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/topology.py: -------------------------------------------------------------------------------- 1 | """ 2 | Classes for building a graph topology comprising processor derived nodes 3 | 4 | """ 5 | 6 | import logging 7 | 8 | from ..errors.kafka_streams_error import KafkaStreamsError 9 | from .processor import SourceProcessor, SinkProcessor 10 | 11 | log = logging.getLogger(__name__) 12 | 13 | 14 | class ProcessorNode: 15 | def __init__(self, name, processor): 16 | self.name = name 17 | self.processor = processor 18 | self.children = [] 19 | self.state_stores = set() 20 | 21 | def initialise(self, _context): 22 | self.processor.initialise(self.name, _context) 23 | 24 | def process(self, key, value): 25 | self.processor.process(key, value) 26 | 27 | def punctuate(self, timestamp): 28 | self.processor.punctuate(timestamp) 29 | 30 | def __repr__(self): 31 | return self.__class__.__name__ + f"({self.processor.__class__}({self.name}))" 32 | 33 | 34 | class Topology: 35 | """ 36 | A realised instance of a topology 37 | 38 | """ 39 | def __init__(self, sources, processors, sinks, store_suppliers): 40 | self.nodes = {} 41 | self.sources = {} 42 | sources_list = [source_builder(self) for source_builder in sources] 43 | for source_node in sources_list: 44 | for topic in source_node.processor.topics: 45 | if topic in self.sources: 46 | raise KafkaStreamsError(f'Topic {topic} associated with more than one Source Processor') 47 | self.sources[topic] = source_node 48 | 49 | self.processors = [processor_builder(self) for processor_builder in processors] 50 | self.sinks = [sink_builder(self) for sink_builder in sinks] 51 | 52 | self.state_stores = {} 53 | for store_supplier, store_processors in store_suppliers.items(): 54 | self.state_stores[store_supplier.name] = store_supplier 55 | for p in store_processors: 56 | self.nodes[p].state_stores.add(store_supplier.name) 57 | 58 | def _add_node(self, name, processor, inputs): 59 | if name in self.nodes: 60 | raise KafkaStreamsError(f"A processor with the name '{name}' already added to this topology") 61 | self.nodes[name] = processor 62 | 63 | node_inputs = list(self.nodes[i] for i in inputs) 64 | 65 | if any(n.name == name for n in node_inputs): 66 | raise KafkaStreamsError("A processor cannot have itself as an input") 67 | if any(n.name not in self.nodes for n in node_inputs): 68 | raise KafkaStreamsError("Input(s) {} to processor {} do not yet exist".format( 69 | (set(inputs) - set(n.name for n in node_inputs)), name)) 70 | 71 | for i in inputs: 72 | self.nodes[i].children.append(processor) 73 | 74 | 75 | class TopologyBuilder: 76 | """ 77 | Convenience class for building a graph topology 78 | """ 79 | def __init__(self): 80 | self._sources = [] 81 | self._processors = [] 82 | self._sinks = [] 83 | self._store_suppliers = {} 84 | self.topics = [] 85 | 86 | def __enter__(self): 87 | return self 88 | 89 | def __exit__(self, exc_type, exc_val, exc_tb): 90 | pass 91 | 92 | @property 93 | def sources(self): 94 | return self._sources 95 | 96 | @property 97 | def sinks(self): 98 | return self._sinks 99 | 100 | @property 101 | def state_stores(self): 102 | return self._store_suppliers 103 | 104 | def state_store(self, store_supplier, *processors): 105 | """ 106 | Add a store and connect to processors 107 | 108 | Parameters: 109 | ----------- 110 | store_supplier : winton_kafka_streams.state.StateStoreSupplier 111 | *processors : processor names to which store should be attached 112 | 113 | Raises: 114 | KafkaStreamsError 115 | * If store_supplier is None 116 | * If store_supplier already exists 117 | """ 118 | if store_supplier is None: 119 | raise KafkaStreamsError("store_supplier cannot be None") 120 | 121 | if any(store_supplier.name == s.name for s in self._store_suppliers): 122 | raise KafkaStreamsError(f"Store with name {store_supplier.name} already exists") 123 | 124 | self._store_suppliers[store_supplier] = processors 125 | return self 126 | 127 | def source(self, name, topics): 128 | """ 129 | Add a source to the topology 130 | 131 | Parameters: 132 | ----------- 133 | name : str 134 | The name of the node 135 | topics : str 136 | Source topic 137 | 138 | Returns: 139 | -------- 140 | topology : TopologyBuilder 141 | 142 | Raises: 143 | KafkaStreamsError 144 | * If node with same name exists already 145 | """ 146 | 147 | def build_source(topology): 148 | log.debug(f'TopologyBuilder is building source {name}') 149 | source = ProcessorNode(name, SourceProcessor(topics)) 150 | topology._add_node(name, source, []) 151 | return source 152 | 153 | self.topics.extend(topics) 154 | self._sources.append(build_source) 155 | return self 156 | 157 | def processor(self, name, processor_type, *parents): 158 | """ 159 | Add a processor to the topology 160 | 161 | Parameters: 162 | ----------- 163 | name : str 164 | The name of the node 165 | processor : winton_kafka_streams.processor.base.BaseProcessor 166 | Processor object that will process the key, value pair passed 167 | *parents: 168 | Parent nodes supplying inputs 169 | 170 | Returns: 171 | -------- 172 | topology : TopologyBuilder 173 | 174 | Raises: 175 | KafkaStreamsError 176 | * If no inputs are specified 177 | """ 178 | if not parents: 179 | raise KafkaStreamsError("Processor '%s' must have a minimum of 1 input", name) 180 | 181 | def build_processor(topology): 182 | log.debug(f'TopologyBuilder is building processor {name}') 183 | processor_node = ProcessorNode(name, processor_type()) 184 | topology._add_node(name, processor_node, parents) 185 | return processor_node 186 | 187 | self._processors.append(build_processor) 188 | return self 189 | 190 | def sink(self, name, topic, *parents): 191 | def build_sink(topology): 192 | log.debug(f'TopologyBuilder is building sink {name}') 193 | sink = ProcessorNode(name, SinkProcessor(topic)) 194 | topology._add_node(name, sink, parents) 195 | return sink 196 | self._sinks.append(build_sink) 197 | return self 198 | 199 | def build(self): 200 | return Topology(self._sources, self._processors, self._sinks, self._store_suppliers) 201 | -------------------------------------------------------------------------------- /winton_kafka_streams/processor/wallclock_timestamp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wall clock time extractor 3 | 4 | """ 5 | 6 | import time 7 | 8 | from ._timestamp import TimeStampExtractor 9 | 10 | 11 | class WallClockTimeStampExtractor(TimeStampExtractor): 12 | """ 13 | Time stamp extractor that returns wall clock time at the point 14 | a record is processed 15 | """ 16 | 17 | def extract(self, record, previous_timestamp): 18 | """ 19 | Returns wall clock time for every message 20 | 21 | Parameters: 22 | ----------- 23 | record : Kafka record 24 | New record from which time should be assigned 25 | previous_timestamp : long 26 | Last extracted timestamp (seconds since the epoch) 27 | 28 | Returns: 29 | -------- 30 | time : long 31 | Time in seconds since the epoch 32 | """ 33 | return time.time() 34 | -------------------------------------------------------------------------------- /winton_kafka_streams/state/__init__.py: -------------------------------------------------------------------------------- 1 | from winton_kafka_streams.state.factory.store_factory import StoreFactory 2 | 3 | 4 | def create(name: str) -> StoreFactory: 5 | # TODO replace this Java-esque factory with a Pythonic DSL as part of the other work on a Streams DSL 6 | return StoreFactory(name) 7 | -------------------------------------------------------------------------------- /winton_kafka_streams/state/factory/__init__.py: -------------------------------------------------------------------------------- 1 | # TODO replace this Java-esque factory with a Pythonic DSL as part of the other work on a Streams DSL 2 | -------------------------------------------------------------------------------- /winton_kafka_streams/state/factory/base_storage_key_value_store_factory.py: -------------------------------------------------------------------------------- 1 | from typing import Generic, TypeVar 2 | 3 | from winton_kafka_streams.processor.serialization import Serde 4 | from abc import ABC, abstractmethod 5 | 6 | from winton_kafka_streams.state.state_store_supplier import StateStoreSupplier 7 | 8 | KT = TypeVar('KT') # Key type. 9 | VT = TypeVar('VT') # Value type. 10 | 11 | 12 | class BaseStorageKeyValueStoreFactory(ABC, Generic[KT, VT]): 13 | def __init__(self, name: str, key_serde: Serde[KT], value_serde: Serde[VT]) -> None: 14 | self.name: str = name 15 | self.key_serde: Serde[KT] = key_serde 16 | self.value_serde: Serde[VT] = value_serde 17 | self.logging_enabled: bool = True 18 | 19 | def enable_logging(self, config_map): 20 | # TODO changelog extra config gets handled here 21 | self.logging_enabled = True 22 | return self 23 | 24 | def disable_logging(self): 25 | self.logging_enabled = False 26 | return self 27 | 28 | @abstractmethod 29 | def build(self) -> StateStoreSupplier[KT, VT]: 30 | pass 31 | -------------------------------------------------------------------------------- /winton_kafka_streams/state/factory/in_memory_key_value_store_factory.py: -------------------------------------------------------------------------------- 1 | from typing import TypeVar 2 | 3 | from winton_kafka_streams.processor.serialization import Serde 4 | from winton_kafka_streams.state.factory.base_storage_key_value_store_factory import BaseStorageKeyValueStoreFactory 5 | from winton_kafka_streams.state.in_memory.in_memory_state_store_supplier import InMemoryStateStoreSupplier 6 | 7 | KT = TypeVar('KT') # Key type. 8 | VT = TypeVar('VT') # Value type. 9 | 10 | 11 | class InMemoryKeyValueStoreFactory(BaseStorageKeyValueStoreFactory[KT, VT]): 12 | def __init__(self, name: str, key_serde: Serde[KT], value_serde: Serde[VT]) -> None: 13 | super(InMemoryKeyValueStoreFactory, self).__init__(name, key_serde, value_serde) 14 | 15 | def build(self) -> InMemoryStateStoreSupplier: 16 | return InMemoryStateStoreSupplier(self.name, self.key_serde, self.value_serde, self.logging_enabled) 17 | -------------------------------------------------------------------------------- /winton_kafka_streams/state/factory/key_value_store_factory.py: -------------------------------------------------------------------------------- 1 | from typing import TypeVar, Generic 2 | 3 | from winton_kafka_streams.processor.serialization import Serde 4 | from winton_kafka_streams.state.factory.in_memory_key_value_store_factory import InMemoryKeyValueStoreFactory 5 | 6 | KT = TypeVar('KT') # Key type. 7 | VT = TypeVar('VT') # Value type. 8 | 9 | 10 | class KeyValueStoreFactory(Generic[KT, VT]): 11 | def __init__(self, name: str, key_serde: Serde[KT], value_serde: Serde[VT]) -> None: 12 | self.name: str = name 13 | self.key_serde: Serde[KT] = key_serde 14 | self.value_serde: Serde[VT] = value_serde 15 | 16 | def in_memory(self) -> InMemoryKeyValueStoreFactory[KT, VT]: 17 | return InMemoryKeyValueStoreFactory[KT, VT](self.name, self.key_serde, self.value_serde) 18 | 19 | def persistent(self): 20 | raise NotImplementedError("Persistent State Store not implemented") 21 | -------------------------------------------------------------------------------- /winton_kafka_streams/state/factory/store_factory.py: -------------------------------------------------------------------------------- 1 | from typing import TypeVar 2 | 3 | from winton_kafka_streams.processor.serialization import Serde 4 | from winton_kafka_streams.processor.serialization.serdes import * 5 | from winton_kafka_streams.state.factory.value_store_factory import ValueStoreFactory 6 | 7 | KT = TypeVar('KT') # Key type. 8 | 9 | 10 | class StoreFactory: 11 | def __init__(self, name: str) -> None: 12 | self.name: str = name 13 | 14 | def _with_key_serde(self, serde: Serde[KT]) -> ValueStoreFactory[KT]: 15 | key_serde: Serde[KT] = serde 16 | configs = None # TODO 17 | is_key = True 18 | key_serde.configure(configs, is_key) 19 | return ValueStoreFactory[KT](self.name, key_serde) 20 | 21 | def with_string_keys(self) -> ValueStoreFactory[str]: 22 | return self._with_key_serde(StringSerde()) 23 | 24 | def with_integer_keys(self) -> ValueStoreFactory[int]: 25 | return self._with_key_serde(IntegerSerde()) 26 | 27 | def with_long_keys(self) -> ValueStoreFactory[int]: 28 | return self._with_key_serde(LongSerde()) 29 | 30 | def with_double_keys(self) -> ValueStoreFactory[float]: 31 | return self._with_key_serde(DoubleSerde()) 32 | 33 | def with_bytes_keys(self) -> ValueStoreFactory[bytes]: 34 | return self._with_key_serde(BytesSerde()) 35 | -------------------------------------------------------------------------------- /winton_kafka_streams/state/factory/value_store_factory.py: -------------------------------------------------------------------------------- 1 | from typing import TypeVar, Generic 2 | 3 | from winton_kafka_streams.processor.serialization import Serde 4 | from winton_kafka_streams.processor.serialization.serdes import * 5 | from .key_value_store_factory import KeyValueStoreFactory 6 | 7 | KT = TypeVar('KT') # Key type. 8 | VT = TypeVar('VT') # Value type. 9 | 10 | 11 | class ValueStoreFactory(Generic[KT]): 12 | def __init__(self, name: str, key_serde: Serde[KT]) -> None: 13 | self.name: str = name 14 | self.key_serde: Serde[KT] = key_serde 15 | 16 | def _with_value_serde(self, serde: Serde[VT]) -> KeyValueStoreFactory[KT, VT]: 17 | value_serde: Serde[VT] = serde 18 | configs = None 19 | is_key = False 20 | value_serde.configure(configs, is_key) 21 | return KeyValueStoreFactory[KT, VT](self.name, self.key_serde, value_serde) 22 | 23 | def with_string_values(self) -> KeyValueStoreFactory[KT, str]: 24 | return self._with_value_serde(StringSerde()) 25 | 26 | def with_integer_values(self) -> KeyValueStoreFactory[KT, int]: 27 | return self._with_value_serde(IntegerSerde()) 28 | 29 | def with_long_values(self) -> KeyValueStoreFactory[KT, int]: 30 | return self._with_value_serde(LongSerde()) 31 | 32 | def with_double_values(self) -> KeyValueStoreFactory[KT, float]: 33 | return self._with_value_serde(DoubleSerde()) 34 | 35 | def with_bytes_values(self) -> KeyValueStoreFactory[KT, bytes]: 36 | return self._with_value_serde(BytesSerde()) 37 | -------------------------------------------------------------------------------- /winton_kafka_streams/state/in_memory/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wintoncode/winton-kafka-streams/5867a1c42fc80bba07173fd1d004b2849b429fdf/winton_kafka_streams/state/in_memory/__init__.py -------------------------------------------------------------------------------- /winton_kafka_streams/state/in_memory/in_memory_state_store.py: -------------------------------------------------------------------------------- 1 | from typing import Iterator, TypeVar, MutableMapping 2 | 3 | from winton_kafka_streams.processor.serialization import Serde 4 | from ..key_value_state_store import KeyValueStateStore 5 | from ..state_store import StateStore 6 | 7 | KT = TypeVar('KT') # Key type. 8 | VT = TypeVar('VT') # Value type. 9 | 10 | 11 | class InMemoryStateStore(StateStore[KT, VT]): 12 | def __init__(self, name: str, key_serde: Serde[KT], value_serde: Serde[VT], logging_enabled: bool) -> None: 13 | super().__init__(name, key_serde, value_serde, logging_enabled) 14 | self.dict: MutableMapping[KT, VT] = {} 15 | 16 | def initialize(self, context, root) -> None: 17 | pass 18 | 19 | def get_key_value_store(self) -> KeyValueStateStore[KT, VT]: 20 | parent = self 21 | 22 | class InMemoryKeyValueStateStore(KeyValueStateStore[KT, VT]): 23 | def __setitem__(self, k: KT, v: VT) -> None: 24 | parent.dict[k] = v 25 | 26 | def __delitem__(self, v: KT) -> None: 27 | del parent.dict[v] 28 | 29 | def __getitem__(self, k: KT) -> VT: 30 | return parent.dict[k] 31 | 32 | def __len__(self) -> int: 33 | return len(parent.dict) 34 | 35 | def __iter__(self) -> Iterator[KT]: 36 | return parent.dict.__iter__() 37 | 38 | return InMemoryKeyValueStateStore() 39 | -------------------------------------------------------------------------------- /winton_kafka_streams/state/in_memory/in_memory_state_store_supplier.py: -------------------------------------------------------------------------------- 1 | from typing import TypeVar 2 | 3 | from winton_kafka_streams.processor.serialization import Serde 4 | from .in_memory_state_store import InMemoryStateStore 5 | from ..state_store import StateStore 6 | from ..state_store_supplier import StateStoreSupplier 7 | 8 | KT = TypeVar('KT') # Key type. 9 | VT = TypeVar('VT') # Value type. 10 | 11 | 12 | class InMemoryStateStoreSupplier(StateStoreSupplier): 13 | def __init__(self, name: str, key_serde: Serde[KT], value_serde: Serde[VT], logging_enabled: bool) -> None: 14 | super().__init__(name, key_serde, value_serde, logging_enabled) 15 | 16 | def _build_state_store(self) -> StateStore: 17 | return InMemoryStateStore(self.name, self._key_serde, self._value_serde, self.logging_enabled) 18 | -------------------------------------------------------------------------------- /winton_kafka_streams/state/key_value_state_store.py: -------------------------------------------------------------------------------- 1 | from typing import TypeVar, Iterator, MutableMapping 2 | 3 | 4 | from abc import abstractmethod 5 | 6 | KT = TypeVar('KT') # Key type. 7 | VT = TypeVar('VT') # Value type. 8 | 9 | 10 | class KeyValueStateStore(MutableMapping[KT, VT]): 11 | """ 12 | Dict-like class is injected into a processors to provide an interface to the underlying StateStore 13 | """ 14 | @abstractmethod 15 | def __setitem__(self, k: KT, v: VT) -> None: 16 | pass 17 | 18 | @abstractmethod 19 | def __delitem__(self, v: KT) -> None: 20 | pass 21 | 22 | @abstractmethod 23 | def __getitem__(self, k: KT) -> VT: 24 | pass 25 | 26 | @abstractmethod 27 | def __len__(self) -> int: 28 | pass 29 | 30 | @abstractmethod 31 | def __iter__(self) -> Iterator[KT]: 32 | pass 33 | -------------------------------------------------------------------------------- /winton_kafka_streams/state/logging/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wintoncode/winton-kafka-streams/5867a1c42fc80bba07173fd1d004b2849b429fdf/winton_kafka_streams/state/logging/__init__.py -------------------------------------------------------------------------------- /winton_kafka_streams/state/logging/change_logging_state_store.py: -------------------------------------------------------------------------------- 1 | from typing import TypeVar, Iterator 2 | 3 | from winton_kafka_streams.processor.serialization import Serde 4 | from ..key_value_state_store import KeyValueStateStore 5 | from ..state_store import StateStore 6 | from .store_change_logger import StoreChangeLogger 7 | 8 | KT = TypeVar('KT') # Key type. 9 | VT = TypeVar('VT') # Value type. 10 | 11 | 12 | class ChangeLoggingStateStore(StateStore[KT, VT]): 13 | def __init__(self, name: str, key_serde: Serde[KT], value_serde: Serde[VT], logging_enabled: bool, 14 | inner_state_store: StateStore[KT, VT]) -> None: 15 | super().__init__(name, key_serde, value_serde, logging_enabled) 16 | self.inner_state_store = inner_state_store 17 | self.change_logger = None 18 | 19 | def initialize(self, context, root): 20 | self.inner_state_store.initialize(context, root) 21 | self.change_logger = StoreChangeLogger(self.inner_state_store.name, context) 22 | # TODO rebuild state into inner here 23 | 24 | def get_key_value_store(self) -> KeyValueStateStore[KT, VT]: 25 | parent = self 26 | 27 | class ChangeLoggingKeyValueStore(KeyValueStateStore[KT, VT]): 28 | # TODO : add write buffer 29 | # TODO : use topic compaction to optimise state-rebuilding 30 | 31 | def __init__(self, change_logger: StoreChangeLogger) -> None: 32 | super(ChangeLoggingKeyValueStore, self).__init__() 33 | self.change_logger: StoreChangeLogger = change_logger 34 | self.inner_kv_store: KeyValueStateStore[KT, VT] = parent.inner_state_store.get_key_value_store() 35 | 36 | def __len__(self) -> int: 37 | return len(self.inner_kv_store) 38 | 39 | def __iter__(self) -> Iterator[KT]: 40 | return self.inner_kv_store.__iter__() 41 | 42 | def __setitem__(self, key: KT, value: VT): 43 | key_bytes = parent.serialize_key(key) 44 | value_bytes = parent.serialize_value(value) 45 | self.inner_kv_store.__setitem__(key, value) 46 | self.change_logger.log_change(key_bytes, value_bytes) 47 | 48 | def __getitem__(self, key: KT) -> VT: 49 | return self.inner_kv_store.__getitem__(key) 50 | 51 | def __delitem__(self, key: KT): 52 | key_bytes = parent.serialize_key(key) 53 | self.inner_kv_store.__delitem__(key) 54 | self.change_logger.log_change(key_bytes, b'') 55 | 56 | return ChangeLoggingKeyValueStore(self.change_logger) 57 | -------------------------------------------------------------------------------- /winton_kafka_streams/state/logging/store_change_logger.py: -------------------------------------------------------------------------------- 1 | class StoreChangeLogger: 2 | def __init__(self, store_name, context) -> None: 3 | self.topic = f'{context.application_id}-{store_name}-changelog' 4 | self.context = context 5 | self.partition = context.task_id.partition 6 | self.record_collector = context.state_record_collector 7 | 8 | def log_change(self, key: bytes, value: bytes) -> None: 9 | if self.record_collector: 10 | self.record_collector.send(self.topic, key, value, self.context.timestamp, partition=self.partition) 11 | -------------------------------------------------------------------------------- /winton_kafka_streams/state/state_store.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import TypeVar, Generic 3 | 4 | from ..processor.serialization import Serde 5 | from .key_value_state_store import KeyValueStateStore 6 | 7 | KT = TypeVar('KT') # Key type. 8 | VT = TypeVar('VT') # Value type. 9 | 10 | 11 | class StateStore(ABC, Generic[KT, VT]): 12 | """ 13 | StateStores are created by Suppliers for use in StreamTasks 14 | """ 15 | def __init__(self, name: str, key_serde: Serde[KT], value_serde: Serde[VT], logging_enabled: bool) -> None: 16 | self.logging_enabled: bool = logging_enabled 17 | self._value_serde: Serde[VT] = value_serde 18 | self._key_serde: Serde[KT] = key_serde 19 | self._name: str = name 20 | 21 | @property 22 | def name(self) -> str: 23 | return self._name 24 | 25 | def serialize_key(self, key: KT) -> bytes: 26 | return self._key_serde.serializer.serialize("", key) 27 | 28 | def deserialize_key(self, data: bytes) -> KT: 29 | return self._key_serde.deserializer.deserialize("", data) 30 | 31 | def serialize_value(self, value: VT) -> bytes: 32 | return self._value_serde.serializer.serialize("", value) 33 | 34 | def deserialize_value(self, data: bytes) -> VT: 35 | return self._value_serde.deserializer.deserialize("", data) 36 | 37 | @abstractmethod 38 | def initialize(self, context, root): 39 | """ 40 | Initialize is called within a StreamTask once partitions are assigned, before processing starts. 41 | State is rebuilt from the change log at this point. 42 | :param context: 43 | :param root: 44 | :return: None 45 | """ 46 | pass 47 | 48 | @abstractmethod 49 | def get_key_value_store(self) -> KeyValueStateStore[KT, VT]: 50 | pass 51 | -------------------------------------------------------------------------------- /winton_kafka_streams/state/state_store_supplier.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | from typing import TypeVar, Generic 4 | 5 | from winton_kafka_streams.state.logging.change_logging_state_store import ChangeLoggingStateStore 6 | from .state_store import StateStore 7 | from ..processor.serialization import Serde 8 | 9 | KT = TypeVar('KT') # Key type. 10 | VT = TypeVar('VT') # Value type. 11 | 12 | 13 | class StateStoreSupplier(ABC, Generic[KT, VT]): 14 | """ 15 | StateStoreSuppliers are added to a topology and are accessible from each StreamThread 16 | 17 | """ 18 | 19 | def __init__(self, name: str, key_serde: Serde[KT], value_serde: Serde[VT], logging_enabled: bool) -> None: 20 | self.logging_enabled: bool = logging_enabled 21 | self._value_serde: Serde[VT] = value_serde 22 | self._key_serde: Serde[KT] = key_serde 23 | self._name: str = name 24 | 25 | @property 26 | def name(self) -> str: 27 | return self._name 28 | 29 | @abstractmethod 30 | def _build_state_store(self) -> StateStore[KT, VT]: 31 | pass 32 | 33 | def get(self) -> StateStore[KT, VT]: 34 | """Create a StateStore for each StreamTask. *These StateStores may exist in different threads.*""" 35 | inner = self._build_state_store() 36 | if self.logging_enabled: 37 | return ChangeLoggingStateStore(self.name, self._key_serde, self._value_serde, self.logging_enabled, inner) 38 | else: 39 | return inner 40 | -------------------------------------------------------------------------------- /winton_kafka_streams/version.py: -------------------------------------------------------------------------------- 1 | from setuptools_scm import get_version 2 | version = get_version() 3 | --------------------------------------------------------------------------------