├── .editorconfig
├── .gitignore
├── .readthedocs.yml
├── .travis.yml
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── ROADMAP.md
├── Vagrantfile
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    ├── make.bat
    └── source
    │   ├── modules.rst
    │   ├── winton_kafka_streams.processor.rst
    │   ├── winton_kafka_streams.processor.serde.rst
    │   ├── winton_kafka_streams.rst
    │   └── winton_kafka_streams.state.rst
├── examples
    ├── binning
    │   ├── README.md
    │   ├── binning.py
    │   ├── config.properties
    │   ├── generator.py
    │   ├── live-plot.ipynb
    │   ├── random_prices.py
    │   └── source.py
    ├── debug
    │   ├── README.md
    │   ├── config.properties
    │   └── example.py
    └── wordcount
    │   ├── README.md
    │   ├── config.properties
    │   ├── custom_serde.py
    │   ├── docker
    │       ├── docker-compose.yml
    │       ├── kafka-debug
    │       │   └── Dockerfile
    │       ├── source_client
    │       │   └── Dockerfile
    │       └── wordcount
    │       │   ├── Dockerfile
    │       │   └── config.properties
    │   ├── example.py
    │   └── source_client.py
├── requirements_docs.txt
├── setup.cfg
├── setup.py
├── tests
    ├── processor
    │   ├── serde
    │   │   ├── __init__.py
    │   │   ├── mock_schema_registry.py
    │   │   ├── test_avro_serde.py
    │   │   ├── test_instantiation.py
    │   │   └── test_serialisation.py
    │   ├── test_base_processor.py
    │   ├── test_extract_timestamp.py
    │   ├── test_punctuation_queue.py
    │   ├── test_sink_processor.py
    │   ├── test_source_processor.py
    │   ├── test_stream_task.py
    │   ├── test_task_id.py
    │   ├── test_topology.py
    │   └── test_wallclock_timestamp.py
    ├── state
    │   └── test_in_memory_key_value_store.py
    └── test_kafka_streams.py
└── winton_kafka_streams
    ├── __init__.py
    ├── errors
        ├── __init__.py
        ├── _kafka_error_codes.py
        ├── kafka_streams_error.py
        └── task_migrated_error.py
    ├── kafka_client_supplier.py
    ├── kafka_config.py
    ├── kafka_streams.py
    ├── processor
        ├── __init__.py
        ├── _context.py
        ├── _punctuation_queue.py
        ├── _record_collector.py
        ├── _stream_task.py
        ├── _stream_thread.py
        ├── _timestamp.py
        ├── extract_timestamp.py
        ├── processor.py
        ├── processor_context.py
        ├── serialization
        │   ├── __init__.py
        │   ├── _avro.py
        │   ├── _bytes.py
        │   ├── _double.py
        │   ├── _float.py
        │   ├── _integer.py
        │   ├── _json.py
        │   ├── _long.py
        │   ├── _string.py
        │   ├── deserializer.py
        │   ├── serde.py
        │   ├── serdes
        │   │   ├── __init__.py
        │   │   ├── _serdes.py
        │   │   ├── avro_serde.py
        │   │   ├── bytes_serde.py
        │   │   ├── double_serde.py
        │   │   ├── float_serde.py
        │   │   ├── integer_serde.py
        │   │   ├── json_serde.py
        │   │   ├── long_serde.py
        │   │   ├── string_serde.py
        │   │   └── wrapper_serde.py
        │   └── serializer.py
        ├── task_id.py
        ├── topology.py
        └── wallclock_timestamp.py
    ├── state
        ├── __init__.py
        ├── factory
        │   ├── __init__.py
        │   ├── base_storage_key_value_store_factory.py
        │   ├── in_memory_key_value_store_factory.py
        │   ├── key_value_store_factory.py
        │   ├── store_factory.py
        │   └── value_store_factory.py
        ├── in_memory
        │   ├── __init__.py
        │   ├── in_memory_state_store.py
        │   └── in_memory_state_store_supplier.py
        ├── key_value_state_store.py
        ├── logging
        │   ├── __init__.py
        │   ├── change_logging_state_store.py
        │   └── store_change_logger.py
        ├── state_store.py
        └── state_store_supplier.py
    └── version.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | ; EditorConfig helps developers define and maintain consistent
 2 | ; coding styles between different editors and IDEs.
 3 | 
 4 | ; For more visit http://editorconfig.org.
 5 | root = true
 6 | 
 7 | ; Choose between lf or rf on "end_of_line" property
 8 | [*]
 9 | indent_style = space
10 | end_of_line = lf
11 | charset = utf-8
12 | trim_trailing_whitespace = true
13 | insert_final_newline = true
14 | 
15 | [*.{js,css,scss}]
16 | indent_size = 2
17 | 
18 | [*.html]
19 | indent_style = tab
20 | 
21 | [*.{py,html,md}]
22 | indent_size = 4
23 | 
24 | [*.md]
25 | trim_trailing_whitespace = false
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | .pytest_cache/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # Environments
 83 | .env
 84 | .venv
 85 | env/
 86 | venv/
 87 | ENV/
 88 | 
 89 | # Spyder project settings
 90 | .spyderproject
 91 | .spyproject
 92 | 
 93 | # Rope project settings
 94 | .ropeproject
 95 | 
 96 | # mkdocs documentation
 97 | /site
 98 | 
 99 | # mypy
100 | .mypy_cache/
101 | 
102 | # vim
103 | *.*~
104 | *.swp
105 | 
106 | # PyCharm
107 | .idea/
108 | 
109 | # Vagrant
110 | .vagrant/
111 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | ---
2 |   build:
3 |       image: latest
4 | 
5 |   requirements_file: requirements_docs.txt
6 | 
7 |   python:
8 |     version: 3.6
9 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | dist: trusty # librdkafka support begins at Trusty.
 3 | python:
 4 |   - "3.6" # Advertised support
 5 |   - "3.6-dev" # Check we'll keep working in future
 6 | before_install:
 7 |   - wget -qO - http://packages.confluent.io/deb/3.2/archive.key | sudo apt-key add - # Use the confluent repository
 8 |   - sudo add-apt-repository "deb [arch=amd64] http://packages.confluent.io/deb/3.2 stable main"
 9 |   - sudo apt-get update -qq # Update quietly.
10 |   - sudo apt-get install -y librdkafka-dev librdkafka1
11 | install: "pip install --editable .[develop]"
12 | script: pytest
13 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | One of the easiest ways to contribute is to participate in discussions and discuss issues. You can also contribute by submitting pull requests with code changes.
 4 | 
 5 | ## Filing issues
 6 | The best way to get your bug fixed is to be as detailed as you can be about the problem.
 7 | Providing a minimal project with steps to reproduce the problem is ideal.
 8 | Here are questions you can answer before you file a bug to make sure you're not missing any important information.
 9 | 
10 | 1. Did you include the snippet of broken code in the issue?
11 | 2. What are the *EXACT* steps to reproduce this problem?
12 | 3. What package versions are you using (you can see these in the `project.json` file)?
13 | 4. What operating system are you using?
14 | 
15 | GitHub supports [markdown](https://help.github.com/articles/github-flavored-markdown/), so when filing bugs make sure you check the formatting before clicking submit.
16 | 
17 | ## Contributor License Agreement ("CLA")
18 | 
19 | In order to accept your pull request, we need you to submit a CLA. You only need to do this once to work on any of Winton's open source projects.
20 | 
21 | The text of the CLA can be seen in this gist: [Winton CLA](https://cla-assistant.io/wintoncode/winton-kafka-streams)
22 | 
23 | ## Contributing code and content
24 | Make sure you can build the code and run the tests. Familiarize yourself with the project workflow and our coding conventions. If you don't know what a pull request is read this article: https://help.github.com/articles/using-pull-requests.
25 | 
26 | Before submitting a feature or substantial code contribution please discuss it with the team and ensure it follows the product roadmap. You might also read these two blogs posts on contributing code: [Open Source Contribution Etiquette](http://tirania.org/blog/archive/2010/Dec-31.html) by Miguel de Icaza and [Don't "Push" Your Pull Requests](https://www.igvita.com/2011/12/19/dont-push-your-pull-requests/) by Ilya Grigorik. Note that all code submissions will be rigorously reviewed and tested by the Winton team prior to merging.
27 | 
28 | Here's a few things you should always do when making changes to the code base:
29 | 
30 | **Engineering guidelines**
31 | 
32 | Please follow the existing coding style used in this project.
33 | 
34 | **Commit/Pull Request Format**
35 | 
36 | ```
37 | Summary of the changes (Less than 80 chars)
38 |  - Detail 1
39 |  - Detail 2
40 | 
41 | Addresses #bugnumber (in this specific format)
42 | ```
43 | 
44 | **Tests**
45 | 
46 | -  Tests need to be provided for every bug/feature that is completed.
47 | -  If there is a scenario that is far too hard to test there does not need to be a test for it.
48 |   - "Too hard" is determined by the team as a whole.
49 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2016 Winton
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Winton Kafka Streams
 2 | 
 3 | [![Build Status](https://travis-ci.org/wintoncode/winton-kafka-streams.svg?branch=master)](https://travis-ci.org/wintoncode/winton-kafka-streams)
 4 | 
 5 | Implementation of [Apache Kafka's Streams API](https://kafka.apache.org/documentation/streams/) in Python.
 6 | 
 7 | ## What and why?
 8 | Apache Kafka is an open-source stream processing platform developed
 9 | by the Apache Software Foundation written in Scala and Java. Kafka
10 | has Streams API added for building stream processing applications
11 | using Apache Kafka. Applications built with Kafka's Streams API do not require any
12 | setup beyond the provision of a Kafka cluster.
13 | 
14 | Winton Kafka Streams is a Python implementation of Apache Kafka's
15 | Streams API. It builds on Confluent's librdkafka (a high
16 | performance C library implementing the Kafka protocol) and the
17 | Confluent Python Kafka library to achieve this.
18 | 
19 | The power and simplicity of both Python and Kafka's Streams API combined
20 | opens the streaming model to many more people and applications.
21 | 
22 | ## Getting started
23 | 
24 | ### Dependencies
25 | 
26 | The minimum Python version is currently 3.6 and a working Kafka
27 | cluster (a single replica is sufficient for testing) are required.
28 | 
29 | You will require [librdkafka](https://github.com/edenhill/librdkafka). On Mac OS, we recommend installing this via HomeBrew and setting `CFLAGS=-I/usr/local/include` and `LDFLAGS=-L/usr/local/lib` is
30 | when installing Confluent Python Kafka (see below).
31 | The librdkafka GitHub page lists packages available for Debian and Ubuntu, as well as RPMS.
32 | For Arch Linux it is available via [AUR](https://aur.archlinux.org/packages/librdkafka-git/).
33 | 
34 | Confluent Python Kafka is also required and it should be installed
35 | as a dependency by pip.
36 | 
37 | ### Installing
38 | 
39 | Cloning the Winton Kafka Streams repository from GitHub is
40 | recommended if you want to contribute to the project.
41 | Then use
42 | `pip install --editable <path/to/winton_kafka_streams>[develop]`
43 | to install as an editable workspace with additional dependencies
44 | required for development.
45 | You may need to do this using `sudo` on Linux.
46 | 
47 | If you want to install the code and get a feel for it as a user then
48 | we recommend using `pip install git+https://github.com/wintoncode/winton-kafka-streams`.
49 | 
50 | ### Running tests
51 | Tests will run when py.test is called in the root of the repository.
52 | 
53 | ### Running examples
54 | To run examples, you must have cloned the code locally from GitHub.
55 | 
56 | The debug and wordcount examples will run without further additional
57 | requirements.
58 | 
59 | The Jupyter notebook in the binning example requires some additional
60 | packages. Install these with the command:
61 | 
62 | pip install <path/to/winton_kafka_streams>[binning_example]
63 | 
64 | ## Contributing
65 | Please see the CONTRIBUTING.md document for more details on getting involved.
66 | 
67 | ## Contact
68 |  - GitHub: https://github.com/wintoncode/
69 |  - Email: opensource@winton.com
70 |  - Twitter: @wintoncapital
71 | 


--------------------------------------------------------------------------------
/ROADMAP.md:
--------------------------------------------------------------------------------
 1 | # Roadmap
 2 | 
 3 | The roadmap is a high level overview of work we would like to see implemented  For more details and discussion of new features, improvements or bugs, please see the [issue list](https://github.com/wintoncode/winton-kafka-streams/issues) in GitHub. 
 4 | 
 5 | * Complete implementation of Kafka's Streams API in Python
 6 |     * The current code is a good proof of concept but is still under active development. There are a number of key features remaining, in particular a persistent state store and a DSL. There are also many improvements to existing features left to implement - check the issue list for the latest status. 
 7 | * Implement new features of Kafka's Streams API
 8 |     * v0.11 of Apache Kafka was released on 28 June 2017 with many important and useful features. 
 9 | * Investigate a more Pythonic API/DSL
10 |     * The current Processor API follows the Java layout very closely. A Python Streams domain specific language (DSL) should leverage Python's unique language stregnths to make writing stream application as easy and intuitive as possible. 
11 | * Optimise performance
12 |     * Python has many known performance limitations; continue to optimise the code to perform as well as possible. Consider implementing some or all of the application in C. 
13 | 


--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
 1 | # -*- mode: ruby -*-
 2 | # vi: set ft=ruby :
 3 | Vagrant.configure("2") do |config|
 4 | 
 5 |   config.vm.box = "bento/ubuntu-16.04"
 6 | 
 7 |   config.vm.network "forwarded_port", guest: 2181, host: 2181
 8 |   config.vm.network "forwarded_port", guest: 9092, host: 9092
 9 | 
10 |   config.vm.provider "virtualbox" do |v|
11 |     v.memory = 2048
12 |     v.cpus = 2
13 |   end
14 | 
15 |   config.vm.provision "shell", inline: <<-SHELL
16 |     export SCALA_VER=2.11
17 |     export KAFKA_VER=1.0.0
18 |     export KAFKA_PACKAGE=kafka_${SCALA_VER}-${KAFKA_VER}
19 | 
20 |     apt-get update
21 |     apt-get install -y tmux htop vim wget git
22 | 
23 |     apt-get install -y build-essential software-properties-common python-software-properties
24 |     wget -qO - http://packages.confluent.io/deb/3.3/archive.key | apt-key add -
25 |     add-apt-repository "deb [arch=amd64] http://packages.confluent.io/deb/3.3 stable main"
26 |     apt-get update
27 |     apt-get install -y librdkafka-dev
28 | 
29 |     wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
30 |     sh Miniconda3-latest-Linux-x86_64.sh -b -f -p /home/vagrant/miniconda3
31 |     rm -f Miniconda3-latest-Linux-x86_64.sh
32 |     /home/vagrant/miniconda3/bin/conda create -q -y -n vagrant python=3.6
33 |     echo PATH=/home/vagrant/miniconda3/bin:\$PATH >> /home/vagrant/.profile
34 |     echo source activate vagrant >> /home/vagrant/.profile
35 |     echo cd /vagrant/ >> /home/vagrant/.profile
36 | 
37 |     apt-get install -y zookeeperd openjdk-8-jdk kafkacat
38 |     wget -q http://mirror.ox.ac.uk/sites/rsync.apache.org/kafka/${KAFKA_VER}/${KAFKA_PACKAGE}.tgz
39 |     tar -xzf ${KAFKA_PACKAGE}.tgz
40 |     rm -f ${KAFKA_PACKAGE}.tgz
41 |     mv ${KAFKA_PACKAGE} /opt/kafka
42 |   SHELL
43 | 
44 |   config.vm.provision "shell", run: "always", inline: <<-SHELL
45 |     /home/vagrant/miniconda3/envs/vagrant/bin/pip install -e /vagrant/.[develop]
46 |     chown -R vagrant:vagrant /home/vagrant/miniconda3
47 |     rm -fr /tmp/kafka*
48 |     nohup /opt/kafka/bin/kafka-server-start.sh /opt/kafka/config/server.properties > /tmp/kafka.log 2>&1 &
49 |   SHELL
50 | end
51 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = WintonKafkaStreamsPython
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Winton Kafka Streams Python documentation build configuration file, created by
  5 | # sphinx-quickstart on Tue May 16 21:00:14 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | 
 23 | # Get the project root dir
 24 | cwd = os.getcwd()
 25 | project_root = os.path.dirname(cwd)
 26 | 
 27 | # Insert the project root dir as the first element in the PYTHONPATH.
 28 | # This lets us ensure that the source package is imported, and that its
 29 | # version is used.
 30 | sys.path.insert(0, project_root)
 31 | 
 32 | import winton_kafka_streams
 33 | 
 34 | from mock import MagicMock
 35 | 
 36 | class Mock(MagicMock):
 37 |     @classmethod
 38 |     def __getattr__(cls, name):
 39 |         return MagicMock()
 40 | 
 41 | MOCK_MODULES = ['confluent_kafka', 'confluent_kafka.cimpl', 'confluent_kafka.avro']
 42 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 43 | 
 44 | # -- General configuration ------------------------------------------------
 45 | 
 46 | # If your documentation needs a minimal Sphinx version, state it here.
 47 | #
 48 | # needs_sphinx = '1.0'
 49 | 
 50 | # Add any Sphinx extension module names here, as strings. They can be
 51 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 52 | # ones.
 53 | extensions = ['sphinx.ext.autodoc']
 54 | 
 55 | # Add any paths that contain templates here, relative to this directory.
 56 | templates_path = ['_templates']
 57 | 
 58 | # The suffix(es) of source filenames.
 59 | # You can specify multiple suffix as a list of string:
 60 | #
 61 | # source_suffix = ['.rst', '.md']
 62 | source_suffix = '.rst'
 63 | 
 64 | # The master toctree document.
 65 | master_doc = 'index'
 66 | 
 67 | # General information about the project.
 68 | project = 'Winton Kafka Streams Python'
 69 | copyright = '2017, Winton Group'
 70 | author = 'Winton Group'
 71 | 
 72 | # The version info for the project you're documenting, acts as replacement for
 73 | # |version| and |release|, also used in various other places throughout the
 74 | # built documents.
 75 | #
 76 | 
 77 | from setuptools_scm import get_version
 78 | version = release = get_version(root='..')
 79 | 
 80 | # The language for content autogenerated by Sphinx. Refer to documentation
 81 | # for a list of supported languages.
 82 | #
 83 | # This is also used if you do content translation via gettext catalogs.
 84 | # Usually you set "language" from the command line for these cases.
 85 | language = None
 86 | 
 87 | # List of patterns, relative to source directory, that match files and
 88 | # directories to ignore when looking for source files.
 89 | # This patterns also effect to html_static_path and html_extra_path
 90 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 91 | 
 92 | # The name of the Pygments (syntax highlighting) style to use.
 93 | pygments_style = 'sphinx'
 94 | 
 95 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 96 | todo_include_todos = False
 97 | 
 98 | 
 99 | # -- Options for HTML output ----------------------------------------------
100 | 
101 | # The theme to use for HTML and HTML Help pages.  See the documentation for
102 | # a list of builtin themes.
103 | #
104 | html_theme = "sphinx_rtd_theme"
105 | 
106 | # Theme options are theme-specific and customize the look and feel of a theme
107 | # further.  For a list of options available for each theme, see the
108 | # documentation.
109 | #
110 | # html_theme_options = {}
111 | 
112 | # Add any paths that contain custom static files (such as style sheets) here,
113 | # relative to this directory. They are copied after the builtin static files,
114 | # so a file named "default.css" will overwrite the builtin "default.css".
115 | html_static_path = ['_static']
116 | 
117 | 
118 | # -- Options for HTMLHelp output ------------------------------------------
119 | 
120 | # Output file base name for HTML help builder.
121 | htmlhelp_basename = 'WintonKafkaStreamsPythondoc'
122 | 
123 | 
124 | # -- Options for LaTeX output ---------------------------------------------
125 | 
126 | latex_elements = {
127 |     # The paper size ('letterpaper' or 'a4paper').
128 |     #
129 |     # 'papersize': 'letterpaper',
130 | 
131 |     # The font size ('10pt', '11pt' or '12pt').
132 |     #
133 |     # 'pointsize': '10pt',
134 | 
135 |     # Additional stuff for the LaTeX preamble.
136 |     #
137 |     # 'preamble': '',
138 | 
139 |     # Latex figure (float) alignment
140 |     #
141 |     # 'figure_align': 'htbp',
142 | }
143 | 
144 | # Grouping the document tree into LaTeX files. List of tuples
145 | # (source start file, target name, title,
146 | #  author, documentclass [howto, manual, or own class]).
147 | latex_documents = [
148 |     (master_doc, 'WintonKafkaStreamsPython.tex', 'Winton Kafka Streams Python Documentation',
149 |      'Winton Group', 'manual'),
150 | ]
151 | 
152 | 
153 | # -- Options for manual page output ---------------------------------------
154 | 
155 | # One entry per manual page. List of tuples
156 | # (source start file, name, description, authors, manual section).
157 | man_pages = [
158 |     (master_doc, 'wintonkafkastreamspython', 'Winton Kafka Streams Python Documentation',
159 |      [author], 1)
160 | ]
161 | 
162 | 
163 | # -- Options for Texinfo output -------------------------------------------
164 | 
165 | # Grouping the document tree into Texinfo files. List of tuples
166 | # (source start file, target name, title, author,
167 | #  dir menu entry, description, category)
168 | texinfo_documents = [
169 |     (master_doc, 'WintonKafkaStreamsPython', 'Winton Kafka Streams Python Documentation',
170 |      author, 'WintonKafkaStreamsPython', 'One line description of project.',
171 |      'Miscellaneous'),
172 | ]
173 | 
174 | 
175 | 
176 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. Winton Kafka Streams Python documentation master file, created by
 2 |    sphinx-quickstart on Tue May 16 21:00:14 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to Winton Kafka Streams Python's documentation!
 7 | =======================================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 | 
14 | 
15 | Indices and tables
16 | ==================
17 | 
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=WintonKafkaStreamsPython
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | winton_kafka_streams
2 | ====================
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    winton_kafka_streams
8 | 


--------------------------------------------------------------------------------
/docs/source/winton_kafka_streams.processor.rst:
--------------------------------------------------------------------------------
 1 | winton\_kafka\_streams\.processor package
 2 | =========================================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     winton_kafka_streams.processor.serde
10 | 
11 | Submodules
12 | ----------
13 | 
14 | winton\_kafka\_streams\.processor\.extract\_timestamp module
15 | ------------------------------------------------------------
16 | 
17 | .. automodule:: winton_kafka_streams.processor.extract_timestamp
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | winton\_kafka\_streams\.processor\.processor module
23 | ---------------------------------------------------
24 | 
25 | .. automodule:: winton_kafka_streams.processor.processor
26 |     :members:
27 |     :undoc-members:
28 |     :show-inheritance:
29 | 
30 | winton\_kafka\_streams\.processor\.processor\_context module
31 | ------------------------------------------------------------
32 | 
33 | .. automodule:: winton_kafka_streams.processor.processor_context
34 |     :members:
35 |     :undoc-members:
36 |     :show-inheritance:
37 | 
38 | winton\_kafka\_streams\.processor\.topology module
39 | --------------------------------------------------
40 | 
41 | .. automodule:: winton_kafka_streams.processor.topology
42 |     :members:
43 |     :undoc-members:
44 |     :show-inheritance:
45 | 
46 | winton\_kafka\_streams\.processor\.wallclock\_timestamp module
47 | --------------------------------------------------------------
48 | 
49 | .. automodule:: winton_kafka_streams.processor.wallclock_timestamp
50 |     :members:
51 |     :undoc-members:
52 |     :show-inheritance:
53 | 
54 | 
55 | Module contents
56 | ---------------
57 | 
58 | .. automodule:: winton_kafka_streams.processor
59 |     :members:
60 |     :undoc-members:
61 |     :show-inheritance:
62 | 


--------------------------------------------------------------------------------
/docs/source/winton_kafka_streams.processor.serde.rst:
--------------------------------------------------------------------------------
 1 | winton\_kafka\_streams\.processor\.serde package
 2 | ================================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | winton\_kafka\_streams\.processor\.serde\.identity module
 8 | ---------------------------------------------------------
 9 | 
10 | .. automodule:: winton_kafka_streams.processor.serde.identity
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: winton_kafka_streams.processor.serde
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/source/winton_kafka_streams.rst:
--------------------------------------------------------------------------------
 1 | winton\_kafka\_streams package
 2 | ==============================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     winton_kafka_streams.processor
10 |     winton_kafka_streams.state
11 | 
12 | Submodules
13 | ----------
14 | 
15 | winton\_kafka\_streams\.kafka\_config module
16 | --------------------------------------------
17 | 
18 | .. automodule:: winton_kafka_streams.kafka_config
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | winton\_kafka\_streams\.kafka\_stream module
24 | --------------------------------------------
25 | 
26 | .. automodule:: winton_kafka_streams.kafka_stream
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | 
32 | Module contents
33 | ---------------
34 | 
35 | .. automodule:: winton_kafka_streams
36 |     :members:
37 |     :undoc-members:
38 |     :show-inheritance:
39 | 


--------------------------------------------------------------------------------
/docs/source/winton_kafka_streams.state.rst:
--------------------------------------------------------------------------------
 1 | winton\_kafka\_streams\.state package
 2 | =====================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | winton\_kafka\_streams\.state\.simple module
 8 | --------------------------------------------
 9 | 
10 | .. automodule:: winton_kafka_streams.state.simple
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: winton_kafka_streams.state
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/examples/binning/README.md:
--------------------------------------------------------------------------------
 1 | # Binning example
 2 | 
 3 | ## Additional Python Package
 4 | 
 5 | In addition to the packages required by the Winton Kafka Streams package, you
 6 | will also need to have `pandas` available.  See also `live-plot.ipynb`
 7 | for a jupyter notebook visualising this example (it additionally
 8 | requires `jupyter` and `bokeh`).
 9 | 
10 | ## Prepare Kafka
11 | 
12 | Start up Zookeeper and Kafka:
13 | 
14 |     bin/zookeeper-server-start.sh config/zookeeper.properties
15 |     bin/kafka-server-start.sh config/server.properties
16 | 
17 | Then create the topics used in this example:
18 | 
19 |     bin/kafka-topics.sh \
20 |        --create \
21 |        --zookeeper localhost:2181 \
22 |        --replication-factor 1 \
23 |        --partitions 1 \
24 |        --topic prices
25 |        
26 |     bin/kafka-topics.sh \
27 |         --create \
28 |         --zookeeper localhost:2181 \
29 |         --replication-factor 1 \
30 |         --partitions 1 \
31 |         --topic bin-prices
32 | 
33 | ## Run generator
34 | 
35 | First generate a log of the full data - see `python generator.py --help`
36 | for details of the options:
37 | 
38 |     python generator.py \
39 |         -i AAA,0.3,123,100.0,0.01 \
40 |         -i BBB,0.4,456,70.0,0.011 \
41 |         -l 60000 -f 250ms \
42 |     > full_data.log
43 | 
44 | then run again, but this time producing 'in real-time' to a Kafka topic
45 | (the generated data is the same as above, as the setup is the same):
46 | 
47 |     python generator.py \
48 |         -i AAA,0.3,123,100.0,0.01 \
49 |         -i BBB,0.4,456,70.0,0.011 \
50 |         -l 6000 -f 1s \
51 |         -kb localhost:9092 -kt prices \
52 |         -rt 
53 | 
54 | This will produce outputs to the 'bin-prices' topic.
55 | 
56 | In both cases the script will terminated once any of the two items has
57 | produced 60000 values.
58 | 
59 | ## Run binning
60 | 
61 | Now run the Winton Kafka Streams application that consumes the price topic:
62 | 
63 |     python -u binning.py --config-file config.properties
64 | 
65 | Note: When terminating it, the last 2 bins are likely to be missing from
66 | the output - this needs improvement.
67 | 
68 | ## Inspect output
69 | 
70 | A simple way to see the produced results is to run:
71 | 
72 |     kafkacat -b localhost -t bin-prices -e -J
73 | 
74 | which will print a JSON formatted view of the topic to stdout.
75 | 


--------------------------------------------------------------------------------
/examples/binning/binning.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Python Kafka Streams example script for price binning
  3 | """
  4 | 
  5 | import logging
  6 | import time
  7 | import pandas as pd
  8 | from winton_kafka_streams.processor import BaseProcessor, TopologyBuilder
  9 | import winton_kafka_streams.kafka_config as kafka_config
 10 | import winton_kafka_streams.kafka_streams as kafka_streams
 11 | 
 12 | LOGGER = logging.getLogger(__name__)
 13 | 
 14 | _VERBOSITY = {
 15 |     0: logging.WARN,
 16 |     1: logging.INFO,
 17 |     2: logging.DEBUG
 18 | }
 19 | 
 20 | 
 21 | class Binning(BaseProcessor):
 22 |     """
 23 |     Implementation of binning process
 24 | 
 25 |     The code will be passed a value from the 'prices' source topic
 26 |     in Kafka. This processor will search for the final value in the
 27 |     binning range (1 minute) and output that to the 'bin-prices'
 28 |     sink topic in Kafka.
 29 | 
 30 |     There is a Python generator script provided to generate prices
 31 |     with normally distributed returns. You can control the frequency
 32 |     of generation, the mean and standard deviation and the number
 33 |     of items generated.
 34 | 
 35 |     TODO: Later this example should be extended to show partition
 36 |           support.
 37 |     """
 38 | 
 39 |     def initialise(self, _name, _context):
 40 |         super().initialise(_name, _context)
 41 |         # bins tracks last time bin and price per symbol
 42 |         self.bins = {}  # TODO: Replace with self.context.get_store(")
 43 | 
 44 |     def process(self, _, value):
 45 |         """
 46 |         Processes values from the source in search of the last
 47 |         value in that bin.
 48 | 
 49 |         Parameters:
 50 |         -----------
 51 |         _ : object, unused (key)
 52 |             The key read from the source topic (unused here)
 53 |         value: object
 54 |             The value read from the source topic
 55 | 
 56 |         Returns:
 57 |         --------
 58 |           None
 59 |         """
 60 |         timestamp, symbol, price = value.split(',')
 61 |         timestamp = pd.Timestamp(timestamp)
 62 | 
 63 |         bin_ts = pd.Timestamp(
 64 |             year=timestamp.year, month=timestamp.month, day=timestamp.day,
 65 |             hour=timestamp.hour, minute=timestamp.minute, second=0
 66 |         ) + pd.Timedelta('1min')
 67 |         bin_ts_and_price = '{},{}'.format(bin_ts.isoformat(), price)
 68 | 
 69 |         last_bin = self.bins.get(symbol)
 70 | 
 71 |         if last_bin is not None:
 72 |             last_bin_ts, last_price = last_bin.split(',')
 73 |             if last_bin_ts != bin_ts.isoformat():
 74 |                 key = '{},{}'.format(last_bin_ts, symbol)
 75 |                 LOGGER.debug('Forwarding to sink  (%s, %s)', key, last_price)
 76 |                 self.context.forward(key, last_price)
 77 |                 self.context.commit()  # TODO: implement auto-commit, remove this
 78 | 
 79 |         self.bins[symbol] = bin_ts_and_price
 80 | 
 81 | 
 82 | def run(config_file=None):
 83 |     """
 84 |     Starts the binning process
 85 | 
 86 |     Called here from main() when invoked from command line
 87 |     but could equally import binning and call
 88 |     binning.run(config_file)
 89 | 
 90 |     """
 91 |     if config_file:
 92 |         kafka_config.read_local_config(config_file)
 93 | 
 94 |     with TopologyBuilder() as topology_builder:
 95 |         topology_builder. \
 96 |             source('prices', ['prices']). \
 97 |             processor('binner', Binning, 'prices'). \
 98 |             sink('result', 'bin-prices', 'binner')
 99 | 
100 |     wks = kafka_streams.KafkaStreams(topology_builder, kafka_config)
101 |     wks.start()
102 |     try:
103 |         while True:
104 |             time.sleep(1)
105 |     except KeyboardInterrupt:
106 |         pass
107 |     finally:
108 |         wks.close()
109 | 
110 | 
111 | def _get_parser():
112 |     import argparse
113 |     parser = argparse.ArgumentParser(description=__doc__)
114 |     parser.add_argument(
115 |         '--config-file', '-c', default='config.properties',
116 |         help="Local configuration - will override internal defaults"
117 |     )
118 |     parser.add_argument(
119 |         '-v', dest='verbosity', action='count', default=0,
120 |         help='Enable more verbose logging, use once for info, '
121 |              'twice for debug.'
122 |     )
123 |     return parser
124 | 
125 | 
126 | def main():
127 |     parser = _get_parser()
128 |     args = parser.parse_args()
129 |     logging.basicConfig(level=_VERBOSITY.get(args.verbosity, logging.DEBUG))
130 |     run(args.config_file)
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     main()
135 | 


--------------------------------------------------------------------------------
/examples/binning/config.properties:
--------------------------------------------------------------------------------
1 | bootstrap.servers = localhost:9092
2 | auto.offset.reset = earliest
3 | enable.auto.commit = false
4 | value.serde = winton_kafka_streams.processor.serialization.serdes.StringSerde
5 | key.serde = winton_kafka_streams.processor.serialization.serdes.StringSerde
6 | 


--------------------------------------------------------------------------------
/examples/binning/generator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Simple script to generate prices values with normally
  3 | distributed returns on a Kafka 'prices' topic.
  4 | 
  5 | Run ./generator --help to see the full range of options.
  6 | 
  7 | """
  8 | 
  9 | import logging
 10 | from collections import namedtuple
 11 | import datetime as dt
 12 | import time
 13 | import pandas as pd
 14 | from random_prices import RandomPrices
 15 | from source import Source
 16 | 
 17 | ITEM = namedtuple('ITEM', ['name', 'prob', 'seed', 'initial_price', 'sigma'])
 18 | 
 19 | LOGGER = logging.getLogger(__name__)
 20 | 
 21 | _VERBOSITY = {
 22 |     0: logging.WARN,
 23 |     1: logging.INFO,
 24 |     2: logging.DEBUG
 25 | }
 26 | 
 27 | 
 28 | def _get_items(items):
 29 |     parsed_items = []
 30 |     for item in items:
 31 |         vals = item.split(',')
 32 |         try:
 33 |             parsed_item = ITEM(
 34 |                 vals[0], float(vals[1]),
 35 |                 int(vals[2]), float(vals[3]), float(vals[4])
 36 |             )
 37 |             parsed_items.append(parsed_item)
 38 |         except Exception:
 39 |             raise ValueError(
 40 |                 '{} should contain 5 comma separated options: '
 41 |                 'name[string],prob[float],seed[int],'
 42 |                 'initial_price[float],sigma[float]'
 43 |             )
 44 |     return parsed_items
 45 | 
 46 | 
 47 | def _get_sources(items, limit):
 48 |     return {
 49 |         item.name: Source(
 50 |             item.prob,
 51 |             RandomPrices(
 52 |                 item.seed, item.initial_price, item.sigma, limit
 53 |             ),
 54 |             item.seed
 55 |         )
 56 |         for item in items
 57 |     }
 58 | 
 59 | 
 60 | def _run(sources, timestamp, freq, real_time, rt_multiplier, produce):
 61 |     """
 62 |     Start the generation of prices on the 'prices' topic
 63 |     """
 64 | 
 65 |     stop = False
 66 |     while not stop:
 67 |         if real_time:
 68 |             start_time = dt.datetime.utcnow()
 69 |         for (name, source) in sources.items():
 70 |             try:
 71 |                 price = next(source)
 72 |                 if price is not None:
 73 |                     produce(timestamp, name, price)
 74 |                     LOGGER.info('%s,%s,%s', timestamp, name, price)
 75 |             except StopIteration:
 76 |                 stop = True
 77 |         timestamp = timestamp + freq
 78 |         if real_time:
 79 |             duration = dt.datetime.utcnow() - start_time
 80 |             sleep_seconds = (freq.total_seconds() - duration.total_seconds()) / rt_multiplier
 81 |             if sleep_seconds < 0.0:
 82 |                 LOGGER.warning(
 83 |                     'Not keeping up, lagging by %ss', -sleep_seconds
 84 |                 )
 85 |             else:
 86 |                 LOGGER.debug('Sleeping for %ss', sleep_seconds)
 87 |                 time.sleep(sleep_seconds)
 88 | 
 89 | 
 90 | def _get_parser():
 91 |     import argparse
 92 |     parser = argparse.ArgumentParser(description=__doc__)
 93 |     parser.add_argument(
 94 |         '-i', '--item', required=True, action='append', dest='items',
 95 |         help='Comma separated list of construction details for random price '
 96 |              'sources, should be name[string],prob[float],seed[int],'
 97 |              'initial_price[float],sigma[float]'
 98 |     )
 99 |     parser.add_argument(
100 |         '-l', '--limit', dest='limit', type=int, default=1_000_000,
101 |         help='Limit of iterations to be performed (default 1M)'
102 |     )
103 |     parser.add_argument(
104 |         '-s', '--start', dest='start', default='2017-01-01',
105 |         help='Date(time) to start the price series from, e.g. '
106 |              '2000-01-01T10:30:12; must be a valid pandas timestamp. '
107 |              '(default 2017-01-01)'
108 |     )
109 |     parser.add_argument(
110 |         '-f', '--freq', dest='freq', default='250ms',
111 |         help='The frequence by which to increment the time, must be a '
112 |              'valid pandas timedelta, e.g. 30s. (default 250ms)'
113 |     )
114 |     parser.add_argument(
115 |         '-kb', '--broker-list', dest='broker_list', default=None,
116 |         help='Kafka broker list, e.g. kafka-1:9092,kafka-2:9092; also '
117 |              'requires --topic to be specified.  If not provided output '
118 |              'will be produced to stdout instead of Kafka.'
119 |     )
120 |     parser.add_argument(
121 |         '-kt', '--topic', dest='topic', default=None,
122 |         help='The Kafka topic to produce to, this will be ignored '
123 |              'if --broker-list is not specified as well.'
124 |     )
125 |     parser.add_argument(
126 |         '-rt', '--real-time', dest='real_time', action='store_true',
127 |         help='Toggle (approximate) real-time generation of random '
128 |              'prices.  This will output prices in real-time trying '
129 |              'to match the frequency specified in --freq.'
130 |     )
131 |     parser.add_argument(
132 |         '-rtm', '--real-time-multiplier', type=float, default=1.0,
133 |         help='Speed up real time producer of prices by a factor. '
134 |              'Default=1.0 (actual time).'
135 |     )
136 |     parser.add_argument(
137 |         '-v', dest='verbosity', action='count', default=0,
138 |         help='Enable more verbose logging (can be specified multiple '
139 |              'times to increase verbosity)'
140 |     )
141 |     return parser
142 | 
143 | 
144 | def main():
145 |     """Main entry for script"""
146 |     parser = _get_parser()
147 |     args = parser.parse_args()
148 |     sources = _get_sources(_get_items(args.items), args.limit)
149 |     timestamp = pd.Timestamp(args.start)
150 |     freq = pd.Timedelta(args.freq)
151 |     logging.basicConfig(level=_VERBOSITY.get(args.verbosity, logging.DEBUG))
152 |     if args.broker_list is None:
153 |         def _produce(timestamp, name, price):
154 |             print('{},{},{}'.format(timestamp, name, price))
155 | 
156 |         LOGGER.debug('Running in console mode')
157 |         _run(sources, timestamp, freq, args.real_time, args.real_time_multiplier, _produce)
158 |     else:
159 |         if args.topic is None:
160 |             raise ValueError('Must specify --topic when using Kafka')
161 |         from confluent_kafka import Producer
162 |         producer = Producer({'bootstrap.servers': args.broker_list})
163 | 
164 |         def _produce(timestamp, name, price):
165 |             data = '{},{},{}'.format(timestamp, name, price)
166 |             produced = False
167 |             while not produced:
168 |                 try:
169 |                     producer.produce(args.topic, value=data.encode('utf-8'), key=name)
170 |                     producer.poll(0)
171 |                     produced = True
172 |                 except BufferError:
173 |                     producer.poll(10)
174 | 
175 |         LOGGER.debug('Producing to %s on %s', args.topic, args.broker_list)
176 |         _run(sources, timestamp, freq, args.real_time, args.real_time_multiplier, _produce)
177 |         producer.flush()
178 | 
179 | 
180 | if __name__ == '__main__':
181 |     main()
182 | 


--------------------------------------------------------------------------------
/examples/binning/live-plot.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Plotting binned generated sample prices. \n",
  8 |     "\n",
  9 |     "This notebook will display an auto-updating plot of prices as they are being generated by the example binning stream processor. \n",
 10 |     "\n",
 11 |     "In addition to the requirements for the Winton Kafka Streams code, these libraries must also be installed:\n",
 12 |     " * pandas\n",
 13 |     " * jupyter\n",
 14 |     " * bokeh\n",
 15 |     "\n",
 16 |     "These libraries can be installed manually or using pip: pip install .[binning_example]\n",
 17 |     "\n",
 18 |     "Once installed, run these two commands in examples/binning/ :\n",
 19 |     "\n",
 20 |     " * The binning stream processor:\n",
 21 |     "   * python binning.py\n",
 22 |     " * The price generator:\n",
 23 |     "   * python generator.py     -i A.N.Other-Corp,0.3,123,100.0,0.01     -l 6000 -f 250ms     -kb localhost:9092 -kt prices     -rt\n",
 24 |     "\n",
 25 |     "The -rt argument will generate the prices in real-time. The prices can also be generated faster than realitime with the flag \"-rtm <multiplier>\" or in bulk by omitting the -rt flag"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import datetime\n",
 37 |     "from pandas import Timestamp\n",
 38 |     "\n",
 39 |     "from ipywidgets import interact\n",
 40 |     "\n",
 41 |     "from bokeh.models.sources import ColumnDataSource\n",
 42 |     "from bokeh.plotting import figure\n",
 43 |     "from bokeh.io import push_notebook, show, output_notebook\n",
 44 |     "\n",
 45 |     "from confluent_kafka import Consumer, KafkaError"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "output_notebook()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {
 61 |     "collapsed": true
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# only plot prices for one symbol\n",
 66 |     "symbol = 'A.N.Other-Corp'"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {
 73 |     "collapsed": true
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "consumer = Consumer({'bootstrap.servers': 'localhost:9092', 'group.id': 'test-group',\n",
 78 |     "              'default.topic.config': {'auto.offset.reset': 'earliest'}})\n",
 79 |     "\n",
 80 |     "consumer.subscribe(['prices', 'bin-prices'])"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {
 87 |     "scrolled": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "price_figure = figure(title=symbol, plot_height=300, \n",
 92 |     "                      plot_width=600, y_range=(90, 110), x_axis_type='datetime')\n",
 93 |     "price_figure.xaxis.axis_label = 'Time'\n",
 94 |     "price_figure.yaxis.axis_label = 'Price'\n",
 95 |     "\n",
 96 |     "price_data = ColumnDataSource(data=dict(x=[datetime.datetime(2017,1,1)], y=[100]))\n",
 97 |     "price_line = price_figure.line(x=\"x\", y=\"y\", color=\"blue\", source=price_data, legend='Price')\n",
 98 |     "\n",
 99 |     "bin_data = ColumnDataSource(data=dict(x=[], y=[]))\n",
100 |     "bin_circle = price_figure.circle(x=\"x\", y=\"y\", color=\"red\", source=bin_data, legend='Binned price')\n",
101 |     "\n",
102 |     "handle = show(price_figure, notebook_handle=True)\n",
103 |     "\n",
104 |     "xp, yp= [], []\n",
105 |     "updated_price_data = dict(x=xp, y=yp)\n",
106 |     "xb, yb= [], []\n",
107 |     "updated_bin_data = dict(x=xb, y=yb)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "def process_price(msg, x, y, updated_data, price_data):\n",
117 |     "    dt, sym, prc = msg.value().decode(\"utf-8\").split(',')\n",
118 |     "    if sym == symbol:\n",
119 |     "        dt = Timestamp(dt).to_pydatetime()\n",
120 |     "        prc = float(prc)\n",
121 |     "\n",
122 |     "        x.append(dt)\n",
123 |     "        y.append(prc)\n",
124 |     "\n",
125 |     "        updated_data['x'] = x\n",
126 |     "        updated_data['y'] = y\n",
127 |     "        price_data.stream(updated_data, len(x))\n",
128 |     "    \n",
129 |     "def process_bin(msg, x, y, updated_data, bin_data):\n",
130 |     "    prc = float(msg.value().decode(\"utf-8\"))\n",
131 |     "    dt, sym = msg.key().decode(\"utf-8\").split(',')\n",
132 |     "    \n",
133 |     "    if sym == symbol:\n",
134 |     "        x.append(Timestamp(dt).to_pydatetime())\n",
135 |     "        y.append(prc)\n",
136 |     "\n",
137 |     "        updated_data['x'] = x\n",
138 |     "        updated_data['y'] = y\n",
139 |     "        bin_data.stream(updated_data, len(x))\n",
140 |     "\n",
141 |     "last_date = None\n",
142 |     "running = True\n",
143 |     "while running:\n",
144 |     "    msg = consumer.poll()\n",
145 |     "    if not msg.error():\n",
146 |     "        #print(f'Received message: {msg.value().decode(\"utf-8\")}')\n",
147 |     "        if msg.topic() == 'prices':\n",
148 |     "             process_price(msg, xp, yp, updated_price_data, price_data)\n",
149 |     "        elif msg.topic() == 'bin-prices':\n",
150 |     "            process_bin(msg, xb, yb, updated_bin_data, bin_data)\n",
151 |     "        \n",
152 |     "        push_notebook(handle=handle)\n",
153 |     "    elif msg.error().code() != KafkaError._PARTITION_EOF:\n",
154 |     "        print(msg.error())\n",
155 |     "        running = False\n",
156 |     "        \n",
157 |     "c.close()"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {
164 |     "collapsed": true
165 |    },
166 |    "outputs": [],
167 |    "source": []
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {
173 |     "collapsed": true
174 |    },
175 |    "outputs": [],
176 |    "source": []
177 |   }
178 |  ],
179 |  "metadata": {
180 |   "kernelspec": {
181 |    "display_name": "Python 3",
182 |    "language": "python",
183 |    "name": "python3"
184 |   },
185 |   "language_info": {
186 |    "codemirror_mode": {
187 |     "name": "ipython",
188 |     "version": 3
189 |    },
190 |    "file_extension": ".py",
191 |    "mimetype": "text/x-python",
192 |    "name": "python",
193 |    "nbconvert_exporter": "python",
194 |    "pygments_lexer": "ipython3",
195 |    "version": "3.6.1"
196 |   }
197 |  },
198 |  "nbformat": 4,
199 |  "nbformat_minor": 2
200 | }
201 | 


--------------------------------------------------------------------------------
/examples/binning/random_prices.py:
--------------------------------------------------------------------------------
 1 | """Provides simple class to compute (reproducible) random prices"""
 2 | 
 3 | 
 4 | from random import Random as _Random
 5 | from math import fabs as _fabs
 6 | from collections import namedtuple as _nt
 7 | 
 8 | _STATE = _nt('_STATE', ['last_price', 'random', 'iter'])
 9 | 
10 | 
11 | class RandomPrices(object):
12 |     """\
13 |     Provides iterable class producing random non-negative and non-zero
14 |     prices.
15 | 
16 |     Given an initial price p, the next price will be calculated as
17 |     abs(p + p*Gaussian(0.0, sigma)).  The price is additionally floored
18 |     at 0.1.
19 | 
20 |     This is not intended as a particularly realistic model of a price
21 |     series, but rather as a reproducible source of suitable test data
22 |     for handling price series.  The reproducibility is handled by the seed
23 |     and using a Pseudo RNG (Mersenne twister as provided by Python's random
24 |     module).
25 |     """
26 | 
27 |     def __init__(self, seed=42, initial_price=100.0, sigma=0.01,
28 |                  max_iter=10_000_000):
29 |         """
30 |         :param seed:  Random number seed
31 |         :param initial_price:  The first price from which to start the price
32 |                                evolution.
33 |         :param sigma:  The sigma of the Gaussian used to generated the price
34 |                        movements.
35 |         :param max_iter:  The total amout of prices to emit before stopping
36 |                           an iteration of this object.
37 |         """
38 |         super().__init__()
39 |         self._seed = seed
40 |         self._initial_price = initial_price
41 |         self._sigma = sigma
42 |         self._max_iter = max_iter
43 |         self._state = None
44 |         self.reset()
45 | 
46 |     def __iter__(self):
47 |         return self
48 | 
49 |     def __next__(self):
50 |         if self._state.iter >= self._max_iter:
51 |             raise StopIteration
52 |         return self.next_price()
53 | 
54 |     def next_price(self):
55 |         """Calculate and return a new price; use initial price on first call"""
56 |         if self._state.iter == 0:
57 |             # On first iteration use initial price
58 |             price = self._initial_price
59 |         else:
60 |             change = self._state.random.gauss(0.0, self._sigma)
61 |             price = _fabs(
62 |                 self._state.last_price + self._state.last_price * change
63 |             )
64 |             if price < 0.1:
65 |                 price = 0.1
66 |         self._state = _STATE(price, self._state.random, self._state.iter + 1)
67 |         return self._state.last_price
68 | 
69 |     def reset(self):
70 |         """Reset this object back to the initial state"""
71 |         self._state = _STATE(self._initial_price, _Random(self._seed), 0)
72 | 


--------------------------------------------------------------------------------
/examples/binning/source.py:
--------------------------------------------------------------------------------
 1 | """Provides a wrapper to randomise whether underlying prices are generated"""
 2 | 
 3 | from random import Random as _Random
 4 | 
 5 | 
 6 | class Source(object):
 7 |     """\
 8 |     Provides iterable class wrapping a price source and randomly produces a
 9 |     price or not.
10 |     """
11 | 
12 |     def __init__(self, prob, prices, seed=123):
13 |         self.prob = prob
14 |         self.prices = prices
15 |         self._rand = _Random(seed)
16 | 
17 |     def __next__(self):
18 |         return self.maybe_next_price()
19 | 
20 |     def __iter__(self):
21 |         return self
22 | 
23 |     def maybe_next_price(self):
24 |         """Based on the probability, return a price or None"""
25 | 
26 |         if self._rand.uniform(0.0, 1.0) <= self.prob:
27 |             return next(self.prices)
28 |         return None
29 | 


--------------------------------------------------------------------------------
/examples/debug/README.md:
--------------------------------------------------------------------------------
 1 | # Debug Winton Kafka Streams Example
 2 | 
 3 | ## Running
 4 | * Edit the config.properties file if necessary to change where Kafka is running
 5 | * Run: python example.py
 6 | * Start a console producer writing to the topic 'wks-debug-example-topic-two'
 7 | 
 8 | ## Features
 9 | * Listens to the topic 'wks-debug-example-topic-two' and writes output to 'wks-debug-example-output'
10 | * The value on the input topic will be doubled when received
11 | * Every fourth value will cause the four values in the current state to be written to the output topic
12 | * It is possible to stop the application at any time and the application will restart where it left off
13 | 


--------------------------------------------------------------------------------
/examples/debug/config.properties:
--------------------------------------------------------------------------------
1 | bootstrap.servers = localhost:9092
2 | auto.offset.reset = earliest
3 | enable.auto.commit = false
4 | value.serde = winton_kafka_streams.processor.serialization.serdes.IntegerSerde
5 | 


--------------------------------------------------------------------------------
/examples/debug/example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Winton Kafka Streams
 3 | 
 4 | Main entrypoints
 5 | 
 6 | """
 7 | 
 8 | import logging
 9 | import time
10 | 
11 | from winton_kafka_streams.processor import BaseProcessor, TopologyBuilder
12 | import winton_kafka_streams.kafka_config as kafka_config
13 | import winton_kafka_streams.kafka_streams as kafka_streams
14 | import winton_kafka_streams.state as state_stores
15 | 
16 | log = logging.getLogger(__name__)
17 | 
18 | 
19 | class DoubleProcessor(BaseProcessor):
20 |     """
21 |     Example processor that will double the value passed in
22 | 
23 |     """
24 | 
25 |     def initialise(self, name, context):
26 |         super().initialise(name, context)
27 |         self.state = context.get_store('double_store')
28 | 
29 |     def process(self, _, value):
30 |         log.debug(f'DoubleProcessor::process({str(value)})')
31 |         doubled = value*2
32 |         items_in_state = len(self.state)
33 |         self.state[items_in_state] = doubled
34 |         if items_in_state >= 4:
35 |             self.punctuate()
36 | 
37 |     def punctuate(self):
38 |         for _, value in self.state.items():
39 |             log.debug(f'Forwarding to sink ({str(value)})')
40 |             self.context.forward(None, value)
41 |         self.state.clear()
42 | 
43 | 
44 | def _debug_run(config_file):
45 |     kafka_config.read_local_config(config_file)
46 | 
47 |     double_store = state_stores.create('double_store'). \
48 |         with_integer_keys(). \
49 |         with_integer_values(). \
50 |         in_memory(). \
51 |         build()
52 | 
53 |     with TopologyBuilder() as topology_builder:
54 |         topology_builder. \
55 |             source('input-value', ['wks-debug-example-topic-two']). \
56 |             processor('double', DoubleProcessor, 'input-value'). \
57 |             state_store(double_store, 'double'). \
58 |             sink('output-double', 'wks-debug-example-output', 'double')
59 | 
60 |     wks = kafka_streams.KafkaStreams(topology_builder, kafka_config)
61 |     wks.start()
62 |     try:
63 |         while True:
64 |             time.sleep(1)
65 |     except KeyboardInterrupt:
66 |         pass
67 |     finally:
68 |         wks.close()
69 | 
70 | 
71 | if __name__ == '__main__':
72 | 
73 |     logging.basicConfig(level=logging.DEBUG)
74 | 
75 |     import argparse
76 | 
77 |     parser = argparse.ArgumentParser(description="Debug runner for Python Kafka Streams")
78 |     parser.add_argument('--config-file', '-c', help="Local configuration - will override internal defaults",
79 |                         default='config.properties')
80 |     args = parser.parse_args()
81 | 
82 |     _debug_run(args.config_file)
83 | 


--------------------------------------------------------------------------------
/examples/wordcount/README.md:
--------------------------------------------------------------------------------
 1 | # Wordcount Winton Kafka Streams Example
 2 | 
 3 | ## Running native
 4 | * Edit the config.properties file if necessary to change where Kafka is running
 5 | * Run: python example.py
 6 | * Start a console producer writing to the topic 'wks-wordcount-example-topic'
 7 | 
 8 | ##Running dockerized
 9 | * Install docker and docker-compose
10 | * cd into examples/wordcount/docker
11 | * start the docker services with `docker-compose up -d`
12 | * the kafka-debug service outputs the `wks-wordcount-example-count`
13 | * the output should be (after a minute or so):
14 |     ```
15 |     $ docker-compose logs kafka-debug
16 |     ...
17 |     kafka-debug_1    | b	2
18 |     kafka-debug_1    | c	1
19 |     kafka-debug_1    | a	3
20 |     ```
21 | 
22 | ## Features
23 | * Listens to the topic 'wks-wordcount-example-topic' and writes output to 'wks-wordcount-example-count'
24 | * Each string read in will be split by spaces
25 | * The count of the number of words will be maintained in a collections.Counter instance. This is not persistent so stopping the example will not maintain the previous state. 
26 | 


--------------------------------------------------------------------------------
/examples/wordcount/config.properties:
--------------------------------------------------------------------------------
1 | application.id = wordcount-example
2 | bootstrap.servers = localhost:9092
3 | auto.offset.reset = earliest
4 | value.serde = winton_kafka_streams.processor.serialization.serdes.StringSerde
5 | key.serde = winton_kafka_streams.processor.serialization.serdes.StringSerde
6 | 


--------------------------------------------------------------------------------
/examples/wordcount/custom_serde.py:
--------------------------------------------------------------------------------
 1 | from winton_kafka_streams.processor.serialization import IntegerSerializer
 2 | from winton_kafka_streams.processor.serialization.serdes.wrapper_serde import WrapperSerde
 3 | from winton_kafka_streams.processor.serialization import StringDeserializer
 4 | 
 5 | 
 6 | class StringIntSerde(WrapperSerde):
 7 |     def __init__(self):
 8 |         serializer = IntegerSerializer()
 9 |         deserializer = StringDeserializer()
10 |         super().__init__(serializer, deserializer)
11 | 


--------------------------------------------------------------------------------
/examples/wordcount/docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | services:
 3 |   wordcount:
 4 |     build:
 5 |       context: ../../../
 6 |       dockerfile: examples/wordcount/docker/wordcount/Dockerfile
 7 |     volumes:
 8 |      - ../../../:/code
 9 |     depends_on:
10 |       - kafka
11 |   source_client:
12 |     build:
13 |       context: ../../../
14 |       dockerfile: examples/wordcount/docker/source_client/Dockerfile
15 |     volumes:
16 |      - ../../../:/code
17 |     depends_on:
18 |       - kafka
19 |   kafka:
20 |     image: "spotify/kafka"
21 |     hostname: kafka
22 |     ports:
23 |      - 2181:2181
24 |      - 9092:9092
25 |      - 7203:7203
26 |     environment:
27 |       - JMX_PORT=7203
28 |       - ADVERTISED_HOST=kafka
29 |       - ADVERTISED_PORT=9092
30 |   kafka-manager:
31 |     image: "sheepkiller/kafka-manager"
32 |     ports:
33 |      - 9000:9000
34 |     environment:
35 |      - ZK_HOSTS=kafka:2181
36 |      - APPLICATION_SECRET=letmein
37 |   kafka-debug:
38 |     build:
39 |       context: ../../../
40 |       dockerfile: examples/wordcount/docker/kafka-debug/Dockerfile
41 |     depends_on:
42 |       - kafka
43 | 


--------------------------------------------------------------------------------
/examples/wordcount/docker/kafka-debug/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | FROM openjdk:8-jre
 3 | 
 4 | ENV SCALA_VERSION 2.11
 5 | ENV KAFKA_VERSION 0.10.1.0
 6 | ENV KAFKA_HOME /opt/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION"
 7 | 
 8 | # Install Kafka and other needed things
 9 | RUN apt-get update && \
10 |     apt-get install -y wget dnsutils && \
11 |     rm -rf /var/lib/apt/lists/* && \
12 |     apt-get clean && \
13 |     wget -q http://apache.mirrors.spacedump.net/kafka/"$KAFKA_VERSION"/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz -O /tmp/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz && \
14 |     tar xfz /tmp/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz -C /opt && \
15 |     rm /tmp/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz
16 | 
17 | 
18 | CMD ["/opt/kafka_2.11-0.10.1.0/bin/kafka-console-consumer.sh","--bootstrap-server","kafka:9092","--topic","wks-wordcount-example-count","--from-beginning","--property","print.key=true"]
19 | 


--------------------------------------------------------------------------------
/examples/wordcount/docker/source_client/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.6
 2 | ADD . /code
 3 | 
 4 | RUN apt-get update
 5 | RUN echo "/usr/local/lib" >> /etc/ld.so.conf
 6 | RUN git clone https://github.com/edenhill/librdkafka.git /tmp/librdkafka
 7 | RUN ls /tmp/ && cd /tmp/librdkafka && ./configure && make && make install && ldconfig
 8 | 
 9 | WORKDIR /code/examples/wordcount/
10 | RUN pip --version
11 | #RUN pip install -e git+https://github.com/confluentinc/confluent-kafka-python.git#egg=confluent-kafka
12 | RUN pip install -e ../../
13 | 
14 | CMD ["python", "source_client.py"]
15 | 


--------------------------------------------------------------------------------
/examples/wordcount/docker/wordcount/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.6
 2 | ADD . /code
 3 | 
 4 | RUN apt-get update
 5 | RUN echo "/usr/local/lib" >> /etc/ld.so.conf
 6 | RUN git clone https://github.com/edenhill/librdkafka.git /tmp/librdkafka
 7 | RUN ls /tmp/ && cd /tmp/librdkafka && ./configure && make && make install && ldconfig
 8 | 
 9 | WORKDIR /code/examples/wordcount/
10 | RUN pip --version
11 | #RUN pip install -e git+https://github.com/confluentinc/confluent-kafka-python.git#egg=confluent-kafka
12 | RUN pip install -e ../../
13 | 
14 | CMD ["python", "example.py", "-c", "docker/wordcount/config.properties"]
15 | 


--------------------------------------------------------------------------------
/examples/wordcount/docker/wordcount/config.properties:
--------------------------------------------------------------------------------
1 | bootstrap.servers = kafka:9092
2 | auto.offset.reset = earliest
3 | 


--------------------------------------------------------------------------------
/examples/wordcount/example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Winton Kafka Streams
 3 | 
 4 | Main entrypoints
 5 | 
 6 | """
 7 | 
 8 | import logging
 9 | import sys
10 | import time
11 | 
12 | import winton_kafka_streams.kafka_config as kafka_config
13 | import winton_kafka_streams.kafka_streams as kafka_streams
14 | from winton_kafka_streams.processor import BaseProcessor, TopologyBuilder
15 | import winton_kafka_streams.state as state_stores
16 | 
17 | log = logging.getLogger(__name__)
18 | 
19 | 
20 | # An example implementation of word count,
21 | # showing where punctuate can be useful
22 | class WordCount(BaseProcessor):
23 | 
24 |     def initialise(self, _name, _context):
25 |         super().initialise(_name, _context)
26 |         self.word_count_store = _context.get_store('counts')
27 |         # dirty_words tracks what words have changed since the last punctuate
28 |         self.dirty_words = set()
29 |         # output updated counts every 10 seconds
30 |         self.context.schedule(10.)
31 | 
32 |     def process(self, key, value):
33 |         words = value.split()
34 |         log.debug(f'words list ({words})')
35 |         for word in words:
36 |             count = self.word_count_store.get(word, 0)
37 |             self.word_count_store[word] = count + 1
38 |         self.dirty_words |= set(words)
39 | 
40 |     def punctuate(self, timestamp):
41 |         for word in self.dirty_words:
42 |             count = str(self.word_count_store[word])
43 |             log.debug(f'Forwarding to sink ({word}, {count})')
44 |             self.context.forward(word, count)
45 |         self.dirty_words = set()
46 | 
47 | 
48 | def run(config_file, binary_output):
49 |     kafka_config.read_local_config(config_file)
50 |     if binary_output:
51 |         kafka_config.VALUE_SERDE = 'examples.wordcount.custom_serde.StringIntSerde'
52 | 
53 |     count_store = state_stores.create('counts'). \
54 |         with_string_keys(). \
55 |         with_integer_values(). \
56 |         in_memory(). \
57 |         build()
58 | 
59 |     with TopologyBuilder() as topology_builder:
60 |         topology_builder. \
61 |             source('input-value', ['wks-wordcount-example-topic']). \
62 |             processor('count', WordCount, 'input-value'). \
63 |             state_store(count_store, 'count'). \
64 |             sink('output-count', 'wks-wordcount-example-count', 'count')
65 | 
66 |     wks = kafka_streams.KafkaStreams(topology_builder, kafka_config)
67 |     wks.start()
68 |     try:
69 |         while True:
70 |             time.sleep(1)
71 |     except KeyboardInterrupt:
72 |         pass
73 |     finally:
74 |         wks.close()
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     import argparse
79 | 
80 |     parser = argparse.ArgumentParser(description="Debug runner for Python Kafka Streams")
81 |     parser.add_argument('--config-file', '-c',
82 |                         help="Local configuration - will override internal defaults",
83 |                         default='config.properties')
84 |     parser.add_argument('--binary-output',
85 |                         help="Output topic will contain 4-byte integers",
86 |                         action='store_true')
87 |     parser.add_argument('--verbose', '-v',
88 |                         help="Increase versbosity (repeat to increase level)",
89 |                         action='count', default=0)
90 |     args = parser.parse_args()
91 | 
92 |     levels = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG}
93 |     level = levels.get(args.verbose, logging.DEBUG)
94 |     logging.basicConfig(stream=sys.stdout, level=level)
95 |     run(args.config_file, binary_output=args.binary_output)
96 | 


--------------------------------------------------------------------------------
/examples/wordcount/source_client.py:
--------------------------------------------------------------------------------
 1 | from confluent_kafka import Producer
 2 | 
 3 | 
 4 | p = Producer({'bootstrap.servers': 'localhost:9092'})
 5 | topic = 'wks-wordcount-example-topic'
 6 | some_data_source = ["a b c", "a b", "a"]
 7 | for data in some_data_source:
 8 |     print("producing {} to {}".format(data, topic))
 9 |     p.produce(topic, data.encode('utf-8'))
10 | p.flush()
11 | 


--------------------------------------------------------------------------------
/requirements_docs.txt:
--------------------------------------------------------------------------------
1 | javaproperties
2 | requests
3 | avro-python3
4 | setuptools_scm
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [tool:pytest]
2 | addopts = -rsxX -q
3 | testpaths = tests
4 | python_files = test_*
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | setup for Winton Kafka Streams package
 6 | 
 7 | """
 8 | 
 9 | from setuptools import setup
10 | 
11 | with open('README.md') as readme_file:
12 |     readme = readme_file.read()
13 | 
14 | requirements = [
15 |     'javaproperties',
16 |     'confluent-kafka>=0.11.4',
17 |     'requests',
18 |     'avro-python3'
19 | ]
20 | 
21 | test_requirements = [
22 |     'pytest'
23 | ]
24 | 
25 | setup(
26 |     name='Winton Kafka Streams',
27 |     use_scm_version=True,
28 |     setup_requires=['setuptools_scm'],
29 |     description="Apache Kafka's Streams API for Python",
30 |     long_description=readme,
31 |     author="Winton Group",
32 |     author_email='opensource@winton.com',
33 |     url='https://github.com/wintoncode/winton_kafka_streams',
34 |     packages=[
35 |         'winton_kafka_streams',
36 |     ],
37 |     include_package_data=True,
38 |     install_requires=requirements,
39 |     license="Apache Software License 2.0",
40 |     zip_safe=True,
41 |     keywords='streams kafka winton',
42 |     classifiers=[
43 |         'Development Status :: 2 - Pre-Alpha',
44 |         'Intended Audience :: Developers',
45 |         'License :: OSI Approved :: Apache Software License',
46 |         'Natural Language :: English',
47 |         'Programming Language :: Python :: 3.6',
48 |     ],
49 |     test_suite='tests',
50 |     tests_require=test_requirements,
51 |     extras_require={
52 |         'develop': ['pytest', 'sphinx_rtd_theme'],
53 |         'binning_example': ['jupyter', 'pandas', 'bokeh'],
54 |     }
55 | )
56 | 


--------------------------------------------------------------------------------
/tests/processor/serde/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wintoncode/winton-kafka-streams/5867a1c42fc80bba07173fd1d004b2849b429fdf/tests/processor/serde/__init__.py


--------------------------------------------------------------------------------
/tests/processor/serde/mock_schema_registry.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright 2016 Confluent Inc.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | 
 18 | 
 19 | #
 20 | # derived from https://github.com/verisign/python-confluent-schemaregistry.git
 21 | #
 22 | 
 23 | from confluent_kafka.avro import ClientError
 24 | 
 25 | 
 26 | class MockSchemaRegistryClient(object):
 27 |     """
 28 |     A client that acts as a schema registry locally.
 29 | 
 30 |     Compatibiity related methods are not implemented at this time.
 31 |     """
 32 | 
 33 |     def __init__(self, max_schemas_per_subject=1000):
 34 |         self.max_schemas_per_subject = max_schemas_per_subject
 35 |         # subj => { schema => id }
 36 |         self.subject_to_schema_ids = {}
 37 |         # id => avro_schema
 38 |         self.id_to_schema = {}
 39 |         # subj => { schema => version }
 40 |         self.subject_to_schema_versions = {}
 41 | 
 42 |         self.subject_to_latest_schema = {}
 43 | 
 44 |         # counters
 45 |         self.next_id = 1
 46 |         self.schema_to_id = {}
 47 | 
 48 |     def _get_next_id(self, schema):
 49 |         if schema in self.schema_to_id:
 50 |             return self.schema_to_id[schema]
 51 |         result = self.next_id
 52 |         self.next_id += 1
 53 |         self.schema_to_id[schema] = result
 54 |         return result
 55 | 
 56 |     def _get_next_version(self, subject):
 57 |         if subject not in self.subject_to_schema_versions:
 58 |             self.subject_to_schema_versions[subject] = {}
 59 |         return len(self.subject_to_schema_versions[subject])
 60 | 
 61 |     def _get_all_versions(self, subject):
 62 |         versions = self.subject_to_schema_versions.get(subject, {})
 63 |         return sorted(versions)
 64 | 
 65 |     def _add_to_cache(self, cache, subject, schema, value):
 66 |         if subject not in cache:
 67 |             cache[subject] = {}
 68 |         sub_cache = cache[subject]
 69 |         sub_cache[schema] = value
 70 | 
 71 |     def _cache_schema(self, schema, schema_id, subject, version):
 72 |         # don't overwrite anything
 73 |         if schema_id in self.id_to_schema:
 74 |             schema = self.id_to_schema[schema_id]
 75 |         else:
 76 |             self.id_to_schema[schema_id] = schema
 77 | 
 78 |         self._add_to_cache(self.subject_to_schema_ids,
 79 |                            subject, schema, schema_id)
 80 | 
 81 |         self._add_to_cache(self.subject_to_schema_versions,
 82 |                            subject, schema, version)
 83 | 
 84 |         if subject in self.subject_to_latest_schema:
 85 |             si, s, v = self.subject_to_latest_schema[subject]
 86 |             if v > version:
 87 |                 return
 88 |         self.subject_to_latest_schema[subject] = (schema_id, schema, version)
 89 | 
 90 |     def register(self, subject, avro_schema):
 91 |         """
 92 |         Register a schema with the registry under the given subject
 93 |         and receive a schema id.
 94 | 
 95 |         avro_schema must be a parsed schema from the python avro library
 96 | 
 97 |         Multiple instances of the same schema will result in inconsistencies.
 98 |         """
 99 |         schemas_to_id = self.subject_to_schema_ids.get(subject, {})
100 |         schema_id = schemas_to_id.get(avro_schema, -1)
101 |         if schema_id != -1:
102 |             return schema_id
103 | 
104 |         # add it
105 |         version = self._get_next_version(subject)
106 |         schema_id = self._get_next_id(avro_schema)
107 | 
108 |         # cache it
109 |         self._cache_schema(avro_schema, schema_id, subject, version)
110 |         return schema_id
111 | 
112 |     def get_by_id(self, schema_id):
113 |         """Retrieve a parsed avro schema by id or None if not found"""
114 |         return self.id_to_schema.get(schema_id, None)
115 | 
116 |     def get_latest_schema(self, subject):
117 |         """
118 |         Return the latest 3-tuple of:
119 |         (the schema id, the parsed avro schema, the schema version)
120 |         for a particular subject.
121 | 
122 |         If the subject is not found, (None,None,None) is returned.
123 |         """
124 |         return self.subject_to_latest_schema.get(subject, (None, None, None))
125 | 
126 |     def get_version(self, subject, avro_schema):
127 |         """
128 |         Get the version of a schema for a given subject.
129 | 
130 |         Returns -1 if not found.
131 |         """
132 |         schemas_to_version = self.subject_to_schema_versions.get(subject, {})
133 |         return schemas_to_version.get(avro_schema, -1)
134 | 
135 |     def get_id_for_schema(self, subject, avro_schema):
136 |         """
137 |         Get the ID of a parsed schema
138 |         """
139 |         schemas_to_id = self.subject_to_schema_ids.get(subject, {})
140 |         return schemas_to_id.get(avro_schema, -1)
141 | 
142 |     def test_compatibility(self, subject, avro_schema, version='latest'):
143 |         raise ClientError("not implemented")
144 | 
145 |     def update_compatibility(self, level, subject=None):
146 |         raise ClientError("not implemented")
147 | 
148 |     def get_compatibility(self, subject=None):
149 |         raise ClientError("not implemented")
150 | 


--------------------------------------------------------------------------------
/tests/processor/serde/test_avro_serde.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import struct
 3 | from confluent_kafka.avro import loads as avro_loads
 4 | from .mock_schema_registry import MockSchemaRegistryClient
 5 | from winton_kafka_streams.processor.serialization.serdes import AvroSerde
 6 | import winton_kafka_streams.kafka_config as config
 7 | 
 8 | string_avro = '{"type": "string"}'
 9 | 
10 | 
11 | def create_serde(registry, schema):
12 |     serde = AvroSerde()
13 |     config.AVRO_SCHEMA_REGISTRY = 'nowhere'
14 |     config.KEY_AVRO_SCHEMA = schema
15 | 
16 |     serde.configure(config, True)
17 |     serde.serializer._avro_helper._set_serializer(registry)
18 |     serde.deserializer._avro_helper._set_serializer(registry)
19 | 
20 |     serde.test_registry = registry
21 |     return serde
22 | 
23 | 
24 | def test_serialize_avro():
25 |     registry = MockSchemaRegistryClient()
26 |     serde = create_serde(registry, string_avro)
27 | 
28 |     message = serde.serializer.serialize('topic', 'data')
29 |     message_io = io.BytesIO(message)
30 |     magic, schema_id, length, string = struct.unpack('>bIb4s', message_io.read(10))
31 |     assert(0 == magic)
32 |     assert(schema_id in registry.id_to_schema)
33 |     assert(8 == length)  # (==4) uses variable-length zig-zag encoding
34 |     assert(b'data' == string)
35 |     message_io.close()
36 | 
37 | 
38 | def test_deserialize_avro():
39 |     registry = MockSchemaRegistryClient()
40 |     serde = create_serde(registry, string_avro)
41 |     schema_id = registry.register('topic-value', avro_loads(string_avro))
42 | 
43 |     serialized = b'\0' + schema_id.to_bytes(4, 'big') + b'\x08data'
44 |     message = serde.deserializer.deserialize('ignored', serialized)
45 |     assert('data' == message)
46 | 


--------------------------------------------------------------------------------
/tests/processor/serde/test_instantiation.py:
--------------------------------------------------------------------------------
 1 | import winton_kafka_streams.processor.serialization.serdes as serdes
 2 | 
 3 | 
 4 | def test_serde_instance_to_string():
 5 |     serde = serdes.BytesSerde()
 6 |     serde_str = serdes.serde_as_string(serde)
 7 |     assert 'winton_kafka_streams.processor.serialization.serdes.bytes_serde.BytesSerde' == serde_str
 8 | 
 9 | 
10 | def test_serde_class_to_string():
11 |     serde = serdes.BytesSerde
12 |     serde_str = serdes.serde_as_string(serde)
13 |     assert 'winton_kafka_streams.processor.serialization.serdes.bytes_serde.BytesSerde' == serde_str
14 | 
15 | 
16 | def test_string_to_serde():
17 |     serde_str = 'winton_kafka_streams.processor.serialization.serdes.StringSerde'
18 |     serde = serdes.serde_from_string(serde_str)
19 |     byte_str = serde.serializer.serialize('topic', 'abc123')
20 |     assert b'abc123' == byte_str
21 | 


--------------------------------------------------------------------------------
/tests/processor/serde/test_serialisation.py:
--------------------------------------------------------------------------------
 1 | from winton_kafka_streams.processor.serialization.serdes import *
 2 | 
 3 | 
 4 | def test_bytes_serde():
 5 |     bytes_serde = BytesSerde()
 6 |     assert bytes_serde.serializer.serialize('topic', b'hello') == b'hello'
 7 |     assert bytes_serde.deserializer.deserialize('topic', b'hello') == b'hello'
 8 | 
 9 | 
10 | def test_string_serde():
11 |     string_serde = StringSerde()
12 |     assert string_serde.serializer.serialize('topic', 'hello') == b'hello'
13 |     assert string_serde.deserializer.deserialize('topic', b'hello') == 'hello'
14 | 
15 | 
16 | def test_integer_serde():
17 |     int_serde = IntegerSerde()
18 |     assert int_serde.serializer.serialize('topic', -2132) == b'\xac\xf7\xff\xff'
19 |     assert int_serde.deserializer.deserialize('topic', b'\xac\xf7\xff\xff') == -2132
20 | 
21 | 
22 | def test_long_serde():
23 |     int_serde = LongSerde()
24 |     assert int_serde.serializer.serialize('topic', -2132) == b'\xac\xf7\xff\xff\xff\xff\xff\xff'
25 |     assert int_serde.deserializer.deserialize('topic', b'\xac\xf7\xff\xff\xff\xff\xff\xff') == -2132
26 | 
27 | 
28 | def test_float_serde():
29 |     float_serde = FloatSerde()
30 |     assert float_serde.serializer.serialize('topic', -18.125) == b'\x00\x00\x91\xc1'
31 |     assert float_serde.deserializer.deserialize('topic', b'\x00\x00\x91\xc1') == -18.125
32 | 
33 | 
34 | def test_double_serde():
35 |     double_serde = DoubleSerde()
36 |     assert double_serde.serializer.serialize('topic', 123.25) == b'\x00\x00\x00\x00\x00\xd0^@'
37 |     assert double_serde.deserializer.deserialize('topic', b'\x00\x00\x00\x00\x00\xd0^@') == 123.25
38 | 
39 | 
40 | def test_json_serde():
41 |     json_serde = JsonSerde()
42 |     test_dict = {'key1': 'val1', 'key2': ["val21", "val22"]}
43 |     test_bytes = b'{"key1": "val1", "key2": ["val21", "val22"]}'
44 |     assert json_serde.serializer.serialize('topic', test_dict) == test_bytes
45 |     assert json_serde.deserializer.deserialize('topic', test_bytes) == test_dict
46 | 


--------------------------------------------------------------------------------
/tests/processor/test_base_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the base processor - base class to all
 3 | custom processor implementations
 4 | """
 5 | 
 6 | import unittest.mock as mock
 7 | 
 8 | import winton_kafka_streams.processor as wks_processor
 9 | from winton_kafka_streams.processor.processor_context import ProcessorContext
10 | from winton_kafka_streams.processor.task_id import TaskId
11 | 
12 | 
13 | def test_createBaseProcessor():
14 |     wks_processor.BaseProcessor()
15 | 
16 | 
17 | def test_initialiseBaseProcessor():
18 |     mock_task = mock.Mock()
19 |     mock_task.application_id = 'test_id'
20 |     mock_task_id = TaskId('test_group', 0)
21 |     mock_context = ProcessorContext(mock_task_id, mock_task, None, None, {})
22 |     bp = wks_processor.BaseProcessor()
23 |     bp.initialise('my-name', mock_context)
24 | 
25 |     assert bp.name == 'my-name'
26 |     assert isinstance(bp.context, ProcessorContext)
27 | 


--------------------------------------------------------------------------------
/tests/processor/test_extract_timestamp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests of using wall clock time from a message
 3 | 
 4 | """
 5 | 
 6 | import unittest.mock as mock
 7 | import pytest
 8 | 
 9 | import winton_kafka_streams.processor as wks_processor
10 | 
11 | expected_time = 1496735099.23712
12 | error_time_offset = 1000
13 | timestamp_create_time = 1
14 | 
15 | 
16 | class TestRecordTimeStampExtractorImpl(wks_processor.RecordTimeStampExtractor):
17 |     def on_error(self, record, timestamp, previous_timestamp):
18 |         return timestamp - 1000
19 | 
20 | 
21 | class MockRecord:
22 |     def __init__(self, time):
23 |         self.time = time
24 | 
25 |     def timestamp(self):
26 |         return (timestamp_create_time, self.time)
27 | 
28 | 
29 | def test_RecordTimeStampExtractorNoImpl():
30 |     pytest.raises(TypeError, wks_processor.RecordTimeStampExtractor)
31 | 
32 | 
33 | def test_RecordTimeStampExtractor():
34 |     rtse = TestRecordTimeStampExtractorImpl()
35 |     assert rtse.extract(MockRecord(expected_time), expected_time-12345) == expected_time
36 | 
37 | 
38 | def test_InvalidRecordTimeStampExtractorNoImpl():
39 |     rtse = TestRecordTimeStampExtractorImpl()
40 |     assert rtse.extract(MockRecord(-1), expected_time-12345) == -1 - error_time_offset
41 | 


--------------------------------------------------------------------------------
/tests/processor/test_punctuation_queue.py:
--------------------------------------------------------------------------------
 1 | import winton_kafka_streams.processor._punctuation_queue as punctuation_queue
 2 | 
 3 | 
 4 | def test_punctuation_queue():
 5 |     punctuations = []
 6 |     pq = punctuation_queue.PunctuationQueue(lambda ts, node: punctuations.append((ts, node)))
 7 |     pq.schedule('node', 100)
 8 |     now = -100
 9 | 
10 |     pq.may_punctuate(now)
11 |     assert len(punctuations) == 0
12 | 
13 |     pq.may_punctuate(now + 99)
14 |     assert len(punctuations) == 0
15 | 
16 |     pq.may_punctuate(now + 100)
17 |     assert len(punctuations) == 1
18 | 
19 |     pq.may_punctuate(now + 199)
20 |     assert len(punctuations) == 1
21 | 
22 |     pq.may_punctuate(now + 200)
23 |     assert len(punctuations) == 2
24 | 
25 |     assert punctuations == [('node', 0), ('node', 100)]
26 | 
27 | 
28 | def test_punctuation_schedule_can_compare_entires_with_same_timestamp():
29 |     schedule1 = punctuation_queue.PunctuationSchedule(123, {}, 100)
30 |     schedule2 = punctuation_queue.PunctuationSchedule(123, {}, 100)
31 | 
32 |     assert not schedule1 < schedule2
33 | 


--------------------------------------------------------------------------------
/tests/processor/test_sink_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test of sink processor behaviour
 3 | """
 4 | 
 5 | import unittest.mock as mock
 6 | 
 7 | import winton_kafka_streams.processor as wks_processor
 8 | from winton_kafka_streams.processor.task_id import TaskId
 9 | 
10 | _expected_timestamp = 1234567890
11 | 
12 | 
13 | def test_createSinkProcessorObject():
14 |     wks_processor.SinkProcessor('topic1')
15 | 
16 | 
17 | def test_sinkProcessorTopic():
18 |     sink = wks_processor.SinkProcessor('topic1')
19 |     assert sink.topic == 'topic1'
20 | 
21 | 
22 | def test_sinkProcessorProcess():
23 | 
24 |     with mock.patch('winton_kafka_streams.processor.ProcessorContext.timestamp', new_callable=mock.PropertyMock) as mock_timestamp:
25 |         mock_timestamp.return_value = _expected_timestamp
26 |         mock_task = mock.Mock()
27 |         mock_task.application_id = 'test_id'
28 |         mock_task_id = TaskId('test_group', 0)
29 |         processor_context = wks_processor.ProcessorContext(mock_task_id, mock_task, None, None, {})
30 |         processor_context.record_collector = mock.MagicMock()
31 | 
32 |         sink = wks_processor.SinkProcessor('topic1')
33 |         sink.initialise('test-sink', processor_context)
34 |         assert sink.name == 'test-sink'
35 | 
36 |         test_key, test_value = 'test-key', 'test-value'
37 |         sink.process(test_key, test_value)
38 |         assert processor_context.record_collector.called_with(test_key, test_value, _expected_timestamp)
39 | 


--------------------------------------------------------------------------------
/tests/processor/test_source_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test of source processor behaviour
 3 | """
 4 | 
 5 | import winton_kafka_streams.processor as wks_processor
 6 | 
 7 | 
 8 | def test_createSourceProcessorObject():
 9 |     wks_processor.SourceProcessor(['test-topic-name'])
10 | 
11 | 
12 | def test_sourceProcessorTopic():
13 |     sp1 = wks_processor.SourceProcessor(('topic1',))
14 |     assert sp1.topics == ('topic1',)
15 |     sp2 = wks_processor.SourceProcessor(('topic1', 'topic2'))
16 |     assert sp2.topics == ('topic1', 'topic2')
17 | 


--------------------------------------------------------------------------------
/tests/processor/test_stream_task.py:
--------------------------------------------------------------------------------
 1 | """
 2 | StreamTask tests
 3 | """
 4 | 
 5 | from unittest.mock import Mock, patch
 6 | 
 7 | import pytest
 8 | from confluent_kafka.cimpl import KafkaError, KafkaException
 9 | 
10 | from winton_kafka_streams import kafka_config
11 | from winton_kafka_streams.errors.task_migrated_error import TaskMigratedError
12 | from winton_kafka_streams.processor import TopologyBuilder
13 | from winton_kafka_streams.processor._stream_task import StreamTask
14 | from winton_kafka_streams.processor.task_id import TaskId
15 | 
16 | taskMigratedErrorCodes = [KafkaError.ILLEGAL_GENERATION,
17 |                           KafkaError.UNKNOWN_MEMBER_ID,
18 |                           KafkaError.REBALANCE_IN_PROGRESS,
19 |                           47 # INVALID_PRODUCER_EPOCH - not supported in all versions for Conluent Kafka so just use the explicit code in this test
20 |                           ]
21 | 
22 | 
23 | @pytest.mark.parametrize("error_code", taskMigratedErrorCodes)
24 | def test__given__commit__when__consumer_commit_fails_as_task_migrated__then__throw_task_migrated_error(error_code):
25 |     kafka_error_attrs = {'code.return_value': error_code}
26 |     kafka_error = Mock(**kafka_error_attrs)
27 | 
28 |     with patch.object(KafkaException, 'args', [kafka_error]):
29 |         consumer_attrs = {'commit.side_effect': KafkaException()}
30 |         consumer = Mock(**consumer_attrs)
31 |         producer = Mock()
32 |         processor_attrs = {'process.return_value': None}
33 |         processor = Mock(**processor_attrs)
34 | 
35 |         topology_builder = TopologyBuilder()
36 | 
37 |         topology_builder.source('my-source', ['my-input-topic-1'])
38 |         topology_builder.processor('my-processor', processor, 'my-source')
39 |         topology_builder.sink('my-sink', 'my-output-topic-1', 'my-processor')
40 | 
41 |         task = StreamTask(TaskId('testgroup', 0), "myapp", [0], topology_builder, consumer, producer, kafka_config)
42 | 
43 |         record_attrs = {'topic.return_value': 'my-input-topic-1',
44 |                         'offset.return_value': 1,
45 |                         'partition.return_value': 0}
46 |         record = Mock(**record_attrs)
47 | 
48 |         task.add_records([record])
49 | 
50 |         task.process()
51 | 
52 |         with pytest.raises(TaskMigratedError, message='StreamTask:testgroup_0 migrated.'):
53 |             task.commit()
54 | 


--------------------------------------------------------------------------------
/tests/processor/test_task_id.py:
--------------------------------------------------------------------------------
 1 | from winton_kafka_streams.processor.task_id import TaskId
 2 | 
 3 | 
 4 | def test_taskId():
 5 |     task_id = TaskId('group1', 0)
 6 | 
 7 |     assert task_id == TaskId('group1', 0)
 8 |     assert not (task_id != TaskId('group1', 0))
 9 |     assert task_id != TaskId('group1', 1)
10 |     assert task_id != TaskId('group2', 0)
11 | 
12 |     assert repr(task_id) == 'group1_0'
13 | 
14 |     assert hash(task_id) == hash(TaskId('group1', 0))
15 | 


--------------------------------------------------------------------------------
/tests/processor/test_topology.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test of topology creation
 3 | 
 4 | Low level connection of processor units
 5 | """
 6 | 
 7 | 
 8 | import unittest
 9 | import winton_kafka_streams.processor as wks_processor
10 | 
11 | def test_createTopologyBuilder():
12 |     wks_processor.topology.TopologyBuilder()
13 | 
14 | 
15 | class MyTestProcessor(wks_processor.processor.BaseProcessor):
16 |     pass
17 | 
18 | 
19 | class TestTopology(unittest.TestCase):
20 |     def setUp(self):
21 |         self.topology = wks_processor.topology.TopologyBuilder()
22 | 
23 |     def test_source(self):
24 |         self.topology.source('my-source', ['my-input-topic-1'])
25 | 
26 |     def test_processor(self):
27 |         self.topology.source('my-source', ['my-input-topic-1'])
28 |         self.topology.processor('my-processor', MyTestProcessor, 'my-source')
29 | 
30 |         self.topology = self.topology.build()
31 | 
32 |         assert len(self.topology.nodes) == 2
33 |         assert 'my-source' in self.topology.nodes.keys()
34 |         assert 'my-processor' in self.topology.nodes.keys()
35 | 
36 |     def test_sink(self):
37 |         self.topology.source('my-source', ['my-input-topic-1'])
38 |         self.topology.processor('my-processor', MyTestProcessor, 'my-source')
39 |         self.topology.sink('my-sink', 'my-output-topic-1', 'my-processor')
40 | 
41 |         self.topology = self.topology.build()
42 | 
43 |         assert len(self.topology.nodes) == 3
44 |         assert 'my-source' in self.topology.nodes.keys()
45 |         assert 'my-processor' in self.topology.nodes.keys()
46 |         assert 'my-sink' in self.topology.nodes.keys()
47 | 


--------------------------------------------------------------------------------
/tests/processor/test_wallclock_timestamp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests of using wall clock time from a message
 3 | 
 4 | """
 5 | 
 6 | import unittest.mock as mock
 7 | 
 8 | import winton_kafka_streams.processor.wallclock_timestamp as wallclock_timestamp
 9 | 
10 | expected_time = 1496735099.23712
11 | 
12 | 
13 | def test_WallClockTimeStampExtractor():
14 |     with mock.patch('time.time', return_value=expected_time):
15 |         assert wallclock_timestamp.WallClockTimeStampExtractor().extract(None, expected_time-1) == expected_time
16 | 


--------------------------------------------------------------------------------
/tests/state/test_in_memory_key_value_store.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from winton_kafka_streams.processor.serialization.serdes import BytesSerde
 4 | from winton_kafka_streams.state.in_memory.in_memory_state_store import InMemoryStateStore
 5 | 
 6 | 
 7 | def test_inMemoryKeyValueStore():
 8 |     store = InMemoryStateStore('teststore', BytesSerde(), BytesSerde(), False)
 9 |     kv_store = store.get_key_value_store()
10 | 
11 |     kv_store['a'] = 1
12 |     assert kv_store['a'] == 1
13 | 
14 |     kv_store['a'] = 2
15 |     assert kv_store['a'] == 2
16 | 
17 |     del kv_store['a']
18 |     assert kv_store.get('a') is None
19 |     with pytest.raises(KeyError):
20 |         _ = kv_store['a']
21 | 


--------------------------------------------------------------------------------
/tests/test_kafka_streams.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the top-level Kafka Streams class
 3 | """
 4 | 
 5 | 
 6 | import pytest
 7 | import unittest.mock as mock
 8 | 
 9 | from winton_kafka_streams import kafka_config
10 | from winton_kafka_streams.errors.kafka_streams_error import KafkaStreamsError
11 | from winton_kafka_streams.kafka_streams import KafkaStreams
12 | from winton_kafka_streams.processor.processor import BaseProcessor
13 | from winton_kafka_streams.processor.topology import TopologyBuilder
14 | 
15 | 
16 | class MyTestProcessor(BaseProcessor):
17 |     pass
18 | 
19 | 
20 | def test__given__stream_already_started__when__call_start_again__then__raise_error():
21 |     kafka_config.NUM_STREAM_THREADS = 0
22 |     topology_builder = TopologyBuilder()
23 | 
24 |     topology_builder.source('my-source', ['my-input-topic-1'])
25 |     topology_builder.processor('my-processor', MyTestProcessor, 'my-source')
26 |     topology_builder.sink('my-sink', 'my-output-topic-1', 'my-processor')
27 | 
28 |     topology = topology_builder.build()
29 | 
30 |     kafka_streams = KafkaStreams(topology, kafka_config)
31 |     kafka_streams.start()
32 | 
33 |     with pytest.raises(KafkaStreamsError, message='KafkaStreams already started.'):
34 |         kafka_streams.start()
35 | 
36 | 
37 | def test__two__processes__with__two__topic__partitions():
38 |     NUM_STREAM_PROCESSES = 2
39 |     kafka_config.NUM_STREAM_THREADS = 1
40 | 
41 |     consumer = mock.Mock()
42 |     producer = mock.Mock()
43 | 
44 |     processor_attrs = {'process.return_value': None}
45 |     processor = mock.Mock(**processor_attrs)
46 | 
47 |     kafka_client_supplier_attrs = {'consumer.return_value': consumer,
48 |                                    'producer.return_value': producer}
49 |     kafka_client_supplier = mock.Mock(**kafka_client_supplier_attrs)
50 | 
51 |     topology_builder = TopologyBuilder()
52 | 
53 |     topology_builder.source('my-source', ['my-input-topic-1'])
54 |     topology_builder.processor('my-processor', processor, 'my-source')
55 |     topology_builder.sink('my-sink', 'my-output-topic-1', 'my-processor')
56 | 
57 |     with mock.patch('winton_kafka_streams.kafka_client_supplier.KafkaClientSupplier', return_value=kafka_client_supplier):
58 |         for partition in range(NUM_STREAM_PROCESSES):
59 |             kafka_stream_process = KafkaStreams(topology_builder, kafka_config)
60 | 
61 |             topic_partition_attrs = {'topic': 'testtopic',
62 |                                      'partition': partition}
63 |             topic_partition = mock.Mock(**topic_partition_attrs)
64 | 
65 |             kafka_stream_process.stream_threads[0].add_stream_tasks([topic_partition])
66 | 
67 |             record_attrs = {'topic.return_value': 'my-input-topic-1',
68 |                             'offset.return_value': 1,
69 |                             'partition.return_value': partition}
70 |             record = mock.Mock(**record_attrs)
71 | 
72 |             kafka_stream_process.stream_threads[0].add_records_to_tasks([record])
73 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wintoncode/winton-kafka-streams/5867a1c42fc80bba07173fd1d004b2849b429fdf/winton_kafka_streams/__init__.py


--------------------------------------------------------------------------------
/winton_kafka_streams/errors/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | import errors will import all common errors
3 | """
4 | 
5 | from .kafka_streams_error import KafkaStreamsError
6 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/errors/_kafka_error_codes.py:
--------------------------------------------------------------------------------
 1 | from confluent_kafka.cimpl import KafkaError
 2 | 
 3 | 
 4 | def _get_invalid_producer_epoch_code():
 5 |     """Some versions of confluent-kafka-python do not explicitly support this error code"""
 6 |     try:
 7 |         return KafkaError.INVALID_PRODUCER_EPOCH
 8 |     except AttributeError:
 9 |         return 47
10 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/errors/kafka_streams_error.py:
--------------------------------------------------------------------------------
1 | """
2 | Run time exception thrown by winton kafka streams on error
3 | 
4 | """
5 | 
6 | 
7 | class KafkaStreamsError(RuntimeError):
8 |     pass
9 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/errors/task_migrated_error.py:
--------------------------------------------------------------------------------
 1 | from .kafka_streams_error import KafkaStreamsError
 2 | 
 3 | 
 4 | class TaskMigratedError(KafkaStreamsError):
 5 |     """
 6 |     Indicates that a task got migrated to another thread.
 7 |     Thus, the task raising this exception can be cleaned up and closed as "zombie".
 8 |     """
 9 |     pass
10 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/kafka_client_supplier.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pprint
 3 | 
 4 | import confluent_kafka as kafka
 5 | 
 6 | log = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | class KafkaClientSupplier:
10 |     def __init__(self, _config):
11 |         self.config = _config
12 | 
13 |     def consumer(self):
14 |         log.debug('Starting consumer...')
15 |         # TODO: Must set all config values applicable to a consumer
16 |         consumer_args = {'bootstrap.servers': self.config.BOOTSTRAP_SERVERS,
17 |                                'group.id': self.config.APPLICATION_ID,
18 |                                'default.topic.config': {'auto.offset.reset':
19 |                                                         self.config.AUTO_OFFSET_RESET},
20 |                                'enable.auto.commit': self.config.ENABLE_AUTO_COMMIT}
21 | 
22 |         log.debug('Consumer Arguments: %s', pprint.PrettyPrinter().pformat(consumer_args))
23 | 
24 |         return kafka.Consumer(consumer_args)
25 | 
26 |     def producer(self):
27 |         # TODO: Must set all config values applicable to a producer
28 |         return kafka.Producer({'bootstrap.servers': self.config.BOOTSTRAP_SERVERS})
29 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/kafka_config.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Configuration values that may be set to control behaviour of Winton Kafka Streams
  3 | 
  4 | Configuration may either be set inline in your application using:
  5 | 
  6 | import kafka_config
  7 | kafka_config.BOOTSTRAP_SERVERS = 'localhost:9092'
  8 | 
  9 | or as a file in java properties format. The property names are identical to
 10 | those used in the Java implementation for ease of sharing between both.
 11 | 
 12 | External files can be loaded using:
 13 | 
 14 | import kafka_config
 15 | kafka_config.read_local_config('path/to/kafka.streams.config')
 16 | 
 17 | 
 18 | """
 19 | 
 20 | import logging
 21 | import os
 22 | import sys
 23 | 
 24 | import javaproperties
 25 | from typing import List
 26 | 
 27 | from .processor.serialization.serdes import BytesSerde, serde_as_string
 28 | from .errors.kafka_streams_error import KafkaStreamsError
 29 | 
 30 | log = logging.getLogger(__name__)
 31 | 
 32 | #### - Required options - ####
 33 | 
 34 | """
 35 | A list of host/port pairs to use for establishing the
 36 | initial connection to the Kafka cluster
 37 | 
 38 | """
 39 | BOOTSTRAP_SERVERS = "localhost:9092"
 40 | 
 41 | """
 42 | An identifier for the stream processing application.
 43 | Must be unique within the Kafka cluster.
 44 | It is used as:
 45 |     1) the default client-id prefix
 46 |     2) the group-id for membership management
 47 |     3) the changelog topic prefix.
 48 | 
 49 | """
 50 | APPLICATION_ID = "wkstream.application.id"
 51 | 
 52 | #### - Optional Options - ####
 53 | 
 54 | """
 55 | The replication factor for changelog topics and repartition topics created by the application
 56 | Default: 1
 57 | Importance: Low
 58 | """
 59 | REPLICATION_FACTOR = 1
 60 | 
 61 | """
 62 | Directory location for state stores
 63 | Default: /var/lib/kafka-streams
 64 | Importance: Low
 65 | """
 66 | STATE_DIR = "/var/lib/kafka-streams"
 67 | 
 68 | """
 69 | Maximum number of memory bytes to be used for record caches across all threads
 70 | Default: 10485760 (bytes)
 71 | Importance: Medium
 72 | """
 73 | CACHE_MAX_BYTES_BUFFERING = 10485760
 74 | 
 75 | """
 76 | The number of standby replicas for each task
 77 | Default: 0
 78 | Importance: Medium
 79 | """
 80 | NUM_STANDBY_REPLICAS = 0
 81 | 
 82 | """
 83 | The number of threads to execute stream processing
 84 | Default: 1
 85 | Importance: Medium
 86 | """
 87 | NUM_STREAM_THREADS = 1
 88 | 
 89 | """
 90 | Timestamp extractor class that implements the TimestampExtractor interface
 91 | Default: see Timestamp Extractor
 92 | Importance: Medium
 93 | """
 94 | TIMESTAMP_EXTRACTOR = None  #  TODO
 95 | 
 96 | """
 97 | A host:port pair pointing to an embedded user defined endpoint that can be used for discovering the locations of state stores within a single Winton Kafka Streams application. The value of this must be different for each instance of the application.
 98 | Default ""
 99 | Importance: Low
100 | """
101 | APPLICATION_SERVER = ""
102 | 
103 | """
104 | The maximum number of records to buffer per partition
105 | Default: 1000
106 | Importance: Low
107 | """
108 | BUFFERED_RECORDS_PER_PARTITION = 1000
109 | 
110 | """
111 | An id string to pass to the server when making requests. (This setting is passed to the consumer/producer clients used internally by Winton Kafka Streams.)
112 | Default: ""
113 | Importance: Low
114 | """
115 | CLIENT_ID = ""
116 | 
117 | """
118 | The frequency with which to save the position (offsets in source topics) of tasks
119 | Default: 30000 (millisecs)
120 | Importance: Low
121 | """
122 | COMMIT_INTERVAL_MS = 30_000
123 | 
124 | """
125 | A list of classes to use as metrics reporters
126 | Default: []
127 | Importance: Low
128 | """
129 | METRIC_REPORTERS: List[str] = []
130 | 
131 | """
132 | The number of samples maintained to compute metrics.
133 | Default: 2
134 | Importance: Low
135 | """
136 | METRICS_NUM_SAMPLES = 2
137 | 
138 | """
139 | The highest recording level for metrics.
140 | Default: info
141 | Importance: Low
142 | """
143 | METRICS_RECORDING_LEVEL = 'info'
144 | 
145 | """
146 | The window of time a metrics sample is computed over.
147 | Default: 30000 (millisecs)
148 | Importance: Low
149 | """
150 | METRICS_SAMPLE_WINDOW_MS = 30_000
151 | 
152 | """
153 | Partition grouper class that implements the PartitionGrouper interface
154 | Defatult: see Partition Grouper
155 | Importance: Low
156 | """
157 | PARITION_GROUPER = None  # DEBUG
158 | 
159 | """
160 | The amount of time in milliseconds to block waiting for input
161 | Default: 100 (millisecs)
162 | Importance: Low
163 | """
164 | POLL_MS = 100
165 | 
166 | """
167 | The amount of time in milliseconds to wait before deleting state when a partition has migrated
168 | Default: 60000 (millisecs)
169 | Importance: Low
170 | """
171 | STATE_CLEANUP_DELAY_MS = 60_000
172 | 
173 | """
174 | Added to a windows maintainMs to ensure data is not deleted from the log prematurely. Allows for clock drift.
175 | Default: 86400000 (millisecons) = 1 day
176 | Importance: Low
177 | """
178 | WINDOWSTORE_CHANGELOG_ADDITIONAL_RETENTION_MS = 86_000_000
179 | 
180 | #### - Non streams configuration parameters - ####
181 | 
182 | """
183 | linger.ms (low)	Producer
184 | Default: 100
185 | Importance: low
186 | """
187 | LINGER_MS = 100
188 | 
189 | """
190 | Producer
191 | Default: 10
192 | Importance: low
193 | """
194 | RETRIES = 10
195 | 
196 | """
197 | Consumer
198 | Default: earliest
199 | Importance: low
200 | """
201 | AUTO_OFFSET_RESET = 'earliest'
202 | 
203 | """
204 | Consumer
205 | Default: false, see Consumer Auto Commit
206 | Importance: low
207 | """
208 | ENABLE_AUTO_COMMIT = 'false'
209 | 
210 | """
211 | Consumer
212 | Default: Integer.MAX_VALUE
213 | Importance: low
214 | """
215 | MAX_POLL_INTERVAL_MS = sys.maxsize  # TODO: No max for Python, this is word size - is that correct for Java?
216 | 
217 | """
218 | Consumer
219 | Default: 1000
220 | Importance: low
221 | """
222 | MAX_POLL_RECORDS = 1000
223 | 
224 | #### - Serdes Configuration - ####
225 | 
226 | """
227 | Default serializer/deserializer class for record values, implements the Serde interface (see also key.serdes)
228 | Default: winton_kafka_streams.processor.serialization.serdes.BytesSerde
229 | Importance: Medium
230 | """
231 | VALUE_SERDE = serde_as_string(BytesSerde)
232 | 
233 | """
234 | Default serializer/deserializer class for record keys, implements the Serde interface (see also value.serdes)
235 | Default: winton_kafka_streams.processor.serialization.serdes.BytesSerde
236 | Importance: Medium
237 | """
238 | KEY_SERDE = serde_as_string(BytesSerde)
239 | 
240 | # StringSerde - encoding
241 | SERIALIZER_ENCODING = 'utf-8'
242 | DESERIALIZER_ENCODING = 'utf-8'
243 | KEY_SERIALIZER_ENCODING = None
244 | KEY_DESERIALIZER_ENCODING = None
245 | VALUE_SERIALIZER_ENCODING = None
246 | VALUE_DESERIALIZER_ENCODING = None
247 | 
248 | # StringSerde - error mode
249 | SERIALIZER_ERROR = 'strict'
250 | DESERIALIZER_ERROR = 'strict'
251 | KEY_SERIALIZER_ERROR = None
252 | KEY_DESERIALIZER_ERROR = None
253 | VALUE_SERIALIZER_ERROR = None
254 | VALUE_DESERIALIZER_ERROR = None
255 | 
256 | # IntegerSerde/LongSerde - byte order
257 | SERIALIZER_BYTEORDER = 'little'
258 | DESERIALIZER_BYTEORDER = 'little'
259 | KEY_SERIALIZER_BYTEORDER = None
260 | KEY_DESERIALIZER_BYTEORDER = None
261 | VALUE_SERIALIZER_BYTEORDER = None
262 | VALUE_DESERIALIZER_BYTEORDER = None
263 | 
264 | # IntegerSerde/LongSerde - signed integer
265 | SERIALIZER_SIGNED = 'true'
266 | DESERIALIZER_SIGNED = 'true'
267 | KEY_SERIALIZER_SIGNED = None
268 | KEY_DESERIALIZER_SIGNED = None
269 | VALUE_SERIALIZER_SIGNED = None
270 | VALUE_DESERIALIZER_SIGNED = None
271 | 
272 | # AvroSerde
273 | AVRO_SCHEMA_REGISTRY = None
274 | AVRO_SCHEMA = None
275 | KEY_AVRO_SCHEMA_REGISTRY = None
276 | KEY_AVRO_SCHEMA = None
277 | VALUE_AVRO_SCHEMA_REGISTRY = None
278 | VALUE_AVRO_SCHEMA = None
279 | 
280 | 
281 | def read_local_config(config_file):
282 |     if not os.path.exists(config_file):
283 |         raise KafkaStreamsError(f'Config file {config_file} does not exist')
284 | 
285 |     with open(config_file, 'r') as cf:
286 |         props = javaproperties.load(cf)
287 | 
288 |     for k, v in props.items():
289 |         ku = k.upper().replace('.', '_')
290 |         if ku not in globals().keys():
291 |             raise KafkaStreamsError(f'Unrecognised property {k} read from config file {config_file}')
292 |         globals()[ku] = v
293 | 
294 |         log.debug('Config from "%s": %s = %s', config_file, k, v)
295 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/kafka_streams.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Primary entrypoint for applications wishing to implement Python Kafka Streams
  3 | 
  4 | """
  5 | 
  6 | import logging
  7 | import threading
  8 | from enum import Enum
  9 | 
 10 | from .errors.kafka_streams_error import KafkaStreamsError
 11 | from .kafka_client_supplier import KafkaClientSupplier
 12 | from .processor import StreamThread
 13 | 
 14 | log = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class KafkaStreams:
 18 |     """
 19 |     Encapsulates stream graph processing units
 20 | 
 21 |     """
 22 | 
 23 |     """
 24 |       Kafka Streams states are the possible state that a Kafka Streams instance can be in.
 25 |       An instance must only be in one state at a time.
 26 |       Note this instance will be in "Rebalancing" state if any of its threads is rebalancing
 27 |       The expected state transition with the following defined states is:
 28 | 
 29 |       <pre>
 30 |                       +--------------+
 31 |               +<----- | Created      |
 32 |               |       +-----+--------+
 33 |               |             |
 34 |               |             v
 35 |               |       +-----+--------+
 36 |               +<----- | Rebalancing  | <----+
 37 |               |       +--------------+      |
 38 |               |                             |
 39 |               |                             |
 40 |               |       +--------------+      |
 41 |               +-----> | Running      | ---->+
 42 |               |       +-----+--------+
 43 |               |             |
 44 |               |             v
 45 |               |       +-----+--------+
 46 |               +-----> | Pending      |
 47 |                       | Shutdown     |
 48 |                       +-----+--------+
 49 |                             |
 50 |                             v
 51 |                       +-----+--------+
 52 |                       | Not Running  |
 53 |                       +--------------+
 54 |       </pre>
 55 |     """
 56 |     class State(Enum):
 57 |         CREATED = 0
 58 |         RUNNING = 1
 59 |         REBALANCING = 2
 60 |         PENDING_SHUTDOWN = 3
 61 |         NOT_RUNNING = 4
 62 | 
 63 |         def valid_transition_to(self, new_state):
 64 |             if self is self.CREATED:
 65 |                 return new_state in (self.REBALANCING, self.RUNNING, self.PENDING_SHUTDOWN)
 66 |             elif self is self.RUNNING:
 67 |                 return new_state in (self.REBALANCING, self.PENDING_SHUTDOWN)
 68 |             elif self is self.REBALANCING:
 69 |                 return new_state in (self.RUNNING, self.REBALANCING, self.PENDING_SHUTDOWN)
 70 |             elif self is self.PENDING_SHUTDOWN:
 71 |                 return new_state in (self.NOT_RUNNING,)
 72 |             else:  # including NOT_RUNNING
 73 |                 return False
 74 | 
 75 |         def is_running(self):
 76 |             return self in (self.RUNNING, self.REBALANCING)
 77 | 
 78 |         def is_created_or_running(self):
 79 |             return self.is_running() or self == self.CREATED
 80 | 
 81 |         def __str__(self):
 82 |             return self.name
 83 | 
 84 |     def __init__(self, topology_builder, kafka_config):
 85 |         self.kafka_config = kafka_config
 86 | 
 87 |         self.state = self.State.CREATED
 88 |         self.state_lock = threading.Lock()
 89 |         self.thread_states = {}
 90 | 
 91 |         self.consumer = None
 92 | 
 93 |         self.stream_threads = [StreamThread(topology_builder, self.kafka_config, KafkaClientSupplier(self.kafka_config))
 94 |                                for _ in range(int(self.kafka_config.NUM_STREAM_THREADS))]
 95 |         for stream_thread in self.stream_threads:
 96 |             stream_thread.set_state_listener(self.on_thread_state_change)
 97 |             self.thread_states[stream_thread.thread_id()] = stream_thread.state
 98 | 
 99 |     def set_state(self, new_state):
100 |         old_state = self.state
101 |         if not old_state.valid_transition_to(new_state):
102 |             log.warn(f'Unexpected state transition from {old_state} to {new_state}.')
103 |         else:
104 |             log.info(f'State transition from {old_state} to {new_state}.')
105 |         self.state = new_state
106 | 
107 |     def on_thread_state_change(self, stream_thread, old_state, new_state):
108 |         with self.state_lock:
109 |             self.thread_states[stream_thread.thread_id()] = new_state
110 |             if new_state in (StreamThread.State.ASSIGNING_PARTITIONS, StreamThread.State.PARTITIONS_REVOKED):
111 |                 self.set_state(self.State.REBALANCING)
112 |             elif set(self.thread_states.values()) == set([StreamThread.State.RUNNING]):
113 |                     self.set_state(self.State.RUNNING)
114 | 
115 |     def start(self):
116 |         log.debug('Starting Kafka Streams process')
117 |         if self.state == self.State.CREATED:
118 |             self.set_state(self.State.RUNNING)
119 |             for stream_thread in self.stream_threads:
120 |                 stream_thread.start()
121 |         else:
122 |             raise KafkaStreamsError('KafkaStreams already started.')
123 | 
124 |     def close(self):
125 |         if self.state.is_created_or_running():
126 |             self.set_state(self.State.PENDING_SHUTDOWN)
127 |             for stream_thread in self.stream_threads:
128 |                 stream_thread.set_state_listener(None)
129 |                 stream_thread.close()
130 |             self.set_state(self.State.NOT_RUNNING)
131 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Processor generating functions
 3 | 
 4 | """
 5 | 
 6 | from .topology import TopologyBuilder
 7 | from .processor import BaseProcessor, SourceProcessor, SinkProcessor
 8 | from .processor_context import ProcessorContext
 9 | 
10 | from ._stream_thread import StreamThread
11 | 
12 | # time extractors
13 | from .wallclock_timestamp import WallClockTimeStampExtractor
14 | from .extract_timestamp import RecordTimeStampExtractor
15 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/_context.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Processor context is the link to kafka from individual processor objects
 3 | 
 4 | """
 5 | 
 6 | import functools
 7 | import logging
 8 | from typing import Any, Callable
 9 | 
10 | from winton_kafka_streams.state.key_value_state_store import KeyValueStateStore
11 | from ..errors.kafka_streams_error import KafkaStreamsError
12 | 
13 | log = logging.getLogger(__name__)
14 | 
15 | 
16 | def _raise_if_null_record(fn: Callable[..., Any]) -> Callable[..., Any]:
17 |     @functools.wraps(fn)
18 |     def _inner(*args, **kwargs):
19 |         if args[0].current_record is None:
20 |             raise KafkaStreamsError(f"Record cannot be unset when retrieving {fn.__name__}")
21 |         return fn(*args, **kwargs)
22 |     return _inner
23 | 
24 | 
25 | class Context:
26 |     """
27 |     Processor context object
28 | 
29 |     """
30 | 
31 |     def __init__(self, _state_record_collector, _state_stores):
32 |         self.current_node = None
33 |         self.current_record = None
34 |         self.state_record_collector = _state_record_collector
35 |         self._state_stores = _state_stores
36 | 
37 |     def send(self, topic, key, obj):
38 |         """
39 |         Send the key value-pair to a Kafka topic
40 | 
41 |         """
42 |         print(f"Send {obj} to {topic}")
43 |         pass
44 | 
45 |     def schedule(self, timestamp):
46 |         """
47 |         Schedule the punctuation function call
48 | 
49 |         """
50 | 
51 |         pass
52 | 
53 |     @property  # type: ignore # waiting on https://github.com/python/mypy/issues/1362
54 |     @_raise_if_null_record
55 |     def offset(self):
56 |         return self.current_record.offset()
57 | 
58 |     @property  # type: ignore
59 |     @_raise_if_null_record
60 |     def partition(self):
61 |         return self.current_record.partition()
62 | 
63 |     @property  # type: ignore
64 |     @_raise_if_null_record
65 |     def timestamp(self):
66 |         return self.current_record.timestamp()
67 | 
68 |     @property  # type: ignore
69 |     @_raise_if_null_record
70 |     def topic(self):
71 |         return self.current_record.topic()
72 | 
73 |     def get_store(self, name) -> KeyValueStateStore:
74 |         if not self.current_node:
75 |             raise KafkaStreamsError("Access of state from unknown node")
76 | 
77 |         # TODO: Need to check for a global state here
78 |         #       This is the reason that processors access store through context
79 | 
80 |         if name not in self.current_node.state_stores:
81 |             raise KafkaStreamsError(f"Processor {self.current_node.name} does not have access to store {name}")
82 |         if name not in self._state_stores:
83 |             raise KafkaStreamsError(f"Store {name} is not found")
84 | 
85 |         return self._state_stores[name].get_key_value_store()
86 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/_punctuation_queue.py:
--------------------------------------------------------------------------------
 1 | from queue import PriorityQueue
 2 | from collections import namedtuple
 3 | 
 4 | 
 5 | class PunctuationSchedule(namedtuple('PunctuationSchedule', ['timestamp', 'node', 'interval'])):
 6 |     def __lt__(self, other):
 7 |         return self.timestamp < other.timestamp
 8 | 
 9 | 
10 | class PunctuationQueue:
11 | 
12 |     def __init__(self, punctuate):
13 |         self.pq = PriorityQueue()
14 |         self.punctuate = punctuate
15 | 
16 |     def schedule(self, node, interval):
17 |         self.pq.put(PunctuationSchedule(0, node, interval))
18 | 
19 |     def may_punctuate(self, timestamp):
20 |         punctuated = False
21 |         while not self.pq.empty():
22 |             top = self.pq.get()
23 |             if top.timestamp <= timestamp:
24 |                 self.punctuate(top.node, timestamp)
25 |                 punctuated = True
26 |                 next_timestamp = top.interval + (timestamp if top.timestamp == 0 else top.timestamp)
27 |                 self.pq.put(PunctuationSchedule(next_timestamp, top.node, top.interval))
28 |             else:
29 |                 self.pq.put(top)
30 |                 break
31 |         return punctuated
32 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/_record_collector.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Record collector sends produced results to kafka topic
 3 | 
 4 | """
 5 | 
 6 | import logging
 7 | 
 8 | from ..errors.kafka_streams_error import KafkaStreamsError
 9 | 
10 | log = logging.getLogger(__name__)
11 | 
12 | # When producing a message with partition = UA rdkafka will run a partitioner for us
13 | RD_KAFKA_PARTITION_UA = -1
14 | 
15 | 
16 | class RecordCollector:
17 |     """
18 |     Collects records to be output to Kafka topics after
19 |     they have been processed by the topology
20 | 
21 |     """
22 | 
23 |     def __init__(self, _producer, _key_serde, _value_serde):
24 |         self.producer = _producer
25 |         self.key_serde = _key_serde
26 |         self.value_serde = _value_serde
27 | 
28 |     def send(self, topic, key, value, timestamp,
29 |              *, partition=RD_KAFKA_PARTITION_UA, partitioner=None):
30 |         ser_key = self.key_serde.serializer.serialize(topic, key)
31 |         ser_value = self.value_serde.serializer.serialize(topic, value)
32 |         produced = False
33 | 
34 |         log.debug("Sending to partition %d of topic %s :  (%s, %s, %s)", partition, topic, ser_key, ser_value, timestamp)
35 | 
36 |         while not produced:
37 |             try:
38 |                 self.producer.produce(topic, ser_value, ser_key, partition, self.on_delivery, partitioner, timestamp)
39 |                 self.producer.poll(0)  # Ensure previous message's delivery reports are served
40 |                 produced = True
41 |             except BufferError as be:
42 |                 log.exception(be)
43 |                 self.producer.poll(10)  # Wait a bit longer to give buffer more time to flush
44 |             except NotImplementedError as nie:
45 |                 log.exception(nie)
46 |                 produced = True  # should not enter infinite loop
47 | 
48 |     def on_delivery(self, err, msg):
49 |         """
50 |         Callback function after a value is output to a source.
51 | 
52 |         Will raise an exception if an error is detected.
53 | 
54 |         TODO: Decide if an error should be raised or if this should be demoted?
55 |               Can an error be raised if a broker fails? Should we simply warn
56 |               and continue to poll and retry in this case?
57 |         """
58 | 
59 |         # TODO: Is err correct? Should we check if msg has error?
60 |         if err:
61 |             raise KafkaStreamsError(f'Error on delivery of message {msg}')
62 | 
63 |     def flush(self):
64 |         """
65 |         Flush all pending items in the queue to the output topic on Kafka
66 | 
67 |         """
68 |         log.debug('Flushing producer')
69 |         self.producer.flush()
70 | 
71 |     def close(self):
72 |         log.debug('Closing producer')
73 |         self.producer.close()
74 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/_stream_task.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import queue
  3 | 
  4 | from confluent_kafka import TopicPartition
  5 | from confluent_kafka.cimpl import KafkaException, KafkaError
  6 | 
  7 | from winton_kafka_streams.processor.serialization.serdes import BytesSerde
  8 | from ..errors._kafka_error_codes import _get_invalid_producer_epoch_code
  9 | from ._punctuation_queue import PunctuationQueue
 10 | from ._record_collector import RecordCollector
 11 | from .processor_context import ProcessorContext
 12 | from .wallclock_timestamp import WallClockTimeStampExtractor
 13 | from ..errors.task_migrated_error import TaskMigratedError
 14 | from ..processor.serialization.serdes import serde_from_string
 15 | 
 16 | 
 17 | class DummyRecord:
 18 |     """
 19 |     Dummy implementation of Record that provides the minimum needed
 20 |     to supply a timestamp to Context during punctuate.
 21 |     """
 22 | 
 23 |     def __init__(self, timestamp):
 24 |         self._timestamp = timestamp
 25 | 
 26 |     def topic(self):
 27 |         return '__null_topic__'
 28 | 
 29 |     def partition(self):
 30 |         return -1
 31 | 
 32 |     def offset(self):
 33 |         return -1
 34 | 
 35 |     def timestamp(self):
 36 |         return self._timestamp
 37 | 
 38 | 
 39 | 
 40 | 
 41 | _taskMigratedErrorCodes = [KafkaError.ILLEGAL_GENERATION,
 42 |                            KafkaError.REBALANCE_IN_PROGRESS,
 43 |                            KafkaError.UNKNOWN_MEMBER_ID,
 44 |                            _get_invalid_producer_epoch_code()]
 45 | 
 46 | 
 47 | class StreamTask:
 48 |     """
 49 |     Stream tasks are associated with a partition group(s)
 50 |     and are responsible for passing values from that partition
 51 |     to an instance of the topology for processing.
 52 | 
 53 |     """
 54 | 
 55 |     def __init__(self, _task_id, _application_id, _partitions, _topology_builder, _consumer, _producer, _config):
 56 |         self.log = logging.getLogger(__name__ + '(' + str(_task_id) + ')')
 57 |         self.task_id = _task_id
 58 |         self.application_id = _application_id
 59 |         self.partitions = _partitions
 60 |         self.topology = _topology_builder.build()
 61 |         self.state_stores = {name: store.get() for name, store in self.topology.state_stores.items()}
 62 |         self.consumer = _consumer
 63 |         self.producer = _producer
 64 |         self.config = _config
 65 | 
 66 |         self.key_serde = serde_from_string(self.config.KEY_SERDE)
 67 |         self.key_serde.configure(self.config, True)
 68 |         self.value_serde = serde_from_string(self.config.VALUE_SERDE)
 69 |         self.value_serde.configure(self.config, False)
 70 | 
 71 |         self.record_collector = RecordCollector(self.producer, self.key_serde, self.value_serde)
 72 |         self.state_record_collector = RecordCollector(self.producer, BytesSerde(), BytesSerde())
 73 | 
 74 |         self.queue = queue.Queue()
 75 |         self.context = ProcessorContext(self.task_id, self, self.record_collector,
 76 |                                         self.state_record_collector, self.state_stores)
 77 | 
 78 |         self.punctuation_queue = PunctuationQueue(self.punctuate)
 79 |         # TODO: use the configured timestamp extractor.
 80 |         self.timestamp_extractor = WallClockTimeStampExtractor()
 81 |         self.current_timestamp = None
 82 | 
 83 |         self.commitRequested = False
 84 |         self.commitOffsetNeeded = False
 85 |         self.consumedOffsets = {}
 86 | 
 87 |         self._init_state_stores()
 88 |         self._init_topology(self.context)
 89 | 
 90 |     def _init_state_stores(self):
 91 |         self.log.debug(f'Initialising state stores')
 92 |         for store in self.state_stores.values():
 93 |             store.initialize(self.context, store)
 94 | 
 95 |     def _init_topology(self, context):
 96 |         for node in self.topology.nodes.values():
 97 |             try:
 98 |                 context.current_node = node
 99 |                 node.initialise(context)
100 |             finally:
101 |                 context.current_node = None
102 |                 context.current_record = None
103 | 
104 |     def add_records(self, records):
105 |         for record in records:
106 |             self.queue.put(record)
107 | 
108 |     def process(self):
109 |         if self.queue.empty():
110 |             return False
111 | 
112 |         record = self.queue.get()
113 |         self.context.current_record = record
114 |         self.current_timestamp = self.timestamp_extractor.extract(record, self.current_timestamp)
115 | 
116 |         topic = record.topic()
117 |         raw_key = record.key()
118 |         key = None if raw_key is None else self.key_serde.deserializer.deserialize(topic, record.key())
119 |         value = self.value_serde.deserializer.deserialize(topic, record.value())
120 | 
121 |         self.context.current_node = self.topology.sources[topic]
122 |         self.topology.sources[topic].process(key, value)
123 | 
124 |         self.consumedOffsets[(topic, record.partition())] = record.offset()
125 |         self.commitOffsetNeeded = True
126 | 
127 |         self.context.current_record = None
128 |         self.context.current_node = None
129 | 
130 |         return True
131 | 
132 |     def maybe_punctuate(self):
133 |         timestamp = self.current_timestamp
134 | 
135 |         if timestamp is None:
136 |             return False
137 | 
138 |         return self.punctuation_queue.may_punctuate(timestamp)
139 | 
140 |     def punctuate(self, node, timestamp):
141 |         self.log.debug(f'Punctuating processor {node} at {timestamp}')
142 |         self.context.current_record = DummyRecord(timestamp)
143 |         self.context.current_node = node
144 |         node.punctuate(timestamp)
145 |         self.context.current_record = None
146 |         self.context.current_node = None
147 | 
148 |     def commit(self):
149 |         try:
150 |             self.record_collector.flush()
151 |             self.commit_offsets()
152 |             self.commitRequested = False
153 |         except Exception as e:
154 |             self.log.exception(e)
155 |             raise
156 | 
157 |     def commit_offsets(self):
158 |         """ Commit consumed offsets if needed """
159 | 
160 |         # may be asked to commit on rebalance or shutdown but
161 |         # should only commit if the processor has requested.
162 |         try:
163 |             if self.commitOffsetNeeded:
164 |                 offsets_to_commit = [TopicPartition(t, p, o + 1) for ((t, p), o) in self.consumedOffsets.items()]
165 |                 self.consumer.commit(offsets=offsets_to_commit, asynchronous=False)
166 |                 self.consumedOffsets.clear()
167 |                 self.commitOffsetNeeded = False
168 | 
169 |         except KafkaException as ke:
170 |             kafka_error = ke.args[0].code()
171 | 
172 |             if kafka_error in _taskMigratedErrorCodes:
173 |                 raise TaskMigratedError(f'{self} migrated.')
174 |             else:
175 |                 raise
176 | 
177 |     def commit_needed(self):
178 |         return self.commitRequested
179 | 
180 |     def need_commit(self):
181 |         self.commitRequested = True
182 | 
183 |     def schedule(self, interval):
184 |         self.punctuation_queue.schedule(self.context.current_node, interval)
185 | 
186 |     def __repr__(self):
187 |         return self.__class__.__name__ + f":{self.task_id}"
188 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/_stream_thread.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Kafka consumer poll thread
  3 | 
  4 | """
  5 | 
  6 | import logging
  7 | import threading
  8 | from enum import Enum
  9 | from itertools import zip_longest
 10 | 
 11 | from confluent_kafka import KafkaError
 12 | 
 13 | from ..errors.task_migrated_error import TaskMigratedError
 14 | from .task_id import TaskId
 15 | from ._stream_task import StreamTask
 16 | 
 17 | 
 18 | class StreamThread:
 19 |     """
 20 |       Stream thread states are the possible states that a stream thread can be in.
 21 |       A thread must only be in one state at a time
 22 |       The expected state transitions with the following defined states is:
 23 | 
 24 |       <pre>
 25 |                      +-------------+
 26 |                      | Not Running | <-------+
 27 |                      +-----+-------+         |
 28 |                            |                 |
 29 |                            v                 |
 30 |                      +-----+-------+         |
 31 |                +<--- | Running     | <----+  |
 32 |                |     +-----+-------+      |  |
 33 |                |           |              |  |
 34 |                |           v              |  |
 35 |                |     +-----+-------+      |  |
 36 |                +<--- | Partitions  |      |  |
 37 |                |     | Revoked     |      |  |
 38 |                |     +-----+-------+      |  |
 39 |                |           |              |  |
 40 |                |           v              |  |
 41 |                |     +-----+-------+      |  |
 42 |                |     | Assigning   |      |  |
 43 |                |     | Partitions  | ---->+  |
 44 |                |     +-----+-------+         |
 45 |                |           |                 |
 46 |                |           v                 |
 47 |                |     +-----+-------+         |
 48 |                +---> | Pending     | ------->+
 49 |                      | Shutdown    |
 50 |                      +-------------+
 51 |       </pre>
 52 |     """
 53 | 
 54 |     class State(Enum):
 55 |         NOT_RUNNING = 0
 56 |         RUNNING = 1
 57 |         PARTITIONS_REVOKED = 2
 58 |         ASSIGNING_PARTITIONS = 3
 59 |         PENDING_SHUTDOWN = 4
 60 | 
 61 |         def valid_transition_to(self, new_state):
 62 |             if self is self.NOT_RUNNING:
 63 |                 return new_state in (self.RUNNING,)
 64 |             elif self is self.RUNNING:
 65 |                 return new_state in (self.PARTITIONS_REVOKED, self.PENDING_SHUTDOWN)
 66 |             elif self is self.PARTITIONS_REVOKED:
 67 |                 return new_state in (self.PENDING_SHUTDOWN, self.ASSIGNING_PARTITIONS)
 68 |             elif self is self.ASSIGNING_PARTITIONS:
 69 |                 return new_state in (self.RUNNING, self.PENDING_SHUTDOWN)
 70 |             elif self is self.PENDING_SHUTDOWN:
 71 |                 return new_state in (self.NOT_RUNNING,)
 72 |             else:
 73 |                 return False
 74 | 
 75 |         def is_running(self):
 76 |             return self not in (self.NOT_RUNNING, self.PENDING_SHUTDOWN)
 77 | 
 78 |         def __str__(self):
 79 |             return self.name
 80 | 
 81 |     def __init__(self, _topology_builder, _config, _kafka_supplier):
 82 |         super().__init__()
 83 |         self.topology_builder = _topology_builder
 84 |         self.config = _config
 85 |         self.kafka_supplier = _kafka_supplier
 86 | 
 87 |         self.tasks = []
 88 |         self.tasks_by_partition = {}
 89 |         self.state = self.State.NOT_RUNNING
 90 | 
 91 |         self.topics = _topology_builder.topics
 92 | 
 93 |         self.thread = threading.Thread(target=self.run)
 94 |         self.log = logging.getLogger(__name__ + '(' + self.thread.name + ')')
 95 | 
 96 |         self.log.info('Topics for consumer are: %s', self.topics)
 97 |         self.consumer = self.kafka_supplier.consumer()
 98 | 
 99 |         self.state_listener = None
100 |         self.set_state(self.State.RUNNING)
101 | 
102 |     def thread_id(self):
103 |         return self.thread.ident
104 | 
105 |     def set_state(self, new_state):
106 |         old_state = self.state
107 |         if not old_state.valid_transition_to(new_state):
108 |             self.log.warning(f'Unexpected state transition from {old_state} to {new_state}.')
109 |         else:
110 |             self.log.info(f'State transition from {old_state} to {new_state}.')
111 |         self.state = new_state
112 |         if self.state_listener:
113 |             self.state_listener(self, old_state, new_state)
114 | 
115 |     def set_state_when_not_in_pending_shutdown(self, new_state):
116 |         if self.state is not self.State.PENDING_SHUTDOWN:
117 |             self.set_state(new_state)
118 | 
119 |     def set_state_listener(self, listener):
120 |         """ For internal use only. """
121 |         self.state_listener = listener
122 | 
123 |     def still_running(self):
124 |         return self.state.is_running()
125 | 
126 |     def start(self):
127 |         self.thread.start()
128 | 
129 |     def run(self):
130 |         self.log.debug('Running stream thread...')
131 |         try:
132 |             self.consumer.subscribe(self.topics, on_assign=self.on_assign, on_revoke=self.on_revoke)
133 | 
134 |             while self.still_running():
135 |                 try:
136 |                     records = self.poll_requests(0.1)
137 |                     if records:
138 |                         self.log.debug(f'Processing {len(records)} record(s)')
139 |                         self.add_records_to_tasks(records)
140 |                         self.process_and_punctuate()
141 |                 except TaskMigratedError as error:
142 |                     self.log.warning(f"Detected a task that got migrated to another thread. " +
143 |                                      "This implies that this thread missed a rebalance and dropped out of the "
144 |                                      "consumer group. " +
145 |                                      "Trying to rejoin the consumer group now. %s", error)
146 | 
147 |             self.log.debug('Ending stream thread...')
148 |         finally:
149 |             self.commit_all()
150 |             self.shutdown()
151 | 
152 |     def poll_requests(self, poll_timeout):
153 |         """ Get the next batch of records """
154 | 
155 |         # The current python kafka client gives us messages one by one,
156 |         # but for better throughput we want to process many records at once.
157 |         # Keep polling until we get no more records out.
158 |         records = []
159 |         record = self.consumer.poll(poll_timeout)
160 |         while record is not None:
161 |             if not record.error():
162 |                 self.log.debug('Received message at offset: %d', record.offset())
163 |                 records.append(record)
164 |                 record = self.consumer.poll(0.)
165 |             elif record.error().code() == KafkaError._PARTITION_EOF:
166 |                 record = self.consumer.poll(0.)
167 |             elif record.error():
168 |                 self.log.error('Record error received: %s', record.error())
169 | 
170 |         return records
171 | 
172 |     def add_records_to_tasks(self, records):
173 |         for record in records:
174 |             self.tasks_by_partition[record.partition()].add_records([record])
175 | 
176 |     def process_and_punctuate(self):
177 |         while True:
178 |             total_processed_each_round = 0
179 | 
180 |             for task in self.tasks:
181 |                 if task.process():
182 |                     total_processed_each_round += 1
183 | 
184 |             if total_processed_each_round == 0:
185 |                 break
186 | 
187 |         for task in self.tasks:
188 |             task.maybe_punctuate()
189 |             if task.commit_needed():
190 |                 self.commit(task)
191 | 
192 |     def commit(self, task):
193 |         self.log.debug('Commit task "%s"', task)
194 |         task.commit()
195 | 
196 |     def commit_all(self):
197 |         for task in self.tasks:
198 |             self.commit(task)
199 | 
200 |     def shutdown(self):
201 |         self.set_state(self.State.NOT_RUNNING)
202 | 
203 |     def add_stream_tasks(self, assignment):
204 |         # simplistic, but good enough for now. should take co-locating topics etc. into account in the future
205 |         grouped_tasks = {TaskId(topic_partition.topic, topic_partition.partition): {topic_partition}
206 |                          for topic_partition in assignment}
207 |         self.tasks = [StreamTask(task_id, self.config.APPLICATION_ID,
208 |                                  partitions, self.topology_builder, self.consumer,
209 |                                  self.kafka_supplier.producer(), self.config)
210 |                       for (task_id, partitions)
211 |                       in grouped_tasks.items()]
212 | 
213 |         for task in self.tasks:
214 |             self.tasks_by_partition.update(
215 |                 zip_longest((topic_partition.partition for topic_partition in task.partitions), [], fillvalue=task))
216 | 
217 |     def on_assign(self, consumer, partitions):
218 |         self.log.debug('Assigning partitions %s', partitions)
219 | 
220 |         self.set_state_when_not_in_pending_shutdown(self.State.ASSIGNING_PARTITIONS)
221 |         self.add_stream_tasks(partitions)
222 |         self.set_state_when_not_in_pending_shutdown(self.State.RUNNING)
223 | 
224 |     def on_revoke(self, consumer, partitions):
225 |         self.log.debug('Revoking partitions %s', partitions)
226 |         self.commit_all()
227 |         self.set_state_when_not_in_pending_shutdown(self.State.PARTITIONS_REVOKED)
228 |         self.tasks = []
229 |         self.tasks_by_partition = {}
230 | 
231 |     def close(self):
232 |         self.log.debug('Closing stream thread and consumer')
233 |         self.set_state(self.State.PENDING_SHUTDOWN)
234 |         self.consumer.close()
235 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/_timestamp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base class for all timestamp extractors
 3 | 
 4 | """
 5 | 
 6 | import abc
 7 | 
 8 | 
 9 | class TimeStampExtractor(metaclass=abc.ABCMeta):
10 |     @abc.abstractmethod
11 |     def extract(self, record, previous_timestamp):
12 |         pass
13 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/extract_timestamp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Time extractor from the message being processed
 3 | 
 4 | """
 5 | 
 6 | import abc
 7 | 
 8 | from ._timestamp import TimeStampExtractor
 9 | 
10 | 
11 | class RecordTimeStampExtractor(TimeStampExtractor):
12 |     """
13 |     Time stamp extractor that returns a time taken from the message itself
14 | 
15 |     This is an abstract class, the on_error function must be implemented to
16 |     use this extractor.
17 |     """
18 | 
19 |     def extract(self, record, previous_timestamp):
20 |         """
21 |         Returns kafka timestamp for message
22 | 
23 |         Parameters:
24 |         -----------
25 |         record : Kafka record
26 |             New record from which time should be assigned
27 |         previous_timestamp : long
28 |             Last extracted timestamp (seconds since the epoch)
29 | 
30 |         Returns:
31 |         --------
32 |         time : long
33 |             Time in seconds since the epoch
34 |         """
35 |         (timestamp_type, timestamp) = record.timestamp()
36 |         if timestamp < 0:
37 |             return self.on_error(record, timestamp, previous_timestamp)
38 | 
39 |         return timestamp
40 | 
41 |     @abc.abstractmethod
42 |     def on_error(self, record, timestamp, previous_timestamp):
43 |         """
44 |         Called when an invalid timestamp is found in a record
45 | 
46 |         Parameters:
47 |         record : Kafka record
48 |             The current record being processed
49 |         timestamp : long
50 |             The (invalid) timestamp that was processed
51 |         previous_timestamp : long
52 |             Last extracted timestamp (seconds since the epoch)
53 |         """
54 |         pass
55 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base definitions for all processors
 3 | 
 4 | """
 5 | 
 6 | import logging
 7 | 
 8 | log = logging.getLogger(__name__)
 9 | 
10 | 
11 | class BaseProcessor:
12 |     def __init__(self):
13 |         super().__init__()
14 | 
15 |         self.name = None
16 |         self.context = None
17 | 
18 |     def initialise(self, _name, _context):
19 |         self.name = _name
20 |         self.context = _context
21 | 
22 | 
23 | class SourceProcessor(BaseProcessor):
24 |     """
25 |     Fetches values from a kafka topic(s)and forwards
26 |     them to child node for processing
27 | 
28 |     """
29 | 
30 |     def __init__(self, topics):
31 |         super().__init__()
32 |         self.topics = topics
33 | 
34 |     def process(self, key, value):
35 |         self.context.forward(key, value)
36 | 
37 |     def punctuate(self, timestamp):
38 |         pass
39 | 
40 | 
41 | class SinkProcessor(BaseProcessor):
42 |     """
43 |     Forward values from processor nodes to the record collector
44 |     from where they will be written to a Kafka topic
45 | 
46 |     """
47 | 
48 |     def __init__(self, _topic):
49 |         super().__init__()
50 |         self.topic = _topic
51 | 
52 |     def process(self, key, value):
53 |         self._send(key, value, self.context.timestamp)
54 | 
55 |     def punctuate(self, timestamp):
56 |         pass
57 | 
58 |     def _send(self, key, value, timestamp):
59 |         self.context.record_collector.send(self.topic, key, value, timestamp)
60 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/processor_context.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Default context passed to every processor
 3 | 
 4 | """
 5 | import logging
 6 | 
 7 | from . import _context
 8 | 
 9 | log = logging.getLogger(__name__)
10 | 
11 | 
12 | class ProcessorContext(_context.Context):
13 |     """
14 |     The same processor context is shared betwen all nodes in
15 |     a single topology instance. It takes care of forwarding
16 |     values to downstream processors.
17 | 
18 |     """
19 |     def __init__(self, _task_id, _task, _record_collector, _state_record_collector, _state_stores):
20 | 
21 |         super().__init__(_state_record_collector, _state_stores)
22 | 
23 |         self.application_id = _task.application_id
24 |         self.task_id = _task_id
25 |         self.task = _task
26 |         self.record_collector = _record_collector
27 | 
28 |     def commit(self):
29 |         """
30 |         Request a commit
31 | 
32 |         Returns:
33 |         --------
34 |          - None
35 | 
36 |         """
37 | 
38 |         self.task.need_commit()
39 | 
40 |     def forward(self, key, value):
41 |         """
42 |         Forward the key/value to the next node in the topology
43 | 
44 |         """
45 |         previous_node = self.current_node
46 |         try:
47 |             for child in self.current_node.children:
48 |                 self.current_node = child
49 |                 child.process(key, value)
50 |         finally:
51 |             self.current_node = previous_node
52 | 
53 |     def schedule(self, timestamp):
54 |         """
55 |         Schedule the punctuation function call
56 | 
57 |         """
58 |         self.task.schedule(timestamp)
59 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/__init__.py:
--------------------------------------------------------------------------------
 1 | from .serde import Serde
 2 | from .serializer import Serializer
 3 | from .deserializer import Deserializer
 4 | 
 5 | from ._avro import AvroDeserializer, AvroSerializer
 6 | from ._bytes import BytesDeserializer, BytesSerializer
 7 | from ._double import DoubleDeserializer, DoubleSerializer
 8 | from ._float import FloatDeserializer, FloatSerializer
 9 | from ._integer import IntegerDeserializer, IntegerSerializer
10 | from ._json import JsonDeserializer, JsonSerializer
11 | from ._long import LongDeserializer, LongSerializer
12 | from ._string import StringDeserializer, StringSerializer
13 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/_avro.py:
--------------------------------------------------------------------------------
 1 | from confluent_kafka.avro import CachedSchemaRegistryClient, MessageSerializer
 2 | from confluent_kafka.avro import loads as avro_loads
 3 | 
 4 | from .serde import extract_config_property
 5 | from .deserializer import Deserializer
 6 | from .serializer import Serializer
 7 | 
 8 | 
 9 | class AvroHelper:
10 |     def __init__(self):
11 |         self._is_key = False
12 |         self._schema_registry = None
13 |         self._serializer = None
14 |         self._schema = None
15 | 
16 |     def _set_serializer(self, schema_registry):
17 |         self._schema_registry = schema_registry
18 |         self._serializer = MessageSerializer(registry_client=self._schema_registry)
19 | 
20 |     def configure(self, configs, is_key):
21 |         self._is_key = is_key
22 |         schema_registry_url = extract_config_property(configs, is_key, 'AVRO_SCHEMA_REGISTRY')
23 |         schema = extract_config_property(configs, is_key, 'AVRO_SCHEMA')
24 | 
25 |         if schema_registry_url is None:
26 |             raise Exception("Missing Avro Schema Registry Url")
27 |         else:
28 |             self._set_serializer(CachedSchemaRegistryClient(url=schema_registry_url))
29 | 
30 |         if schema:
31 |             self._schema = avro_loads(schema)
32 | 
33 |     def serialize(self, topic, data):
34 |         if self._schema is None:
35 |             raise Exception("Missing Avro Schema")
36 | 
37 |         return self._serializer.encode_record_with_schema(topic, self._schema, data, is_key=self._is_key)
38 | 
39 |     def deserialize(self, topic, data):
40 |         return self._serializer.decode_message(data)
41 | 
42 | 
43 | class AvroSerializer(Serializer):
44 |     def __init__(self):
45 |         self._avro_helper = AvroHelper()
46 | 
47 |     def serialize(self, topic, data):
48 |         return self._avro_helper.serialize(topic, data)
49 | 
50 |     def configure(self, configs, is_key):
51 |         self._avro_helper.configure(configs, is_key)
52 | 
53 |     def close(self):
54 |         pass
55 | 
56 | 
57 | class AvroDeserializer(Deserializer):
58 |     def __init__(self):
59 |         self._avro_helper = AvroHelper()
60 | 
61 |     def deserialize(self, topic, data):
62 |         return self._avro_helper.deserialize(topic, data)
63 | 
64 |     def configure(self, configs, is_key):
65 |         self._avro_helper.configure(configs, is_key)
66 | 
67 |     def close(self):
68 |         pass
69 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/_bytes.py:
--------------------------------------------------------------------------------
 1 | from .deserializer import Deserializer
 2 | from .serializer import Serializer
 3 | 
 4 | 
 5 | class BytesSerializer(Serializer[bytes]):
 6 |     def serialize(self, topic: str, data: bytes) -> bytes:
 7 |         return data
 8 | 
 9 |     def configure(self, configs, is_key):
10 |         pass
11 | 
12 |     def close(self):
13 |         pass
14 | 
15 | 
16 | class BytesDeserializer(Deserializer[bytes]):
17 |     def deserialize(self, topic: str, data: bytes) -> bytes:
18 |         return data
19 | 
20 |     def configure(self, configs, is_key):
21 |         pass
22 | 
23 |     def close(self):
24 |         pass
25 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/_double.py:
--------------------------------------------------------------------------------
 1 | from .deserializer import Deserializer
 2 | from .serializer import Serializer
 3 | import struct
 4 | 
 5 | 
 6 | class DoubleSerializer(Serializer[float]):
 7 |     def serialize(self, topic: str, data: float) -> bytes:
 8 |         return struct.pack('d', data)
 9 | 
10 |     def configure(self, configs, is_key):
11 |         pass
12 | 
13 |     def close(self):
14 |         pass
15 | 
16 | 
17 | class DoubleDeserializer(Deserializer[float]):
18 |     def deserialize(self, topic: str, data: bytes) -> float:
19 |         return struct.unpack('d', data)[0]
20 | 
21 |     def configure(self, configs, is_key):
22 |         pass
23 | 
24 |     def close(self):
25 |         pass
26 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/_float.py:
--------------------------------------------------------------------------------
 1 | from .deserializer import Deserializer
 2 | from .serializer import Serializer
 3 | import struct
 4 | 
 5 | 
 6 | class FloatSerializer(Serializer[float]):
 7 |     def serialize(self, topic: str, data: float) -> bytes:
 8 |         return struct.pack('f', data)
 9 | 
10 |     def configure(self, configs, is_key):
11 |         pass
12 | 
13 |     def close(self):
14 |         pass
15 | 
16 | 
17 | class FloatDeserializer(Deserializer[float]):
18 |     def deserialize(self, topic: str, data: bytes) -> float:
19 |         return struct.unpack('f', data)[0]
20 | 
21 |     def configure(self, configs, is_key):
22 |         pass
23 | 
24 |     def close(self):
25 |         pass
26 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/_integer.py:
--------------------------------------------------------------------------------
 1 | from .serde import extract_config_property
 2 | from .deserializer import Deserializer
 3 | from .serializer import Serializer
 4 | 
 5 | 
 6 | class IntegerSerializer(Serializer[int]):
 7 |     def __init__(self):
 8 |         self.byte_order = 'little'
 9 |         self.signed = True
10 |         self.int_size = 4
11 | 
12 |     def serialize(self, topic: str, data: int) -> bytes:
13 |         return int(data).to_bytes(length=self.int_size, byteorder=self.byte_order, signed=self.signed)
14 | 
15 |     def configure(self, configs, is_key):
16 |         self.byte_order = extract_config_property(configs, is_key, 'SERIALIZER_BYTEORDER', self.byte_order)
17 |         self.signed = extract_config_property(configs, is_key, 'SERIALIZER_SIGNED', str(self.signed)).lower() == 'true'
18 | 
19 |     def close(self):
20 |         pass
21 | 
22 | 
23 | class IntegerDeserializer(Deserializer[int]):
24 |     def __init__(self):
25 |         self.byte_order = 'little'
26 |         self.signed = True
27 | 
28 |     def deserialize(self, topic: str, data: bytes) -> int:
29 |         return int.from_bytes(bytes=data, byteorder=self.byte_order, signed=self.signed)
30 | 
31 |     def configure(self, configs, is_key):
32 |         self.byte_order = extract_config_property(configs, is_key, 'DESERIALIZER_BYTEORDER', self.byte_order)
33 |         self.signed = extract_config_property(configs, is_key, 'DESERIALIZER_SIGNED', str(self.signed)).lower() == 'true'
34 | 
35 |     def close(self):
36 |         pass
37 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/_json.py:
--------------------------------------------------------------------------------
 1 | from ._string import StringSerializer, StringDeserializer
 2 | from .deserializer import Deserializer
 3 | from .serializer import Serializer
 4 | import json
 5 | 
 6 | 
 7 | class JsonSerializer(Serializer):
 8 |     def __init__(self):
 9 |         self.string_serializer = StringSerializer()
10 | 
11 |     def serialize(self, topic, data):
12 |         string_form = json.dumps(data)
13 |         return self.string_serializer.serialize(topic, string_form)
14 | 
15 |     def configure(self, configs, is_key):
16 |         self.string_serializer.configure(configs, is_key)
17 | 
18 |     def close(self):
19 |         pass
20 | 
21 | 
22 | class JsonDeserializer(Deserializer):
23 |     def __init__(self):
24 |         self.string_deserializer = StringDeserializer()
25 | 
26 |     def deserialize(self, topic, data):
27 |         string_form = self.string_deserializer.deserialize(topic, data)
28 |         return json.loads(string_form)
29 | 
30 |     def configure(self, configs, is_key):
31 |         self.string_deserializer.configure(configs, is_key)
32 | 
33 |     def close(self):
34 |         pass
35 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/_long.py:
--------------------------------------------------------------------------------
 1 | from ._integer import IntegerSerializer, IntegerDeserializer
 2 | 
 3 | 
 4 | class LongSerializer(IntegerSerializer):
 5 |     def __init__(self):
 6 |         super(LongSerializer, self).__init__()
 7 |         self.int_size = 8
 8 | 
 9 | 
10 | class LongDeserializer(IntegerDeserializer):
11 |     def __init__(self):
12 |         super(LongDeserializer, self).__init__()
13 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/_string.py:
--------------------------------------------------------------------------------
 1 | from .serde import extract_config_property
 2 | from .deserializer import Deserializer
 3 | from .serializer import Serializer
 4 | 
 5 | 
 6 | class StringSerializer(Serializer[str]):
 7 |     def __init__(self):
 8 |         self.encoding = 'utf-8'
 9 |         self.on_error = 'strict'
10 | 
11 |     def serialize(self, topic: str, data: str) -> bytes:
12 |         return str(data).encode(self.encoding, self.on_error)
13 | 
14 |     def configure(self, configs, is_key):
15 |         self.encoding = extract_config_property(configs, is_key, 'SERIALIZER_ENCODING', self.encoding)
16 |         self.on_error = extract_config_property(configs, is_key, 'SERIALIZER_ERROR', self.on_error)
17 | 
18 |     def close(self):
19 |         pass
20 | 
21 | 
22 | class StringDeserializer(Deserializer[str]):
23 |     def __init__(self):
24 |         self.encoding = 'utf-8'
25 |         self.on_error = 'strict'
26 | 
27 |     def deserialize(self, topic: str, data: bytes) -> str:
28 |         return data.decode(self.encoding, self.on_error)
29 | 
30 |     def configure(self, configs, is_key):
31 |         self.encoding = extract_config_property(configs, is_key, 'DESERIALIZER_ENCODING', self.encoding)
32 |         self.on_error = extract_config_property(configs, is_key, 'DESERIALIZER_ERROR', self.on_error)
33 | 
34 |     def close(self):
35 |         pass
36 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/deserializer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base class for deserializer implementations
 3 | 
 4 | """
 5 | 
 6 | import abc
 7 | 
 8 | from typing import TypeVar, Generic
 9 | 
10 | T = TypeVar('T')
11 | 
12 | 
13 | class Deserializer(Generic[T], metaclass=abc.ABCMeta):
14 |     """
15 |     Configure this deserializer.
16 | 
17 |     Parameters:
18 |     -----------
19 |     configs : dict
20 |         configs in key/value pairs
21 |     is_key : bool
22 |         whether is for key or value
23 |     """
24 |     @abc.abstractmethod
25 |     def configure(self, configs, is_key):
26 |         pass
27 | 
28 |     """
29 |     Convert a bytes into typed data.
30 |     
31 |     Parameters:
32 |     -----------
33 |     topic : string
34 |     data : bytes
35 |     
36 |     Returns:
37 |     --------
38 |     deserialized_data : typed data
39 |     """
40 |     @abc.abstractmethod
41 |     def deserialize(self, topic: str, data: bytes) -> T:
42 |         pass
43 | 
44 |     """
45 |     Close this deserializer.
46 |     This method has to be idempotent because it might be called multiple times.
47 |     """
48 |     @abc.abstractmethod
49 |     def close(self):
50 |         pass
51 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/serde.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base class for deserializer implementations
 3 | 
 4 | """
 5 | 
 6 | import abc
 7 | 
 8 | from typing import TypeVar, Generic
 9 | 
10 | from .deserializer import Deserializer
11 | from .serializer import Serializer
12 | 
13 | T = TypeVar('T')
14 | TSer = TypeVar('TSer')
15 | TDe = TypeVar('TDe')
16 | 
17 | 
18 | def extract_config_property(configs, is_key, property_name, default_value=None):
19 |     overridden_property_name = ('KEY_%s' % property_name) if is_key else ('VALUE_%s' % property_name)
20 |     prop_value = getattr(configs, overridden_property_name, None)
21 |     if prop_value is None:
22 |         prop_value = getattr(configs, property_name, default_value)
23 |     return prop_value
24 | 
25 | 
26 | class AsymmetricSerde(Generic[TSer, TDe], metaclass=abc.ABCMeta):
27 |     @property
28 |     @abc.abstractmethod
29 |     def serializer(self) -> Serializer[TSer]:
30 |         pass
31 | 
32 |     @property
33 |     @abc.abstractmethod
34 |     def deserializer(self) -> Deserializer[TDe]:
35 |         pass
36 | 
37 |     @abc.abstractmethod
38 |     def configure(self, configs, is_key):
39 |         pass
40 | 
41 |     @abc.abstractmethod
42 |     def close(self):
43 |         pass
44 | 
45 | 
46 | class Serde(AsymmetricSerde[T, T]):
47 |     """
48 |     Get Serializer
49 | 
50 |     Returns:
51 |     --------
52 |     serializer : Serializer
53 |     """
54 | 
55 |     @property
56 |     @abc.abstractmethod
57 |     def serializer(self) -> Serializer[T]:
58 |         pass
59 | 
60 |     """
61 |     Get Deserializer
62 | 
63 |     Returns:
64 |     --------
65 |     deserializer : Deserializer
66 |     """
67 | 
68 |     @property
69 |     @abc.abstractmethod
70 |     def deserializer(self) -> Deserializer[T]:
71 |         pass
72 | 
73 |     """
74 |     Configure this class, which will configure the underlying serializer and deserializer.
75 | 
76 |     Parameters:
77 |     -----------
78 |     configs : dict
79 |         configs in key/value pairs
80 |     is_key : bool
81 |         whether is for key or value
82 |     """
83 | 
84 |     @abc.abstractmethod
85 |     def configure(self, configs, is_key):
86 |         pass
87 | 
88 |     """
89 |     Close this serde class, which will close the underlying serializer and deserializer.
90 |     This method has to be idempotent because it might be called multiple times.
91 |     """
92 | 
93 |     @abc.abstractmethod
94 |     def close(self):
95 |         pass
96 | 
97 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/serdes/__init__.py:
--------------------------------------------------------------------------------
 1 | from .bytes_serde import BytesSerde
 2 | from .float_serde import FloatSerde
 3 | from .double_serde import DoubleSerde
 4 | from .integer_serde import IntegerSerde
 5 | from .long_serde import LongSerde
 6 | from .string_serde import StringSerde
 7 | from .json_serde import JsonSerde
 8 | from .avro_serde import AvroSerde
 9 | 
10 | from ._serdes import serde_from_string
11 | from ._serdes import serde_as_string
12 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/serdes/_serdes.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import inspect
 3 | 
 4 | 
 5 | def serde_from_string(serde_name):
 6 |     module_name, class_name = serde_name.rsplit(".", 1)
 7 |     module = importlib.import_module(module_name)
 8 |     SerdeClass = getattr(module, class_name)
 9 |     return SerdeClass()
10 | 
11 | 
12 | def serde_as_string(serde):
13 |     module_name = serde.__module__
14 |     class_name = serde.__name__ if inspect.isclass(serde) else serde.__class__.__name__
15 |     return module_name + "." + class_name
16 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/serdes/avro_serde.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Avro Serde
 3 | 
 4 | """
 5 | from .._avro import AvroSerializer, AvroDeserializer
 6 | from .wrapper_serde import WrapperSerde
 7 | 
 8 | 
 9 | class AvroSerde(WrapperSerde):
10 |     """
11 |     Avro Serde that will use Avro and a schema registry
12 |     for serialization and deserialization
13 |     """
14 | 
15 |     def __init__(self):
16 |         serializer = AvroSerializer()
17 |         deserializer = AvroDeserializer()
18 |         super().__init__(serializer, deserializer)
19 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/serdes/bytes_serde.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Bytes Serde (default)
 3 | 
 4 | """
 5 | from .._bytes import BytesSerializer, BytesDeserializer
 6 | from .wrapper_serde import WrapperSerde
 7 | 
 8 | 
 9 | class BytesSerde(WrapperSerde[bytes]):
10 |     """
11 |     Bytes Serde that makes no changes to values
12 |     during serialization or deserialization
13 |     """
14 | 
15 |     def __init__(self):
16 |         serializer = BytesSerializer()
17 |         deserializer = BytesDeserializer()
18 |         super().__init__(serializer, deserializer)
19 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/serdes/double_serde.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Float Serde
 3 | 
 4 | """
 5 | from .._double import DoubleDeserializer, DoubleSerializer
 6 | from .wrapper_serde import WrapperSerde
 7 | 
 8 | 
 9 | class DoubleSerde(WrapperSerde[float]):
10 |     def __init__(self):
11 |         serializer = DoubleSerializer()
12 |         deserializer = DoubleDeserializer()
13 |         super().__init__(serializer, deserializer)
14 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/serdes/float_serde.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Float Serde
 3 | 
 4 | """
 5 | from .._float import FloatDeserializer, FloatSerializer
 6 | from .wrapper_serde import WrapperSerde
 7 | 
 8 | 
 9 | class FloatSerde(WrapperSerde[float]):
10 |     def __init__(self):
11 |         serializer = FloatSerializer()
12 |         deserializer = FloatDeserializer()
13 |         super().__init__(serializer, deserializer)
14 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/serdes/integer_serde.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Integer Serde
 3 | 
 4 | """
 5 | from .._integer import IntegerDeserializer, IntegerSerializer
 6 | from .wrapper_serde import WrapperSerde
 7 | 
 8 | 
 9 | class IntegerSerde(WrapperSerde[int]):
10 |     def __init__(self):
11 |         serializer = IntegerSerializer()
12 |         deserializer = IntegerDeserializer()
13 |         super().__init__(serializer, deserializer)
14 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/serdes/json_serde.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Json Serde
 3 | 
 4 | """
 5 | from .._json import JsonSerializer, JsonDeserializer
 6 | from .wrapper_serde import WrapperSerde
 7 | 
 8 | 
 9 | class JsonSerde(WrapperSerde):
10 |     def __init__(self):
11 |         serializer = JsonSerializer()
12 |         deserializer = JsonDeserializer()
13 |         super().__init__(serializer, deserializer)
14 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/serdes/long_serde.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Long Serde
 3 | 
 4 | """
 5 | from .._long import LongDeserializer, LongSerializer
 6 | from .wrapper_serde import WrapperSerde
 7 | 
 8 | 
 9 | class LongSerde(WrapperSerde[int]):
10 |     def __init__(self):
11 |         serializer = LongSerializer()
12 |         deserializer = LongDeserializer()
13 |         super().__init__(serializer, deserializer)
14 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/serdes/string_serde.py:
--------------------------------------------------------------------------------
 1 | """
 2 | String Serde
 3 | 
 4 | """
 5 | from .._string import StringSerializer, StringDeserializer
 6 | from .wrapper_serde import WrapperSerde
 7 | 
 8 | 
 9 | class StringSerde(WrapperSerde[str]):
10 |     def __init__(self):
11 |         serializer = StringSerializer()
12 |         deserializer = StringDeserializer()
13 |         super().__init__(serializer, deserializer)
14 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/serdes/wrapper_serde.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Serde from a Serializer and Deserializer
 3 | 
 4 | """
 5 | from typing import TypeVar
 6 | 
 7 | from ..deserializer import Deserializer
 8 | from ..serializer import Serializer
 9 | from ..serde import AsymmetricSerde, Serde
10 | 
11 | TSer = TypeVar('TSer')
12 | TDe = TypeVar('TDe')
13 | 
14 | 
15 | class AsymmetricWrapperSerde(AsymmetricSerde[TSer, TDe]):
16 |     def __init__(self, serializer: Serializer[TSer], deserializer: Deserializer[TDe]) -> None:
17 |         self._serializer = serializer
18 |         self._deserializer = deserializer
19 | 
20 |     @property
21 |     def serializer(self) -> Serializer[TSer]:
22 |         return self._serializer
23 | 
24 |     @property
25 |     def deserializer(self) -> Deserializer[TDe]:
26 |         return self._deserializer
27 | 
28 |     def configure(self, configs, is_key) -> None:
29 |         self.serializer.configure(configs, is_key)
30 |         self.deserializer.configure(configs, is_key)
31 | 
32 |     def close(self) -> None:
33 |         self.serializer.close()
34 |         self.deserializer.close()
35 | 
36 | 
37 | T = TypeVar('T')
38 | 
39 | 
40 | class WrapperSerde(Serde[T]):
41 |     def __init__(self, serializer: Serializer[T], deserializer: Deserializer[T]) -> None:
42 |         self._serializer = serializer
43 |         self._deserializer = deserializer
44 | 
45 |     @property
46 |     def serializer(self) -> Serializer[T]:
47 |         return self._serializer
48 | 
49 |     @property
50 |     def deserializer(self) -> Deserializer[T]:
51 |         return self._deserializer
52 | 
53 |     def configure(self, configs, is_key) -> None:
54 |         self.serializer.configure(configs, is_key)
55 |         self.deserializer.configure(configs, is_key)
56 | 
57 |     def close(self) -> None:
58 |         self.serializer.close()
59 |         self.deserializer.close()
60 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/serialization/serializer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base class for serializer implementations
 3 | 
 4 | """
 5 | 
 6 | import abc
 7 | 
 8 | from typing import TypeVar, Generic
 9 | 
10 | T = TypeVar('T')
11 | 
12 | 
13 | class Serializer(Generic[T], metaclass=abc.ABCMeta):
14 |     """
15 |     Configure this serializer.
16 | 
17 |     Parameters:
18 |     -----------
19 |     configs : dict
20 |         configs in key/value pairs
21 |     is_key : bool
22 |         whether is for key or value
23 |     """
24 |     @abc.abstractmethod
25 |     def configure(self, configs, is_key):
26 |         pass
27 | 
28 |     """
29 |     Convert typed data into a bytes.
30 |     
31 |     Parameters:
32 |     -----------
33 |     topic : string
34 |     data : typed data
35 |     
36 |     Returns:
37 |     --------
38 |     serialized_bytearray : bytes
39 |     """
40 |     @abc.abstractmethod
41 |     def serialize(self, topic: str, data: T) -> bytes:
42 |         pass
43 | 
44 |     """
45 |     Close this serializer.
46 |     This method has to be idempotent because it might be called multiple times.
47 |     """
48 |     @abc.abstractmethod
49 |     def close(self):
50 |         pass
51 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/task_id.py:
--------------------------------------------------------------------------------
 1 | class TaskId:
 2 |     def __init__(self, topic_group_id, partition):
 3 |         self.topic_group_id = topic_group_id
 4 |         self.partition = partition
 5 | 
 6 |     def __repr__(self):
 7 |         return f"{self.topic_group_id}_{self.partition}"
 8 | 
 9 |     def __eq__(self, other):
10 |         if other.__class__ is self.__class__:
11 |             return (self.topic_group_id, self.partition) == (other.topic_group_id, other.partition)
12 |         return False
13 | 
14 |     def __ne__(self, other):
15 |         return not self.__eq__(other)
16 | 
17 |     def __hash__(self):
18 |         return hash((self.topic_group_id, self.partition))
19 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/topology.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Classes for building a graph topology comprising processor derived nodes
  3 | 
  4 | """
  5 | 
  6 | import logging
  7 | 
  8 | from ..errors.kafka_streams_error import KafkaStreamsError
  9 | from .processor import SourceProcessor, SinkProcessor
 10 | 
 11 | log = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class ProcessorNode:
 15 |     def __init__(self, name, processor):
 16 |         self.name = name
 17 |         self.processor = processor
 18 |         self.children = []
 19 |         self.state_stores = set()
 20 | 
 21 |     def initialise(self, _context):
 22 |         self.processor.initialise(self.name, _context)
 23 | 
 24 |     def process(self, key, value):
 25 |         self.processor.process(key, value)
 26 | 
 27 |     def punctuate(self, timestamp):
 28 |         self.processor.punctuate(timestamp)
 29 | 
 30 |     def __repr__(self):
 31 |         return self.__class__.__name__ + f"({self.processor.__class__}({self.name}))"
 32 | 
 33 | 
 34 | class Topology:
 35 |     """
 36 |     A realised instance of a topology
 37 | 
 38 |     """
 39 |     def __init__(self, sources, processors, sinks, store_suppliers):
 40 |         self.nodes = {}
 41 |         self.sources = {}
 42 |         sources_list = [source_builder(self) for source_builder in sources]
 43 |         for source_node in sources_list:
 44 |             for topic in source_node.processor.topics:
 45 |                 if topic in self.sources:
 46 |                     raise KafkaStreamsError(f'Topic {topic} associated with more than one Source Processor')
 47 |                 self.sources[topic] = source_node
 48 | 
 49 |         self.processors = [processor_builder(self) for processor_builder in processors]
 50 |         self.sinks = [sink_builder(self) for sink_builder in sinks]
 51 | 
 52 |         self.state_stores = {}
 53 |         for store_supplier, store_processors in store_suppliers.items():
 54 |             self.state_stores[store_supplier.name] = store_supplier
 55 |             for p in store_processors:
 56 |                 self.nodes[p].state_stores.add(store_supplier.name)
 57 | 
 58 |     def _add_node(self, name, processor, inputs):
 59 |         if name in self.nodes:
 60 |             raise KafkaStreamsError(f"A processor with the name '{name}' already added to this topology")
 61 |         self.nodes[name] = processor
 62 | 
 63 |         node_inputs = list(self.nodes[i] for i in inputs)
 64 | 
 65 |         if any(n.name == name for n in node_inputs):
 66 |             raise KafkaStreamsError("A processor cannot have itself as an input")
 67 |         if any(n.name not in self.nodes for n in node_inputs):
 68 |             raise KafkaStreamsError("Input(s) {} to processor {} do not yet exist".format(
 69 |                 (set(inputs) - set(n.name for n in node_inputs)), name))
 70 | 
 71 |         for i in inputs:
 72 |             self.nodes[i].children.append(processor)
 73 | 
 74 | 
 75 | class TopologyBuilder:
 76 |     """
 77 |     Convenience class for building a graph topology
 78 |     """
 79 |     def __init__(self):
 80 |         self._sources = []
 81 |         self._processors = []
 82 |         self._sinks = []
 83 |         self._store_suppliers = {}
 84 |         self.topics = []
 85 | 
 86 |     def __enter__(self):
 87 |         return self
 88 | 
 89 |     def __exit__(self, exc_type, exc_val, exc_tb):
 90 |         pass
 91 | 
 92 |     @property
 93 |     def sources(self):
 94 |         return self._sources
 95 | 
 96 |     @property
 97 |     def sinks(self):
 98 |         return self._sinks
 99 | 
100 |     @property
101 |     def state_stores(self):
102 |         return self._store_suppliers
103 | 
104 |     def state_store(self, store_supplier, *processors):
105 |         """
106 |         Add a store and connect to processors
107 | 
108 |         Parameters:
109 |         -----------
110 |         store_supplier : winton_kafka_streams.state.StateStoreSupplier
111 |         *processors : processor names to which store should be attached
112 | 
113 |         Raises:
114 |         KafkaStreamsError
115 |             * If store_supplier is None
116 |             * If store_supplier already exists
117 |         """
118 |         if store_supplier is None:
119 |             raise KafkaStreamsError("store_supplier cannot be None")
120 | 
121 |         if any(store_supplier.name == s.name for s in self._store_suppliers):
122 |             raise KafkaStreamsError(f"Store with name {store_supplier.name} already exists")
123 | 
124 |         self._store_suppliers[store_supplier] = processors
125 |         return self
126 | 
127 |     def source(self, name, topics):
128 |         """
129 |         Add a source to the topology
130 | 
131 |         Parameters:
132 |         -----------
133 |         name : str
134 |             The name of the node
135 |         topics : str
136 |             Source topic
137 | 
138 |         Returns:
139 |         --------
140 |         topology : TopologyBuilder
141 | 
142 |         Raises:
143 |         KafkaStreamsError
144 |             * If node with same name exists already
145 |         """
146 | 
147 |         def build_source(topology):
148 |             log.debug(f'TopologyBuilder is building source {name}')
149 |             source = ProcessorNode(name, SourceProcessor(topics))
150 |             topology._add_node(name, source, [])
151 |             return source
152 | 
153 |         self.topics.extend(topics)
154 |         self._sources.append(build_source)
155 |         return self
156 | 
157 |     def processor(self, name, processor_type, *parents):
158 |         """
159 |         Add a processor to the topology
160 | 
161 |         Parameters:
162 |         -----------
163 |         name : str
164 |             The name of the node
165 |         processor : winton_kafka_streams.processor.base.BaseProcessor
166 |             Processor object that will process the key, value pair passed
167 |         *parents:
168 |             Parent nodes supplying inputs
169 | 
170 |         Returns:
171 |         --------
172 |         topology : TopologyBuilder
173 | 
174 |         Raises:
175 |         KafkaStreamsError
176 |             * If no inputs are specified
177 |         """
178 |         if not parents:
179 |             raise KafkaStreamsError("Processor '%s' must have a minimum of 1 input", name)
180 | 
181 |         def build_processor(topology):
182 |             log.debug(f'TopologyBuilder is building processor {name}')
183 |             processor_node = ProcessorNode(name, processor_type())
184 |             topology._add_node(name, processor_node, parents)
185 |             return processor_node
186 | 
187 |         self._processors.append(build_processor)
188 |         return self
189 | 
190 |     def sink(self, name, topic, *parents):
191 |         def build_sink(topology):
192 |             log.debug(f'TopologyBuilder is building sink {name}')
193 |             sink = ProcessorNode(name, SinkProcessor(topic))
194 |             topology._add_node(name, sink, parents)
195 |             return sink
196 |         self._sinks.append(build_sink)
197 |         return self
198 | 
199 |     def build(self):
200 |         return Topology(self._sources, self._processors, self._sinks, self._store_suppliers)
201 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/processor/wallclock_timestamp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Wall clock time extractor
 3 | 
 4 | """
 5 | 
 6 | import time
 7 | 
 8 | from ._timestamp import TimeStampExtractor
 9 | 
10 | 
11 | class WallClockTimeStampExtractor(TimeStampExtractor):
12 |     """
13 |     Time stamp extractor that returns wall clock time at the point
14 |     a record is processed
15 |     """
16 | 
17 |     def extract(self, record, previous_timestamp):
18 |         """
19 |         Returns wall clock time for every message
20 | 
21 |         Parameters:
22 |         -----------
23 |         record : Kafka record
24 |             New record from which time should be assigned
25 |         previous_timestamp : long
26 |             Last extracted timestamp (seconds since the epoch)
27 | 
28 |         Returns:
29 |         --------
30 |         time : long
31 |             Time in seconds since the epoch
32 |         """
33 |         return time.time()
34 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/state/__init__.py:
--------------------------------------------------------------------------------
1 | from winton_kafka_streams.state.factory.store_factory import StoreFactory
2 | 
3 | 
4 | def create(name: str) -> StoreFactory:
5 |     # TODO replace this Java-esque factory with a Pythonic DSL as part of the other work on a Streams DSL
6 |     return StoreFactory(name)
7 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/state/factory/__init__.py:
--------------------------------------------------------------------------------
1 | # TODO replace this Java-esque factory with a Pythonic DSL as part of the other work on a Streams DSL
2 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/state/factory/base_storage_key_value_store_factory.py:
--------------------------------------------------------------------------------
 1 | from typing import Generic, TypeVar
 2 | 
 3 | from winton_kafka_streams.processor.serialization import Serde
 4 | from abc import ABC, abstractmethod
 5 | 
 6 | from winton_kafka_streams.state.state_store_supplier import StateStoreSupplier
 7 | 
 8 | KT = TypeVar('KT')  # Key type.
 9 | VT = TypeVar('VT')  # Value type.
10 | 
11 | 
12 | class BaseStorageKeyValueStoreFactory(ABC, Generic[KT, VT]):
13 |     def __init__(self, name: str, key_serde: Serde[KT], value_serde: Serde[VT]) -> None:
14 |         self.name: str = name
15 |         self.key_serde: Serde[KT] = key_serde
16 |         self.value_serde: Serde[VT] = value_serde
17 |         self.logging_enabled: bool = True
18 | 
19 |     def enable_logging(self, config_map):
20 |         # TODO changelog extra config gets handled here
21 |         self.logging_enabled = True
22 |         return self
23 | 
24 |     def disable_logging(self):
25 |         self.logging_enabled = False
26 |         return self
27 | 
28 |     @abstractmethod
29 |     def build(self) -> StateStoreSupplier[KT, VT]:
30 |         pass
31 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/state/factory/in_memory_key_value_store_factory.py:
--------------------------------------------------------------------------------
 1 | from typing import TypeVar
 2 | 
 3 | from winton_kafka_streams.processor.serialization import Serde
 4 | from winton_kafka_streams.state.factory.base_storage_key_value_store_factory import BaseStorageKeyValueStoreFactory
 5 | from winton_kafka_streams.state.in_memory.in_memory_state_store_supplier import InMemoryStateStoreSupplier
 6 | 
 7 | KT = TypeVar('KT')  # Key type.
 8 | VT = TypeVar('VT')  # Value type.
 9 | 
10 | 
11 | class InMemoryKeyValueStoreFactory(BaseStorageKeyValueStoreFactory[KT, VT]):
12 |     def __init__(self, name: str, key_serde: Serde[KT], value_serde: Serde[VT]) -> None:
13 |         super(InMemoryKeyValueStoreFactory, self).__init__(name, key_serde, value_serde)
14 | 
15 |     def build(self) -> InMemoryStateStoreSupplier:
16 |         return InMemoryStateStoreSupplier(self.name, self.key_serde, self.value_serde, self.logging_enabled)
17 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/state/factory/key_value_store_factory.py:
--------------------------------------------------------------------------------
 1 | from typing import TypeVar, Generic
 2 | 
 3 | from winton_kafka_streams.processor.serialization import Serde
 4 | from winton_kafka_streams.state.factory.in_memory_key_value_store_factory import InMemoryKeyValueStoreFactory
 5 | 
 6 | KT = TypeVar('KT')  # Key type.
 7 | VT = TypeVar('VT')  # Value type.
 8 | 
 9 | 
10 | class KeyValueStoreFactory(Generic[KT, VT]):
11 |     def __init__(self, name: str, key_serde: Serde[KT], value_serde: Serde[VT]) -> None:
12 |         self.name: str = name
13 |         self.key_serde: Serde[KT] = key_serde
14 |         self.value_serde: Serde[VT] = value_serde
15 | 
16 |     def in_memory(self) -> InMemoryKeyValueStoreFactory[KT, VT]:
17 |         return InMemoryKeyValueStoreFactory[KT, VT](self.name, self.key_serde, self.value_serde)
18 | 
19 |     def persistent(self):
20 |         raise NotImplementedError("Persistent State Store not implemented")
21 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/state/factory/store_factory.py:
--------------------------------------------------------------------------------
 1 | from typing import TypeVar
 2 | 
 3 | from winton_kafka_streams.processor.serialization import Serde
 4 | from winton_kafka_streams.processor.serialization.serdes import *
 5 | from winton_kafka_streams.state.factory.value_store_factory import ValueStoreFactory
 6 | 
 7 | KT = TypeVar('KT')  # Key type.
 8 | 
 9 | 
10 | class StoreFactory:
11 |     def __init__(self, name: str) -> None:
12 |         self.name: str = name
13 | 
14 |     def _with_key_serde(self, serde: Serde[KT]) -> ValueStoreFactory[KT]:
15 |         key_serde: Serde[KT] = serde
16 |         configs = None  # TODO
17 |         is_key = True
18 |         key_serde.configure(configs, is_key)
19 |         return ValueStoreFactory[KT](self.name, key_serde)
20 | 
21 |     def with_string_keys(self) -> ValueStoreFactory[str]:
22 |         return self._with_key_serde(StringSerde())
23 | 
24 |     def with_integer_keys(self) -> ValueStoreFactory[int]:
25 |         return self._with_key_serde(IntegerSerde())
26 | 
27 |     def with_long_keys(self) -> ValueStoreFactory[int]:
28 |         return self._with_key_serde(LongSerde())
29 | 
30 |     def with_double_keys(self) -> ValueStoreFactory[float]:
31 |         return self._with_key_serde(DoubleSerde())
32 | 
33 |     def with_bytes_keys(self) -> ValueStoreFactory[bytes]:
34 |         return self._with_key_serde(BytesSerde())
35 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/state/factory/value_store_factory.py:
--------------------------------------------------------------------------------
 1 | from typing import TypeVar, Generic
 2 | 
 3 | from winton_kafka_streams.processor.serialization import Serde
 4 | from winton_kafka_streams.processor.serialization.serdes import *
 5 | from .key_value_store_factory import KeyValueStoreFactory
 6 | 
 7 | KT = TypeVar('KT')  # Key type.
 8 | VT = TypeVar('VT')  # Value type.
 9 | 
10 | 
11 | class ValueStoreFactory(Generic[KT]):
12 |     def __init__(self, name: str, key_serde: Serde[KT]) -> None:
13 |         self.name: str = name
14 |         self.key_serde: Serde[KT] = key_serde
15 | 
16 |     def _with_value_serde(self, serde: Serde[VT]) -> KeyValueStoreFactory[KT, VT]:
17 |         value_serde: Serde[VT] = serde
18 |         configs = None
19 |         is_key = False
20 |         value_serde.configure(configs, is_key)
21 |         return KeyValueStoreFactory[KT, VT](self.name, self.key_serde, value_serde)
22 | 
23 |     def with_string_values(self) -> KeyValueStoreFactory[KT, str]:
24 |         return self._with_value_serde(StringSerde())
25 | 
26 |     def with_integer_values(self) -> KeyValueStoreFactory[KT, int]:
27 |         return self._with_value_serde(IntegerSerde())
28 | 
29 |     def with_long_values(self) -> KeyValueStoreFactory[KT, int]:
30 |         return self._with_value_serde(LongSerde())
31 | 
32 |     def with_double_values(self) -> KeyValueStoreFactory[KT, float]:
33 |         return self._with_value_serde(DoubleSerde())
34 | 
35 |     def with_bytes_values(self) -> KeyValueStoreFactory[KT, bytes]:
36 |         return self._with_value_serde(BytesSerde())
37 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/state/in_memory/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wintoncode/winton-kafka-streams/5867a1c42fc80bba07173fd1d004b2849b429fdf/winton_kafka_streams/state/in_memory/__init__.py


--------------------------------------------------------------------------------
/winton_kafka_streams/state/in_memory/in_memory_state_store.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterator, TypeVar, MutableMapping
 2 | 
 3 | from winton_kafka_streams.processor.serialization import Serde
 4 | from ..key_value_state_store import KeyValueStateStore
 5 | from ..state_store import StateStore
 6 | 
 7 | KT = TypeVar('KT')  # Key type.
 8 | VT = TypeVar('VT')  # Value type.
 9 | 
10 | 
11 | class InMemoryStateStore(StateStore[KT, VT]):
12 |     def __init__(self,  name: str, key_serde: Serde[KT], value_serde: Serde[VT], logging_enabled: bool) -> None:
13 |         super().__init__(name, key_serde, value_serde, logging_enabled)
14 |         self.dict: MutableMapping[KT, VT] = {}
15 | 
16 |     def initialize(self, context, root) -> None:
17 |         pass
18 | 
19 |     def get_key_value_store(self) -> KeyValueStateStore[KT, VT]:
20 |         parent = self
21 | 
22 |         class InMemoryKeyValueStateStore(KeyValueStateStore[KT, VT]):
23 |             def __setitem__(self, k: KT, v: VT) -> None:
24 |                 parent.dict[k] = v
25 | 
26 |             def __delitem__(self, v: KT) -> None:
27 |                 del parent.dict[v]
28 | 
29 |             def __getitem__(self, k: KT) -> VT:
30 |                 return parent.dict[k]
31 | 
32 |             def __len__(self) -> int:
33 |                 return len(parent.dict)
34 | 
35 |             def __iter__(self) -> Iterator[KT]:
36 |                 return parent.dict.__iter__()
37 | 
38 |         return InMemoryKeyValueStateStore()
39 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/state/in_memory/in_memory_state_store_supplier.py:
--------------------------------------------------------------------------------
 1 | from typing import TypeVar
 2 | 
 3 | from winton_kafka_streams.processor.serialization import Serde
 4 | from .in_memory_state_store import InMemoryStateStore
 5 | from ..state_store import StateStore
 6 | from ..state_store_supplier import StateStoreSupplier
 7 | 
 8 | KT = TypeVar('KT')  # Key type.
 9 | VT = TypeVar('VT')  # Value type.
10 | 
11 | 
12 | class InMemoryStateStoreSupplier(StateStoreSupplier):
13 |     def __init__(self,  name: str, key_serde: Serde[KT], value_serde: Serde[VT], logging_enabled: bool) -> None:
14 |         super().__init__(name, key_serde, value_serde, logging_enabled)
15 | 
16 |     def _build_state_store(self) -> StateStore:
17 |         return InMemoryStateStore(self.name, self._key_serde, self._value_serde, self.logging_enabled)
18 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/state/key_value_state_store.py:
--------------------------------------------------------------------------------
 1 | from typing import TypeVar, Iterator, MutableMapping
 2 | 
 3 | 
 4 | from abc import abstractmethod
 5 | 
 6 | KT = TypeVar('KT')  # Key type.
 7 | VT = TypeVar('VT')  # Value type.
 8 | 
 9 | 
10 | class KeyValueStateStore(MutableMapping[KT, VT]):
11 |     """
12 |     Dict-like class is injected into a processors to provide an interface to the underlying StateStore
13 |     """
14 |     @abstractmethod
15 |     def __setitem__(self, k: KT, v: VT) -> None:
16 |         pass
17 | 
18 |     @abstractmethod
19 |     def __delitem__(self, v: KT) -> None:
20 |         pass
21 | 
22 |     @abstractmethod
23 |     def __getitem__(self, k: KT) -> VT:
24 |         pass
25 | 
26 |     @abstractmethod
27 |     def __len__(self) -> int:
28 |         pass
29 | 
30 |     @abstractmethod
31 |     def __iter__(self) -> Iterator[KT]:
32 |         pass
33 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/state/logging/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wintoncode/winton-kafka-streams/5867a1c42fc80bba07173fd1d004b2849b429fdf/winton_kafka_streams/state/logging/__init__.py


--------------------------------------------------------------------------------
/winton_kafka_streams/state/logging/change_logging_state_store.py:
--------------------------------------------------------------------------------
 1 | from typing import TypeVar, Iterator
 2 | 
 3 | from winton_kafka_streams.processor.serialization import Serde
 4 | from ..key_value_state_store import KeyValueStateStore
 5 | from ..state_store import StateStore
 6 | from .store_change_logger import StoreChangeLogger
 7 | 
 8 | KT = TypeVar('KT')  # Key type.
 9 | VT = TypeVar('VT')  # Value type.
10 | 
11 | 
12 | class ChangeLoggingStateStore(StateStore[KT, VT]):
13 |     def __init__(self,  name: str, key_serde: Serde[KT], value_serde: Serde[VT], logging_enabled: bool,
14 |                  inner_state_store: StateStore[KT, VT]) -> None:
15 |         super().__init__(name, key_serde, value_serde, logging_enabled)
16 |         self.inner_state_store = inner_state_store
17 |         self.change_logger = None
18 | 
19 |     def initialize(self, context, root):
20 |         self.inner_state_store.initialize(context, root)
21 |         self.change_logger = StoreChangeLogger(self.inner_state_store.name, context)
22 |         # TODO rebuild state into inner here
23 | 
24 |     def get_key_value_store(self) -> KeyValueStateStore[KT, VT]:
25 |         parent = self
26 | 
27 |         class ChangeLoggingKeyValueStore(KeyValueStateStore[KT, VT]):
28 |             # TODO : add write buffer
29 |             # TODO : use topic compaction to optimise state-rebuilding
30 | 
31 |             def __init__(self, change_logger: StoreChangeLogger) -> None:
32 |                 super(ChangeLoggingKeyValueStore, self).__init__()
33 |                 self.change_logger: StoreChangeLogger = change_logger
34 |                 self.inner_kv_store: KeyValueStateStore[KT, VT] = parent.inner_state_store.get_key_value_store()
35 | 
36 |             def __len__(self) -> int:
37 |                 return len(self.inner_kv_store)
38 | 
39 |             def __iter__(self) -> Iterator[KT]:
40 |                 return self.inner_kv_store.__iter__()
41 | 
42 |             def __setitem__(self, key: KT, value: VT):
43 |                 key_bytes = parent.serialize_key(key)
44 |                 value_bytes = parent.serialize_value(value)
45 |                 self.inner_kv_store.__setitem__(key, value)
46 |                 self.change_logger.log_change(key_bytes, value_bytes)
47 | 
48 |             def __getitem__(self, key: KT) -> VT:
49 |                 return self.inner_kv_store.__getitem__(key)
50 | 
51 |             def __delitem__(self, key: KT):
52 |                 key_bytes = parent.serialize_key(key)
53 |                 self.inner_kv_store.__delitem__(key)
54 |                 self.change_logger.log_change(key_bytes, b'')
55 | 
56 |         return ChangeLoggingKeyValueStore(self.change_logger)
57 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/state/logging/store_change_logger.py:
--------------------------------------------------------------------------------
 1 | class StoreChangeLogger:
 2 |     def __init__(self, store_name, context) -> None:
 3 |         self.topic = f'{context.application_id}-{store_name}-changelog'
 4 |         self.context = context
 5 |         self.partition = context.task_id.partition
 6 |         self.record_collector = context.state_record_collector
 7 | 
 8 |     def log_change(self, key: bytes, value: bytes) -> None:
 9 |         if self.record_collector:
10 |             self.record_collector.send(self.topic, key, value, self.context.timestamp, partition=self.partition)
11 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/state/state_store.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import TypeVar, Generic
 3 | 
 4 | from ..processor.serialization import Serde
 5 | from .key_value_state_store import KeyValueStateStore
 6 | 
 7 | KT = TypeVar('KT')  # Key type.
 8 | VT = TypeVar('VT')  # Value type.
 9 | 
10 | 
11 | class StateStore(ABC, Generic[KT, VT]):
12 |     """
13 |     StateStores are created by Suppliers for use in StreamTasks
14 |     """
15 |     def __init__(self, name: str, key_serde: Serde[KT], value_serde: Serde[VT], logging_enabled: bool) -> None:
16 |         self.logging_enabled: bool = logging_enabled
17 |         self._value_serde: Serde[VT] = value_serde
18 |         self._key_serde: Serde[KT] = key_serde
19 |         self._name: str = name
20 | 
21 |     @property
22 |     def name(self) -> str:
23 |         return self._name
24 | 
25 |     def serialize_key(self, key: KT) -> bytes:
26 |         return self._key_serde.serializer.serialize("", key)
27 | 
28 |     def deserialize_key(self, data: bytes) -> KT:
29 |         return self._key_serde.deserializer.deserialize("", data)
30 | 
31 |     def serialize_value(self, value: VT) -> bytes:
32 |         return self._value_serde.serializer.serialize("", value)
33 | 
34 |     def deserialize_value(self, data: bytes) -> VT:
35 |         return self._value_serde.deserializer.deserialize("", data)
36 | 
37 |     @abstractmethod
38 |     def initialize(self, context, root):
39 |         """
40 |         Initialize is called within a StreamTask once partitions are assigned, before processing starts.
41 |         State is rebuilt from the change log at this point.
42 |         :param context:
43 |         :param root:
44 |         :return:  None
45 |         """
46 |         pass
47 | 
48 |     @abstractmethod
49 |     def get_key_value_store(self) -> KeyValueStateStore[KT, VT]:
50 |         pass
51 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/state/state_store_supplier.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | from typing import TypeVar, Generic
 4 | 
 5 | from winton_kafka_streams.state.logging.change_logging_state_store import ChangeLoggingStateStore
 6 | from .state_store import StateStore
 7 | from ..processor.serialization import Serde
 8 | 
 9 | KT = TypeVar('KT')  # Key type.
10 | VT = TypeVar('VT')  # Value type.
11 | 
12 | 
13 | class StateStoreSupplier(ABC, Generic[KT, VT]):
14 |     """
15 |     StateStoreSuppliers are added to a topology and are accessible from each StreamThread
16 | 
17 |     """
18 | 
19 |     def __init__(self, name: str, key_serde: Serde[KT], value_serde: Serde[VT], logging_enabled: bool) -> None:
20 |         self.logging_enabled: bool = logging_enabled
21 |         self._value_serde: Serde[VT] = value_serde
22 |         self._key_serde: Serde[KT] = key_serde
23 |         self._name: str = name
24 | 
25 |     @property
26 |     def name(self) -> str:
27 |         return self._name
28 | 
29 |     @abstractmethod
30 |     def _build_state_store(self) -> StateStore[KT, VT]:
31 |         pass
32 | 
33 |     def get(self) -> StateStore[KT, VT]:
34 |         """Create a StateStore for each StreamTask. *These StateStores may exist in different threads.*"""
35 |         inner = self._build_state_store()
36 |         if self.logging_enabled:
37 |             return ChangeLoggingStateStore(self.name, self._key_serde, self._value_serde, self.logging_enabled, inner)
38 |         else:
39 |             return inner
40 | 


--------------------------------------------------------------------------------
/winton_kafka_streams/version.py:
--------------------------------------------------------------------------------
1 | from setuptools_scm import get_version
2 | version = get_version()
3 | 


--------------------------------------------------------------------------------