├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── benchmark-requirements.txt
├── benchmark.py
├── docker-compose.yaml
├── docs
    ├── DESIGN.md
    ├── YETANOTHER.md
    └── benchmark.png
├── kinesis
    ├── __init__.py
    ├── aggregators.py
    ├── base.py
    ├── checkpointers.py
    ├── consumer.py
    ├── exceptions.py
    ├── processors.py
    ├── producer.py
    ├── serializers.py
    └── utils.py
├── mypy.ini
├── requirements.txt
├── setup.cfg
├── setup.py
├── test-requirements.txt
├── tests.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
 1 | dist
 2 | *.egg-info
 3 | .idea
 4 | *.pyc
 5 | build
 6 | notes.txt
 7 | deploy.sh
 8 | temp.py
 9 | .env
10 | .mypy_cache
11 | .tox
12 | release.sh
13 | test_*


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7-slim
 2 | 
 3 | RUN apt-get update && apt-get install -y gcc python-dev gettext-base
 4 | 
 5 | RUN mkdir /app
 6 | 
 7 | COPY requirements.txt /app/requirements.txt
 8 | 
 9 | RUN pip install -r /app/requirements.txt
10 | 
11 | COPY test-requirements.txt /app/test-requirements.txt
12 | 
13 | RUN pip install -r /app/test-requirements.txt
14 | 
15 | COPY kinesis /app/kinesis/
16 | 
17 | COPY tests.py /app/tests.py
18 | 
19 | WORKDIR /app/
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # async-kinesis
  2 | 
  3 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) [![PyPI version](https://badge.fury.io/py/async-kinesis.svg)](https://badge.fury.io/py/async-kinesis) [![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)](https://www.python.org/downloads/release/python-370/) [![Python 3.8](https://img.shields.io/badge/python-3.8-blue.svg)](https://www.python.org/downloads/release/python-380/)
  4 | 
  5 | ```
  6 | pip install async-kinesis
  7 | ```
  8 | 
  9 | ## Features
 10 | 
 11 | - uses queues for both producer and consumer
 12 |   - producer flushes with put_records() if has enough to flush or after "buffer_time" reached
 13 |   - consumer iterates over msg queue independent of shard readers
 14 | - Configurable to handle Sharding limits but will throttle/retry if required
 15 |   - ie multiple independent clients are saturating the Shards
 16 | - Checkpointing with heartbeats
 17 |   - deadlock + reallocation of shards if checkpoint fails to heartbeat within "session_timeout"
 18 | - processors (aggregator + serializer)
 19 |     - json line delimited, msgpack
 20 | 
 21 | 
 22 | See [docs/design](./docs/DESIGN.md) for more details.
 23 | See [docs/yetanother](docs/YETANOTHER.md) as to why reinvent the wheel.
 24 | 
 25 | ## Environment Variables
 26 | 
 27 | As required by boto3
 28 | 
 29 | ```
 30 | AWS_ACCESS_KEY_ID
 31 | AWS_SECRET_ACCESS_KEY
 32 | ```
 33 | 
 34 | ## Producer
 35 | 
 36 |     from kinesis import Producer
 37 | 
 38 |     async with Producer(stream_name="test") as producer:
 39 |         # Put item onto queue to be flushed via put_records()
 40 |         await producer.put({'my': 'data'})
 41 | 
 42 | 
 43 | Options:
 44 | 
 45 | (comments in quotes are Kinesis Limits as per AWS Docs)
 46 | 
 47 | | Arg | Default | Description |
 48 | | --- | --- | --- |
 49 | | session | None | AioSession (to use non default profile etc) |
 50 | | region_name | None | AWS Region |
 51 | | buffer_time | 0.5 | Buffer time in seconds before auto flushing records |
 52 | | put_rate_limit_per_shard | 1000 | "A single shard can ingest up to 1 MiB of data per second (including partition keys) or 1,000 records per second for writes" |
 53 | | put_bandwidth_limit_per_shard | 1024 | Kb per sec. max is 1024 per shard (ie 1 MiB). Keep below to minimize ProvisionedThroughputExceeded" errors * |
 54 | | batch_size | 500 | "Each PutRecords request can support up to 500 records" |
 55 | | max_queue_size | 10000 | put() method will block when queue is at max |
 56 | | after_flush_fun | None | async function to call after doing a flush (err put_records()) call |
 57 | | processor | JsonProcessor() | Record aggregator/serializer. Default is JSON without aggregation. Note this is highly inefficient as each record can be up to 1Mib |
 58 | | retry_limit | None | How many connection attempts should be made before raising a exception |
 59 | | expo_backoff | None | Exponential Backoff when connection attempt fails |
 60 | | expo_backoff_limit | 120 | Max amount of seconds Exponential Backoff can grow |
 61 | | create_stream | False | Creates a Kinesis Stream based on the `stream_name` keyword argument. Note if stream already existing it will ignore |
 62 | | create_stream_shards | 1 | Sets the amount of shard you want for your new stream. Note if stream already existing it will ignore  |
 63 | 
 64 | * Throughput exceeded. The docs (for Java/KPL see: https://docs.aws.amazon.com/streams/latest/dev/kinesis-producer-adv-retries-rate-limiting.html) state:
 65 | 
 66 | > You can lower this limit to reduce spamming due to excessive retries. However, the best practice is for each producer is to retry for maximum throughput aggressively and to handle any resulting throttling determined as excessive by expanding the capacity of the stream and implementing an appropriate partition key strategy.
 67 | 
 68 | Even though our default here is to limit at this threshold (1024kb) in reality the threshold seems lower (~80%).
 69 | If you wish to avoid excessive throttling or have multiple producers on a stream you will want to set this quite a bit lower.
 70 | 
 71 | 
 72 | ## Consumer
 73 | 
 74 |     from kinesis import Consumer
 75 | 
 76 |     async with Consumer(stream_name="test") as consumer:
 77 |         while True:
 78 |             async for item in consumer:
 79 |                 print(item)
 80 |             # caught up.. take a breather~
 81 | 
 82 | 
 83 | Options:
 84 | 
 85 | (comments in quotes are Kinesis Limits as per AWS Docs)
 86 | 
 87 | 
 88 | | Arg | Default | Description |
 89 | | --- | --- | --- |
 90 | | session | None | AioSession (to use non default profile etc) |
 91 | | region_name | None | AWS Region |
 92 | | max_queue_size | 10000 | the fetch() task shard will block when queue is at max |
 93 | | max_shard_consumers | None | Max number of shards to use. None = all |
 94 | | record_limit | 10000 | Number of records to fetch with get_records() |
 95 | | sleep_time_no_records | 2 | No of seconds to sleep when caught up |
 96 | | iterator_type | TRIM_HORIZON | Default shard iterator type for new/unknown shards (ie start from start of stream). Alternatives are "LATEST" (ie end of stream), "AT_TIMESTAMP" (ie particular point in time, requires defining `timestamp` arg) |
 97 | | shard_fetch_rate | 1 | No of fetches per second (max = 5). 1 is recommended as allows having multiple consumers without hitting the max limit. |
 98 | | checkpointer | MemoryCheckPointer() | Checkpointer to use |
 99 | | processor | JsonProcessor() |  Record aggregator/serializer. Must Match processor used by Producer() |
100 | | retry_limit | None | How many connection attempts should be made before raising a exception |
101 | | expo_backoff | None | Exponential Backoff when connection attempt fails |
102 | | expo_backoff_limit | 120 | Max amount of seconds Exponential Backoff can grow |
103 | | create_stream | False | Creates a Kinesis Stream based on the `stream_name` keyword argument. Note if stream already existing it will ignore |
104 | | create_stream_shards | 1 | Sets the amount of shard you want for your new stream. Note if stream already existing it will ignore  |
105 | | timestamp | None | Timestamp to start reading stream from. Used with iterator type "AT_TIMESTAMP"
106 | 
107 | 
108 | ## Checkpointers
109 | 
110 | - memory (the default but kinda pointless)
111 | 
112 | ```
113 |     MemoryCheckPointer()
114 | ```
115 | 
116 | - redis
117 | 
118 | ```
119 |     RedisCheckPointer(name, session_timeout=60, heartbeat_frequency=15, is_cluster=False)
120 | ```
121 | 
122 | Requires ENV:
123 | 
124 | ```
125 |     REDIS_HOST
126 | ```
127 | 
128 | Requires `pip install aredis`
129 | 
130 | 
131 | ## Processors (Aggregator + Serializer)
132 | 
133 | 
134 | Aggregation enable batching up multiple records to more efficiently use the stream.
135 | Refer https://aws.amazon.com/blogs/big-data/implementing-efficient-and-reliable-producers-with-the-amazon-kinesis-producer-library/
136 | 
137 | 
138 | | Class | Aggregator | Serializer | Description |
139 | | --- | --- | --- | --- |
140 | | StringProcessor | SimpleAggregator | StringSerializer | Single String record |
141 | | JsonProcessor | SimpleAggregator | JsonSerializer | Single JSON record |
142 | | JsonLineProcessor | NewlineAggregator | JsonSerializer | Multiple JSON record separated by new line char |
143 | | JsonListProcessor | ListAggregator | JsonSerializer | Multiple JSON record returned by list |
144 | | MsgpackProcessor | NetstringAggregator | MsgpackSerializer | Multiple Msgpack record framed with Netstring Protocol (https://en.wikipedia.org/wiki/Netstring) |
145 | | KPLJsonProcessor | KPLAggregator | JsonSerializer | Multiple JSON record in a KPL Aggregated Record (https://github.com/awslabs/amazon-kinesis-producer/blob/master/aggregation-format.md) |
146 | | KPLStringProcessor | KPLAggregator | StringSerializer | Multiple String record in a KPL Aggregated Record (https://github.com/awslabs/amazon-kinesis-producer/blob/master/aggregation-format.md) | 
147 | 
148 | Note you can define your own processor easily as it's simply a class inheriting the Aggregator + Serializer.
149 | 
150 | ```
151 | class MsgpackProcessor(Processor, NetstringAggregator, MsgpackSerializer):
152 |     pass
153 | ```
154 | 
155 | Just define a new Serializer class with serialize() and deserialize() methods.
156 | 
157 | Note:
158 | 
159 | * Json will use `pip install ujson` if installed
160 | * Msgpack requires `pip install msgpack` to be installed
161 | * KPL requires `pip install aws-kinesis-agg` to be installed
162 | 
163 | ## Benchmark/Example
164 | 
165 | See [benchmark.py](./benchmark.py) for code
166 | 
167 | 50k items of approx 1k (python) in size, using single shard.
168 | 
169 | ![Benchmark](docs/benchmark.png)
170 | 
171 | 
172 | ## Unit Testing
173 | 
174 | Uses https://github.com/mhart/kinesalite for local testing.
175 | 
176 | Run tests via docker
177 | 
178 | ```
179 | docker-compose up --abort-on-container-exit --exit-code-from test
180 | ```
181 | 
182 | For local testing use
183 | 
184 | ```
185 | docker-compose up kinesis redis
186 | ```
187 | 
188 | then within your virtualenv
189 | 
190 | ```
191 | nosetests
192 | 
193 | # or run individual test
194 | nosetests tests.py:KinesisTests.test_create_stream_shard_limit_exceeded
195 | ```
196 | 
197 | Note there are a few test cases using the *actual* AWS Kinesis (AWSKinesisTests)
198 | These require setting an env in order to run
199 | 
200 | Create an ".env" file with
201 | 
202 | ```
203 | TESTING_USE_AWS_KINESIS=1
204 | ```
205 | 
206 | Note you can ignore these tests if submitting PR unless core batching/processing behaviour is being changed.
207 | 
208 | 
209 | 


--------------------------------------------------------------------------------
/benchmark-requirements.txt:
--------------------------------------------------------------------------------
1 | mimesis==3.2.0
2 | humanize==0.5.1
3 | terminaltables==3.1.0
4 | coloredlogs==10.0
5 | contexttimer==0.3.3
6 | 


--------------------------------------------------------------------------------
/benchmark.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     pip install -r benchmark-requirements.txt
  3 | 
  4 |     Note: This will create a shard called "test" on your AWS.
  5 |     Your responsibility to delete it afterwards!!
  6 | 
  7 | 
  8 | """
  9 | import asyncio
 10 | import math
 11 | import logging
 12 | import coloredlogs
 13 | import copy
 14 | import itertools
 15 | import sys
 16 | import humanize
 17 | from terminaltables import AsciiTable
 18 | from contexttimer import Timer
 19 | from kinesis import (
 20 |     Producer,
 21 |     Consumer,
 22 |     JsonProcessor,
 23 |     JsonLineProcessor,
 24 |     MsgpackProcessor,
 25 | )
 26 | from mimesis import Person, Address, Datetime
 27 | 
 28 | coloredlogs.install(level="DEBUG")
 29 | 
 30 | logging.getLogger("botocore").setLevel(logging.WARNING)
 31 | logging.getLogger("kinesis.consumer").setLevel(logging.DEBUG)
 32 | logging.getLogger("kinesis.checkpointers").setLevel(logging.INFO)
 33 | 
 34 | log = logging.getLogger(__name__)
 35 | 
 36 | 
 37 | def generate_random_data():
 38 |     p = Person()
 39 |     a = Address()
 40 | 
 41 |     return {
 42 |         "name": p.full_name(),
 43 |         "email": p.email(),
 44 |         "nationality": p.nationality(),
 45 |         "occupation": p.occupation(),
 46 |         "password": p.password(),
 47 |         "phone": p.telephone(),
 48 |         "address": a.address(),
 49 |         "city": a.city(),
 50 |         "street_no": a.street_number(),
 51 |         "created": Datetime().date().isoformat(),
 52 |     }
 53 | 
 54 | 
 55 | def generate_dataset(n):
 56 |     return [generate_random_data() for _ in range(n)]
 57 | 
 58 | 
 59 | def copy_dataset(data, n):
 60 |     return list(itertools.chain.from_iterable([copy.copy(data) for _ in range(n)]))
 61 | 
 62 | 
 63 | async def test_producer(data, processor):
 64 |     log.info("Testing with {}".format(processor.__class__.__name__))
 65 |     async with Producer(
 66 |         stream_name="test", processor=processor, max_queue_size=100000
 67 |     ) as producer:
 68 | 
 69 |         await producer.create_stream(shards=1, ignore_exists=True)
 70 | 
 71 |         async with Consumer(
 72 |             stream_name="test",
 73 |             processor=processor,
 74 |             max_queue_size=100000,
 75 |             iterator_type="LATEST",
 76 |         ) as consumer:
 77 | 
 78 |             # ensure set up before producer puts records as using LATEST
 79 |             await consumer.start_consumer(wait_iterations=0)
 80 | 
 81 |             with Timer() as t:
 82 |                 for item in data:
 83 |                     await producer.put(item)
 84 |                 await producer.flush()
 85 | 
 86 |             total = 0
 87 |             while total < len(data):
 88 |                 async for _ in consumer:
 89 |                     total += 1
 90 | 
 91 |     if len(data) != total:
 92 |         log.error(
 93 |             "Failed to read all records.. expected {} read {}".format(len(data), total)
 94 |         )
 95 |         return False, None
 96 | 
 97 |     log.info(
 98 |         "Completed {} records (read: {}) in {} seconds".format(
 99 |             len(data), total, round(t.elapsed, 2)
100 |         )
101 |     )
102 | 
103 |     return True, round(t.elapsed, 2)
104 | 
105 | 
106 | async def test():
107 |     n = 50000
108 | 
109 |     data = generate_dataset(500)
110 | 
111 |     multiplier = math.ceil(n / 500)
112 | 
113 |     python_bytes = sum([sys.getsizeof(x) for x in data]) * multiplier
114 | 
115 |     result = []
116 | 
117 |     for processor in [JsonProcessor(), JsonLineProcessor(), MsgpackProcessor()]:
118 | 
119 |         all_data = copy_dataset(data, multiplier)
120 | 
121 |         aggregator_bytes = 0
122 |         for x in data:
123 |             for size, _, _ in processor.add_item(x):
124 |                 aggregator_bytes += size
125 | 
126 |         if processor.has_items():
127 |             for size, _, _ in processor.get_items():
128 |                 aggregator_bytes += size
129 | 
130 |         aggregator_bytes *= multiplier
131 | 
132 |         success, elapsed_ts = await test_producer(data=all_data, processor=processor)
133 | 
134 |         if success:
135 |             result.append(
136 |                 [
137 |                     processor.__class__.__name__,
138 |                     humanize.naturalsize(python_bytes),
139 |                     humanize.naturalsize(aggregator_bytes),
140 |                     elapsed_ts,
141 |                     round(n / elapsed_ts),
142 |                     humanize.naturalsize(python_bytes / elapsed_ts),
143 |                     humanize.naturalsize(aggregator_bytes / elapsed_ts),
144 |                 ]
145 |             )
146 | 
147 |             # Pause a bit
148 |             await asyncio.sleep(2)
149 | 
150 |     print("\n\n Results for {} records:\n".format(n))
151 |     print(
152 |         AsciiTable(
153 |             [
154 |                 [
155 |                     "Aggregator",
156 |                     "Python Bytes",
157 |                     "Kinesis Bytes",
158 |                     "Time (Seconds)",
159 |                     "RPS",
160 |                     "Python BPS",
161 |                     "Kinesis BPS",
162 |                 ]
163 |             ]
164 |             + result
165 |         ).table
166 |     )
167 |     print("\n")
168 | 
169 | 
170 | asyncio.run(test())
171 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.3'
 2 | services:
 3 | 
 4 |   kinesis:
 5 |     image: vsouza/kinesis-local:latest
 6 |     command: --port 4567 --shardLimit 10000
 7 |     restart: always
 8 |     ports:
 9 |       - 4567:4567
10 |   redis:
11 |     image: redis:latest
12 |     restart: always
13 |     ports:
14 |       - 16379:6379
15 | 
16 |   test:
17 |      container_name: async-kinesis-test
18 |      command: ['nosetests']
19 |      volumes:
20 |       - ./tests.py:/app/tests.py
21 |       - ./kinesis:/app/kinesis
22 |      build:
23 |        context: .
24 |        dockerfile: Dockerfile
25 |      environment:
26 |       - AWS_DEFAULT_REGION=ap-southeast-2
27 |       - ENDPOINT_URL=http://kinesis:4567
28 |       - REDIS_HOST=redis
29 |       - REDIS_PORT=6379
30 |       - AWS_ACCESS_KEY_ID=
31 |       - AWS_SECRET_ACCESS_KEY=
32 |      links:
33 |       - kinesis:kinesis
34 |       - redis:redis
35 | 


--------------------------------------------------------------------------------
/docs/DESIGN.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ## Consumer Design
 4 | 
 5 | (Bears some explanation, kinda complex~)
 6 | 
 7 | - fetch() gets called periodically (0.2 sec (ie max 5x per second as is the limit on shard get_records()))
 8 |   - iterate over the list of shards (set on startup, does not currently detect resharding)
 9 |     - assign shard if not in use and not at "max_shard_consumers" limit otherwise ignore/continue
10 |     - ignore/continue if this shard is still fetching
11 |     - process records if shard is done fetching
12 |         - put records on queue
13 |         - add checkpoint record to queue
14 |         - assign NextShardIterator
15 |     - create (get_records()) task again
16 | 
17 | Note that get_records() is throttled via "shard_fetch_rate=5" (ie the same 0.2 sec/ 5x limit)
18 | 
19 | This pattern seemed like the easiest way to maintain a pool of consumers without needing to think too hard about starting it's next job or handling new shards etc.
20 | 
21 | 
22 | See also
23 | 
24 | https://aws.amazon.com/blogs/big-data/implementing-efficient-and-reliable-producers-with-the-amazon-kinesis-producer-library/
25 | 
26 | 


--------------------------------------------------------------------------------
/docs/YETANOTHER.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Yet another Python Kinesis Library?
 3 | 
 4 | Sadly I had issues with every other library I could find :(
 5 | 
 6 | * https://github.com/NerdWalletOSS/kinesis-python
 7 |     * pro:
 8 |         * kinda works
 9 |     * con
10 |         * threaded
11 |         * Outstanding PR to fix some issues
12 |         * checkpoints on every record on main thread
13 | 
14 | * https://github.com/ungikim/kinsumer
15 |     * pro:
16 |         * handles shard changes
17 |         * no producer
18 |         * no redis checkpointer/heartbeat
19 |         * threaded/seems kinda complicated~
20 |     * con
21 |         * consumer only
22 |          
23 | * https://github.com/bufferapp/kiner
24 |     * pro:
25 |         * Batching
26 |     * con
27 |         * Producer only
28 | 
29 | * https://github.com/niklio/aiokinesis
30 |     * pro:
31 |         * asyncio
32 |         * no checkpointing
33 |     * con
34 |         * limited to 1 shard / too simplistic
35 | 
36 | * https://github.com/ticketea/pynesis
37 |     * pro:
38 |         * checkpoints
39 |     * con
40 |         * hasn't been updated for 1 year
41 |         * doesnt use put_records()
42 |         * single threaded / round robin reads shards
43 | 
44 | * https://github.com/whale2/async-kinesis-client
45 |     * pro:
46 |         * checkpoints
47 |         * asyncio
48 |     * con
49 |         * ?
50 | 
51 | (Actually I only found this one recently, might be ok alternative?)
52 | 


--------------------------------------------------------------------------------
/docs/benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hampsterx/async-kinesis/d07efb2bcc6c4963e77b524c0eed1a250d3db401/docs/benchmark.png


--------------------------------------------------------------------------------
/kinesis/__init__.py:
--------------------------------------------------------------------------------
 1 | from .producer import Producer
 2 | from .consumer import Consumer
 3 | from .processors import (
 4 |     StringProcessor,
 5 |     JsonProcessor,
 6 |     JsonLineProcessor,
 7 |     JsonListProcessor,
 8 |     MsgpackProcessor,
 9 | )
10 | from .serializers import StringSerializer, JsonSerializer, MsgpackSerializer
11 | from .checkpointers import MemoryCheckPointer, RedisCheckPointer
12 | from .aggregators import (
13 |     SimpleAggregator,
14 |     NewlineAggregator,
15 |     NetstringAggregator,
16 |     ListAggregator,
17 | )
18 | 


--------------------------------------------------------------------------------
/kinesis/aggregators.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import math
  3 | from collections import namedtuple
  4 | from .exceptions import ValidationError
  5 | from .exceptions import ExceededPutLimit
  6 | 
  7 | try:
  8 |     import aws_kinesis_agg
  9 |     import aws_kinesis_agg.aggregator
 10 |     import aws_kinesis_agg.kpl_pb2
 11 | except ModuleNotFoundError:
 12 |     pass
 13 | 
 14 | log = logging.getLogger(__name__)
 15 | 
 16 | OutputItem = namedtuple("OutputItem", ["size", "n", "data"])
 17 | 
 18 | 
 19 | class BaseAggregator:
 20 |     def __init__(self, max_size=None):
 21 | 
 22 |         if not max_size:
 23 |             max_size = 1024
 24 | 
 25 |         put_units = math.floor(max_size / 25)
 26 | 
 27 |         if put_units <= 0:
 28 |             raise ValidationError(
 29 |                 "max_size is too low. Should be at least one PUT Payload Unit (25Kb)"
 30 |             )
 31 | 
 32 |         if put_units > 40:
 33 |             raise ValidationError(
 34 |                 "max_size is too high. Should be no higher than 40x PUT Payload Units (25Kb)"
 35 |             )
 36 | 
 37 |         self.max_bytes = put_units * 25 * 1024
 38 | 
 39 |         log.debug(
 40 |             "setting max_bytes to {} ({} PUT Payload Units (25kb))".format(
 41 |                 self.max_bytes, put_units
 42 |             )
 43 |         )
 44 | 
 45 |         self.buffer = []
 46 |         self.size = 0
 47 | 
 48 |     def validate_size(self, size):
 49 |         if size > self.max_bytes:
 50 |             raise ExceededPutLimit("Put of {} bytes exceeded 1MB limit".format(size))
 51 | 
 52 |     def parse(self, data):
 53 |         yield self.deserialize(data)
 54 | 
 55 | 
 56 | class SimpleAggregator(BaseAggregator):
 57 |     """
 58 |     Simple Aggregator (Does NOT aggregate)
 59 |     Sends a single record only (high inefficient)
 60 |     """
 61 | 
 62 |     def has_items(self):
 63 |         return False
 64 | 
 65 |     def add_item(self, item):
 66 |         output = self.serialize(item)
 67 |         size = len(output)
 68 | 
 69 |         self.validate_size(size)
 70 | 
 71 |         yield OutputItem(size=size, n=1, data=output)
 72 | 
 73 | 
 74 | class Aggregator(BaseAggregator):
 75 |     """
 76 |     Aggregator
 77 |     Sends an aggregated record
 78 |     """
 79 | 
 80 |     def has_items(self):
 81 |         return self.size > 0
 82 | 
 83 |     def get_header_size(self, data):
 84 |         raise NotImplementedError()
 85 | 
 86 |     def add_item(self, item):
 87 |         output = self.serialize(item)
 88 |         size = len(output)
 89 | 
 90 |         self.validate_size(size)
 91 | 
 92 |         header_size = self.get_header_size(output)
 93 | 
 94 |         if size + self.size + header_size < self.max_bytes:
 95 | 
 96 |             self.buffer.append((size, output))
 97 |             self.size += size + header_size
 98 | 
 99 |         else:
100 |             log.debug(
101 |                 "Yielding item to queue with {} individual records with size of {} kb".format(
102 |                     len(self.buffer), round(self.size / 1024)
103 |                 )
104 |             )
105 |             yield OutputItem(size=self.size, n=len(self.buffer), data=self.output())
106 |             self.buffer = [(size, output)]
107 |             self.size = size
108 | 
109 |         log.debug("Adding item to queue with size of {} kb".format(round(size / 1024)))
110 | 
111 |     def get_items(self):
112 |         log.debug(
113 |             "Yielding (final) item to queue with {} individual records with size of {} kb".format(
114 |                 len(self.buffer), round(self.size / 1024)
115 |             )
116 |         )
117 |         yield OutputItem(size=self.size, n=len(self.buffer), data=self.output())
118 |         self.buffer = []
119 |         self.size = 0
120 | 
121 | 
122 | class NewlineAggregator(Aggregator):
123 |     def get_header_size(self, output):
124 |         return 1
125 | 
126 |     def output(self):
127 |         return b"\n".join([x[1] for x in self.buffer] + [b""])
128 | 
129 |     def parse(self, data):
130 |         for row in data.split(b"\n"):
131 |             if row:
132 |                 yield self.deserialize(row)
133 | 
134 | 
135 | class ListAggregator(Aggregator):
136 |     def get_header_size(self, output):
137 |         return 1
138 | 
139 |     def output(self):
140 |         return self.serialize([self.deserialize(x[1]) for x in self.buffer])
141 | 
142 |     def parse(self, data):
143 |         yield self.deserialize(data)
144 | 
145 | 
146 | class NetstringAggregator(Aggregator):
147 |     """
148 |     Netstring Aggregation
149 |     Framing = {x} bytes (ascii int for size) + 1 byte (":") + data + trailing ","
150 |     See: https://en.wikipedia.org/wiki/Netstring
151 |     """
152 | 
153 |     def get_header_size(self, output):
154 |         return len(str(len(output))) + 2
155 | 
156 |     def output(self):
157 |         frame = []
158 | 
159 |         for size, data in self.buffer:
160 |             frame.append(str(size).encode("ascii"))
161 |             frame.append(b":")
162 |             frame.append(data)
163 |             frame.append(b",")
164 | 
165 |         return b"".join(frame)
166 | 
167 |     def parse(self, data):
168 | 
169 |         i = 0
170 |         length = len(data)
171 | 
172 |         while True:
173 |             header_offset = data[i:].index(b":")
174 |             size = int(data[i : i + header_offset].decode("ascii"))
175 |             item = data[i + header_offset + 1 : i + header_offset + 1 + size]
176 |             yield self.deserialize(item)
177 | 
178 |             i += header_offset + size + 2
179 |             if i == length:
180 |                 break
181 | 
182 | 
183 | class KPLAggregator(Aggregator):
184 |     """
185 |     KPL Aggregated Record Aggregation
186 |     See: https://github.com/awslabs/kinesis-aggregation/tree/master/python
187 |     """
188 | 
189 |     def __init__(self, max_size=None):
190 |         if max_size:
191 |             self.agg = aws_kinesis_agg.aggregator.RecordAggregator(max_size=max_size)
192 |         else:
193 |             self.agg = aws_kinesis_agg.aggregator.RecordAggregator()
194 | 
195 |     def has_items(self):
196 |         return self.agg.get_num_user_records() > 0
197 | 
198 |     def add_item(self, item):
199 |         output = self.serialize(item)
200 |         record = self.agg.add_user_record("a", output)
201 |         self.size = self.agg.get_num_user_records()
202 |         if record:
203 |             size = record.get_size_bytes()
204 |             n = record.get_num_user_records()
205 |             partition_key, explicit_hash_key, data = record.get_contents()
206 |             yield OutputItem(size=size, n=n, data=data)
207 | 
208 |     def get_items(self):
209 |         record = self.agg.clear_and_get()
210 |         if record:
211 |             size = record.get_size_bytes()
212 |             n = record.get_num_user_records()
213 |             partition_key, explicit_hash_key, data = record.get_contents()
214 |             yield OutputItem(size=size, n=n, data=data)
215 | 
216 |     def parse(self, data):
217 |         message_data = data[len(aws_kinesis_agg.MAGIC) : -aws_kinesis_agg.DIGEST_SIZE]
218 |         ar = aws_kinesis_agg.kpl_pb2.AggregatedRecord()
219 |         ar.ParseFromString(message_data)
220 |         for record in ar.records:
221 |             yield self.deserialize(record.data)
222 | 


--------------------------------------------------------------------------------
/kinesis/base.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | from async_timeout import timeout
  4 | from aiobotocore.session import AioSession
  5 | from asyncio import CancelledError
  6 | from botocore.exceptions import ClientError
  7 | from botocore.config import Config
  8 | import time
  9 | 
 10 | from . import exceptions
 11 | 
 12 | log = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class Base:
 16 |     def __init__(
 17 |         self,
 18 |         stream_name,
 19 |         session=None,
 20 |         endpoint_url=None,
 21 |         region_name=None,
 22 |         retry_limit=None,
 23 |         expo_backoff=None,
 24 |         expo_backoff_limit=120,
 25 |         skip_describe_stream=False,
 26 |         create_stream=False,
 27 |         create_stream_shards=1,
 28 |     ):
 29 | 
 30 |         self.stream_name = stream_name
 31 | 
 32 |         if session:
 33 |             assert isinstance(session, AioSession)
 34 |             self.session = session
 35 |         else:
 36 |             self.session = AioSession()
 37 | 
 38 |         self.endpoint_url = endpoint_url
 39 |         self.region_name = region_name
 40 | 
 41 |         self.client = None
 42 |         self.shards = None
 43 | 
 44 |         self.stream_status = None
 45 | 
 46 |         self.retry_limit = retry_limit
 47 |         self.expo_backoff = expo_backoff
 48 |         self.expo_backoff_limit = expo_backoff_limit
 49 | 
 50 |         # connection states of kinesis client
 51 |         self.RECONNECT = "RECONNECT"
 52 |         self.ACTIVE = "ACTIVE"
 53 |         self.INITIALIZE = "INITIALIZE"
 54 | 
 55 |         self.stream_status = self.INITIALIZE
 56 |         # Short Lived producer might want to skip describing stream on startup
 57 |         self.skip_describe_stream = skip_describe_stream
 58 |         self._conn_lock = asyncio.Lock()
 59 |         self._reconnect_timeout = time.monotonic()
 60 |         self.create_stream = create_stream
 61 |         self.create_stream_shards = create_stream_shards
 62 | 
 63 |     async def __aenter__(self):
 64 | 
 65 |         log.info(
 66 |             "creating client with {}".format(
 67 |                 self.endpoint_url if self.endpoint_url else "AWS default endpoint"
 68 |             )
 69 |         )
 70 | 
 71 |         try:
 72 |             await self.get_conn()
 73 |         except exceptions.StreamDoesNotExist:
 74 |             await self.close()
 75 |             raise
 76 |         except:
 77 |             raise
 78 | 
 79 |         return self
 80 | 
 81 |     async def __aexit__(self, exc_type, exc, tb):
 82 |         await self.close()
 83 |         await self.client.__aexit__(exc_type, exc, tb)
 84 | 
 85 |     async def get_client(self):
 86 | 
 87 |         # Note: max_attempts = 0
 88 |         # Boto RetryHandler only handles these errors:
 89 |         #  GENERAL_CONNECTION_ERROR => ConnectionError, ConnectionClosedError, ReadTimeoutError, EndpointConnectionError
 90 |         # Still have to handle ClientError anyway~
 91 | 
 92 |         self.client = await self.session.create_client(
 93 |             "kinesis",
 94 |             endpoint_url=self.endpoint_url,
 95 |             region_name=self.region_name,
 96 |             config=Config(
 97 |                 connect_timeout=5, read_timeout=90, retries={"max_attempts": 0}
 98 |             ),
 99 |         ).__aenter__()
100 | 
101 |     async def get_stream_description(self):
102 | 
103 |         try:
104 |             return (await self.client.describe_stream(StreamName=self.stream_name))[
105 |                 "StreamDescription"
106 |             ]
107 |         except ClientError as err:
108 |             code = err.response["Error"]["Code"]
109 |             if code == "ResourceNotFoundException":
110 |                 raise exceptions.StreamDoesNotExist(
111 |                     "Stream '{}' does not exist".format(self.stream_name)
112 |                 ) from None
113 |             raise
114 | 
115 |     async def start(self):
116 | 
117 |         await self.get_client()
118 | 
119 |         if self.create_stream:
120 |             await self._create_stream()
121 |             self.create_stream = False
122 | 
123 |         if self.skip_describe_stream:
124 |             log.debug(
125 |                 "Skipping Describe stream '{}'. Assuming it exists..".format(
126 |                     self.stream_name
127 |                 )
128 |             )
129 |             self.shards = []
130 | 
131 |         log.debug("Checking stream '{}' is active".format(self.stream_name))
132 | 
133 |         async with timeout(60) as cm:
134 |             try:
135 |                 while True:
136 |                     stream_info = await self.get_stream_description()
137 |                     stream_status = stream_info["StreamStatus"]
138 | 
139 |                     if stream_status == self.ACTIVE:
140 |                         self.stream_status = stream_status
141 |                         break
142 | 
143 |                     if stream_status in ["CREATING", "UPDATING"]:
144 |                         await asyncio.sleep(0.25)
145 | 
146 |                     else:
147 |                         raise exceptions.StreamStatusInvalid(
148 |                             "Stream '{}' is {}".format(self.stream_name, stream_status)
149 |                         )
150 |             except CancelledError:
151 |                 pass
152 | 
153 |             else:
154 |                 self.shards = stream_info["Shards"]
155 | 
156 |         if cm.expired:
157 |             raise exceptions.StreamStatusInvalid(
158 |                 "Stream '{}' is still {}".format(self.stream_name, stream_status)
159 |             )
160 | 
161 |     async def close(self):
162 |         raise NotImplementedError
163 | 
164 |     async def get_conn(self):
165 | 
166 |         async with self._conn_lock:
167 | 
168 |             log.debug(
169 |                 f"Get Connection (stream name: {self.stream_name}), stream status: {self.stream_status})"
170 |             )
171 | 
172 |             if self.stream_status == self.INITIALIZE:
173 |                 try:
174 |                     await self.start()
175 |                     log.info(f"Connection Successfully Initialized")
176 |                 except exceptions.StreamDoesNotExist:
177 |                     # Do not attempt to reconnect if stream does not exist
178 |                     log.error(f"Stream does not exist ({self.stream_name})")
179 |                     raise
180 |                 except Exception as e:
181 |                     log.warning(f"Connection Failed to Initialize : {e.__class__} {e}")
182 |                     await self._get_reconn_helper()
183 |             elif (
184 |                 self.stream_status == self.ACTIVE
185 |                 and (time.monotonic() - self._reconnect_timeout) > 120
186 |             ):
187 |                 # reconnect_timeout is a Lock so a new connection is not created immediately
188 |                 # after a successfully reconnection has been made since self.start() sets self.stream_status = "ACTIVE"
189 |                 # immediately after a successful reconnect.
190 |                 # Based on testing a hardcode 120 seconds backoff is best since, there could be a lot of pending
191 |                 # coroutines reattempting the connection when the client connection it's already healthy.
192 |                 await self._get_reconn_helper()
193 | 
194 |     async def _get_reconn_helper(self):
195 |         # Logic used to reconnect to connect to kinesis if there is a error
196 | 
197 |         self.stream_status = self.RECONNECT
198 |         backoff_delay = 5
199 |         conn_attempts = 1
200 |         await self.close()
201 |         while True:
202 |             self._reconnect_timeout = time.monotonic()
203 |             try:
204 |                 log.warning(
205 |                     f"Connection Error. Rebuilding connection. Sleeping for {backoff_delay} seconds. Reconnection Attempt: {conn_attempts}"
206 |                 )
207 |                 await asyncio.sleep(backoff_delay)
208 |                 await self.start()
209 |                 log.warning(
210 |                     f"Connection Reestablished After {conn_attempts} and Sleeping for {backoff_delay}"
211 |                 )
212 |                 break
213 |             except Exception as e:
214 |                 if isinstance(e, exceptions.StreamDoesNotExist):
215 |                     raise e
216 |                 log.warning(e)
217 |                 conn_attempts += 1
218 |                 if isinstance(self.retry_limit, int):
219 |                     if conn_attempts >= (self.retry_limit + 1):
220 |                         await self.close()
221 |                         raise ConnectionError(
222 |                             f"Kinesis client has exceeded {self.retry_limit} connection attempts"
223 |                         )
224 |                 if self.expo_backoff:
225 |                     backoff_delay = (conn_attempts ** 2) * self.expo_backoff
226 |                     if backoff_delay >= self.expo_backoff_limit:
227 |                         backoff_delay = self.expo_backoff_limit
228 |                 await self.close()
229 | 
230 |     async def _create_stream(self, ignore_exists=True):
231 | 
232 |         log.debug(
233 |             "Creating (or ignoring) stream {} with {} shards".format(
234 |                 self.stream_name, self.create_stream_shards
235 |             )
236 |         )
237 | 
238 |         if self.create_stream_shards < 1:
239 |             raise Exception("Min shard count is one")
240 | 
241 |         try:
242 |             await self.client.create_stream(
243 |                 StreamName=self.stream_name, ShardCount=self.create_stream_shards
244 |             )
245 |         except ClientError as err:
246 |             code = err.response["Error"]["Code"]
247 | 
248 |             if code == "ResourceInUseException":
249 |                 if not ignore_exists:
250 |                     raise exceptions.StreamExists(
251 |                         "Stream '{}' exists, cannot create it".format(self.stream_name)
252 |                     ) from None
253 |             elif code == "LimitExceededException":
254 |                 raise exceptions.StreamShardLimit(
255 |                     "Stream '{}' exceeded shard limit".format(self.stream_name)
256 |                 )
257 |             else:
258 |                 raise
259 | 


--------------------------------------------------------------------------------
/kinesis/checkpointers.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import asyncio
  3 | import os
  4 | import json
  5 | from datetime import timezone, datetime
  6 | 
  7 | log = logging.getLogger(__name__)
  8 | 
  9 | 
 10 | class BaseCheckPointer:
 11 |     def __init__(self, name="", id=None):
 12 |         self._id = id if id else os.getpid()
 13 |         self._name = name
 14 |         self._items = {}
 15 | 
 16 |     def get_id(self):
 17 |         return self._id
 18 | 
 19 |     def get_ref(self):
 20 |         return "{}/{}".format(self._name, self._id)
 21 | 
 22 |     def get_all_checkpoints(self):
 23 |         return self._items.copy()
 24 | 
 25 |     def get_checkpoint(self, shard_id):
 26 |         return self._items.get(shard_id)
 27 | 
 28 |     async def close(self):
 29 |         log.info("{} stopping..".format(self.get_ref()))
 30 |         await asyncio.gather(
 31 |             *[self.deallocate(shard_id) for shard_id in self._items.keys()]
 32 |         )
 33 | 
 34 |     def is_allocated(self, shard_id):
 35 |         return shard_id in self._items
 36 | 
 37 | 
 38 | class BaseHeartbeatCheckPointer(BaseCheckPointer):
 39 |     def __init__(
 40 |         self,
 41 |         name,
 42 |         id=None,
 43 |         session_timeout=60,
 44 |         heartbeat_frequency=15,
 45 |         auto_checkpoint=True,
 46 |     ):
 47 |         super().__init__(name=name, id=id)
 48 | 
 49 |         self.session_timeout = session_timeout
 50 |         self.heartbeat_frequency = heartbeat_frequency
 51 |         self.auto_checkpoint = auto_checkpoint
 52 |         self._manual_checkpoints = {}
 53 | 
 54 |         self.heartbeat_task = asyncio.Task(self.heartbeat())
 55 | 
 56 |     async def close(self):
 57 |         log.debug("Cancelling heartbeat task..")
 58 |         self.heartbeat_task.cancel()
 59 | 
 60 |         await super().close()
 61 | 
 62 |     async def heartbeat(self):
 63 |         while True:
 64 |             await asyncio.sleep(self.heartbeat_frequency)
 65 | 
 66 |             # todo: don't heartbeat if checkpoint already updated it recently
 67 |             for shard_id, sequence in self._items.items():
 68 |                 key = self.get_key(shard_id)
 69 |                 val = {"ref": self.get_ref(), "ts": self.get_ts(), "sequence": sequence}
 70 |                 log.debug("Heartbeating {}@{}".format(shard_id, sequence))
 71 |                 await self.do_heartbeat(key, val)
 72 | 
 73 | 
 74 | class MemoryCheckPointer(BaseCheckPointer):
 75 |     async def deallocate(self, shard_id):
 76 |         log.info(
 77 |             "{} deallocated on {}@{}".format(
 78 |                 self.get_ref(), shard_id, self._items[shard_id]
 79 |             )
 80 |         )
 81 |         self._items[shard_id]["active"] = False
 82 | 
 83 |     def is_allocated(self, shard_id):
 84 |         return shard_id in self._items and self._items[shard_id]["active"]
 85 | 
 86 |     async def allocate(self, shard_id):
 87 |         if shard_id not in self._items:
 88 |             self._items[shard_id] = {"sequence": None}
 89 | 
 90 |         self._items[shard_id]["active"] = True
 91 | 
 92 |         return True, self._items[shard_id]["sequence"]
 93 | 
 94 |     async def checkpoint(self, shard_id, sequence):
 95 |         log.debug(
 96 |             "{} checkpointed on {} @ {}".format(self.get_ref(), shard_id, sequence)
 97 |         )
 98 |         self._items[shard_id]["sequence"] = sequence
 99 | 
100 | 
101 | class RedisCheckPointer(BaseHeartbeatCheckPointer):
102 |     def __init__(
103 |         self,
104 |         name,
105 |         id=None,
106 |         session_timeout=60,
107 |         heartbeat_frequency=15,
108 |         is_cluster=False,
109 |         auto_checkpoint=True,
110 |     ):
111 |         super().__init__(
112 |             name=name,
113 |             id=id,
114 |             session_timeout=session_timeout,
115 |             heartbeat_frequency=heartbeat_frequency,
116 |             auto_checkpoint=auto_checkpoint,
117 |         )
118 | 
119 |         if is_cluster:
120 |             from aredis import StrictRedisCluster as Redis
121 |         else:
122 |             from aredis import StrictRedis as Redis
123 | 
124 |         params = {
125 |             "host": os.environ.get("REDIS_HOST", "localhost"),
126 |             "port": int(os.environ.get("REDIS_PORT", "6379")),
127 |             "password": os.environ.get("REDIS_PASSWORD"),
128 |         }
129 | 
130 |         if not is_cluster:
131 |             db = int(os.environ.get("REDIS_DB", 0))
132 |             if db > 0:
133 |                 params["db"] = db
134 |         else:
135 |             params["skip_full_coverage_check"] = True
136 | 
137 |         self.client = Redis(**params)
138 | 
139 |     async def do_heartbeat(self, key, value):
140 |         await self.client.set(key, json.dumps(value))
141 | 
142 |     def get_key(self, shard_id):
143 |         return "pyredis-{}-{}".format(self._name, shard_id)
144 | 
145 |     def get_ts(self):
146 |         return round(int(datetime.now(tz=timezone.utc).timestamp()))
147 | 
148 |     async def checkpoint(self, shard_id, sequence):
149 | 
150 |         if not self.auto_checkpoint:
151 |             log.debug(
152 |                 "{} updated manual checkpoint {}@{}".format(
153 |                     self.get_ref(), shard_id, sequence
154 |                 )
155 |             )
156 |             self._manual_checkpoints[shard_id] = sequence
157 |             return
158 | 
159 |         await self._checkpoint(shard_id, sequence)
160 | 
161 |     async def manual_checkpoint(self):
162 |         items = [(k, v) for k, v in self._manual_checkpoints.items()]
163 | 
164 |         self._manual_checkpoints = {}
165 | 
166 |         for shard_id, sequence in items:
167 |             await self._checkpoint(shard_id, sequence)
168 | 
169 |     async def _checkpoint(self, shard_id, sequence):
170 | 
171 |         key = self.get_key(shard_id)
172 | 
173 |         val = {"ref": self.get_ref(), "ts": self.get_ts(), "sequence": sequence}
174 | 
175 |         previous_val = await self.client.getset(key, json.dumps(val))
176 |         previous_val = json.loads(previous_val) if previous_val else None
177 | 
178 |         if not previous_val:
179 |             raise NotImplementedError(
180 |                 "{} checkpointed on {} but key did not exist?".format(
181 |                     self.get_ref(), shard_id
182 |                 )
183 |             )
184 | 
185 |         if previous_val["ref"] != self.get_ref():
186 |             raise NotImplementedError(
187 |                 "{} checkpointed on {} but ref is different {}".format(
188 |                     self.get_ref(), shard_id, val["ref"]
189 |                 )
190 |             )
191 | 
192 |         log.debug("{} checkpointed on {}@{}".format(self.get_ref(), shard_id, sequence))
193 |         self._items[shard_id] = sequence
194 | 
195 |     async def deallocate(self, shard_id):
196 | 
197 |         key = self.get_key(shard_id)
198 | 
199 |         val = {"ref": None, "ts": None, "sequence": self._items[shard_id]}
200 | 
201 |         await self.client.set(key, json.dumps(val))
202 | 
203 |         log.info(
204 |             "{} deallocated on {}@{}".format(
205 |                 self.get_ref(), shard_id, self._items[shard_id]
206 |             )
207 |         )
208 | 
209 |         self._items.pop(shard_id)
210 | 
211 |     async def allocate(self, shard_id):
212 | 
213 |         key = self.get_key(shard_id)
214 | 
215 |         ts = self.get_ts()
216 | 
217 |         # try to set lock
218 |         success = await self.client.set(
219 |             key,
220 |             json.dumps({"ref": self.get_ref(), "ts": ts, "sequence": None}),
221 |             nx=True,
222 |         )
223 | 
224 |         val = await self.client.get(key)
225 |         val = json.loads(val) if val else None
226 | 
227 |         original_ts = val["ts"]
228 | 
229 |         if success:
230 |             log.info(
231 |                 "{} allocated {} (new checkpoint)".format(self.get_ref(), shard_id)
232 |             )
233 |             self._items[shard_id] = None
234 |             return True, None
235 | 
236 |         if val["ts"]:
237 | 
238 |             log.info(
239 |                 "{} could not allocate {}, still in use by {}".format(
240 |                     self.get_ref(), shard_id, val["ref"]
241 |                 )
242 |             )
243 | 
244 |             # Wait a bit before carrying on to avoid spamming ourselves
245 |             await asyncio.sleep(1)
246 | 
247 |             age = ts - original_ts
248 | 
249 |             # still alive?
250 |             if age < self.session_timeout:
251 |                 return False, None
252 | 
253 |             log.info(
254 |                 "Attempting to take lock as {} is {} seconds over due..".format(
255 |                     val["ref"], age - self.session_timeout
256 |                 )
257 |             )
258 | 
259 |         val["ref"] = self.get_ref()
260 |         val["ts"] = ts
261 | 
262 |         previous_val = await self.client.getset(key, json.dumps(val))
263 |         previous_val = json.loads(previous_val) if previous_val else None
264 | 
265 |         if previous_val["ts"] != original_ts:
266 |             log.info("{} beat me to the lock..".format(previous_val["ref"]))
267 |             return False, None
268 | 
269 |         log.info(
270 |             "{} allocating {}@{}".format(self.get_ref(), shard_id, val["sequence"])
271 |         )
272 | 
273 |         self._items[shard_id] = val["sequence"]
274 | 
275 |         return True, val["sequence"]
276 | 


--------------------------------------------------------------------------------
/kinesis/consumer.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | from datetime import datetime, timezone
  4 | from aiohttp import ClientConnectionError
  5 | from asyncio import TimeoutError
  6 | from asyncio.queues import QueueEmpty
  7 | from botocore.exceptions import ClientError
  8 | from .utils import Throttler
  9 | from .base import Base
 10 | from .checkpointers import MemoryCheckPointer
 11 | from .processors import JsonProcessor
 12 | 
 13 | log = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class ShardStats:
 17 |     def __init__(self):
 18 |         self._throttled = 0
 19 |         self._success = 0
 20 | 
 21 |     def succeded(self):
 22 |         self._success += 1
 23 | 
 24 |     def throttled(self):
 25 |         self._throttled += 1
 26 | 
 27 |     def to_data(self):
 28 |         return {"throttled": self._throttled, "success": self._success}
 29 | 
 30 | 
 31 | class Consumer(Base):
 32 |     def __init__(
 33 |         self,
 34 |         stream_name,
 35 |         session=None,
 36 |         endpoint_url=None,
 37 |         region_name=None,
 38 |         max_queue_size=10000,
 39 |         max_shard_consumers=None,
 40 |         record_limit=10000,
 41 |         sleep_time_no_records=2,
 42 |         iterator_type="TRIM_HORIZON",
 43 |         shard_fetch_rate=1,
 44 |         checkpointer=None,
 45 |         processor=None,
 46 |         retry_limit=None,
 47 |         expo_backoff=None,
 48 |         expo_backoff_limit=120,
 49 |         skip_describe_stream=False,
 50 |         create_stream=False,
 51 |         create_stream_shards=1,
 52 |         timestamp=None,
 53 |     ):
 54 | 
 55 |         super(Consumer, self).__init__(
 56 |             stream_name,
 57 |             session=session,
 58 |             endpoint_url=endpoint_url,
 59 |             region_name=region_name,
 60 |             retry_limit=retry_limit,
 61 |             expo_backoff=expo_backoff,
 62 |             expo_backoff_limit=expo_backoff_limit,
 63 |             skip_describe_stream=skip_describe_stream,
 64 |             create_stream=create_stream,
 65 |             create_stream_shards=create_stream_shards,
 66 |         )
 67 | 
 68 |         self.queue = asyncio.Queue(maxsize=max_queue_size)
 69 | 
 70 |         self.sleep_time_no_records = sleep_time_no_records
 71 | 
 72 |         self.max_shard_consumers = max_shard_consumers
 73 | 
 74 |         self.record_limit = record_limit
 75 | 
 76 |         self.is_fetching = True
 77 | 
 78 |         self.checkpointer = checkpointer if checkpointer else MemoryCheckPointer()
 79 | 
 80 |         self.processor = processor if processor else JsonProcessor()
 81 | 
 82 |         self.iterator_type = iterator_type
 83 | 
 84 |         self.fetch_task = None
 85 | 
 86 |         self.shard_fetch_rate = shard_fetch_rate
 87 | 
 88 |         self.timestamp = timestamp
 89 | 
 90 |     def __aiter__(self):
 91 |         return self
 92 | 
 93 |     async def close(self):
 94 |         log.debug("Closing Connection..")
 95 |         if not self.stream_status == self.RECONNECT:
 96 | 
 97 |             await self.flush()
 98 | 
 99 |             if self.fetch_task:
100 |                 self.fetch_task.cancel()
101 |                 self.fetch_task = None
102 | 
103 |             if self.checkpointer:
104 |                 await self.checkpointer.close()
105 |         await self.client.close()
106 | 
107 |     async def flush(self):
108 | 
109 |         self.is_fetching = False
110 | 
111 |         if not self.shards:
112 |             return
113 | 
114 |         # Wait for shard fetches to finish
115 |         # todo: use gather
116 |         for shard in self.shards:
117 |             if shard.get("fetch"):
118 |                 if not shard["fetch"].done():
119 |                     await shard["fetch"]
120 | 
121 |     async def _fetch(self):
122 |         while self.is_fetching:
123 |             # Ensure fetch is performed at most 5 times per second (the limit per shard)
124 |             await asyncio.sleep(0.2)
125 |             try:
126 |                 await self.fetch()
127 |             except asyncio.CancelledError:
128 |                 pass
129 |             except Exception as e:
130 |                 log.exception(e)
131 | 
132 |     async def fetch(self):
133 | 
134 |         if not self.is_fetching:
135 |             return
136 | 
137 |         # todo: check for/handle new shards
138 | 
139 |         shards_in_use = [
140 |             s for s in self.shards if self.checkpointer.is_allocated(s["ShardId"])
141 |         ]
142 | 
143 |         # log.debug("shards in use: {}".format([s["ShardId"] for s in shards_in_use]))
144 | 
145 |         for shard in self.shards:
146 | 
147 |             if not self.is_fetching:
148 |                 break
149 | 
150 |             if not self.checkpointer.is_allocated(shard["ShardId"]):
151 |                 if (
152 |                     self.max_shard_consumers
153 |                     and len(shards_in_use) >= self.max_shard_consumers
154 |                 ):
155 |                     continue
156 | 
157 |                 if self.checkpointer is None:
158 |                     log.debug("Marking shard in use {}".format(shard["ShardId"]))
159 |                     shard["ShardIterator"] = await self.get_shard_iterator(
160 |                         shard_id=shard["ShardId"]
161 |                     )
162 | 
163 |                 else:
164 |                     success, checkpoint = await self.checkpointer.allocate(
165 |                         shard["ShardId"]
166 |                     )
167 | 
168 |                     if not success:
169 |                         log.debug(
170 |                             "Shard in use. Could not assign shard {} to checkpointer[{}]".format(
171 |                                 shard["ShardId"], self.checkpointer.get_ref()
172 |                             )
173 |                         )
174 |                         continue
175 | 
176 |                     log.debug(
177 |                         "Marking shard in use {} by checkpointer[{}] @ {}".format(
178 |                             shard["ShardId"], self.checkpointer.get_ref(), checkpoint
179 |                         )
180 |                     )
181 | 
182 |                     shard["ShardIterator"] = await self.get_shard_iterator(
183 |                         shard_id=shard["ShardId"], last_sequence_number=checkpoint
184 |                     )
185 | 
186 |                 if "ShardIterator" in shard:
187 |                     shard["stats"] = ShardStats()
188 |                     shard["throttler"] = Throttler(
189 |                         rate_limit=self.shard_fetch_rate, period=1
190 |                     )
191 |                     shards_in_use.append(shard)
192 | 
193 |                     log.debug("Shard count now at {}".format(len(shards_in_use)))
194 | 
195 |             if shard.get("fetch"):
196 |                 if shard["fetch"].done():
197 |                     result = shard["fetch"].result()
198 | 
199 |                     if not result:
200 |                         shard["fetch"] = None
201 |                         continue
202 | 
203 |                     records = result["Records"]
204 | 
205 |                     if records:
206 |                         log.debug(
207 |                             "Shard {} got {} records".format(
208 |                                 shard["ShardId"], len(records)
209 |                             )
210 |                         )
211 | 
212 |                         total_items = 0
213 |                         for row in result["Records"]:
214 |                             for n, output in enumerate(
215 |                                 self.processor.parse(row["Data"])
216 |                             ):
217 |                                 await self.queue.put(output)
218 |                             total_items += n + 1
219 | 
220 |                         # Get approx minutes behind..
221 |                         last_arrival = records[-1].get("ApproximateArrivalTimestamp")
222 |                         if last_arrival:
223 |                             last_arrival = round(
224 |                                 (
225 |                                     (
226 |                                         datetime.now(timezone.utc) - last_arrival
227 |                                     ).total_seconds()
228 |                                     / 60
229 |                                 )
230 |                             )
231 | 
232 |                             log.debug(
233 |                                 "Shard {} added {} items from {} records. Consumer is {}m behind".format(
234 |                                     shard["ShardId"],
235 |                                     total_items,
236 |                                     len(records),
237 |                                     last_arrival,
238 |                                 ),
239 |                                 extra={"consumer_behind_m": last_arrival},
240 |                             )
241 | 
242 |                         else:
243 |                             # ApproximateArrivalTimestamp not available in kinesis-lite
244 |                             log.debug(
245 |                                 "Shard {} added {} items from {} records".format(
246 |                                     shard["ShardId"], total_items, len(records)
247 |                                 )
248 |                             )
249 | 
250 |                         # Add checkpoint record
251 |                         last_record = result["Records"][-1]
252 |                         await self.queue.put(
253 |                             {
254 |                                 "__CHECKPOINT__": {
255 |                                     "ShardId": shard["ShardId"],
256 |                                     "SequenceNumber": last_record["SequenceNumber"],
257 |                                 }
258 |                             }
259 |                         )
260 | 
261 |                         shard["LastSequenceNumber"] = last_record["SequenceNumber"]
262 | 
263 |                     else:
264 |                         log.debug(
265 |                             "Shard {} caught up, sleeping {}s".format(
266 |                                 shard["ShardId"], self.sleep_time_no_records
267 |                             )
268 |                         )
269 |                         await asyncio.sleep(self.sleep_time_no_records)
270 | 
271 |                     if not result["NextShardIterator"]:
272 |                         raise NotImplementedError("Shard is closed?")
273 | 
274 |                     shard["ShardIterator"] = result["NextShardIterator"]
275 | 
276 |                     shard["fetch"] = None
277 | 
278 |                 else:
279 |                     # log.debug("shard {} fetch in progress..".format(shard['ShardId']))
280 |                     continue
281 | 
282 |             if "ShardIterator" in shard and shard["ShardIterator"] is not None:
283 |                 shard["fetch"] = asyncio.create_task(self.get_records(shard=shard))
284 | 
285 |     async def get_records(self, shard):
286 | 
287 |         # Note: "This operation has a limit of five transactions per second per account."
288 | 
289 |         async with shard["throttler"]:
290 |             # log.debug("get_records shard={}".format(shard['ShardId']))
291 | 
292 |             try:
293 | 
294 |                 result = await self.client.get_records(
295 |                     ShardIterator=shard["ShardIterator"], Limit=self.record_limit
296 |                 )
297 | 
298 |                 shard["stats"].succeded()
299 |                 return result
300 | 
301 |             except ClientConnectionError as e:
302 |                 await self.get_conn()
303 |             except TimeoutError as e:
304 |                 log.warning("Timeout {}. sleeping..".format(e))
305 |                 await asyncio.sleep(3)
306 | 
307 |             except ClientError as e:
308 |                 code = e.response["Error"]["Code"]
309 |                 if code == "ProvisionedThroughputExceededException":
310 |                     log.warning(
311 |                         "{} hit ProvisionedThroughputExceededException".format(
312 |                             shard["ShardId"]
313 |                         )
314 |                     )
315 |                     shard["stats"].throttled()
316 |                     # todo: control the throttle ?
317 |                     await asyncio.sleep(0.25)
318 | 
319 |                 elif code == "ExpiredIteratorException":
320 |                     log.warning(
321 |                         "{} hit ExpiredIteratorException".format(shard["ShardId"])
322 |                     )
323 | 
324 |                     shard["ShardIterator"] = await self.get_shard_iterator(
325 |                         shard_id=shard["ShardId"],
326 |                         last_sequence_number=shard.get("LastSequenceNumber"),
327 |                     )
328 | 
329 |                 elif code == "InternalFailure":
330 |                     log.warning(
331 |                         "Received InternalFailure from Kinesis, rebuilding connection.. "
332 |                     )
333 |                     await self.get_conn()
334 | 
335 |                 else:
336 |                     log.warning("ClientError {}. sleeping..".format(code))
337 |                     await asyncio.sleep(3)
338 | 
339 |             except Exception as e:
340 |                 log.warning("Unknown error {}. sleeping..".format(e))
341 |                 await asyncio.sleep(3)
342 | 
343 |             # Connection or other issue
344 |             return None
345 | 
346 |     async def get_shard_iterator(self, shard_id, last_sequence_number=None):
347 | 
348 |         log.debug(
349 |             "getting shard iterator for {} @ {}".format(
350 |                 shard_id,
351 |                 last_sequence_number if last_sequence_number else self.iterator_type,
352 |             )
353 |         )
354 | 
355 |         params = {
356 |             "StreamName": self.stream_name,
357 |             "ShardId": shard_id,
358 |             "ShardIteratorType": "AFTER_SEQUENCE_NUMBER"
359 |             if last_sequence_number
360 |             else self.iterator_type,
361 |         }
362 | 
363 |         if last_sequence_number:
364 |             params["StartingSequenceNumber"] = last_sequence_number
365 | 
366 |         if self.iterator_type == 'AT_TIMESTAMP' and self.timestamp:
367 |             params['Timestamp'] = self.timestamp
368 | 
369 |         response = await self.client.get_shard_iterator(**params)
370 |         return response["ShardIterator"]
371 | 
372 |     async def start_consumer(self, wait_iterations=10, wait_sleep=0.25):
373 | 
374 |         # Start task to fetch periodically
375 | 
376 |         self.fetch_task = asyncio.create_task(self._fetch())
377 | 
378 |         # Wait a while until we have some results
379 |         for i in range(0, wait_iterations):
380 |             if self.fetch_task and self.queue.qsize() == 0:
381 |                 await asyncio.sleep(wait_sleep)
382 | 
383 |         log.debug("start_consumer completed.. queue size={}".format(self.queue.qsize()))
384 | 
385 |     async def __anext__(self):
386 | 
387 |         if not self.shards:
388 |             await self.get_conn()
389 | 
390 |         if not self.fetch_task:
391 |             await self.start_consumer()
392 | 
393 |         # Raise exception from Fetch Task to main task otherwise raise exception inside
394 |         # Fetch Task will fail silently
395 |         if self.fetch_task.done():
396 |             raise self.fetch_task.exception()
397 | 
398 |         while True:
399 |             try:
400 |                 item = self.queue.get_nowait()
401 | 
402 |                 if item and isinstance(item, dict) and "__CHECKPOINT__" in item:
403 |                     if self.checkpointer:
404 |                         await self.checkpointer.checkpoint(
405 |                             item["__CHECKPOINT__"]["ShardId"],
406 |                             item["__CHECKPOINT__"]["SequenceNumber"],
407 |                         )
408 |                     continue
409 | 
410 |                 return item
411 | 
412 |             except QueueEmpty:
413 |                 log.debug("Queue empty..")
414 |                 await asyncio.sleep(self.sleep_time_no_records)
415 |                 raise StopAsyncIteration
416 | 


--------------------------------------------------------------------------------
/kinesis/exceptions.py:
--------------------------------------------------------------------------------
 1 | class StreamExists(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class StreamDoesNotExist(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class StreamShardLimit(Exception):
10 |     pass
11 | 
12 | 
13 | class StreamStatusInvalid(Exception):
14 |     pass
15 | 
16 | 
17 | class ExceededPutLimit(Exception):
18 |     pass
19 | 
20 | 
21 | class UnknownException(Exception):
22 |     pass
23 | 
24 | 
25 | class ValidationError(Exception):
26 |     pass
27 | 


--------------------------------------------------------------------------------
/kinesis/processors.py:
--------------------------------------------------------------------------------
 1 | from .aggregators import (
 2 |     NewlineAggregator,
 3 |     SimpleAggregator,
 4 |     NetstringAggregator,
 5 |     ListAggregator,
 6 |     KPLAggregator,
 7 | )
 8 | from .serializers import StringSerializer, JsonSerializer, MsgpackSerializer
 9 | 
10 | 
11 | class Processor:
12 |     pass
13 | 
14 | 
15 | class StringProcessor(Processor, SimpleAggregator, StringSerializer):
16 |     pass
17 | 
18 | 
19 | class JsonProcessor(Processor, SimpleAggregator, JsonSerializer):
20 |     pass
21 | 
22 | 
23 | class JsonLineProcessor(Processor, NewlineAggregator, JsonSerializer):
24 |     pass
25 | 
26 | 
27 | class JsonListProcessor(Processor, ListAggregator, JsonSerializer):
28 |     pass
29 | 
30 | 
31 | class MsgpackProcessor(Processor, NetstringAggregator, MsgpackSerializer):
32 |     pass
33 | 
34 | 
35 | class KPLJsonProcessor(Processor, KPLAggregator, JsonSerializer):
36 |     pass
37 | 
38 | 
39 | class KPLStringProcessor(Processor, KPLAggregator, StringSerializer):
40 |     pass
41 | 


--------------------------------------------------------------------------------
/kinesis/producer.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | import time
  4 | import math
  5 | from aiohttp import ClientConnectionError
  6 | 
  7 | from asyncio.queues import QueueEmpty
  8 | 
  9 | from .utils import Throttler
 10 | from botocore.exceptions import ClientError
 11 | 
 12 | from .base import Base
 13 | from . import exceptions
 14 | from .processors import JsonProcessor
 15 | 
 16 | log = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | class Producer(Base):
 20 |     def __init__(
 21 |         self,
 22 |         stream_name,
 23 |         session=None,
 24 |         endpoint_url=None,
 25 |         region_name=None,
 26 |         buffer_time=0.5,
 27 |         put_rate_limit_per_shard=1000,
 28 |         put_bandwidth_limit_per_shard=1024,
 29 |         after_flush_fun=None,
 30 |         batch_size=500,
 31 |         max_queue_size=10000,
 32 |         processor=None,
 33 |         skip_describe_stream=False,
 34 |         retry_limit=None,
 35 |         expo_backoff=None,
 36 |         expo_backoff_limit=120,
 37 |         create_stream=False,
 38 |         create_stream_shards=1,
 39 |     ):
 40 | 
 41 |         super(Producer, self).__init__(
 42 |             stream_name,
 43 |             session=session,
 44 |             endpoint_url=endpoint_url,
 45 |             region_name=region_name,
 46 |             retry_limit=retry_limit,
 47 |             expo_backoff=expo_backoff,
 48 |             expo_backoff_limit=expo_backoff_limit,
 49 |             skip_describe_stream=skip_describe_stream,
 50 |             create_stream=create_stream,
 51 |             create_stream_shards=create_stream_shards,
 52 |         )
 53 | 
 54 |         self.buffer_time = buffer_time
 55 | 
 56 |         self.processor = processor if processor else JsonProcessor()
 57 | 
 58 |         self.queue = asyncio.Queue(maxsize=max_queue_size)
 59 | 
 60 |         self.batch_size = batch_size
 61 | 
 62 |         # A single shard can ingest up to 1 MiB of data per second (including partition keys)
 63 |         # or 1,000 records per second for writes
 64 |         self.put_rate_limit_per_shard = put_rate_limit_per_shard
 65 |         self.put_rate_throttle = None
 66 |         self.put_bandwidth_limit_per_shard = put_bandwidth_limit_per_shard
 67 |         self.put_bandwidth_throttle = None
 68 | 
 69 |         if put_bandwidth_limit_per_shard > 1024:
 70 |             log.warning(
 71 |                 (
 72 |                     "Put bandwidth {}kb exceeds 1024kb. Expect throughput errors..".format(
 73 |                         put_bandwidth_limit_per_shard
 74 |                     )
 75 |                 )
 76 |             )
 77 |         self.set_put_rate_throttle()
 78 | 
 79 |         self.flush_task = asyncio.create_task(self._flush())
 80 |         self.is_flushing = False
 81 |         self.after_flush_fun = after_flush_fun
 82 | 
 83 |         # keep track of these (used by unit test only)
 84 |         self.throughput_exceeded_count = 0
 85 | 
 86 |         # overflow buffer
 87 |         self.overflow = []
 88 | 
 89 |         self.flush_total_records = 0
 90 |         self.flush_total_size = 0
 91 | 
 92 |     def set_put_rate_throttle(self):
 93 |         self.put_rate_throttle = Throttler(
 94 |             rate_limit=self.put_rate_limit_per_shard
 95 |             * (len(self.shards) if self.shards else 1),
 96 |             period=1,
 97 |         )
 98 |         self.put_bandwidth_throttle = Throttler(
 99 |             # kb per second. Go below a bit to avoid hitting the threshold
100 |             size_limit=self.put_bandwidth_limit_per_shard
101 |             * (len(self.shards) if self.shards else 1),
102 |             period=1,
103 |         )
104 | 
105 |     async def put(self, data):
106 | 
107 |         # Raise exception from Flush Task to main task otherwise raise exception inside
108 |         # Flush Task will fail silently
109 |         if self.flush_task.done():
110 |             raise self.flush_task.exception()
111 | 
112 |         if not self.stream_status == self.ACTIVE:
113 |             await self.get_conn()
114 | 
115 |         elif self.queue.qsize() >= self.batch_size:
116 |             await self.flush()
117 | 
118 |         for output in self.processor.add_item(data):
119 |             await self.queue.put(output)
120 | 
121 |     async def close(self):
122 |         log.debug(f"Closing Connection.. (stream status:{self.stream_status})")
123 |         if not self.stream_status == self.RECONNECT:
124 |             # Cancel Flush Task
125 |             self.flush_task.cancel()
126 |             # final flush (probably not required but no harm)
127 |             await self.flush()
128 | 
129 |         await self.client.close()
130 | 
131 |     async def _flush(self):
132 |         while True:
133 |             if self.stream_status == self.ACTIVE:
134 |                 if not self.is_flushing:
135 |                     await self.flush()
136 |             await asyncio.sleep(self.buffer_time)
137 | 
138 |     async def flush(self):
139 | 
140 |         if self.is_flushing:
141 |             log.debug("Flush already in progress, ignoring..")
142 |             return
143 | 
144 |         self.is_flushing = True
145 | 
146 |         if self.processor.has_items():
147 |             for output in self.processor.get_items():
148 |                 await self.queue.put(output)
149 | 
150 |         while True:
151 | 
152 |             self.flush_total_records = 0
153 |             self.flush_total_size = 0
154 | 
155 |             if self.queue.qsize() > 0 or len(self.overflow) > 0:
156 |                 log.debug(
157 |                     "flush queue={} overflow={}".format(
158 |                         self.queue.qsize(), len(self.overflow)
159 |                     )
160 |                 )
161 | 
162 |             items = await self.get_batch()
163 | 
164 |             if not items:
165 |                 break
166 | 
167 |             else:
168 |                 result = await self._push_kinesis(items)
169 |                 await self.process_result(result, items)
170 | 
171 |         self.is_flushing = False
172 | 
173 |     async def process_result(self, result, items):
174 |         if result["FailedRecordCount"]:
175 | 
176 |             errors = list(
177 |                 set(
178 |                     [
179 |                         r.get("ErrorCode")
180 |                         for r in result["Records"]
181 |                         if r.get("ErrorCode")
182 |                     ]
183 |                 )
184 |             )
185 | 
186 |             if not errors:
187 |                 raise exceptions.UnknownException(
188 |                     "Failed to put records but no errorCodes return in results"
189 |                 )
190 | 
191 |             if "ProvisionedThroughputExceededException" in errors:
192 |                 log.warning(
193 |                     "Throughput exceeded ({} records failed, added back..), pausing for 0.25s..".format(
194 |                         result["FailedRecordCount"]
195 |                     )
196 |                 )
197 | 
198 |                 self.throughput_exceeded_count += 1
199 | 
200 |                 for i, record in enumerate(result["Records"]):
201 |                     if "ErrorCode" in record:
202 |                         self.overflow.append(items[i])
203 | 
204 |                 # log.debug("items={} overflow={}".format(len(items), len(overflow)))
205 | 
206 |                 await asyncio.sleep(0.25)
207 | 
208 |             elif "InternalFailure" in errors:
209 |                 log.warning("Received InternalFailure from Kinesis")
210 |                 await self.get_conn()
211 | 
212 |                 for i, record in enumerate(result["Records"]):
213 |                     if "ErrorCode" in record:
214 |                         self.overflow.append(items[i])
215 | 
216 |             else:
217 |                 raise exceptions.UnknownException(
218 |                     "Failed to put records due to: {}".format(", ".join(errors))
219 |                 )
220 | 
221 |         else:
222 | 
223 |             if self.after_flush_fun:
224 |                 await self.after_flush_fun(items)
225 | 
226 |     async def get_batch(self):
227 |         items = []
228 |         flush_max_size = 0
229 | 
230 |         for num in range(self.queue.qsize() + len(self.overflow)):
231 |             async with self.put_rate_throttle:
232 | 
233 |                 if self.overflow:
234 |                     item = self.overflow.pop()
235 | 
236 |                 else:
237 |                     try:
238 |                         item = self.queue.get_nowait()
239 |                     except QueueEmpty:
240 |                         break
241 | 
242 |                 size_kb = math.ceil(item[0] / 1024)
243 | 
244 |                 flush_max_size += size_kb
245 | 
246 |                 if flush_max_size > 1024:
247 |                     self.overflow.append(item)
248 | 
249 |                 elif num <= self.batch_size:
250 |                     async with self.put_bandwidth_throttle(size=self.flush_total_size):
251 |                         items.append(item)
252 |                         self.flush_total_size += size_kb
253 |                         self.flush_total_records += item[1]
254 |                 else:
255 |                     self.overflow.append(item)
256 | 
257 |         return items
258 | 
259 |     async def _push_kinesis(self, items):
260 | 
261 |         log.debug(
262 |             "doing flush with {} record ({} items) @ {} kb".format(
263 |                 len(items), self.flush_total_records, self.flush_total_size
264 |             )
265 |         )
266 | 
267 |         while True:
268 | 
269 |             try:
270 | 
271 |                 # todo: custom partition key
272 |                 results = await self.client.put_records(
273 |                     Records=[
274 |                         {
275 |                             "Data": item.data,
276 |                             "PartitionKey": "{0}{1}".format(
277 |                                 time.perf_counter(), time.time()
278 |                             ),
279 |                         }
280 |                         for item in items
281 |                     ],
282 |                     StreamName=self.stream_name,
283 |                 )
284 | 
285 |                 log.info(
286 |                     "flush complete with {} record ({} items) @ {} kb".format(
287 |                         len(items), self.flush_total_records, self.flush_total_size
288 |                     )
289 |                 )
290 |                 return results
291 | 
292 |             except ClientError as err:
293 | 
294 |                 code = err.response["Error"]["Code"]
295 | 
296 |                 if code == "ValidationException":
297 |                     if (
298 |                         "must have length less than or equal"
299 |                         in err.response["Error"]["Message"]
300 |                     ):
301 |                         log.warning(
302 |                             "Batch size {} exceeded the limit. retrying with less".format(
303 |                                 len(items)
304 |                             )
305 |                         )
306 | 
307 |                         existing_batch_size = self.batch_size
308 |                         self.batch_size -= round(self.batch_size / 10)
309 | 
310 |                         # Must be small batch of big items, take at least one out..
311 |                         if existing_batch_size == self.batch_size:
312 |                             self.batch_size -= 1
313 | 
314 |                         self.overflow.extend(items)
315 | 
316 |                         self.flush_total_records = 0
317 |                         self.flush_max_size = 0
318 |                         self.flush_total_size = 0
319 | 
320 |                         items = await self.get_batch()
321 | 
322 |                     else:
323 |                         log.warning(
324 |                             f'Unknown ValidationException error code {err.response["Error"]["Code"]}'
325 |                         )
326 |                         log.exception(err)
327 |                         await self.get_conn()
328 |                         # raise err
329 |                 elif code == "ResourceNotFoundException":
330 |                     raise exceptions.StreamDoesNotExist(
331 |                         "Stream '{}' does not exist".format(self.stream_name)
332 |                     ) from None
333 |                 else:
334 |                     log.warning(
335 |                         f'Unknown Client error code {err.response["Error"]["Code"]}'
336 |                     )
337 |                     log.exception(err)
338 |                     await self.get_conn()
339 |                     # raise err
340 |             except ClientConnectionError as err:
341 |                 await self.get_conn()
342 |             except asyncio.CancelledError:
343 |                 return
344 |             except Exception as e:
345 |                 log.exception(e)
346 |                 log.critical("Unknown Exception Caught")
347 |                 await self.get_conn()
348 | 


--------------------------------------------------------------------------------
/kinesis/serializers.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import ujson as json
 3 | except ModuleNotFoundError:
 4 |     # https://github.com/python/mypy/issues/1153 (mypy bug with try/except conditional imports)
 5 |     import json  # type: ignore
 6 | 
 7 | try:
 8 |     import msgpack
 9 | except ModuleNotFoundError:
10 |     pass
11 | 
12 | 
13 | class Serializer:
14 |     pass
15 | 
16 | 
17 | class StringSerializer(Serializer):
18 |     def serialize(self, item):
19 |         return str(item).encode("utf-8")
20 | 
21 |     def deserialize(self, data):
22 |         return data.decode("utf-8")
23 | 
24 | 
25 | class JsonSerializer(Serializer):
26 |     def serialize(self, item):
27 |         return json.dumps(item).encode("utf-8")
28 | 
29 |     def deserialize(self, data):
30 |         return json.loads(data.decode("utf-8"))
31 | 
32 | 
33 | class MsgpackSerializer(Serializer):
34 |     def serialize(self, item):
35 |         result = msgpack.packb(item, use_bin_type=True)
36 |         return result
37 | 
38 |     def deserialize(self, data):
39 |         return msgpack.unpackb(data, raw=False)
40 | 


--------------------------------------------------------------------------------
/kinesis/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Source: https://github.com/hallazzang/asyncio-throttle
 3 | 
 4 |     Mods:
 5 |         - add size_limit to support throttling by size
 6 | """
 7 | 
 8 | import time
 9 | import math
10 | import asyncio
11 | import logging
12 | from collections import deque
13 | 
14 | log = logging.getLogger(__name__)
15 | 
16 | 
17 | class Throttler:
18 |     def __init__(
19 |         self,
20 |         rate_limit=None,
21 |         size_limit=None,
22 |         period=1.0,
23 |         retry_interval=0.05,
24 |     ):
25 |         self.rate_limit = rate_limit
26 |         self.size_limit = size_limit
27 |         self.period = period
28 |         self.retry_interval = retry_interval
29 | 
30 |         self._task_logs = deque()
31 | 
32 |         self.size = None
33 | 
34 |     def flush(self):
35 |         now = time.time()
36 |         while self._task_logs:
37 |             if now - self._task_logs[0][0] > self.period:
38 |                 self._task_logs.popleft()
39 |             else:
40 |                 break
41 | 
42 |     def is_below_rate(self):
43 | 
44 |         if self.rate_limit:
45 |             below_rate_requests = len(self._task_logs) < self.rate_limit
46 | 
47 |             if not below_rate_requests:
48 |                 return False
49 | 
50 |         if self.size_limit is None or not self._task_logs:
51 |             return True
52 | 
53 |         size = sum([x[1] for x in self._task_logs])
54 | 
55 |         period = time.time() - self._task_logs[0][0]
56 | 
57 |         period_used_ratio = (self.period - period) / self.period
58 | 
59 |         remaining = self.size_limit - math.ceil(size * period_used_ratio)
60 | 
61 |         # log.debug("rate check: size={} requested={} period={} period_used_ratio={} remaining={}".format(size, self.size, round(period,3), round(period_used_ratio, 2), round(remaining,2)))
62 | 
63 |         return self.size <= remaining
64 | 
65 |     async def acquire(self):
66 | 
67 |         while True:
68 |             self.flush()
69 |             if self.is_below_rate():
70 |                 break
71 |             await asyncio.sleep(
72 |                 self.retry_interval,
73 |             )
74 | 
75 |         self._task_logs.append((time.time(), self.size))
76 | 
77 |     def __call__(self, size=1):
78 |         self.size = size
79 |         return self
80 | 
81 |     async def __aenter__(self):
82 |         await self.acquire()
83 | 
84 |     async def __aexit__(self, exc_type, exc, tb):
85 |         pass
86 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = True


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiobotocore>=1.3.3
2 | async-timeout==3.0.1
3 | aredis==1.1.8
4 | msgpack==1.0.0


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [nosetests]
2 | stop=1
3 | with-spec=1
4 | spec-color=1
5 | nologcapture=1
6 | nocapture=1
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open("README.md") as f:
 4 |     long_description = f.read()
 5 | 
 6 | setup(
 7 |     name="async-kinesis",
 8 |     description="AsyncIO Kinesis Library",
 9 |     long_description=long_description,
10 |     long_description_content_type="text/markdown",
11 |     version="1.1.5",
12 |     url="https://github.com/hampsterx/async-kinesis",
13 |     author="hampsterx",
14 |     author_email="tim.vdh@gmail.com",
15 |     license="Apache2",
16 |     classifiers=[
17 |         "Development Status :: 4 - Beta",
18 |         "Programming Language :: Python",
19 |         "Programming Language :: Python :: 3",
20 |         "Programming Language :: Python :: 3.5",
21 |         "Programming Language :: Python :: 3.6",
22 |         "Programming Language :: Python :: 3.7",
23 |         "Programming Language :: Python :: 3.8",
24 |         "Intended Audience :: Developers",
25 |         "License :: OSI Approved :: Apache Software License",
26 |     ],
27 |     packages=["kinesis"],
28 |     install_requires=[
29 |         "aiobotocore>=1.0.4",
30 |         "async-timeout>=3.0.1",
31 |         "asyncio-throttle>=0.1.1",
32 |     ],
33 |     extras_require={
34 |         "kpl": ["aws-kinesis-agg>=1.1.6"],
35 |         "redis": ["aredis>=1.1.8"],
36 |         "msgpack": ["msgpack>=0.6.1"],
37 |     },
38 | )
39 | 


--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
 1 | -r requirements.txt
 2 | 
 3 | aws-kinesis-agg==1.1.6
 4 | coloredlogs==10.0
 5 | nose==1.3.7
 6 | pinocchio==0.4.2
 7 | python-dotenv==0.9.1
 8 | asynctest==0.12.3
 9 | pluggy==0.13.1
10 | tox==3.20.1
11 | 


--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
   1 | import os
   2 | import uuid
   3 | import asyncio
   4 | import logging, coloredlogs
   5 | 
   6 | from aiobotocore import AioSession
   7 | from dotenv import load_dotenv
   8 | from asynctest import TestCase as AsynTestCase, fail_on
   9 | from unittest import skipUnless, TestCase
  10 | from kinesis import Consumer, Producer, MemoryCheckPointer, RedisCheckPointer
  11 | from kinesis.processors import (
  12 |     StringProcessor,
  13 |     JsonProcessor,
  14 |     JsonLineProcessor,
  15 |     JsonListProcessor,
  16 |     MsgpackProcessor,
  17 |     Processor,
  18 | )
  19 | from kinesis.aggregators import (
  20 |     Aggregator,
  21 |     KPLAggregator,
  22 |     SimpleAggregator,
  23 |     NewlineAggregator,
  24 |     ListAggregator,
  25 |     NetstringAggregator,
  26 |     OutputItem,
  27 | )
  28 | from kinesis.serializers import StringSerializer, JsonSerializer, Serializer
  29 | from kinesis import exceptions
  30 | 
  31 | coloredlogs.install(level="DEBUG", fmt="%(name)s %(levelname)s %(message)s")
  32 | 
  33 | logging.getLogger("botocore").setLevel(logging.WARNING)
  34 | logging.getLogger("aiobotocore").setLevel(logging.INFO)
  35 | 
  36 | 
  37 | log = logging.getLogger(__name__)
  38 | 
  39 | load_dotenv()
  40 | 
  41 | # https://github.com/mhart/kinesalite
  42 | # ./node_modules/.bin/kinesalite --shardLimit 1000
  43 | # see also docker-compose.yaml
  44 | ENDPOINT_URL = os.environ.get("ENDPOINT_URL", "http://localhost:4567")
  45 | 
  46 | TESTING_USE_AWS_KINESIS = os.environ.get("TESTING_USE_AWS_KINESIS", "0") == "1"
  47 | 
  48 | # Use docker-compose one
  49 | if "REDIS_PORT" not in os.environ:
  50 |     os.environ["REDIS_PORT"] = "16379"
  51 | 
  52 | 
  53 | class BaseTests:
  54 |     def random_string(self, length):
  55 |         from random import choice
  56 |         from string import ascii_uppercase
  57 | 
  58 |         return "".join(choice(ascii_uppercase) for i in range(length))
  59 | 
  60 | 
  61 | class BaseKinesisTests(AsynTestCase, BaseTests):
  62 |     async def setUp(self):
  63 |         self.stream_name = "test_{}".format(str(uuid.uuid4())[0:8])
  64 |         producer = await Producer(
  65 |             stream_name=self.stream_name,
  66 |             endpoint_url=ENDPOINT_URL,
  67 |             create_stream=self.stream_name,
  68 |             create_stream_shards=1,
  69 |         ).__aenter__()
  70 |         await producer.__aexit__(None, None, None)
  71 | 
  72 |     async def add_record_delayed(self, msg, producer, delay):
  73 |         log.debug("Adding record. delay={}".format(delay))
  74 |         await asyncio.sleep(delay)
  75 |         await producer.put(msg)
  76 | 
  77 | 
  78 | class ProcessorAndAggregatorTests(TestCase, BaseTests):
  79 |     """
  80 |     Processor and Aggregator Tests
  81 |     """
  82 | 
  83 |     def test_aggregator_min_size(self):
  84 | 
  85 |         with self.assertRaises(exceptions.ValidationError):
  86 |             Aggregator(max_size=20)
  87 | 
  88 |     def test_aggregator_max_size(self):
  89 | 
  90 |         with self.assertRaises(exceptions.ValidationError):
  91 |             Aggregator(max_size=2000)
  92 | 
  93 |     def test_processor_exceed_put_limit(self):
  94 |         processor = StringProcessor()
  95 | 
  96 |         with self.assertRaises(exceptions.ExceededPutLimit):
  97 |             list(processor.add_item(self.random_string(1024 * 1024 + 1)))
  98 | 
  99 |     def test_newline_aggregator(self):
 100 | 
 101 |         # in reality does not make sense as strings can contain new lines
 102 |         # so is not a suitable combination to use
 103 |         class NewlineTestProcessor(NewlineAggregator, StringSerializer):
 104 |             pass
 105 | 
 106 |         processor = NewlineTestProcessor()
 107 | 
 108 |         # Expect nothing as batching
 109 |         self.assertEqual([], list(processor.add_item(123)))
 110 |         self.assertEqual([], list(processor.add_item("test")))
 111 | 
 112 |         self.assertTrue(processor.has_items())
 113 | 
 114 |         output = list(processor.get_items())
 115 | 
 116 |         self.assertEqual(len(output), 1)
 117 | 
 118 |         self.assertEqual(output[0].size, 9)
 119 |         self.assertEqual(output[0].n, 2)
 120 |         self.assertEqual(output[0].data, b"123\ntest\n")
 121 | 
 122 |         self.assertListEqual(list(processor.parse(output[0].data)), ["123", "test"])
 123 | 
 124 |     def test_list_aggregator(self):
 125 |         class JsonListTestProcessor(ListAggregator, JsonSerializer):
 126 |             pass
 127 | 
 128 |         processor = JsonListTestProcessor()
 129 | 
 130 |         # Expect nothing as batching
 131 |         self.assertEqual([], list(processor.add_item(123)))
 132 |         self.assertEqual([], list(processor.add_item("test")))
 133 | 
 134 |         self.assertTrue(processor.has_items())
 135 | 
 136 |         output = list(processor.get_items())
 137 | 
 138 |         self.assertEqual(len(output), 1)
 139 | 
 140 |         self.assertEqual(output[0].size, 11)
 141 |         self.assertEqual(output[0].n, 2)
 142 |         self.assertEqual(output[0].data, b'[123, "test"]')
 143 | 
 144 |         self.assertListEqual(next(processor.parse(output[0].data)), [123, "test"])
 145 | 
 146 |     def test_netstring_aggregator(self):
 147 |         class NetstringTestProcessor(NetstringAggregator, StringSerializer):
 148 |             pass
 149 | 
 150 |         processor = NetstringTestProcessor()
 151 | 
 152 |         # Expect nothing as batching
 153 |         self.assertEqual([], list(processor.add_item(123)))
 154 |         self.assertEqual([], list(processor.add_item("test")))
 155 | 
 156 |         self.assertTrue(processor.has_items())
 157 | 
 158 |         output = list(processor.get_items())
 159 | 
 160 |         self.assertEqual(len(output), 1)
 161 | 
 162 |         self.assertEqual(output[0].size, 13)
 163 |         self.assertEqual(output[0].n, 2)
 164 |         self.assertEqual(output[0].data, b"3:123,4:test,")
 165 | 
 166 |         self.assertListEqual(list(processor.parse(output[0].data)), ["123", "test"])
 167 | 
 168 |     def test_kpl_aggregator(self):
 169 |         class KPLTestProcessor(KPLAggregator, StringSerializer):
 170 |             pass
 171 | 
 172 |         processor = KPLTestProcessor()
 173 | 
 174 |         # Expect nothing as batching
 175 |         self.assertEqual([], list(processor.add_item(123)))
 176 |         self.assertEqual([], list(processor.add_item("test")))
 177 | 
 178 |         self.assertTrue(processor.has_items())
 179 | 
 180 |         output = list(processor.get_items())
 181 | 
 182 |         self.assertEqual(len(output), 1)
 183 | 
 184 |         self.assertEqual(output[0].n, 2)
 185 | 
 186 |         self.assertListEqual(list(processor.parse(output[0].data)), ["123", "test"])
 187 | 
 188 |     def test_kpl_aggregator_max_size(self):
 189 |         class BytesSerializer:
 190 |             def serialize(self, item):
 191 |                 return item
 192 | 
 193 |             def deserialize(self, data):
 194 |                 return data
 195 | 
 196 |         class KPLTestProcessor(KPLAggregator, BytesSerializer):
 197 |             pass
 198 | 
 199 |         # 100 K max_size
 200 |         processor = KPLTestProcessor(max_size=1024 * 100)
 201 | 
 202 |         # Expect nothing as batching first two 40K records
 203 |         self.assertEqual([], list(processor.add_item(bytes(40 * 1024))))
 204 |         self.assertEqual([], list(processor.add_item(bytes(40 * 1024))))
 205 | 
 206 |         # output as we exceed
 207 |         output = list(processor.add_item(bytes(40 * 1024)))
 208 | 
 209 |         self.assertEqual(len(output), 1)
 210 | 
 211 |         self.assertEqual(output[0].n, 2)
 212 | 
 213 |     def test_string_processor(self):
 214 | 
 215 |         processor = StringProcessor()
 216 | 
 217 |         self.assertEquals(processor.max_bytes, 1024 * 25 * 40)
 218 | 
 219 |         output = list(processor.add_item("test"))
 220 | 
 221 |         self.assertEqual(len(output), 1)
 222 |         self.assertIsInstance(output[0], OutputItem)
 223 | 
 224 |         self.assertEqual(output[0].size, len("test"))
 225 |         self.assertEqual(output[0].n, 1)
 226 |         self.assertEqual(output[0].data, b"test")
 227 | 
 228 |         self.assertFalse(processor.has_items())
 229 | 
 230 |     def test_json_processor(self):
 231 | 
 232 |         processor = JsonProcessor()
 233 | 
 234 |         output = list(processor.add_item({"test": 123}))
 235 | 
 236 |         self.assertEqual(len(output), 1)
 237 |         self.assertIsInstance(output[0], OutputItem)
 238 | 
 239 |         self.assertEqual(output[0].size, 13)
 240 |         self.assertEqual(output[0].n, 1)
 241 |         self.assertEqual(output[0].data, b'{"test": 123}')
 242 | 
 243 |         self.assertFalse(processor.has_items())
 244 | 
 245 |         self.assertListEqual(list(processor.parse(output[0].data)), [{"test": 123}])
 246 | 
 247 |     def test_json_line_processor(self):
 248 | 
 249 |         processor = JsonLineProcessor(max_size=25)
 250 | 
 251 |         # Expect nothing as batching
 252 |         self.assertEqual([], list(processor.add_item({"test": 123})))
 253 |         self.assertEqual([], list(processor.add_item({"test": 456})))
 254 | 
 255 |         self.assertTrue(processor.has_items())
 256 | 
 257 |         output = list(processor.get_items())
 258 | 
 259 |         self.assertEqual(len(output), 1)
 260 | 
 261 |         self.assertEqual(output[0].size, 28)
 262 |         self.assertEqual(output[0].n, 2)
 263 |         self.assertEqual(output[0].data, b'{"test": 123}\n{"test": 456}\n')
 264 | 
 265 |         self.assertListEqual(
 266 |             list(processor.parse(output[0].data)),
 267 |             [{"test": 123}, {"test": 456}],
 268 |         )
 269 | 
 270 |         # Expect empty now
 271 |         self.assertFalse(processor.has_items())
 272 | 
 273 |         result = []
 274 |         for x in range(1000):
 275 |             output = list(processor.add_item({"test": "test with some more data"}))
 276 |             if output:
 277 |                 self.assertEqual(len(output), 1)
 278 |                 result.append(output[0])
 279 | 
 280 |         # Expected at least one record to be output
 281 |         self.assertEqual(len(result), 1)
 282 | 
 283 |         self.assertEqual(result[0].size, 25567)  # expect below 25*1024=25600
 284 |         self.assertEqual(result[0].n, 691)
 285 | 
 286 |         # Expect some left
 287 |         self.assertTrue(processor.has_items())
 288 | 
 289 |         output = list(processor.get_items())
 290 | 
 291 |         self.assertEqual(len(output), 1)
 292 | 
 293 |         self.assertEqual(output[0].size, 11432)
 294 |         self.assertEqual(output[0].n, 309)
 295 | 
 296 |         self.assertFalse(processor.has_items())
 297 | 
 298 |     def test_json_list_processor(self):
 299 | 
 300 |         processor = JsonListProcessor(max_size=25)
 301 | 
 302 |         # Expect nothing as batching
 303 |         self.assertEqual([], list(processor.add_item({"test": 123})))
 304 |         self.assertEqual([], list(processor.add_item({"test": 456})))
 305 | 
 306 |         self.assertTrue(processor.has_items())
 307 | 
 308 |         output = list(processor.get_items())
 309 | 
 310 |         self.assertEqual(len(output), 1)
 311 | 
 312 |         self.assertEqual(output[0].size, 28)
 313 |         self.assertEqual(output[0].n, 2)
 314 |         self.assertEqual(output[0].data, b'[{"test": 123}, {"test": 456}]')
 315 | 
 316 |         # Need to use next() otherwise list() creates double nested list
 317 |         self.assertListEqual(
 318 |             next(processor.parse(output[0].data)), [{"test": 123}, {"test": 456}]
 319 |         )
 320 | 
 321 |         # Expect empty now
 322 |         self.assertFalse(processor.has_items())
 323 | 
 324 |         result = []
 325 |         for x in range(1000):
 326 |             output = list(processor.add_item({"test": "test with some more data"}))
 327 |             if output:
 328 |                 self.assertEqual(len(output), 1)
 329 |                 result.append(output[0])
 330 | 
 331 |         # Expected at least one record to be output
 332 |         self.assertEqual(len(result), 1)
 333 | 
 334 |         self.assertEqual(result[0].size, 25567)  # expect below 25*1024=25600
 335 |         self.assertEqual(result[0].n, 691)
 336 | 
 337 |         # Expect some left
 338 |         self.assertTrue(processor.has_items())
 339 | 
 340 |         output = list(processor.get_items())
 341 | 
 342 |         self.assertEqual(len(output), 1)
 343 | 
 344 |         self.assertEqual(output[0].size, 11432)
 345 |         self.assertEqual(output[0].n, 309)
 346 | 
 347 |         self.assertFalse(processor.has_items())
 348 | 
 349 |     def test_msgpack_processor(self):
 350 | 
 351 |         processor = MsgpackProcessor(max_size=25)
 352 | 
 353 |         # Expect nothing as batching
 354 |         self.assertEqual([], list(processor.add_item({"test": 123})))
 355 |         self.assertEqual([], list(processor.add_item({"test": 456})))
 356 | 
 357 |         self.assertTrue(processor.has_items())
 358 | 
 359 |         output = list(processor.get_items())
 360 | 
 361 |         self.assertEqual(len(output), 1)
 362 | 
 363 |         self.assertEqual(output[0].size, 22)
 364 |         self.assertEqual(output[0].n, 2)
 365 |         self.assertEqual(output[0].data, b"7:\x81\xa4test{,9:\x81\xa4test\xcd\x01\xc8,")
 366 | 
 367 |         self.assertListEqual(
 368 |             list(processor.parse(output[0].data)), [{"test": 123}, {"test": 456}]
 369 |         )
 370 | 
 371 |         # Expect empty now
 372 |         self.assertFalse(processor.has_items())
 373 | 
 374 |         result = []
 375 |         for x in range(1000):
 376 |             output = list(processor.add_item({"test": "test with some more data"}))
 377 |             if output:
 378 |                 self.assertEqual(len(output), 1)
 379 |                 result.append(output[0])
 380 | 
 381 |         # Expected at least one record to be output
 382 |         self.assertEqual(len(result), 1)
 383 | 
 384 |         self.assertEqual(result[0].size, 25585)  # expect below 25*1024=25600
 385 |         self.assertEqual(result[0].n, 731)
 386 | 
 387 |         # Expect some left
 388 |         self.assertTrue(processor.has_items())
 389 | 
 390 |         output = list(processor.get_items())
 391 | 
 392 |         self.assertEqual(len(output), 1)
 393 | 
 394 |         self.assertEqual(output[0].size, 9411)
 395 |         self.assertEqual(output[0].n, 269)
 396 | 
 397 |         self.assertFalse(processor.has_items())
 398 | 
 399 | 
 400 | class CheckpointTests(BaseKinesisTests):
 401 |     """
 402 |     Checkpoint Tests
 403 |     """
 404 | 
 405 |     @classmethod
 406 |     def patch_consumer_fetch(cls, consumer):
 407 |         async def get_shard_iterator(shard_id, last_sequence_number=None):
 408 |             log.info(
 409 |                 "getting shard iterator for {} @ {}".format(
 410 |                     shard_id, last_sequence_number
 411 |                 )
 412 |             )
 413 |             return True
 414 | 
 415 |         consumer.get_shard_iterator = get_shard_iterator
 416 | 
 417 |         async def get_records(shard):
 418 |             log.info("get records shard={}".format(shard["ShardId"]))
 419 |             return {}
 420 | 
 421 |         consumer.get_records = get_records
 422 | 
 423 |         consumer.is_fetching = True
 424 | 
 425 |     async def test_memory_checkpoint(self):
 426 |         # first consumer
 427 |         checkpointer = MemoryCheckPointer(name="test")
 428 | 
 429 |         consumer_a = Consumer(
 430 |             stream_name=None,
 431 |             checkpointer=checkpointer,
 432 |             max_shard_consumers=1,
 433 |             endpoint_url=ENDPOINT_URL,
 434 |         )
 435 | 
 436 |         self.patch_consumer_fetch(consumer_a)
 437 | 
 438 |         consumer_a.shards = [{"ShardId": "test-1"}, {"ShardId": "test-2"}]
 439 | 
 440 |         await consumer_a.fetch()
 441 | 
 442 |         shards = [s["ShardId"] for s in consumer_a.shards if s.get("stats")]
 443 | 
 444 |         # Expect only one shard assigned as max = 1
 445 |         self.assertEqual(["test-1"], shards)
 446 | 
 447 |         # second consumer (note: max_shard_consumers needs to be 2 as uses checkpointer to get allocated shards)
 448 | 
 449 |         consumer_b = Consumer(
 450 |             stream_name=None,
 451 |             checkpointer=checkpointer,
 452 |             max_shard_consumers=2,
 453 |             endpoint_url=ENDPOINT_URL,
 454 |         )
 455 | 
 456 |         self.patch_consumer_fetch(consumer_b)
 457 | 
 458 |         consumer_b.shards = [{"ShardId": "test-1"}, {"ShardId": "test-2"}]
 459 | 
 460 |         await consumer_b.fetch()
 461 | 
 462 |         shards = [s["ShardId"] for s in consumer_b.shards if s.get("stats")]
 463 | 
 464 |         # Expect only one shard assigned as max = 1
 465 |         self.assertEqual(["test-2"], shards)
 466 | 
 467 |     async def test_redis_checkpoint_locking(self):
 468 |         name = "test-{}".format(str(uuid.uuid4())[0:8])
 469 | 
 470 |         # first consumer
 471 |         checkpointer_a = RedisCheckPointer(name=name, id="proc-1")
 472 | 
 473 |         # second consumer
 474 |         checkpointer_b = RedisCheckPointer(name=name, id="proc-2")
 475 | 
 476 |         # try to allocate the same shard
 477 | 
 478 |         result = await asyncio.gather(
 479 |             *[checkpointer_a.allocate("test"), checkpointer_b.allocate("test")]
 480 |         )
 481 | 
 482 |         result = list(sorted([x[0] for x in result]))
 483 | 
 484 |         # Expect only one to have succeeded
 485 |         self.assertEquals([False, True], result)
 486 | 
 487 |         await checkpointer_a.close()
 488 |         await checkpointer_b.close()
 489 | 
 490 |     async def test_redis_checkpoint_reallocate(self):
 491 |         name = "test-{}".format(str(uuid.uuid4())[0:8])
 492 | 
 493 |         # first consumer
 494 |         checkpointer_a = RedisCheckPointer(name=name, id="proc-1")
 495 | 
 496 |         await checkpointer_a.allocate("test")
 497 | 
 498 |         # checkpoint
 499 |         await checkpointer_a.checkpoint("test", "123")
 500 | 
 501 |         # stop on this shard
 502 |         await checkpointer_a.deallocate("test")
 503 | 
 504 |         # second consumer
 505 |         checkpointer_b = RedisCheckPointer(name=name, id="proc-2")
 506 | 
 507 |         success, sequence = await checkpointer_b.allocate("test")
 508 | 
 509 |         self.assertTrue(success)
 510 |         self.assertEquals("123", sequence)
 511 | 
 512 |         await checkpointer_b.close()
 513 | 
 514 |         self.assertEquals(checkpointer_b.get_all_checkpoints(), {})
 515 | 
 516 |         await checkpointer_a.close()
 517 | 
 518 |     async def test_redis_checkpoint_hearbeat(self):
 519 |         name = "test-{}".format(str(uuid.uuid4())[0:8])
 520 | 
 521 |         checkpointer = RedisCheckPointer(name=name, heartbeat_frequency=0.5)
 522 | 
 523 |         await checkpointer.allocate("test")
 524 |         await checkpointer.checkpoint("test", "123")
 525 | 
 526 |         await asyncio.sleep(1)
 527 | 
 528 |         await checkpointer.close()
 529 | 
 530 |         # nothing to assert
 531 |         self.assertTrue(True)
 532 | 
 533 | 
 534 | class KinesisTests(BaseKinesisTests):
 535 |     """
 536 |     Kinesalite Tests
 537 |     """
 538 | 
 539 |     async def test_stream_does_not_exist(self):
 540 | 
 541 |         await asyncio.sleep(2)
 542 | 
 543 |         # Producer
 544 |         with self.assertRaises(exceptions.StreamDoesNotExist):
 545 |             async with Producer(
 546 |                 session=AioSession(),
 547 |                 stream_name="test_stream_does_not_exist", endpoint_url=ENDPOINT_URL
 548 |             ) as producer:
 549 |                 await producer.put("test")
 550 | 
 551 |         # Consumer
 552 |         with self.assertRaises(exceptions.StreamDoesNotExist):
 553 |             async with Consumer(
 554 |                 stream_name="test_stream_does_not_exist", endpoint_url=ENDPOINT_URL
 555 |             ):
 556 |                 pass
 557 | 
 558 |     @fail_on(unused_loop=True, active_handles=True)
 559 |     async def test_producer_put(self):
 560 |         async with Producer(
 561 |             stream_name=self.stream_name, endpoint_url=ENDPOINT_URL
 562 |         ) as producer:
 563 |             await producer.put("test")
 564 | 
 565 |     async def test_producer_put_below_limit(self):
 566 |         async with Producer(
 567 |             stream_name=self.stream_name,
 568 |             processor=StringProcessor(),
 569 |             endpoint_url=ENDPOINT_URL,
 570 |         ) as producer:
 571 |             # The maximum size of the data payload of a record before base64-encoding is up to 1 MiB.
 572 |             # Limit is set in aggregators.BaseAggregator (few bytes short of 1MiB)
 573 |             await producer.put(self.random_string(40 * 25 * 1024))
 574 | 
 575 |     async def test_producer_put_exceed_batch_size(self):
 576 |         # Expect to complete by lowering batch size until successful (500 is max)
 577 |         async with Producer(
 578 |             stream_name=self.stream_name, endpoint_url=ENDPOINT_URL, batch_size=600
 579 |         ) as producer:
 580 | 
 581 |             for x in range(1000):
 582 |                 await producer.put("test")
 583 | 
 584 |     async def test_producer_and_consumer(self):
 585 | 
 586 |         async with Producer(
 587 |             stream_name=self.stream_name, endpoint_url=ENDPOINT_URL
 588 |         ) as producer:
 589 |             pass
 590 | 
 591 |             async with Consumer(
 592 |                 stream_name=self.stream_name, endpoint_url=ENDPOINT_URL
 593 |             ):
 594 |                 pass
 595 | 
 596 |     async def test_producer_and_consumer_consume_from_start_flush(self):
 597 |         async with Producer(
 598 |             stream_name=self.stream_name, endpoint_url=ENDPOINT_URL
 599 |         ) as producer:
 600 | 
 601 |             await producer.put({"test": 123})
 602 | 
 603 |             await producer.flush()
 604 | 
 605 |             results = []
 606 | 
 607 |             async with Consumer(
 608 |                 stream_name=self.stream_name, endpoint_url=ENDPOINT_URL
 609 |             ) as consumer:
 610 |                 async for item in consumer:
 611 |                     results.append(item)
 612 | 
 613 |             # Expect to have consumed from start as default iterator_type=TRIM_HORIZON
 614 |             self.assertEquals([{"test": 123}], results)
 615 | 
 616 |     async def test_producer_and_consumer_consume_from_start_after(self):
 617 | 
 618 |         # Don't flush, close producer immediately to test all data is written to stream on exit.
 619 |         async with Producer(
 620 |             stream_name=self.stream_name,
 621 |             endpoint_url=ENDPOINT_URL,
 622 |             processor=StringProcessor(),
 623 |         ) as producer:
 624 |             # Put enough data to ensure it will require more than one put
 625 |             # ie test overflow behaviour
 626 |             for _ in range(15):
 627 |                 await producer.put(self.random_string(100 * 1024))
 628 | 
 629 |         results = []
 630 | 
 631 |         async with Consumer(
 632 |             stream_name=self.stream_name,
 633 |             endpoint_url=ENDPOINT_URL,
 634 |             processor=StringProcessor(),
 635 |         ) as consumer:
 636 |             async for item in consumer:
 637 |                 results.append(item)
 638 | 
 639 |         # Expect to have consumed from start as default iterator_type=TRIM_HORIZON
 640 |         self.assertEquals(len(results), 15)
 641 | 
 642 |     async def test_producer_and_consumer_consume_with_json_line_aggregator(self):
 643 | 
 644 |         processor = JsonLineProcessor()
 645 | 
 646 |         async with Producer(
 647 |             stream_name=self.stream_name, endpoint_url=ENDPOINT_URL, processor=processor
 648 |         ) as producer:
 649 | 
 650 |             for x in range(0, 10):
 651 |                 await producer.put({"test": x})
 652 | 
 653 |             await producer.flush()
 654 | 
 655 |             results = []
 656 | 
 657 |             async with Consumer(
 658 |                 stream_name=self.stream_name,
 659 |                 endpoint_url=ENDPOINT_URL,
 660 |                 processor=processor,
 661 |             ) as consumer:
 662 |                 async for item in consumer:
 663 |                     results.append(item)
 664 | 
 665 |             # Expect to have consumed from start as default iterator_type=TRIM_HORIZON
 666 | 
 667 |             self.assertEqual(len(results), 10)
 668 | 
 669 |             self.assertEquals(results[0], {"test": 0})
 670 |             self.assertEquals(results[-1], {"test": 9})
 671 | 
 672 |     async def test_producer_and_consumer_consume_with_msgpack_aggregator(self):
 673 | 
 674 |         processor = MsgpackProcessor()
 675 | 
 676 |         async with Producer(
 677 |             stream_name=self.stream_name, endpoint_url=ENDPOINT_URL, processor=processor
 678 |         ) as producer:
 679 | 
 680 |             for x in range(0, 10):
 681 |                 await producer.put({"test": x})
 682 | 
 683 |             await producer.flush()
 684 | 
 685 |             results = []
 686 | 
 687 |             async with Consumer(
 688 |                 stream_name=self.stream_name,
 689 |                 endpoint_url=ENDPOINT_URL,
 690 |                 processor=processor,
 691 |             ) as consumer:
 692 |                 async for item in consumer:
 693 |                     results.append(item)
 694 | 
 695 |             # Expect to have consumed from start as default iterator_type=TRIM_HORIZON
 696 | 
 697 |             self.assertEqual(len(results), 10)
 698 | 
 699 |             self.assertEquals(results[0], {"test": 0})
 700 |             self.assertEquals(results[-1], {"test": 9})
 701 | 
 702 |     async def test_producer_and_consumer_consume_with_bytes(self):
 703 |         class ByteSerializer(Serializer):
 704 |             def serialize(self, msg):
 705 |                 result = str.encode(msg)
 706 |                 return result
 707 | 
 708 |             def deserialize(self, data):
 709 |                 return data
 710 | 
 711 |         class ByteProcessor(Processor, NetstringAggregator, ByteSerializer):
 712 |             pass
 713 | 
 714 |         processor = ByteProcessor()
 715 | 
 716 |         async with Producer(
 717 |             stream_name=self.stream_name, endpoint_url=ENDPOINT_URL, processor=processor
 718 |         ) as producer:
 719 | 
 720 |             for x in range(0, 2):
 721 |                 await producer.put(f"{x}")
 722 | 
 723 |             await producer.flush()
 724 | 
 725 |             results = []
 726 | 
 727 |             checkpointer = MemoryCheckPointer(name="test")
 728 | 
 729 |             async with Consumer(
 730 |                 stream_name=self.stream_name,
 731 |                 endpoint_url=ENDPOINT_URL,
 732 |                 processor=processor,
 733 |                 checkpointer=checkpointer,
 734 |             ) as consumer:
 735 |                 async for item in consumer:
 736 |                     results.append(item)
 737 |                     await checkpointer.checkpoint(
 738 |                         shard_id=consumer.shards[0]["ShardId"], sequence="seq"
 739 |                     )
 740 | 
 741 |                 async for item in consumer:
 742 |                     results.append(item)
 743 | 
 744 |             self.assertEquals(len(results), 2)
 745 | 
 746 |             await checkpointer.close()
 747 | 
 748 |             self.assertEquals(len(checkpointer.get_all_checkpoints()), 1)
 749 | 
 750 |     async def test_producer_and_consumer_consume_queue_full(self):
 751 |         async with Producer(
 752 |             stream_name=self.stream_name, endpoint_url=ENDPOINT_URL
 753 |         ) as producer:
 754 | 
 755 |             for i in range(0, 100):
 756 |                 await producer.put("test")
 757 | 
 758 |             await producer.flush()
 759 | 
 760 |             results = []
 761 | 
 762 |             async with Consumer(
 763 |                 stream_name=self.stream_name,
 764 |                 endpoint_url=ENDPOINT_URL,
 765 |                 max_queue_size=20,
 766 |             ) as consumer:
 767 | 
 768 |                 async for item in consumer:
 769 |                     results.append(item)
 770 | 
 771 |             # Expect 20 only as queue is full and we don't wait on queue
 772 |             self.assertEqual(20, len(results))
 773 | 
 774 |     async def test_producer_and_consumer_consume_throttle(self):
 775 |         async with Producer(
 776 |             stream_name=self.stream_name, endpoint_url=ENDPOINT_URL
 777 |         ) as producer:
 778 | 
 779 |             for i in range(0, 100):
 780 |                 await producer.put("test")
 781 | 
 782 |             await producer.flush()
 783 | 
 784 |             results = []
 785 | 
 786 |             async with Consumer(
 787 |                 stream_name=self.stream_name,
 788 |                 endpoint_url=ENDPOINT_URL,
 789 |                 record_limit=10,
 790 |                 # 2 per second
 791 |                 shard_fetch_rate=2,
 792 |             ) as consumer:
 793 | 
 794 |                 from datetime import datetime
 795 | 
 796 |                 dt = datetime.now()
 797 | 
 798 |                 while (datetime.now() - dt).total_seconds() < 3.05:
 799 |                     async for item in consumer:
 800 |                         results.append(item)
 801 | 
 802 |             # Expect 2*3*10 = 60  ie at most 6 iterations of 10 records
 803 |             self.assertGreaterEqual(len(results), 50)
 804 |             self.assertLessEqual(len(results), 70)
 805 | 
 806 |     async def test_producer_and_consumer_consume_with_checkpointer_and_latest(self):
 807 |         async with Producer(
 808 |             stream_name=self.stream_name, endpoint_url=ENDPOINT_URL
 809 |         ) as producer:
 810 | 
 811 |             await producer.put("test.A")
 812 | 
 813 |             results = []
 814 | 
 815 |             checkpointer = MemoryCheckPointer(name="test")
 816 | 
 817 |             async with Consumer(
 818 |                 stream_name=self.stream_name,
 819 |                 endpoint_url=ENDPOINT_URL,
 820 |                 checkpointer=checkpointer,
 821 |                 iterator_type="LATEST",
 822 |             ) as consumer:
 823 | 
 824 |                 async for item in consumer:
 825 |                     results.append(item)
 826 | 
 827 |             # Expect none as LATEST
 828 |             self.assertEquals([], results)
 829 | 
 830 |             checkpoints = checkpointer.get_all_checkpoints()
 831 | 
 832 |             # Expect 1 as only 1 shard
 833 |             self.assertEquals(1, len(checkpoints))
 834 | 
 835 |             # none as no records yet (using LATEST)
 836 |             self.assertIsNone(checkpoints[list(checkpoints.keys())[0]]["sequence"])
 837 | 
 838 |             results = []
 839 | 
 840 |             log.info("checkpointer checkpoints: {}".format(checkpoints))
 841 | 
 842 |             log.info("Starting consumer again..")
 843 | 
 844 |             async with Consumer(
 845 |                 stream_name=self.stream_name,
 846 |                 endpoint_url=ENDPOINT_URL,
 847 |                 checkpointer=checkpointer,
 848 |                 iterator_type="LATEST",
 849 |                 sleep_time_no_records=0.5,
 850 |             ) as consumer:
 851 | 
 852 |                 # Manually start
 853 |                 await consumer.start_consumer()
 854 | 
 855 |                 await producer.put("test.B")
 856 | 
 857 |                 await producer.flush()
 858 | 
 859 |                 log.info("waiting..")
 860 | 
 861 |                 await asyncio.sleep(1)
 862 | 
 863 |                 log.info("about to consume..")
 864 | 
 865 |                 async for item in consumer:
 866 |                     results.append(item)
 867 | 
 868 |             self.assertEquals(["test.B"], results)
 869 | 
 870 |             checkpoints = checkpointer.get_all_checkpoints()
 871 | 
 872 |             log.info("checkpointer checkpoints: {}".format(checkpoints))
 873 | 
 874 |             # expect not None as has processed records
 875 |             self.assertIsNotNone(checkpoints[list(checkpoints.keys())[0]]["sequence"])
 876 | 
 877 |             # now add some records
 878 |             for i in range(0, 10):
 879 |                 await producer.put("test.{}".format(i))
 880 | 
 881 |             await producer.flush()
 882 | 
 883 |             await asyncio.sleep(1)
 884 | 
 885 |             results = []
 886 | 
 887 |             async with Consumer(
 888 |                 stream_name=self.stream_name,
 889 |                 endpoint_url=ENDPOINT_URL,
 890 |                 checkpointer=checkpointer,
 891 |                 iterator_type="LATEST",
 892 |                 sleep_time_no_records=0.5,
 893 |             ) as consumer:
 894 | 
 895 |                 async for item in consumer:
 896 |                     results.append(item)
 897 | 
 898 |             # Expect results as checkpointer resumed from prior sequence
 899 |             self.assertEquals(10, len(results))
 900 | 
 901 |     async def test_producer_and_consumer_consume_multiple_shards_with_redis_checkpointer(
 902 |         self,
 903 |     ):
 904 |         stream_name = "test_{}".format(str(uuid.uuid4())[0:8])
 905 |         async with Producer(
 906 |             stream_name=stream_name,
 907 |             endpoint_url=ENDPOINT_URL,
 908 |             create_stream=stream_name,
 909 |             create_stream_shards=2,
 910 |         ) as producer:
 911 | 
 912 |             for i in range(0, 100):
 913 |                 await producer.put("test.{}".format(i))
 914 | 
 915 |             await producer.flush()
 916 | 
 917 |             results = []
 918 | 
 919 |             checkpointer = RedisCheckPointer(
 920 |                 name="test-{}".format(str(uuid.uuid4())[0:8]), heartbeat_frequency=3
 921 |             )
 922 | 
 923 |             async with Consumer(
 924 |                 stream_name=stream_name,
 925 |                 endpoint_url=ENDPOINT_URL,
 926 |                 checkpointer=checkpointer,
 927 |                 record_limit=10,
 928 |             ) as consumer:
 929 | 
 930 |                 # consumer will stop if no msgs
 931 |                 for i in range(0, 6):
 932 |                     async for item in consumer:
 933 |                         results.append(item)
 934 |                     await asyncio.sleep(0.5)
 935 | 
 936 |                 self.assertEquals(100, len(results))
 937 | 
 938 |                 checkpoints = checkpointer.get_all_checkpoints()
 939 | 
 940 |                 self.assertEquals(2, len(checkpoints))
 941 | 
 942 |                 # Expect both shards to have been used/set
 943 |                 for item in checkpoints.values():
 944 |                     self.assertIsNotNone(item)
 945 | 
 946 | 
 947 | class AWSKinesisTests(BaseKinesisTests):
 948 |     """
 949 |     AWS Kinesis Tests
 950 |     """
 951 | 
 952 |     STREAM_NAME_SINGLE_SHARD = "pykinesis-test-single-shard"
 953 |     STREAM_NAME_MULTI_SHARD = "pykinesis-test-multi-shard"
 954 | 
 955 |     forbid_get_event_loop = True
 956 | 
 957 |     @classmethod
 958 |     def setUpClass(cls):
 959 |         if not TESTING_USE_AWS_KINESIS:
 960 |             return
 961 | 
 962 |         log.info(
 963 |             "Creating (or ignoring if exists) *Actual* Kinesis stream: {}".format(
 964 |                 cls.STREAM_NAME_SINGLE_SHARD
 965 |             )
 966 |         )
 967 | 
 968 |         async def create(stream_name, shards):
 969 |             async with Producer(stream_name=stream_name, create_stream=True, create_stream_shards=shards) as producer:
 970 |                 await producer.start()
 971 | 
 972 |         asyncio.run(create(stream_name=cls.STREAM_NAME_SINGLE_SHARD, shards=1))
 973 | 
 974 |     @classmethod
 975 |     def tearDownClass(cls):
 976 |         if not TESTING_USE_AWS_KINESIS:
 977 |             return
 978 | 
 979 |         log.warning(
 980 |             "Don't forget to delete your $$ streams: {} and {}".format(
 981 |                 cls.STREAM_NAME_SINGLE_SHARD, cls.STREAM_NAME_MULTI_SHARD
 982 |             )
 983 |         )
 984 | 
 985 |     @skipUnless(
 986 |         TESTING_USE_AWS_KINESIS, "Requires TESTING_USE_AWS_KINESIS flag to be set"
 987 |     )
 988 |     async def test_consumer_checkpoint(self):
 989 | 
 990 |         checkpointer = MemoryCheckPointer(name="test")
 991 | 
 992 |         results = []
 993 | 
 994 |         async with Producer(
 995 |             stream_name=self.STREAM_NAME_SINGLE_SHARD,
 996 |             processor=StringProcessor(),
 997 |         ) as producer:
 998 | 
 999 |             async with Consumer(
1000 |                 stream_name=self.STREAM_NAME_SINGLE_SHARD,
1001 |                 checkpointer=checkpointer,
1002 |                 processor=StringProcessor(),
1003 |                 iterator_type="LATEST",
1004 |             ) as consumer:
1005 | 
1006 |                 # Manually start
1007 |                 await consumer.start_consumer()
1008 | 
1009 |                 await producer.put("test")
1010 | 
1011 |                 await producer.flush()
1012 | 
1013 |                 for i in range(3):
1014 |                     async for item in consumer:
1015 |                         results.append(item)
1016 | 
1017 |             checkpoints = checkpointer.get_all_checkpoints()
1018 | 
1019 |             # Expect 1 as only 1 shard
1020 |             self.assertEquals(1, len(checkpoints))
1021 | 
1022 |             self.assertIsNotNone(checkpoints[list(checkpoints.keys())[0]]["sequence"])
1023 | 
1024 |             self.assertListEqual(results, ["test"])
1025 | 
1026 |     @skipUnless(
1027 |         TESTING_USE_AWS_KINESIS, "Requires TESTING_USE_AWS_KINESIS flag to be set"
1028 |     )
1029 |     async def test_consumer_consume_fetch_limit(self):
1030 | 
1031 |         async with Consumer(
1032 |             stream_name=self.STREAM_NAME_SINGLE_SHARD,
1033 |             sleep_time_no_records=0.0001,
1034 |             shard_fetch_rate=500,
1035 |             iterator_type="LATEST",
1036 |         ) as consumer:
1037 |             await consumer.start()
1038 | 
1039 |             # GetShardIterator has a limit of five transactions per second per account per open shard
1040 | 
1041 |             for i in range(0, 500):
1042 |                 await consumer.fetch()
1043 |                 # sleep 50ms
1044 |                 await asyncio.sleep(0.05)
1045 | 
1046 |             shard_stats = [s["stats"] for s in consumer.shards][0].to_data()
1047 | 
1048 |             self.assertTrue(
1049 |                 shard_stats["throttled"] > 0, msg="Expected to be throttled"
1050 |             )
1051 | 
1052 |     @skipUnless(
1053 |         TESTING_USE_AWS_KINESIS, "Requires TESTING_USE_AWS_KINESIS flag to be set"
1054 |     )
1055 |     async def test_producer_producer_limit(self):
1056 |         # Expect some throughput errors
1057 | 
1058 |         async with Producer(
1059 |             stream_name=self.STREAM_NAME_SINGLE_SHARD,
1060 |             processor=StringProcessor(),
1061 |             put_bandwidth_limit_per_shard=1500,
1062 |         ) as producer:
1063 | 
1064 |             async with Consumer(
1065 |                 stream_name=self.STREAM_NAME_SINGLE_SHARD,
1066 |                 processor=StringProcessor(),
1067 |                 iterator_type="LATEST",
1068 |             ) as consumer:
1069 | 
1070 |                 await consumer.start_consumer()
1071 | 
1072 |                 # Wait a bit just to be sure iterator is gonna get late
1073 |                 await asyncio.sleep(3)
1074 | 
1075 |                 for x in range(20):
1076 |                     await producer.put(self.random_string(1024 * 250))
1077 | 
1078 |                 # todo: async timeout
1079 |                 output = []
1080 |                 while len(output) < 20:
1081 |                     async for item in consumer:
1082 |                         output.append(item)
1083 | 
1084 |                 self.assertEquals(len(output), 20)
1085 |                 self.assertTrue(producer.throughput_exceeded_count > 0)
1086 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py37,py38,black,mypy
 3 | 
 4 | [testenv]
 5 | passenv = TESTING_USE_AWS_KINESIS
 6 | commands =
 7 |     nosetests []
 8 | 
 9 | deps =
10 |     -r test-requirements.txt
11 | 
12 | [testenv:black]
13 | deps =
14 |     black
15 | commands =
16 |     black --check --diff kinesis
17 | skip_install = true
18 | 
19 | [testenv:mypy]
20 | deps =
21 |     mypy
22 | commands =
23 |     mypy kinesis


--------------------------------------------------------------------------------