├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── benchmark-requirements.txt ├── benchmark.py ├── docker-compose.yaml ├── docs ├── DESIGN.md ├── YETANOTHER.md └── benchmark.png ├── kinesis ├── __init__.py ├── aggregators.py ├── base.py ├── checkpointers.py ├── consumer.py ├── exceptions.py ├── processors.py ├── producer.py ├── serializers.py └── utils.py ├── mypy.ini ├── requirements.txt ├── setup.cfg ├── setup.py ├── test-requirements.txt ├── tests.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | *.egg-info 3 | .idea 4 | *.pyc 5 | build 6 | notes.txt 7 | deploy.sh 8 | temp.py 9 | .env 10 | .mypy_cache 11 | .tox 12 | release.sh 13 | test_* -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-slim 2 | 3 | RUN apt-get update && apt-get install -y gcc python-dev gettext-base 4 | 5 | RUN mkdir /app 6 | 7 | COPY requirements.txt /app/requirements.txt 8 | 9 | RUN pip install -r /app/requirements.txt 10 | 11 | COPY test-requirements.txt /app/test-requirements.txt 12 | 13 | RUN pip install -r /app/test-requirements.txt 14 | 15 | COPY kinesis /app/kinesis/ 16 | 17 | COPY tests.py /app/tests.py 18 | 19 | WORKDIR /app/ 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # async-kinesis 2 | 3 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) [![PyPI version](https://badge.fury.io/py/async-kinesis.svg)](https://badge.fury.io/py/async-kinesis) [![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)](https://www.python.org/downloads/release/python-370/) [![Python 3.8](https://img.shields.io/badge/python-3.8-blue.svg)](https://www.python.org/downloads/release/python-380/) 4 | 5 | ``` 6 | pip install async-kinesis 7 | ``` 8 | 9 | ## Features 10 | 11 | - uses queues for both producer and consumer 12 | - producer flushes with put_records() if has enough to flush or after "buffer_time" reached 13 | - consumer iterates over msg queue independent of shard readers 14 | - Configurable to handle Sharding limits but will throttle/retry if required 15 | - ie multiple independent clients are saturating the Shards 16 | - Checkpointing with heartbeats 17 | - deadlock + reallocation of shards if checkpoint fails to heartbeat within "session_timeout" 18 | - processors (aggregator + serializer) 19 | - json line delimited, msgpack 20 | 21 | 22 | See [docs/design](./docs/DESIGN.md) for more details. 23 | See [docs/yetanother](docs/YETANOTHER.md) as to why reinvent the wheel. 24 | 25 | ## Environment Variables 26 | 27 | As required by boto3 28 | 29 | ``` 30 | AWS_ACCESS_KEY_ID 31 | AWS_SECRET_ACCESS_KEY 32 | ``` 33 | 34 | ## Producer 35 | 36 | from kinesis import Producer 37 | 38 | async with Producer(stream_name="test") as producer: 39 | # Put item onto queue to be flushed via put_records() 40 | await producer.put({'my': 'data'}) 41 | 42 | 43 | Options: 44 | 45 | (comments in quotes are Kinesis Limits as per AWS Docs) 46 | 47 | | Arg | Default | Description | 48 | | --- | --- | --- | 49 | | session | None | AioSession (to use non default profile etc) | 50 | | region_name | None | AWS Region | 51 | | buffer_time | 0.5 | Buffer time in seconds before auto flushing records | 52 | | put_rate_limit_per_shard | 1000 | "A single shard can ingest up to 1 MiB of data per second (including partition keys) or 1,000 records per second for writes" | 53 | | put_bandwidth_limit_per_shard | 1024 | Kb per sec. max is 1024 per shard (ie 1 MiB). Keep below to minimize ProvisionedThroughputExceeded" errors * | 54 | | batch_size | 500 | "Each PutRecords request can support up to 500 records" | 55 | | max_queue_size | 10000 | put() method will block when queue is at max | 56 | | after_flush_fun | None | async function to call after doing a flush (err put_records()) call | 57 | | processor | JsonProcessor() | Record aggregator/serializer. Default is JSON without aggregation. Note this is highly inefficient as each record can be up to 1Mib | 58 | | retry_limit | None | How many connection attempts should be made before raising a exception | 59 | | expo_backoff | None | Exponential Backoff when connection attempt fails | 60 | | expo_backoff_limit | 120 | Max amount of seconds Exponential Backoff can grow | 61 | | create_stream | False | Creates a Kinesis Stream based on the `stream_name` keyword argument. Note if stream already existing it will ignore | 62 | | create_stream_shards | 1 | Sets the amount of shard you want for your new stream. Note if stream already existing it will ignore | 63 | 64 | * Throughput exceeded. The docs (for Java/KPL see: https://docs.aws.amazon.com/streams/latest/dev/kinesis-producer-adv-retries-rate-limiting.html) state: 65 | 66 | > You can lower this limit to reduce spamming due to excessive retries. However, the best practice is for each producer is to retry for maximum throughput aggressively and to handle any resulting throttling determined as excessive by expanding the capacity of the stream and implementing an appropriate partition key strategy. 67 | 68 | Even though our default here is to limit at this threshold (1024kb) in reality the threshold seems lower (~80%). 69 | If you wish to avoid excessive throttling or have multiple producers on a stream you will want to set this quite a bit lower. 70 | 71 | 72 | ## Consumer 73 | 74 | from kinesis import Consumer 75 | 76 | async with Consumer(stream_name="test") as consumer: 77 | while True: 78 | async for item in consumer: 79 | print(item) 80 | # caught up.. take a breather~ 81 | 82 | 83 | Options: 84 | 85 | (comments in quotes are Kinesis Limits as per AWS Docs) 86 | 87 | 88 | | Arg | Default | Description | 89 | | --- | --- | --- | 90 | | session | None | AioSession (to use non default profile etc) | 91 | | region_name | None | AWS Region | 92 | | max_queue_size | 10000 | the fetch() task shard will block when queue is at max | 93 | | max_shard_consumers | None | Max number of shards to use. None = all | 94 | | record_limit | 10000 | Number of records to fetch with get_records() | 95 | | sleep_time_no_records | 2 | No of seconds to sleep when caught up | 96 | | iterator_type | TRIM_HORIZON | Default shard iterator type for new/unknown shards (ie start from start of stream). Alternatives are "LATEST" (ie end of stream), "AT_TIMESTAMP" (ie particular point in time, requires defining `timestamp` arg) | 97 | | shard_fetch_rate | 1 | No of fetches per second (max = 5). 1 is recommended as allows having multiple consumers without hitting the max limit. | 98 | | checkpointer | MemoryCheckPointer() | Checkpointer to use | 99 | | processor | JsonProcessor() | Record aggregator/serializer. Must Match processor used by Producer() | 100 | | retry_limit | None | How many connection attempts should be made before raising a exception | 101 | | expo_backoff | None | Exponential Backoff when connection attempt fails | 102 | | expo_backoff_limit | 120 | Max amount of seconds Exponential Backoff can grow | 103 | | create_stream | False | Creates a Kinesis Stream based on the `stream_name` keyword argument. Note if stream already existing it will ignore | 104 | | create_stream_shards | 1 | Sets the amount of shard you want for your new stream. Note if stream already existing it will ignore | 105 | | timestamp | None | Timestamp to start reading stream from. Used with iterator type "AT_TIMESTAMP" 106 | 107 | 108 | ## Checkpointers 109 | 110 | - memory (the default but kinda pointless) 111 | 112 | ``` 113 | MemoryCheckPointer() 114 | ``` 115 | 116 | - redis 117 | 118 | ``` 119 | RedisCheckPointer(name, session_timeout=60, heartbeat_frequency=15, is_cluster=False) 120 | ``` 121 | 122 | Requires ENV: 123 | 124 | ``` 125 | REDIS_HOST 126 | ``` 127 | 128 | Requires `pip install aredis` 129 | 130 | 131 | ## Processors (Aggregator + Serializer) 132 | 133 | 134 | Aggregation enable batching up multiple records to more efficiently use the stream. 135 | Refer https://aws.amazon.com/blogs/big-data/implementing-efficient-and-reliable-producers-with-the-amazon-kinesis-producer-library/ 136 | 137 | 138 | | Class | Aggregator | Serializer | Description | 139 | | --- | --- | --- | --- | 140 | | StringProcessor | SimpleAggregator | StringSerializer | Single String record | 141 | | JsonProcessor | SimpleAggregator | JsonSerializer | Single JSON record | 142 | | JsonLineProcessor | NewlineAggregator | JsonSerializer | Multiple JSON record separated by new line char | 143 | | JsonListProcessor | ListAggregator | JsonSerializer | Multiple JSON record returned by list | 144 | | MsgpackProcessor | NetstringAggregator | MsgpackSerializer | Multiple Msgpack record framed with Netstring Protocol (https://en.wikipedia.org/wiki/Netstring) | 145 | | KPLJsonProcessor | KPLAggregator | JsonSerializer | Multiple JSON record in a KPL Aggregated Record (https://github.com/awslabs/amazon-kinesis-producer/blob/master/aggregation-format.md) | 146 | | KPLStringProcessor | KPLAggregator | StringSerializer | Multiple String record in a KPL Aggregated Record (https://github.com/awslabs/amazon-kinesis-producer/blob/master/aggregation-format.md) | 147 | 148 | Note you can define your own processor easily as it's simply a class inheriting the Aggregator + Serializer. 149 | 150 | ``` 151 | class MsgpackProcessor(Processor, NetstringAggregator, MsgpackSerializer): 152 | pass 153 | ``` 154 | 155 | Just define a new Serializer class with serialize() and deserialize() methods. 156 | 157 | Note: 158 | 159 | * Json will use `pip install ujson` if installed 160 | * Msgpack requires `pip install msgpack` to be installed 161 | * KPL requires `pip install aws-kinesis-agg` to be installed 162 | 163 | ## Benchmark/Example 164 | 165 | See [benchmark.py](./benchmark.py) for code 166 | 167 | 50k items of approx 1k (python) in size, using single shard. 168 | 169 | ![Benchmark](docs/benchmark.png) 170 | 171 | 172 | ## Unit Testing 173 | 174 | Uses https://github.com/mhart/kinesalite for local testing. 175 | 176 | Run tests via docker 177 | 178 | ``` 179 | docker-compose up --abort-on-container-exit --exit-code-from test 180 | ``` 181 | 182 | For local testing use 183 | 184 | ``` 185 | docker-compose up kinesis redis 186 | ``` 187 | 188 | then within your virtualenv 189 | 190 | ``` 191 | nosetests 192 | 193 | # or run individual test 194 | nosetests tests.py:KinesisTests.test_create_stream_shard_limit_exceeded 195 | ``` 196 | 197 | Note there are a few test cases using the *actual* AWS Kinesis (AWSKinesisTests) 198 | These require setting an env in order to run 199 | 200 | Create an ".env" file with 201 | 202 | ``` 203 | TESTING_USE_AWS_KINESIS=1 204 | ``` 205 | 206 | Note you can ignore these tests if submitting PR unless core batching/processing behaviour is being changed. 207 | 208 | 209 | -------------------------------------------------------------------------------- /benchmark-requirements.txt: -------------------------------------------------------------------------------- 1 | mimesis==3.2.0 2 | humanize==0.5.1 3 | terminaltables==3.1.0 4 | coloredlogs==10.0 5 | contexttimer==0.3.3 6 | -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | pip install -r benchmark-requirements.txt 3 | 4 | Note: This will create a shard called "test" on your AWS. 5 | Your responsibility to delete it afterwards!! 6 | 7 | 8 | """ 9 | import asyncio 10 | import math 11 | import logging 12 | import coloredlogs 13 | import copy 14 | import itertools 15 | import sys 16 | import humanize 17 | from terminaltables import AsciiTable 18 | from contexttimer import Timer 19 | from kinesis import ( 20 | Producer, 21 | Consumer, 22 | JsonProcessor, 23 | JsonLineProcessor, 24 | MsgpackProcessor, 25 | ) 26 | from mimesis import Person, Address, Datetime 27 | 28 | coloredlogs.install(level="DEBUG") 29 | 30 | logging.getLogger("botocore").setLevel(logging.WARNING) 31 | logging.getLogger("kinesis.consumer").setLevel(logging.DEBUG) 32 | logging.getLogger("kinesis.checkpointers").setLevel(logging.INFO) 33 | 34 | log = logging.getLogger(__name__) 35 | 36 | 37 | def generate_random_data(): 38 | p = Person() 39 | a = Address() 40 | 41 | return { 42 | "name": p.full_name(), 43 | "email": p.email(), 44 | "nationality": p.nationality(), 45 | "occupation": p.occupation(), 46 | "password": p.password(), 47 | "phone": p.telephone(), 48 | "address": a.address(), 49 | "city": a.city(), 50 | "street_no": a.street_number(), 51 | "created": Datetime().date().isoformat(), 52 | } 53 | 54 | 55 | def generate_dataset(n): 56 | return [generate_random_data() for _ in range(n)] 57 | 58 | 59 | def copy_dataset(data, n): 60 | return list(itertools.chain.from_iterable([copy.copy(data) for _ in range(n)])) 61 | 62 | 63 | async def test_producer(data, processor): 64 | log.info("Testing with {}".format(processor.__class__.__name__)) 65 | async with Producer( 66 | stream_name="test", processor=processor, max_queue_size=100000 67 | ) as producer: 68 | 69 | await producer.create_stream(shards=1, ignore_exists=True) 70 | 71 | async with Consumer( 72 | stream_name="test", 73 | processor=processor, 74 | max_queue_size=100000, 75 | iterator_type="LATEST", 76 | ) as consumer: 77 | 78 | # ensure set up before producer puts records as using LATEST 79 | await consumer.start_consumer(wait_iterations=0) 80 | 81 | with Timer() as t: 82 | for item in data: 83 | await producer.put(item) 84 | await producer.flush() 85 | 86 | total = 0 87 | while total < len(data): 88 | async for _ in consumer: 89 | total += 1 90 | 91 | if len(data) != total: 92 | log.error( 93 | "Failed to read all records.. expected {} read {}".format(len(data), total) 94 | ) 95 | return False, None 96 | 97 | log.info( 98 | "Completed {} records (read: {}) in {} seconds".format( 99 | len(data), total, round(t.elapsed, 2) 100 | ) 101 | ) 102 | 103 | return True, round(t.elapsed, 2) 104 | 105 | 106 | async def test(): 107 | n = 50000 108 | 109 | data = generate_dataset(500) 110 | 111 | multiplier = math.ceil(n / 500) 112 | 113 | python_bytes = sum([sys.getsizeof(x) for x in data]) * multiplier 114 | 115 | result = [] 116 | 117 | for processor in [JsonProcessor(), JsonLineProcessor(), MsgpackProcessor()]: 118 | 119 | all_data = copy_dataset(data, multiplier) 120 | 121 | aggregator_bytes = 0 122 | for x in data: 123 | for size, _, _ in processor.add_item(x): 124 | aggregator_bytes += size 125 | 126 | if processor.has_items(): 127 | for size, _, _ in processor.get_items(): 128 | aggregator_bytes += size 129 | 130 | aggregator_bytes *= multiplier 131 | 132 | success, elapsed_ts = await test_producer(data=all_data, processor=processor) 133 | 134 | if success: 135 | result.append( 136 | [ 137 | processor.__class__.__name__, 138 | humanize.naturalsize(python_bytes), 139 | humanize.naturalsize(aggregator_bytes), 140 | elapsed_ts, 141 | round(n / elapsed_ts), 142 | humanize.naturalsize(python_bytes / elapsed_ts), 143 | humanize.naturalsize(aggregator_bytes / elapsed_ts), 144 | ] 145 | ) 146 | 147 | # Pause a bit 148 | await asyncio.sleep(2) 149 | 150 | print("\n\n Results for {} records:\n".format(n)) 151 | print( 152 | AsciiTable( 153 | [ 154 | [ 155 | "Aggregator", 156 | "Python Bytes", 157 | "Kinesis Bytes", 158 | "Time (Seconds)", 159 | "RPS", 160 | "Python BPS", 161 | "Kinesis BPS", 162 | ] 163 | ] 164 | + result 165 | ).table 166 | ) 167 | print("\n") 168 | 169 | 170 | asyncio.run(test()) 171 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.3' 2 | services: 3 | 4 | kinesis: 5 | image: vsouza/kinesis-local:latest 6 | command: --port 4567 --shardLimit 10000 7 | restart: always 8 | ports: 9 | - 4567:4567 10 | redis: 11 | image: redis:latest 12 | restart: always 13 | ports: 14 | - 16379:6379 15 | 16 | test: 17 | container_name: async-kinesis-test 18 | command: ['nosetests'] 19 | volumes: 20 | - ./tests.py:/app/tests.py 21 | - ./kinesis:/app/kinesis 22 | build: 23 | context: . 24 | dockerfile: Dockerfile 25 | environment: 26 | - AWS_DEFAULT_REGION=ap-southeast-2 27 | - ENDPOINT_URL=http://kinesis:4567 28 | - REDIS_HOST=redis 29 | - REDIS_PORT=6379 30 | - AWS_ACCESS_KEY_ID= 31 | - AWS_SECRET_ACCESS_KEY= 32 | links: 33 | - kinesis:kinesis 34 | - redis:redis 35 | -------------------------------------------------------------------------------- /docs/DESIGN.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Consumer Design 4 | 5 | (Bears some explanation, kinda complex~) 6 | 7 | - fetch() gets called periodically (0.2 sec (ie max 5x per second as is the limit on shard get_records())) 8 | - iterate over the list of shards (set on startup, does not currently detect resharding) 9 | - assign shard if not in use and not at "max_shard_consumers" limit otherwise ignore/continue 10 | - ignore/continue if this shard is still fetching 11 | - process records if shard is done fetching 12 | - put records on queue 13 | - add checkpoint record to queue 14 | - assign NextShardIterator 15 | - create (get_records()) task again 16 | 17 | Note that get_records() is throttled via "shard_fetch_rate=5" (ie the same 0.2 sec/ 5x limit) 18 | 19 | This pattern seemed like the easiest way to maintain a pool of consumers without needing to think too hard about starting it's next job or handling new shards etc. 20 | 21 | 22 | See also 23 | 24 | https://aws.amazon.com/blogs/big-data/implementing-efficient-and-reliable-producers-with-the-amazon-kinesis-producer-library/ 25 | 26 | -------------------------------------------------------------------------------- /docs/YETANOTHER.md: -------------------------------------------------------------------------------- 1 | 2 | ## Yet another Python Kinesis Library? 3 | 4 | Sadly I had issues with every other library I could find :( 5 | 6 | * https://github.com/NerdWalletOSS/kinesis-python 7 | * pro: 8 | * kinda works 9 | * con 10 | * threaded 11 | * Outstanding PR to fix some issues 12 | * checkpoints on every record on main thread 13 | 14 | * https://github.com/ungikim/kinsumer 15 | * pro: 16 | * handles shard changes 17 | * no producer 18 | * no redis checkpointer/heartbeat 19 | * threaded/seems kinda complicated~ 20 | * con 21 | * consumer only 22 | 23 | * https://github.com/bufferapp/kiner 24 | * pro: 25 | * Batching 26 | * con 27 | * Producer only 28 | 29 | * https://github.com/niklio/aiokinesis 30 | * pro: 31 | * asyncio 32 | * no checkpointing 33 | * con 34 | * limited to 1 shard / too simplistic 35 | 36 | * https://github.com/ticketea/pynesis 37 | * pro: 38 | * checkpoints 39 | * con 40 | * hasn't been updated for 1 year 41 | * doesnt use put_records() 42 | * single threaded / round robin reads shards 43 | 44 | * https://github.com/whale2/async-kinesis-client 45 | * pro: 46 | * checkpoints 47 | * asyncio 48 | * con 49 | * ? 50 | 51 | (Actually I only found this one recently, might be ok alternative?) 52 | -------------------------------------------------------------------------------- /docs/benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hampsterx/async-kinesis/d07efb2bcc6c4963e77b524c0eed1a250d3db401/docs/benchmark.png -------------------------------------------------------------------------------- /kinesis/__init__.py: -------------------------------------------------------------------------------- 1 | from .producer import Producer 2 | from .consumer import Consumer 3 | from .processors import ( 4 | StringProcessor, 5 | JsonProcessor, 6 | JsonLineProcessor, 7 | JsonListProcessor, 8 | MsgpackProcessor, 9 | ) 10 | from .serializers import StringSerializer, JsonSerializer, MsgpackSerializer 11 | from .checkpointers import MemoryCheckPointer, RedisCheckPointer 12 | from .aggregators import ( 13 | SimpleAggregator, 14 | NewlineAggregator, 15 | NetstringAggregator, 16 | ListAggregator, 17 | ) 18 | -------------------------------------------------------------------------------- /kinesis/aggregators.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | from collections import namedtuple 4 | from .exceptions import ValidationError 5 | from .exceptions import ExceededPutLimit 6 | 7 | try: 8 | import aws_kinesis_agg 9 | import aws_kinesis_agg.aggregator 10 | import aws_kinesis_agg.kpl_pb2 11 | except ModuleNotFoundError: 12 | pass 13 | 14 | log = logging.getLogger(__name__) 15 | 16 | OutputItem = namedtuple("OutputItem", ["size", "n", "data"]) 17 | 18 | 19 | class BaseAggregator: 20 | def __init__(self, max_size=None): 21 | 22 | if not max_size: 23 | max_size = 1024 24 | 25 | put_units = math.floor(max_size / 25) 26 | 27 | if put_units <= 0: 28 | raise ValidationError( 29 | "max_size is too low. Should be at least one PUT Payload Unit (25Kb)" 30 | ) 31 | 32 | if put_units > 40: 33 | raise ValidationError( 34 | "max_size is too high. Should be no higher than 40x PUT Payload Units (25Kb)" 35 | ) 36 | 37 | self.max_bytes = put_units * 25 * 1024 38 | 39 | log.debug( 40 | "setting max_bytes to {} ({} PUT Payload Units (25kb))".format( 41 | self.max_bytes, put_units 42 | ) 43 | ) 44 | 45 | self.buffer = [] 46 | self.size = 0 47 | 48 | def validate_size(self, size): 49 | if size > self.max_bytes: 50 | raise ExceededPutLimit("Put of {} bytes exceeded 1MB limit".format(size)) 51 | 52 | def parse(self, data): 53 | yield self.deserialize(data) 54 | 55 | 56 | class SimpleAggregator(BaseAggregator): 57 | """ 58 | Simple Aggregator (Does NOT aggregate) 59 | Sends a single record only (high inefficient) 60 | """ 61 | 62 | def has_items(self): 63 | return False 64 | 65 | def add_item(self, item): 66 | output = self.serialize(item) 67 | size = len(output) 68 | 69 | self.validate_size(size) 70 | 71 | yield OutputItem(size=size, n=1, data=output) 72 | 73 | 74 | class Aggregator(BaseAggregator): 75 | """ 76 | Aggregator 77 | Sends an aggregated record 78 | """ 79 | 80 | def has_items(self): 81 | return self.size > 0 82 | 83 | def get_header_size(self, data): 84 | raise NotImplementedError() 85 | 86 | def add_item(self, item): 87 | output = self.serialize(item) 88 | size = len(output) 89 | 90 | self.validate_size(size) 91 | 92 | header_size = self.get_header_size(output) 93 | 94 | if size + self.size + header_size < self.max_bytes: 95 | 96 | self.buffer.append((size, output)) 97 | self.size += size + header_size 98 | 99 | else: 100 | log.debug( 101 | "Yielding item to queue with {} individual records with size of {} kb".format( 102 | len(self.buffer), round(self.size / 1024) 103 | ) 104 | ) 105 | yield OutputItem(size=self.size, n=len(self.buffer), data=self.output()) 106 | self.buffer = [(size, output)] 107 | self.size = size 108 | 109 | log.debug("Adding item to queue with size of {} kb".format(round(size / 1024))) 110 | 111 | def get_items(self): 112 | log.debug( 113 | "Yielding (final) item to queue with {} individual records with size of {} kb".format( 114 | len(self.buffer), round(self.size / 1024) 115 | ) 116 | ) 117 | yield OutputItem(size=self.size, n=len(self.buffer), data=self.output()) 118 | self.buffer = [] 119 | self.size = 0 120 | 121 | 122 | class NewlineAggregator(Aggregator): 123 | def get_header_size(self, output): 124 | return 1 125 | 126 | def output(self): 127 | return b"\n".join([x[1] for x in self.buffer] + [b""]) 128 | 129 | def parse(self, data): 130 | for row in data.split(b"\n"): 131 | if row: 132 | yield self.deserialize(row) 133 | 134 | 135 | class ListAggregator(Aggregator): 136 | def get_header_size(self, output): 137 | return 1 138 | 139 | def output(self): 140 | return self.serialize([self.deserialize(x[1]) for x in self.buffer]) 141 | 142 | def parse(self, data): 143 | yield self.deserialize(data) 144 | 145 | 146 | class NetstringAggregator(Aggregator): 147 | """ 148 | Netstring Aggregation 149 | Framing = {x} bytes (ascii int for size) + 1 byte (":") + data + trailing "," 150 | See: https://en.wikipedia.org/wiki/Netstring 151 | """ 152 | 153 | def get_header_size(self, output): 154 | return len(str(len(output))) + 2 155 | 156 | def output(self): 157 | frame = [] 158 | 159 | for size, data in self.buffer: 160 | frame.append(str(size).encode("ascii")) 161 | frame.append(b":") 162 | frame.append(data) 163 | frame.append(b",") 164 | 165 | return b"".join(frame) 166 | 167 | def parse(self, data): 168 | 169 | i = 0 170 | length = len(data) 171 | 172 | while True: 173 | header_offset = data[i:].index(b":") 174 | size = int(data[i : i + header_offset].decode("ascii")) 175 | item = data[i + header_offset + 1 : i + header_offset + 1 + size] 176 | yield self.deserialize(item) 177 | 178 | i += header_offset + size + 2 179 | if i == length: 180 | break 181 | 182 | 183 | class KPLAggregator(Aggregator): 184 | """ 185 | KPL Aggregated Record Aggregation 186 | See: https://github.com/awslabs/kinesis-aggregation/tree/master/python 187 | """ 188 | 189 | def __init__(self, max_size=None): 190 | if max_size: 191 | self.agg = aws_kinesis_agg.aggregator.RecordAggregator(max_size=max_size) 192 | else: 193 | self.agg = aws_kinesis_agg.aggregator.RecordAggregator() 194 | 195 | def has_items(self): 196 | return self.agg.get_num_user_records() > 0 197 | 198 | def add_item(self, item): 199 | output = self.serialize(item) 200 | record = self.agg.add_user_record("a", output) 201 | self.size = self.agg.get_num_user_records() 202 | if record: 203 | size = record.get_size_bytes() 204 | n = record.get_num_user_records() 205 | partition_key, explicit_hash_key, data = record.get_contents() 206 | yield OutputItem(size=size, n=n, data=data) 207 | 208 | def get_items(self): 209 | record = self.agg.clear_and_get() 210 | if record: 211 | size = record.get_size_bytes() 212 | n = record.get_num_user_records() 213 | partition_key, explicit_hash_key, data = record.get_contents() 214 | yield OutputItem(size=size, n=n, data=data) 215 | 216 | def parse(self, data): 217 | message_data = data[len(aws_kinesis_agg.MAGIC) : -aws_kinesis_agg.DIGEST_SIZE] 218 | ar = aws_kinesis_agg.kpl_pb2.AggregatedRecord() 219 | ar.ParseFromString(message_data) 220 | for record in ar.records: 221 | yield self.deserialize(record.data) 222 | -------------------------------------------------------------------------------- /kinesis/base.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from async_timeout import timeout 4 | from aiobotocore.session import AioSession 5 | from asyncio import CancelledError 6 | from botocore.exceptions import ClientError 7 | from botocore.config import Config 8 | import time 9 | 10 | from . import exceptions 11 | 12 | log = logging.getLogger(__name__) 13 | 14 | 15 | class Base: 16 | def __init__( 17 | self, 18 | stream_name, 19 | session=None, 20 | endpoint_url=None, 21 | region_name=None, 22 | retry_limit=None, 23 | expo_backoff=None, 24 | expo_backoff_limit=120, 25 | skip_describe_stream=False, 26 | create_stream=False, 27 | create_stream_shards=1, 28 | ): 29 | 30 | self.stream_name = stream_name 31 | 32 | if session: 33 | assert isinstance(session, AioSession) 34 | self.session = session 35 | else: 36 | self.session = AioSession() 37 | 38 | self.endpoint_url = endpoint_url 39 | self.region_name = region_name 40 | 41 | self.client = None 42 | self.shards = None 43 | 44 | self.stream_status = None 45 | 46 | self.retry_limit = retry_limit 47 | self.expo_backoff = expo_backoff 48 | self.expo_backoff_limit = expo_backoff_limit 49 | 50 | # connection states of kinesis client 51 | self.RECONNECT = "RECONNECT" 52 | self.ACTIVE = "ACTIVE" 53 | self.INITIALIZE = "INITIALIZE" 54 | 55 | self.stream_status = self.INITIALIZE 56 | # Short Lived producer might want to skip describing stream on startup 57 | self.skip_describe_stream = skip_describe_stream 58 | self._conn_lock = asyncio.Lock() 59 | self._reconnect_timeout = time.monotonic() 60 | self.create_stream = create_stream 61 | self.create_stream_shards = create_stream_shards 62 | 63 | async def __aenter__(self): 64 | 65 | log.info( 66 | "creating client with {}".format( 67 | self.endpoint_url if self.endpoint_url else "AWS default endpoint" 68 | ) 69 | ) 70 | 71 | try: 72 | await self.get_conn() 73 | except exceptions.StreamDoesNotExist: 74 | await self.close() 75 | raise 76 | except: 77 | raise 78 | 79 | return self 80 | 81 | async def __aexit__(self, exc_type, exc, tb): 82 | await self.close() 83 | await self.client.__aexit__(exc_type, exc, tb) 84 | 85 | async def get_client(self): 86 | 87 | # Note: max_attempts = 0 88 | # Boto RetryHandler only handles these errors: 89 | # GENERAL_CONNECTION_ERROR => ConnectionError, ConnectionClosedError, ReadTimeoutError, EndpointConnectionError 90 | # Still have to handle ClientError anyway~ 91 | 92 | self.client = await self.session.create_client( 93 | "kinesis", 94 | endpoint_url=self.endpoint_url, 95 | region_name=self.region_name, 96 | config=Config( 97 | connect_timeout=5, read_timeout=90, retries={"max_attempts": 0} 98 | ), 99 | ).__aenter__() 100 | 101 | async def get_stream_description(self): 102 | 103 | try: 104 | return (await self.client.describe_stream(StreamName=self.stream_name))[ 105 | "StreamDescription" 106 | ] 107 | except ClientError as err: 108 | code = err.response["Error"]["Code"] 109 | if code == "ResourceNotFoundException": 110 | raise exceptions.StreamDoesNotExist( 111 | "Stream '{}' does not exist".format(self.stream_name) 112 | ) from None 113 | raise 114 | 115 | async def start(self): 116 | 117 | await self.get_client() 118 | 119 | if self.create_stream: 120 | await self._create_stream() 121 | self.create_stream = False 122 | 123 | if self.skip_describe_stream: 124 | log.debug( 125 | "Skipping Describe stream '{}'. Assuming it exists..".format( 126 | self.stream_name 127 | ) 128 | ) 129 | self.shards = [] 130 | 131 | log.debug("Checking stream '{}' is active".format(self.stream_name)) 132 | 133 | async with timeout(60) as cm: 134 | try: 135 | while True: 136 | stream_info = await self.get_stream_description() 137 | stream_status = stream_info["StreamStatus"] 138 | 139 | if stream_status == self.ACTIVE: 140 | self.stream_status = stream_status 141 | break 142 | 143 | if stream_status in ["CREATING", "UPDATING"]: 144 | await asyncio.sleep(0.25) 145 | 146 | else: 147 | raise exceptions.StreamStatusInvalid( 148 | "Stream '{}' is {}".format(self.stream_name, stream_status) 149 | ) 150 | except CancelledError: 151 | pass 152 | 153 | else: 154 | self.shards = stream_info["Shards"] 155 | 156 | if cm.expired: 157 | raise exceptions.StreamStatusInvalid( 158 | "Stream '{}' is still {}".format(self.stream_name, stream_status) 159 | ) 160 | 161 | async def close(self): 162 | raise NotImplementedError 163 | 164 | async def get_conn(self): 165 | 166 | async with self._conn_lock: 167 | 168 | log.debug( 169 | f"Get Connection (stream name: {self.stream_name}), stream status: {self.stream_status})" 170 | ) 171 | 172 | if self.stream_status == self.INITIALIZE: 173 | try: 174 | await self.start() 175 | log.info(f"Connection Successfully Initialized") 176 | except exceptions.StreamDoesNotExist: 177 | # Do not attempt to reconnect if stream does not exist 178 | log.error(f"Stream does not exist ({self.stream_name})") 179 | raise 180 | except Exception as e: 181 | log.warning(f"Connection Failed to Initialize : {e.__class__} {e}") 182 | await self._get_reconn_helper() 183 | elif ( 184 | self.stream_status == self.ACTIVE 185 | and (time.monotonic() - self._reconnect_timeout) > 120 186 | ): 187 | # reconnect_timeout is a Lock so a new connection is not created immediately 188 | # after a successfully reconnection has been made since self.start() sets self.stream_status = "ACTIVE" 189 | # immediately after a successful reconnect. 190 | # Based on testing a hardcode 120 seconds backoff is best since, there could be a lot of pending 191 | # coroutines reattempting the connection when the client connection it's already healthy. 192 | await self._get_reconn_helper() 193 | 194 | async def _get_reconn_helper(self): 195 | # Logic used to reconnect to connect to kinesis if there is a error 196 | 197 | self.stream_status = self.RECONNECT 198 | backoff_delay = 5 199 | conn_attempts = 1 200 | await self.close() 201 | while True: 202 | self._reconnect_timeout = time.monotonic() 203 | try: 204 | log.warning( 205 | f"Connection Error. Rebuilding connection. Sleeping for {backoff_delay} seconds. Reconnection Attempt: {conn_attempts}" 206 | ) 207 | await asyncio.sleep(backoff_delay) 208 | await self.start() 209 | log.warning( 210 | f"Connection Reestablished After {conn_attempts} and Sleeping for {backoff_delay}" 211 | ) 212 | break 213 | except Exception as e: 214 | if isinstance(e, exceptions.StreamDoesNotExist): 215 | raise e 216 | log.warning(e) 217 | conn_attempts += 1 218 | if isinstance(self.retry_limit, int): 219 | if conn_attempts >= (self.retry_limit + 1): 220 | await self.close() 221 | raise ConnectionError( 222 | f"Kinesis client has exceeded {self.retry_limit} connection attempts" 223 | ) 224 | if self.expo_backoff: 225 | backoff_delay = (conn_attempts ** 2) * self.expo_backoff 226 | if backoff_delay >= self.expo_backoff_limit: 227 | backoff_delay = self.expo_backoff_limit 228 | await self.close() 229 | 230 | async def _create_stream(self, ignore_exists=True): 231 | 232 | log.debug( 233 | "Creating (or ignoring) stream {} with {} shards".format( 234 | self.stream_name, self.create_stream_shards 235 | ) 236 | ) 237 | 238 | if self.create_stream_shards < 1: 239 | raise Exception("Min shard count is one") 240 | 241 | try: 242 | await self.client.create_stream( 243 | StreamName=self.stream_name, ShardCount=self.create_stream_shards 244 | ) 245 | except ClientError as err: 246 | code = err.response["Error"]["Code"] 247 | 248 | if code == "ResourceInUseException": 249 | if not ignore_exists: 250 | raise exceptions.StreamExists( 251 | "Stream '{}' exists, cannot create it".format(self.stream_name) 252 | ) from None 253 | elif code == "LimitExceededException": 254 | raise exceptions.StreamShardLimit( 255 | "Stream '{}' exceeded shard limit".format(self.stream_name) 256 | ) 257 | else: 258 | raise 259 | -------------------------------------------------------------------------------- /kinesis/checkpointers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import asyncio 3 | import os 4 | import json 5 | from datetime import timezone, datetime 6 | 7 | log = logging.getLogger(__name__) 8 | 9 | 10 | class BaseCheckPointer: 11 | def __init__(self, name="", id=None): 12 | self._id = id if id else os.getpid() 13 | self._name = name 14 | self._items = {} 15 | 16 | def get_id(self): 17 | return self._id 18 | 19 | def get_ref(self): 20 | return "{}/{}".format(self._name, self._id) 21 | 22 | def get_all_checkpoints(self): 23 | return self._items.copy() 24 | 25 | def get_checkpoint(self, shard_id): 26 | return self._items.get(shard_id) 27 | 28 | async def close(self): 29 | log.info("{} stopping..".format(self.get_ref())) 30 | await asyncio.gather( 31 | *[self.deallocate(shard_id) for shard_id in self._items.keys()] 32 | ) 33 | 34 | def is_allocated(self, shard_id): 35 | return shard_id in self._items 36 | 37 | 38 | class BaseHeartbeatCheckPointer(BaseCheckPointer): 39 | def __init__( 40 | self, 41 | name, 42 | id=None, 43 | session_timeout=60, 44 | heartbeat_frequency=15, 45 | auto_checkpoint=True, 46 | ): 47 | super().__init__(name=name, id=id) 48 | 49 | self.session_timeout = session_timeout 50 | self.heartbeat_frequency = heartbeat_frequency 51 | self.auto_checkpoint = auto_checkpoint 52 | self._manual_checkpoints = {} 53 | 54 | self.heartbeat_task = asyncio.Task(self.heartbeat()) 55 | 56 | async def close(self): 57 | log.debug("Cancelling heartbeat task..") 58 | self.heartbeat_task.cancel() 59 | 60 | await super().close() 61 | 62 | async def heartbeat(self): 63 | while True: 64 | await asyncio.sleep(self.heartbeat_frequency) 65 | 66 | # todo: don't heartbeat if checkpoint already updated it recently 67 | for shard_id, sequence in self._items.items(): 68 | key = self.get_key(shard_id) 69 | val = {"ref": self.get_ref(), "ts": self.get_ts(), "sequence": sequence} 70 | log.debug("Heartbeating {}@{}".format(shard_id, sequence)) 71 | await self.do_heartbeat(key, val) 72 | 73 | 74 | class MemoryCheckPointer(BaseCheckPointer): 75 | async def deallocate(self, shard_id): 76 | log.info( 77 | "{} deallocated on {}@{}".format( 78 | self.get_ref(), shard_id, self._items[shard_id] 79 | ) 80 | ) 81 | self._items[shard_id]["active"] = False 82 | 83 | def is_allocated(self, shard_id): 84 | return shard_id in self._items and self._items[shard_id]["active"] 85 | 86 | async def allocate(self, shard_id): 87 | if shard_id not in self._items: 88 | self._items[shard_id] = {"sequence": None} 89 | 90 | self._items[shard_id]["active"] = True 91 | 92 | return True, self._items[shard_id]["sequence"] 93 | 94 | async def checkpoint(self, shard_id, sequence): 95 | log.debug( 96 | "{} checkpointed on {} @ {}".format(self.get_ref(), shard_id, sequence) 97 | ) 98 | self._items[shard_id]["sequence"] = sequence 99 | 100 | 101 | class RedisCheckPointer(BaseHeartbeatCheckPointer): 102 | def __init__( 103 | self, 104 | name, 105 | id=None, 106 | session_timeout=60, 107 | heartbeat_frequency=15, 108 | is_cluster=False, 109 | auto_checkpoint=True, 110 | ): 111 | super().__init__( 112 | name=name, 113 | id=id, 114 | session_timeout=session_timeout, 115 | heartbeat_frequency=heartbeat_frequency, 116 | auto_checkpoint=auto_checkpoint, 117 | ) 118 | 119 | if is_cluster: 120 | from aredis import StrictRedisCluster as Redis 121 | else: 122 | from aredis import StrictRedis as Redis 123 | 124 | params = { 125 | "host": os.environ.get("REDIS_HOST", "localhost"), 126 | "port": int(os.environ.get("REDIS_PORT", "6379")), 127 | "password": os.environ.get("REDIS_PASSWORD"), 128 | } 129 | 130 | if not is_cluster: 131 | db = int(os.environ.get("REDIS_DB", 0)) 132 | if db > 0: 133 | params["db"] = db 134 | else: 135 | params["skip_full_coverage_check"] = True 136 | 137 | self.client = Redis(**params) 138 | 139 | async def do_heartbeat(self, key, value): 140 | await self.client.set(key, json.dumps(value)) 141 | 142 | def get_key(self, shard_id): 143 | return "pyredis-{}-{}".format(self._name, shard_id) 144 | 145 | def get_ts(self): 146 | return round(int(datetime.now(tz=timezone.utc).timestamp())) 147 | 148 | async def checkpoint(self, shard_id, sequence): 149 | 150 | if not self.auto_checkpoint: 151 | log.debug( 152 | "{} updated manual checkpoint {}@{}".format( 153 | self.get_ref(), shard_id, sequence 154 | ) 155 | ) 156 | self._manual_checkpoints[shard_id] = sequence 157 | return 158 | 159 | await self._checkpoint(shard_id, sequence) 160 | 161 | async def manual_checkpoint(self): 162 | items = [(k, v) for k, v in self._manual_checkpoints.items()] 163 | 164 | self._manual_checkpoints = {} 165 | 166 | for shard_id, sequence in items: 167 | await self._checkpoint(shard_id, sequence) 168 | 169 | async def _checkpoint(self, shard_id, sequence): 170 | 171 | key = self.get_key(shard_id) 172 | 173 | val = {"ref": self.get_ref(), "ts": self.get_ts(), "sequence": sequence} 174 | 175 | previous_val = await self.client.getset(key, json.dumps(val)) 176 | previous_val = json.loads(previous_val) if previous_val else None 177 | 178 | if not previous_val: 179 | raise NotImplementedError( 180 | "{} checkpointed on {} but key did not exist?".format( 181 | self.get_ref(), shard_id 182 | ) 183 | ) 184 | 185 | if previous_val["ref"] != self.get_ref(): 186 | raise NotImplementedError( 187 | "{} checkpointed on {} but ref is different {}".format( 188 | self.get_ref(), shard_id, val["ref"] 189 | ) 190 | ) 191 | 192 | log.debug("{} checkpointed on {}@{}".format(self.get_ref(), shard_id, sequence)) 193 | self._items[shard_id] = sequence 194 | 195 | async def deallocate(self, shard_id): 196 | 197 | key = self.get_key(shard_id) 198 | 199 | val = {"ref": None, "ts": None, "sequence": self._items[shard_id]} 200 | 201 | await self.client.set(key, json.dumps(val)) 202 | 203 | log.info( 204 | "{} deallocated on {}@{}".format( 205 | self.get_ref(), shard_id, self._items[shard_id] 206 | ) 207 | ) 208 | 209 | self._items.pop(shard_id) 210 | 211 | async def allocate(self, shard_id): 212 | 213 | key = self.get_key(shard_id) 214 | 215 | ts = self.get_ts() 216 | 217 | # try to set lock 218 | success = await self.client.set( 219 | key, 220 | json.dumps({"ref": self.get_ref(), "ts": ts, "sequence": None}), 221 | nx=True, 222 | ) 223 | 224 | val = await self.client.get(key) 225 | val = json.loads(val) if val else None 226 | 227 | original_ts = val["ts"] 228 | 229 | if success: 230 | log.info( 231 | "{} allocated {} (new checkpoint)".format(self.get_ref(), shard_id) 232 | ) 233 | self._items[shard_id] = None 234 | return True, None 235 | 236 | if val["ts"]: 237 | 238 | log.info( 239 | "{} could not allocate {}, still in use by {}".format( 240 | self.get_ref(), shard_id, val["ref"] 241 | ) 242 | ) 243 | 244 | # Wait a bit before carrying on to avoid spamming ourselves 245 | await asyncio.sleep(1) 246 | 247 | age = ts - original_ts 248 | 249 | # still alive? 250 | if age < self.session_timeout: 251 | return False, None 252 | 253 | log.info( 254 | "Attempting to take lock as {} is {} seconds over due..".format( 255 | val["ref"], age - self.session_timeout 256 | ) 257 | ) 258 | 259 | val["ref"] = self.get_ref() 260 | val["ts"] = ts 261 | 262 | previous_val = await self.client.getset(key, json.dumps(val)) 263 | previous_val = json.loads(previous_val) if previous_val else None 264 | 265 | if previous_val["ts"] != original_ts: 266 | log.info("{} beat me to the lock..".format(previous_val["ref"])) 267 | return False, None 268 | 269 | log.info( 270 | "{} allocating {}@{}".format(self.get_ref(), shard_id, val["sequence"]) 271 | ) 272 | 273 | self._items[shard_id] = val["sequence"] 274 | 275 | return True, val["sequence"] 276 | -------------------------------------------------------------------------------- /kinesis/consumer.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from datetime import datetime, timezone 4 | from aiohttp import ClientConnectionError 5 | from asyncio import TimeoutError 6 | from asyncio.queues import QueueEmpty 7 | from botocore.exceptions import ClientError 8 | from .utils import Throttler 9 | from .base import Base 10 | from .checkpointers import MemoryCheckPointer 11 | from .processors import JsonProcessor 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | 16 | class ShardStats: 17 | def __init__(self): 18 | self._throttled = 0 19 | self._success = 0 20 | 21 | def succeded(self): 22 | self._success += 1 23 | 24 | def throttled(self): 25 | self._throttled += 1 26 | 27 | def to_data(self): 28 | return {"throttled": self._throttled, "success": self._success} 29 | 30 | 31 | class Consumer(Base): 32 | def __init__( 33 | self, 34 | stream_name, 35 | session=None, 36 | endpoint_url=None, 37 | region_name=None, 38 | max_queue_size=10000, 39 | max_shard_consumers=None, 40 | record_limit=10000, 41 | sleep_time_no_records=2, 42 | iterator_type="TRIM_HORIZON", 43 | shard_fetch_rate=1, 44 | checkpointer=None, 45 | processor=None, 46 | retry_limit=None, 47 | expo_backoff=None, 48 | expo_backoff_limit=120, 49 | skip_describe_stream=False, 50 | create_stream=False, 51 | create_stream_shards=1, 52 | timestamp=None, 53 | ): 54 | 55 | super(Consumer, self).__init__( 56 | stream_name, 57 | session=session, 58 | endpoint_url=endpoint_url, 59 | region_name=region_name, 60 | retry_limit=retry_limit, 61 | expo_backoff=expo_backoff, 62 | expo_backoff_limit=expo_backoff_limit, 63 | skip_describe_stream=skip_describe_stream, 64 | create_stream=create_stream, 65 | create_stream_shards=create_stream_shards, 66 | ) 67 | 68 | self.queue = asyncio.Queue(maxsize=max_queue_size) 69 | 70 | self.sleep_time_no_records = sleep_time_no_records 71 | 72 | self.max_shard_consumers = max_shard_consumers 73 | 74 | self.record_limit = record_limit 75 | 76 | self.is_fetching = True 77 | 78 | self.checkpointer = checkpointer if checkpointer else MemoryCheckPointer() 79 | 80 | self.processor = processor if processor else JsonProcessor() 81 | 82 | self.iterator_type = iterator_type 83 | 84 | self.fetch_task = None 85 | 86 | self.shard_fetch_rate = shard_fetch_rate 87 | 88 | self.timestamp = timestamp 89 | 90 | def __aiter__(self): 91 | return self 92 | 93 | async def close(self): 94 | log.debug("Closing Connection..") 95 | if not self.stream_status == self.RECONNECT: 96 | 97 | await self.flush() 98 | 99 | if self.fetch_task: 100 | self.fetch_task.cancel() 101 | self.fetch_task = None 102 | 103 | if self.checkpointer: 104 | await self.checkpointer.close() 105 | await self.client.close() 106 | 107 | async def flush(self): 108 | 109 | self.is_fetching = False 110 | 111 | if not self.shards: 112 | return 113 | 114 | # Wait for shard fetches to finish 115 | # todo: use gather 116 | for shard in self.shards: 117 | if shard.get("fetch"): 118 | if not shard["fetch"].done(): 119 | await shard["fetch"] 120 | 121 | async def _fetch(self): 122 | while self.is_fetching: 123 | # Ensure fetch is performed at most 5 times per second (the limit per shard) 124 | await asyncio.sleep(0.2) 125 | try: 126 | await self.fetch() 127 | except asyncio.CancelledError: 128 | pass 129 | except Exception as e: 130 | log.exception(e) 131 | 132 | async def fetch(self): 133 | 134 | if not self.is_fetching: 135 | return 136 | 137 | # todo: check for/handle new shards 138 | 139 | shards_in_use = [ 140 | s for s in self.shards if self.checkpointer.is_allocated(s["ShardId"]) 141 | ] 142 | 143 | # log.debug("shards in use: {}".format([s["ShardId"] for s in shards_in_use])) 144 | 145 | for shard in self.shards: 146 | 147 | if not self.is_fetching: 148 | break 149 | 150 | if not self.checkpointer.is_allocated(shard["ShardId"]): 151 | if ( 152 | self.max_shard_consumers 153 | and len(shards_in_use) >= self.max_shard_consumers 154 | ): 155 | continue 156 | 157 | if self.checkpointer is None: 158 | log.debug("Marking shard in use {}".format(shard["ShardId"])) 159 | shard["ShardIterator"] = await self.get_shard_iterator( 160 | shard_id=shard["ShardId"] 161 | ) 162 | 163 | else: 164 | success, checkpoint = await self.checkpointer.allocate( 165 | shard["ShardId"] 166 | ) 167 | 168 | if not success: 169 | log.debug( 170 | "Shard in use. Could not assign shard {} to checkpointer[{}]".format( 171 | shard["ShardId"], self.checkpointer.get_ref() 172 | ) 173 | ) 174 | continue 175 | 176 | log.debug( 177 | "Marking shard in use {} by checkpointer[{}] @ {}".format( 178 | shard["ShardId"], self.checkpointer.get_ref(), checkpoint 179 | ) 180 | ) 181 | 182 | shard["ShardIterator"] = await self.get_shard_iterator( 183 | shard_id=shard["ShardId"], last_sequence_number=checkpoint 184 | ) 185 | 186 | if "ShardIterator" in shard: 187 | shard["stats"] = ShardStats() 188 | shard["throttler"] = Throttler( 189 | rate_limit=self.shard_fetch_rate, period=1 190 | ) 191 | shards_in_use.append(shard) 192 | 193 | log.debug("Shard count now at {}".format(len(shards_in_use))) 194 | 195 | if shard.get("fetch"): 196 | if shard["fetch"].done(): 197 | result = shard["fetch"].result() 198 | 199 | if not result: 200 | shard["fetch"] = None 201 | continue 202 | 203 | records = result["Records"] 204 | 205 | if records: 206 | log.debug( 207 | "Shard {} got {} records".format( 208 | shard["ShardId"], len(records) 209 | ) 210 | ) 211 | 212 | total_items = 0 213 | for row in result["Records"]: 214 | for n, output in enumerate( 215 | self.processor.parse(row["Data"]) 216 | ): 217 | await self.queue.put(output) 218 | total_items += n + 1 219 | 220 | # Get approx minutes behind.. 221 | last_arrival = records[-1].get("ApproximateArrivalTimestamp") 222 | if last_arrival: 223 | last_arrival = round( 224 | ( 225 | ( 226 | datetime.now(timezone.utc) - last_arrival 227 | ).total_seconds() 228 | / 60 229 | ) 230 | ) 231 | 232 | log.debug( 233 | "Shard {} added {} items from {} records. Consumer is {}m behind".format( 234 | shard["ShardId"], 235 | total_items, 236 | len(records), 237 | last_arrival, 238 | ), 239 | extra={"consumer_behind_m": last_arrival}, 240 | ) 241 | 242 | else: 243 | # ApproximateArrivalTimestamp not available in kinesis-lite 244 | log.debug( 245 | "Shard {} added {} items from {} records".format( 246 | shard["ShardId"], total_items, len(records) 247 | ) 248 | ) 249 | 250 | # Add checkpoint record 251 | last_record = result["Records"][-1] 252 | await self.queue.put( 253 | { 254 | "__CHECKPOINT__": { 255 | "ShardId": shard["ShardId"], 256 | "SequenceNumber": last_record["SequenceNumber"], 257 | } 258 | } 259 | ) 260 | 261 | shard["LastSequenceNumber"] = last_record["SequenceNumber"] 262 | 263 | else: 264 | log.debug( 265 | "Shard {} caught up, sleeping {}s".format( 266 | shard["ShardId"], self.sleep_time_no_records 267 | ) 268 | ) 269 | await asyncio.sleep(self.sleep_time_no_records) 270 | 271 | if not result["NextShardIterator"]: 272 | raise NotImplementedError("Shard is closed?") 273 | 274 | shard["ShardIterator"] = result["NextShardIterator"] 275 | 276 | shard["fetch"] = None 277 | 278 | else: 279 | # log.debug("shard {} fetch in progress..".format(shard['ShardId'])) 280 | continue 281 | 282 | if "ShardIterator" in shard and shard["ShardIterator"] is not None: 283 | shard["fetch"] = asyncio.create_task(self.get_records(shard=shard)) 284 | 285 | async def get_records(self, shard): 286 | 287 | # Note: "This operation has a limit of five transactions per second per account." 288 | 289 | async with shard["throttler"]: 290 | # log.debug("get_records shard={}".format(shard['ShardId'])) 291 | 292 | try: 293 | 294 | result = await self.client.get_records( 295 | ShardIterator=shard["ShardIterator"], Limit=self.record_limit 296 | ) 297 | 298 | shard["stats"].succeded() 299 | return result 300 | 301 | except ClientConnectionError as e: 302 | await self.get_conn() 303 | except TimeoutError as e: 304 | log.warning("Timeout {}. sleeping..".format(e)) 305 | await asyncio.sleep(3) 306 | 307 | except ClientError as e: 308 | code = e.response["Error"]["Code"] 309 | if code == "ProvisionedThroughputExceededException": 310 | log.warning( 311 | "{} hit ProvisionedThroughputExceededException".format( 312 | shard["ShardId"] 313 | ) 314 | ) 315 | shard["stats"].throttled() 316 | # todo: control the throttle ? 317 | await asyncio.sleep(0.25) 318 | 319 | elif code == "ExpiredIteratorException": 320 | log.warning( 321 | "{} hit ExpiredIteratorException".format(shard["ShardId"]) 322 | ) 323 | 324 | shard["ShardIterator"] = await self.get_shard_iterator( 325 | shard_id=shard["ShardId"], 326 | last_sequence_number=shard.get("LastSequenceNumber"), 327 | ) 328 | 329 | elif code == "InternalFailure": 330 | log.warning( 331 | "Received InternalFailure from Kinesis, rebuilding connection.. " 332 | ) 333 | await self.get_conn() 334 | 335 | else: 336 | log.warning("ClientError {}. sleeping..".format(code)) 337 | await asyncio.sleep(3) 338 | 339 | except Exception as e: 340 | log.warning("Unknown error {}. sleeping..".format(e)) 341 | await asyncio.sleep(3) 342 | 343 | # Connection or other issue 344 | return None 345 | 346 | async def get_shard_iterator(self, shard_id, last_sequence_number=None): 347 | 348 | log.debug( 349 | "getting shard iterator for {} @ {}".format( 350 | shard_id, 351 | last_sequence_number if last_sequence_number else self.iterator_type, 352 | ) 353 | ) 354 | 355 | params = { 356 | "StreamName": self.stream_name, 357 | "ShardId": shard_id, 358 | "ShardIteratorType": "AFTER_SEQUENCE_NUMBER" 359 | if last_sequence_number 360 | else self.iterator_type, 361 | } 362 | 363 | if last_sequence_number: 364 | params["StartingSequenceNumber"] = last_sequence_number 365 | 366 | if self.iterator_type == 'AT_TIMESTAMP' and self.timestamp: 367 | params['Timestamp'] = self.timestamp 368 | 369 | response = await self.client.get_shard_iterator(**params) 370 | return response["ShardIterator"] 371 | 372 | async def start_consumer(self, wait_iterations=10, wait_sleep=0.25): 373 | 374 | # Start task to fetch periodically 375 | 376 | self.fetch_task = asyncio.create_task(self._fetch()) 377 | 378 | # Wait a while until we have some results 379 | for i in range(0, wait_iterations): 380 | if self.fetch_task and self.queue.qsize() == 0: 381 | await asyncio.sleep(wait_sleep) 382 | 383 | log.debug("start_consumer completed.. queue size={}".format(self.queue.qsize())) 384 | 385 | async def __anext__(self): 386 | 387 | if not self.shards: 388 | await self.get_conn() 389 | 390 | if not self.fetch_task: 391 | await self.start_consumer() 392 | 393 | # Raise exception from Fetch Task to main task otherwise raise exception inside 394 | # Fetch Task will fail silently 395 | if self.fetch_task.done(): 396 | raise self.fetch_task.exception() 397 | 398 | while True: 399 | try: 400 | item = self.queue.get_nowait() 401 | 402 | if item and isinstance(item, dict) and "__CHECKPOINT__" in item: 403 | if self.checkpointer: 404 | await self.checkpointer.checkpoint( 405 | item["__CHECKPOINT__"]["ShardId"], 406 | item["__CHECKPOINT__"]["SequenceNumber"], 407 | ) 408 | continue 409 | 410 | return item 411 | 412 | except QueueEmpty: 413 | log.debug("Queue empty..") 414 | await asyncio.sleep(self.sleep_time_no_records) 415 | raise StopAsyncIteration 416 | -------------------------------------------------------------------------------- /kinesis/exceptions.py: -------------------------------------------------------------------------------- 1 | class StreamExists(Exception): 2 | pass 3 | 4 | 5 | class StreamDoesNotExist(Exception): 6 | pass 7 | 8 | 9 | class StreamShardLimit(Exception): 10 | pass 11 | 12 | 13 | class StreamStatusInvalid(Exception): 14 | pass 15 | 16 | 17 | class ExceededPutLimit(Exception): 18 | pass 19 | 20 | 21 | class UnknownException(Exception): 22 | pass 23 | 24 | 25 | class ValidationError(Exception): 26 | pass 27 | -------------------------------------------------------------------------------- /kinesis/processors.py: -------------------------------------------------------------------------------- 1 | from .aggregators import ( 2 | NewlineAggregator, 3 | SimpleAggregator, 4 | NetstringAggregator, 5 | ListAggregator, 6 | KPLAggregator, 7 | ) 8 | from .serializers import StringSerializer, JsonSerializer, MsgpackSerializer 9 | 10 | 11 | class Processor: 12 | pass 13 | 14 | 15 | class StringProcessor(Processor, SimpleAggregator, StringSerializer): 16 | pass 17 | 18 | 19 | class JsonProcessor(Processor, SimpleAggregator, JsonSerializer): 20 | pass 21 | 22 | 23 | class JsonLineProcessor(Processor, NewlineAggregator, JsonSerializer): 24 | pass 25 | 26 | 27 | class JsonListProcessor(Processor, ListAggregator, JsonSerializer): 28 | pass 29 | 30 | 31 | class MsgpackProcessor(Processor, NetstringAggregator, MsgpackSerializer): 32 | pass 33 | 34 | 35 | class KPLJsonProcessor(Processor, KPLAggregator, JsonSerializer): 36 | pass 37 | 38 | 39 | class KPLStringProcessor(Processor, KPLAggregator, StringSerializer): 40 | pass 41 | -------------------------------------------------------------------------------- /kinesis/producer.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import time 4 | import math 5 | from aiohttp import ClientConnectionError 6 | 7 | from asyncio.queues import QueueEmpty 8 | 9 | from .utils import Throttler 10 | from botocore.exceptions import ClientError 11 | 12 | from .base import Base 13 | from . import exceptions 14 | from .processors import JsonProcessor 15 | 16 | log = logging.getLogger(__name__) 17 | 18 | 19 | class Producer(Base): 20 | def __init__( 21 | self, 22 | stream_name, 23 | session=None, 24 | endpoint_url=None, 25 | region_name=None, 26 | buffer_time=0.5, 27 | put_rate_limit_per_shard=1000, 28 | put_bandwidth_limit_per_shard=1024, 29 | after_flush_fun=None, 30 | batch_size=500, 31 | max_queue_size=10000, 32 | processor=None, 33 | skip_describe_stream=False, 34 | retry_limit=None, 35 | expo_backoff=None, 36 | expo_backoff_limit=120, 37 | create_stream=False, 38 | create_stream_shards=1, 39 | ): 40 | 41 | super(Producer, self).__init__( 42 | stream_name, 43 | session=session, 44 | endpoint_url=endpoint_url, 45 | region_name=region_name, 46 | retry_limit=retry_limit, 47 | expo_backoff=expo_backoff, 48 | expo_backoff_limit=expo_backoff_limit, 49 | skip_describe_stream=skip_describe_stream, 50 | create_stream=create_stream, 51 | create_stream_shards=create_stream_shards, 52 | ) 53 | 54 | self.buffer_time = buffer_time 55 | 56 | self.processor = processor if processor else JsonProcessor() 57 | 58 | self.queue = asyncio.Queue(maxsize=max_queue_size) 59 | 60 | self.batch_size = batch_size 61 | 62 | # A single shard can ingest up to 1 MiB of data per second (including partition keys) 63 | # or 1,000 records per second for writes 64 | self.put_rate_limit_per_shard = put_rate_limit_per_shard 65 | self.put_rate_throttle = None 66 | self.put_bandwidth_limit_per_shard = put_bandwidth_limit_per_shard 67 | self.put_bandwidth_throttle = None 68 | 69 | if put_bandwidth_limit_per_shard > 1024: 70 | log.warning( 71 | ( 72 | "Put bandwidth {}kb exceeds 1024kb. Expect throughput errors..".format( 73 | put_bandwidth_limit_per_shard 74 | ) 75 | ) 76 | ) 77 | self.set_put_rate_throttle() 78 | 79 | self.flush_task = asyncio.create_task(self._flush()) 80 | self.is_flushing = False 81 | self.after_flush_fun = after_flush_fun 82 | 83 | # keep track of these (used by unit test only) 84 | self.throughput_exceeded_count = 0 85 | 86 | # overflow buffer 87 | self.overflow = [] 88 | 89 | self.flush_total_records = 0 90 | self.flush_total_size = 0 91 | 92 | def set_put_rate_throttle(self): 93 | self.put_rate_throttle = Throttler( 94 | rate_limit=self.put_rate_limit_per_shard 95 | * (len(self.shards) if self.shards else 1), 96 | period=1, 97 | ) 98 | self.put_bandwidth_throttle = Throttler( 99 | # kb per second. Go below a bit to avoid hitting the threshold 100 | size_limit=self.put_bandwidth_limit_per_shard 101 | * (len(self.shards) if self.shards else 1), 102 | period=1, 103 | ) 104 | 105 | async def put(self, data): 106 | 107 | # Raise exception from Flush Task to main task otherwise raise exception inside 108 | # Flush Task will fail silently 109 | if self.flush_task.done(): 110 | raise self.flush_task.exception() 111 | 112 | if not self.stream_status == self.ACTIVE: 113 | await self.get_conn() 114 | 115 | elif self.queue.qsize() >= self.batch_size: 116 | await self.flush() 117 | 118 | for output in self.processor.add_item(data): 119 | await self.queue.put(output) 120 | 121 | async def close(self): 122 | log.debug(f"Closing Connection.. (stream status:{self.stream_status})") 123 | if not self.stream_status == self.RECONNECT: 124 | # Cancel Flush Task 125 | self.flush_task.cancel() 126 | # final flush (probably not required but no harm) 127 | await self.flush() 128 | 129 | await self.client.close() 130 | 131 | async def _flush(self): 132 | while True: 133 | if self.stream_status == self.ACTIVE: 134 | if not self.is_flushing: 135 | await self.flush() 136 | await asyncio.sleep(self.buffer_time) 137 | 138 | async def flush(self): 139 | 140 | if self.is_flushing: 141 | log.debug("Flush already in progress, ignoring..") 142 | return 143 | 144 | self.is_flushing = True 145 | 146 | if self.processor.has_items(): 147 | for output in self.processor.get_items(): 148 | await self.queue.put(output) 149 | 150 | while True: 151 | 152 | self.flush_total_records = 0 153 | self.flush_total_size = 0 154 | 155 | if self.queue.qsize() > 0 or len(self.overflow) > 0: 156 | log.debug( 157 | "flush queue={} overflow={}".format( 158 | self.queue.qsize(), len(self.overflow) 159 | ) 160 | ) 161 | 162 | items = await self.get_batch() 163 | 164 | if not items: 165 | break 166 | 167 | else: 168 | result = await self._push_kinesis(items) 169 | await self.process_result(result, items) 170 | 171 | self.is_flushing = False 172 | 173 | async def process_result(self, result, items): 174 | if result["FailedRecordCount"]: 175 | 176 | errors = list( 177 | set( 178 | [ 179 | r.get("ErrorCode") 180 | for r in result["Records"] 181 | if r.get("ErrorCode") 182 | ] 183 | ) 184 | ) 185 | 186 | if not errors: 187 | raise exceptions.UnknownException( 188 | "Failed to put records but no errorCodes return in results" 189 | ) 190 | 191 | if "ProvisionedThroughputExceededException" in errors: 192 | log.warning( 193 | "Throughput exceeded ({} records failed, added back..), pausing for 0.25s..".format( 194 | result["FailedRecordCount"] 195 | ) 196 | ) 197 | 198 | self.throughput_exceeded_count += 1 199 | 200 | for i, record in enumerate(result["Records"]): 201 | if "ErrorCode" in record: 202 | self.overflow.append(items[i]) 203 | 204 | # log.debug("items={} overflow={}".format(len(items), len(overflow))) 205 | 206 | await asyncio.sleep(0.25) 207 | 208 | elif "InternalFailure" in errors: 209 | log.warning("Received InternalFailure from Kinesis") 210 | await self.get_conn() 211 | 212 | for i, record in enumerate(result["Records"]): 213 | if "ErrorCode" in record: 214 | self.overflow.append(items[i]) 215 | 216 | else: 217 | raise exceptions.UnknownException( 218 | "Failed to put records due to: {}".format(", ".join(errors)) 219 | ) 220 | 221 | else: 222 | 223 | if self.after_flush_fun: 224 | await self.after_flush_fun(items) 225 | 226 | async def get_batch(self): 227 | items = [] 228 | flush_max_size = 0 229 | 230 | for num in range(self.queue.qsize() + len(self.overflow)): 231 | async with self.put_rate_throttle: 232 | 233 | if self.overflow: 234 | item = self.overflow.pop() 235 | 236 | else: 237 | try: 238 | item = self.queue.get_nowait() 239 | except QueueEmpty: 240 | break 241 | 242 | size_kb = math.ceil(item[0] / 1024) 243 | 244 | flush_max_size += size_kb 245 | 246 | if flush_max_size > 1024: 247 | self.overflow.append(item) 248 | 249 | elif num <= self.batch_size: 250 | async with self.put_bandwidth_throttle(size=self.flush_total_size): 251 | items.append(item) 252 | self.flush_total_size += size_kb 253 | self.flush_total_records += item[1] 254 | else: 255 | self.overflow.append(item) 256 | 257 | return items 258 | 259 | async def _push_kinesis(self, items): 260 | 261 | log.debug( 262 | "doing flush with {} record ({} items) @ {} kb".format( 263 | len(items), self.flush_total_records, self.flush_total_size 264 | ) 265 | ) 266 | 267 | while True: 268 | 269 | try: 270 | 271 | # todo: custom partition key 272 | results = await self.client.put_records( 273 | Records=[ 274 | { 275 | "Data": item.data, 276 | "PartitionKey": "{0}{1}".format( 277 | time.perf_counter(), time.time() 278 | ), 279 | } 280 | for item in items 281 | ], 282 | StreamName=self.stream_name, 283 | ) 284 | 285 | log.info( 286 | "flush complete with {} record ({} items) @ {} kb".format( 287 | len(items), self.flush_total_records, self.flush_total_size 288 | ) 289 | ) 290 | return results 291 | 292 | except ClientError as err: 293 | 294 | code = err.response["Error"]["Code"] 295 | 296 | if code == "ValidationException": 297 | if ( 298 | "must have length less than or equal" 299 | in err.response["Error"]["Message"] 300 | ): 301 | log.warning( 302 | "Batch size {} exceeded the limit. retrying with less".format( 303 | len(items) 304 | ) 305 | ) 306 | 307 | existing_batch_size = self.batch_size 308 | self.batch_size -= round(self.batch_size / 10) 309 | 310 | # Must be small batch of big items, take at least one out.. 311 | if existing_batch_size == self.batch_size: 312 | self.batch_size -= 1 313 | 314 | self.overflow.extend(items) 315 | 316 | self.flush_total_records = 0 317 | self.flush_max_size = 0 318 | self.flush_total_size = 0 319 | 320 | items = await self.get_batch() 321 | 322 | else: 323 | log.warning( 324 | f'Unknown ValidationException error code {err.response["Error"]["Code"]}' 325 | ) 326 | log.exception(err) 327 | await self.get_conn() 328 | # raise err 329 | elif code == "ResourceNotFoundException": 330 | raise exceptions.StreamDoesNotExist( 331 | "Stream '{}' does not exist".format(self.stream_name) 332 | ) from None 333 | else: 334 | log.warning( 335 | f'Unknown Client error code {err.response["Error"]["Code"]}' 336 | ) 337 | log.exception(err) 338 | await self.get_conn() 339 | # raise err 340 | except ClientConnectionError as err: 341 | await self.get_conn() 342 | except asyncio.CancelledError: 343 | return 344 | except Exception as e: 345 | log.exception(e) 346 | log.critical("Unknown Exception Caught") 347 | await self.get_conn() 348 | -------------------------------------------------------------------------------- /kinesis/serializers.py: -------------------------------------------------------------------------------- 1 | try: 2 | import ujson as json 3 | except ModuleNotFoundError: 4 | # https://github.com/python/mypy/issues/1153 (mypy bug with try/except conditional imports) 5 | import json # type: ignore 6 | 7 | try: 8 | import msgpack 9 | except ModuleNotFoundError: 10 | pass 11 | 12 | 13 | class Serializer: 14 | pass 15 | 16 | 17 | class StringSerializer(Serializer): 18 | def serialize(self, item): 19 | return str(item).encode("utf-8") 20 | 21 | def deserialize(self, data): 22 | return data.decode("utf-8") 23 | 24 | 25 | class JsonSerializer(Serializer): 26 | def serialize(self, item): 27 | return json.dumps(item).encode("utf-8") 28 | 29 | def deserialize(self, data): 30 | return json.loads(data.decode("utf-8")) 31 | 32 | 33 | class MsgpackSerializer(Serializer): 34 | def serialize(self, item): 35 | result = msgpack.packb(item, use_bin_type=True) 36 | return result 37 | 38 | def deserialize(self, data): 39 | return msgpack.unpackb(data, raw=False) 40 | -------------------------------------------------------------------------------- /kinesis/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Source: https://github.com/hallazzang/asyncio-throttle 3 | 4 | Mods: 5 | - add size_limit to support throttling by size 6 | """ 7 | 8 | import time 9 | import math 10 | import asyncio 11 | import logging 12 | from collections import deque 13 | 14 | log = logging.getLogger(__name__) 15 | 16 | 17 | class Throttler: 18 | def __init__( 19 | self, 20 | rate_limit=None, 21 | size_limit=None, 22 | period=1.0, 23 | retry_interval=0.05, 24 | ): 25 | self.rate_limit = rate_limit 26 | self.size_limit = size_limit 27 | self.period = period 28 | self.retry_interval = retry_interval 29 | 30 | self._task_logs = deque() 31 | 32 | self.size = None 33 | 34 | def flush(self): 35 | now = time.time() 36 | while self._task_logs: 37 | if now - self._task_logs[0][0] > self.period: 38 | self._task_logs.popleft() 39 | else: 40 | break 41 | 42 | def is_below_rate(self): 43 | 44 | if self.rate_limit: 45 | below_rate_requests = len(self._task_logs) < self.rate_limit 46 | 47 | if not below_rate_requests: 48 | return False 49 | 50 | if self.size_limit is None or not self._task_logs: 51 | return True 52 | 53 | size = sum([x[1] for x in self._task_logs]) 54 | 55 | period = time.time() - self._task_logs[0][0] 56 | 57 | period_used_ratio = (self.period - period) / self.period 58 | 59 | remaining = self.size_limit - math.ceil(size * period_used_ratio) 60 | 61 | # log.debug("rate check: size={} requested={} period={} period_used_ratio={} remaining={}".format(size, self.size, round(period,3), round(period_used_ratio, 2), round(remaining,2))) 62 | 63 | return self.size <= remaining 64 | 65 | async def acquire(self): 66 | 67 | while True: 68 | self.flush() 69 | if self.is_below_rate(): 70 | break 71 | await asyncio.sleep( 72 | self.retry_interval, 73 | ) 74 | 75 | self._task_logs.append((time.time(), self.size)) 76 | 77 | def __call__(self, size=1): 78 | self.size = size 79 | return self 80 | 81 | async def __aenter__(self): 82 | await self.acquire() 83 | 84 | async def __aexit__(self, exc_type, exc, tb): 85 | pass 86 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiobotocore>=1.3.3 2 | async-timeout==3.0.1 3 | aredis==1.1.8 4 | msgpack==1.0.0 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [nosetests] 2 | stop=1 3 | with-spec=1 4 | spec-color=1 5 | nologcapture=1 6 | nocapture=1 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open("README.md") as f: 4 | long_description = f.read() 5 | 6 | setup( 7 | name="async-kinesis", 8 | description="AsyncIO Kinesis Library", 9 | long_description=long_description, 10 | long_description_content_type="text/markdown", 11 | version="1.1.5", 12 | url="https://github.com/hampsterx/async-kinesis", 13 | author="hampsterx", 14 | author_email="tim.vdh@gmail.com", 15 | license="Apache2", 16 | classifiers=[ 17 | "Development Status :: 4 - Beta", 18 | "Programming Language :: Python", 19 | "Programming Language :: Python :: 3", 20 | "Programming Language :: Python :: 3.5", 21 | "Programming Language :: Python :: 3.6", 22 | "Programming Language :: Python :: 3.7", 23 | "Programming Language :: Python :: 3.8", 24 | "Intended Audience :: Developers", 25 | "License :: OSI Approved :: Apache Software License", 26 | ], 27 | packages=["kinesis"], 28 | install_requires=[ 29 | "aiobotocore>=1.0.4", 30 | "async-timeout>=3.0.1", 31 | "asyncio-throttle>=0.1.1", 32 | ], 33 | extras_require={ 34 | "kpl": ["aws-kinesis-agg>=1.1.6"], 35 | "redis": ["aredis>=1.1.8"], 36 | "msgpack": ["msgpack>=0.6.1"], 37 | }, 38 | ) 39 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | 3 | aws-kinesis-agg==1.1.6 4 | coloredlogs==10.0 5 | nose==1.3.7 6 | pinocchio==0.4.2 7 | python-dotenv==0.9.1 8 | asynctest==0.12.3 9 | pluggy==0.13.1 10 | tox==3.20.1 11 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uuid 3 | import asyncio 4 | import logging, coloredlogs 5 | 6 | from aiobotocore import AioSession 7 | from dotenv import load_dotenv 8 | from asynctest import TestCase as AsynTestCase, fail_on 9 | from unittest import skipUnless, TestCase 10 | from kinesis import Consumer, Producer, MemoryCheckPointer, RedisCheckPointer 11 | from kinesis.processors import ( 12 | StringProcessor, 13 | JsonProcessor, 14 | JsonLineProcessor, 15 | JsonListProcessor, 16 | MsgpackProcessor, 17 | Processor, 18 | ) 19 | from kinesis.aggregators import ( 20 | Aggregator, 21 | KPLAggregator, 22 | SimpleAggregator, 23 | NewlineAggregator, 24 | ListAggregator, 25 | NetstringAggregator, 26 | OutputItem, 27 | ) 28 | from kinesis.serializers import StringSerializer, JsonSerializer, Serializer 29 | from kinesis import exceptions 30 | 31 | coloredlogs.install(level="DEBUG", fmt="%(name)s %(levelname)s %(message)s") 32 | 33 | logging.getLogger("botocore").setLevel(logging.WARNING) 34 | logging.getLogger("aiobotocore").setLevel(logging.INFO) 35 | 36 | 37 | log = logging.getLogger(__name__) 38 | 39 | load_dotenv() 40 | 41 | # https://github.com/mhart/kinesalite 42 | # ./node_modules/.bin/kinesalite --shardLimit 1000 43 | # see also docker-compose.yaml 44 | ENDPOINT_URL = os.environ.get("ENDPOINT_URL", "http://localhost:4567") 45 | 46 | TESTING_USE_AWS_KINESIS = os.environ.get("TESTING_USE_AWS_KINESIS", "0") == "1" 47 | 48 | # Use docker-compose one 49 | if "REDIS_PORT" not in os.environ: 50 | os.environ["REDIS_PORT"] = "16379" 51 | 52 | 53 | class BaseTests: 54 | def random_string(self, length): 55 | from random import choice 56 | from string import ascii_uppercase 57 | 58 | return "".join(choice(ascii_uppercase) for i in range(length)) 59 | 60 | 61 | class BaseKinesisTests(AsynTestCase, BaseTests): 62 | async def setUp(self): 63 | self.stream_name = "test_{}".format(str(uuid.uuid4())[0:8]) 64 | producer = await Producer( 65 | stream_name=self.stream_name, 66 | endpoint_url=ENDPOINT_URL, 67 | create_stream=self.stream_name, 68 | create_stream_shards=1, 69 | ).__aenter__() 70 | await producer.__aexit__(None, None, None) 71 | 72 | async def add_record_delayed(self, msg, producer, delay): 73 | log.debug("Adding record. delay={}".format(delay)) 74 | await asyncio.sleep(delay) 75 | await producer.put(msg) 76 | 77 | 78 | class ProcessorAndAggregatorTests(TestCase, BaseTests): 79 | """ 80 | Processor and Aggregator Tests 81 | """ 82 | 83 | def test_aggregator_min_size(self): 84 | 85 | with self.assertRaises(exceptions.ValidationError): 86 | Aggregator(max_size=20) 87 | 88 | def test_aggregator_max_size(self): 89 | 90 | with self.assertRaises(exceptions.ValidationError): 91 | Aggregator(max_size=2000) 92 | 93 | def test_processor_exceed_put_limit(self): 94 | processor = StringProcessor() 95 | 96 | with self.assertRaises(exceptions.ExceededPutLimit): 97 | list(processor.add_item(self.random_string(1024 * 1024 + 1))) 98 | 99 | def test_newline_aggregator(self): 100 | 101 | # in reality does not make sense as strings can contain new lines 102 | # so is not a suitable combination to use 103 | class NewlineTestProcessor(NewlineAggregator, StringSerializer): 104 | pass 105 | 106 | processor = NewlineTestProcessor() 107 | 108 | # Expect nothing as batching 109 | self.assertEqual([], list(processor.add_item(123))) 110 | self.assertEqual([], list(processor.add_item("test"))) 111 | 112 | self.assertTrue(processor.has_items()) 113 | 114 | output = list(processor.get_items()) 115 | 116 | self.assertEqual(len(output), 1) 117 | 118 | self.assertEqual(output[0].size, 9) 119 | self.assertEqual(output[0].n, 2) 120 | self.assertEqual(output[0].data, b"123\ntest\n") 121 | 122 | self.assertListEqual(list(processor.parse(output[0].data)), ["123", "test"]) 123 | 124 | def test_list_aggregator(self): 125 | class JsonListTestProcessor(ListAggregator, JsonSerializer): 126 | pass 127 | 128 | processor = JsonListTestProcessor() 129 | 130 | # Expect nothing as batching 131 | self.assertEqual([], list(processor.add_item(123))) 132 | self.assertEqual([], list(processor.add_item("test"))) 133 | 134 | self.assertTrue(processor.has_items()) 135 | 136 | output = list(processor.get_items()) 137 | 138 | self.assertEqual(len(output), 1) 139 | 140 | self.assertEqual(output[0].size, 11) 141 | self.assertEqual(output[0].n, 2) 142 | self.assertEqual(output[0].data, b'[123, "test"]') 143 | 144 | self.assertListEqual(next(processor.parse(output[0].data)), [123, "test"]) 145 | 146 | def test_netstring_aggregator(self): 147 | class NetstringTestProcessor(NetstringAggregator, StringSerializer): 148 | pass 149 | 150 | processor = NetstringTestProcessor() 151 | 152 | # Expect nothing as batching 153 | self.assertEqual([], list(processor.add_item(123))) 154 | self.assertEqual([], list(processor.add_item("test"))) 155 | 156 | self.assertTrue(processor.has_items()) 157 | 158 | output = list(processor.get_items()) 159 | 160 | self.assertEqual(len(output), 1) 161 | 162 | self.assertEqual(output[0].size, 13) 163 | self.assertEqual(output[0].n, 2) 164 | self.assertEqual(output[0].data, b"3:123,4:test,") 165 | 166 | self.assertListEqual(list(processor.parse(output[0].data)), ["123", "test"]) 167 | 168 | def test_kpl_aggregator(self): 169 | class KPLTestProcessor(KPLAggregator, StringSerializer): 170 | pass 171 | 172 | processor = KPLTestProcessor() 173 | 174 | # Expect nothing as batching 175 | self.assertEqual([], list(processor.add_item(123))) 176 | self.assertEqual([], list(processor.add_item("test"))) 177 | 178 | self.assertTrue(processor.has_items()) 179 | 180 | output = list(processor.get_items()) 181 | 182 | self.assertEqual(len(output), 1) 183 | 184 | self.assertEqual(output[0].n, 2) 185 | 186 | self.assertListEqual(list(processor.parse(output[0].data)), ["123", "test"]) 187 | 188 | def test_kpl_aggregator_max_size(self): 189 | class BytesSerializer: 190 | def serialize(self, item): 191 | return item 192 | 193 | def deserialize(self, data): 194 | return data 195 | 196 | class KPLTestProcessor(KPLAggregator, BytesSerializer): 197 | pass 198 | 199 | # 100 K max_size 200 | processor = KPLTestProcessor(max_size=1024 * 100) 201 | 202 | # Expect nothing as batching first two 40K records 203 | self.assertEqual([], list(processor.add_item(bytes(40 * 1024)))) 204 | self.assertEqual([], list(processor.add_item(bytes(40 * 1024)))) 205 | 206 | # output as we exceed 207 | output = list(processor.add_item(bytes(40 * 1024))) 208 | 209 | self.assertEqual(len(output), 1) 210 | 211 | self.assertEqual(output[0].n, 2) 212 | 213 | def test_string_processor(self): 214 | 215 | processor = StringProcessor() 216 | 217 | self.assertEquals(processor.max_bytes, 1024 * 25 * 40) 218 | 219 | output = list(processor.add_item("test")) 220 | 221 | self.assertEqual(len(output), 1) 222 | self.assertIsInstance(output[0], OutputItem) 223 | 224 | self.assertEqual(output[0].size, len("test")) 225 | self.assertEqual(output[0].n, 1) 226 | self.assertEqual(output[0].data, b"test") 227 | 228 | self.assertFalse(processor.has_items()) 229 | 230 | def test_json_processor(self): 231 | 232 | processor = JsonProcessor() 233 | 234 | output = list(processor.add_item({"test": 123})) 235 | 236 | self.assertEqual(len(output), 1) 237 | self.assertIsInstance(output[0], OutputItem) 238 | 239 | self.assertEqual(output[0].size, 13) 240 | self.assertEqual(output[0].n, 1) 241 | self.assertEqual(output[0].data, b'{"test": 123}') 242 | 243 | self.assertFalse(processor.has_items()) 244 | 245 | self.assertListEqual(list(processor.parse(output[0].data)), [{"test": 123}]) 246 | 247 | def test_json_line_processor(self): 248 | 249 | processor = JsonLineProcessor(max_size=25) 250 | 251 | # Expect nothing as batching 252 | self.assertEqual([], list(processor.add_item({"test": 123}))) 253 | self.assertEqual([], list(processor.add_item({"test": 456}))) 254 | 255 | self.assertTrue(processor.has_items()) 256 | 257 | output = list(processor.get_items()) 258 | 259 | self.assertEqual(len(output), 1) 260 | 261 | self.assertEqual(output[0].size, 28) 262 | self.assertEqual(output[0].n, 2) 263 | self.assertEqual(output[0].data, b'{"test": 123}\n{"test": 456}\n') 264 | 265 | self.assertListEqual( 266 | list(processor.parse(output[0].data)), 267 | [{"test": 123}, {"test": 456}], 268 | ) 269 | 270 | # Expect empty now 271 | self.assertFalse(processor.has_items()) 272 | 273 | result = [] 274 | for x in range(1000): 275 | output = list(processor.add_item({"test": "test with some more data"})) 276 | if output: 277 | self.assertEqual(len(output), 1) 278 | result.append(output[0]) 279 | 280 | # Expected at least one record to be output 281 | self.assertEqual(len(result), 1) 282 | 283 | self.assertEqual(result[0].size, 25567) # expect below 25*1024=25600 284 | self.assertEqual(result[0].n, 691) 285 | 286 | # Expect some left 287 | self.assertTrue(processor.has_items()) 288 | 289 | output = list(processor.get_items()) 290 | 291 | self.assertEqual(len(output), 1) 292 | 293 | self.assertEqual(output[0].size, 11432) 294 | self.assertEqual(output[0].n, 309) 295 | 296 | self.assertFalse(processor.has_items()) 297 | 298 | def test_json_list_processor(self): 299 | 300 | processor = JsonListProcessor(max_size=25) 301 | 302 | # Expect nothing as batching 303 | self.assertEqual([], list(processor.add_item({"test": 123}))) 304 | self.assertEqual([], list(processor.add_item({"test": 456}))) 305 | 306 | self.assertTrue(processor.has_items()) 307 | 308 | output = list(processor.get_items()) 309 | 310 | self.assertEqual(len(output), 1) 311 | 312 | self.assertEqual(output[0].size, 28) 313 | self.assertEqual(output[0].n, 2) 314 | self.assertEqual(output[0].data, b'[{"test": 123}, {"test": 456}]') 315 | 316 | # Need to use next() otherwise list() creates double nested list 317 | self.assertListEqual( 318 | next(processor.parse(output[0].data)), [{"test": 123}, {"test": 456}] 319 | ) 320 | 321 | # Expect empty now 322 | self.assertFalse(processor.has_items()) 323 | 324 | result = [] 325 | for x in range(1000): 326 | output = list(processor.add_item({"test": "test with some more data"})) 327 | if output: 328 | self.assertEqual(len(output), 1) 329 | result.append(output[0]) 330 | 331 | # Expected at least one record to be output 332 | self.assertEqual(len(result), 1) 333 | 334 | self.assertEqual(result[0].size, 25567) # expect below 25*1024=25600 335 | self.assertEqual(result[0].n, 691) 336 | 337 | # Expect some left 338 | self.assertTrue(processor.has_items()) 339 | 340 | output = list(processor.get_items()) 341 | 342 | self.assertEqual(len(output), 1) 343 | 344 | self.assertEqual(output[0].size, 11432) 345 | self.assertEqual(output[0].n, 309) 346 | 347 | self.assertFalse(processor.has_items()) 348 | 349 | def test_msgpack_processor(self): 350 | 351 | processor = MsgpackProcessor(max_size=25) 352 | 353 | # Expect nothing as batching 354 | self.assertEqual([], list(processor.add_item({"test": 123}))) 355 | self.assertEqual([], list(processor.add_item({"test": 456}))) 356 | 357 | self.assertTrue(processor.has_items()) 358 | 359 | output = list(processor.get_items()) 360 | 361 | self.assertEqual(len(output), 1) 362 | 363 | self.assertEqual(output[0].size, 22) 364 | self.assertEqual(output[0].n, 2) 365 | self.assertEqual(output[0].data, b"7:\x81\xa4test{,9:\x81\xa4test\xcd\x01\xc8,") 366 | 367 | self.assertListEqual( 368 | list(processor.parse(output[0].data)), [{"test": 123}, {"test": 456}] 369 | ) 370 | 371 | # Expect empty now 372 | self.assertFalse(processor.has_items()) 373 | 374 | result = [] 375 | for x in range(1000): 376 | output = list(processor.add_item({"test": "test with some more data"})) 377 | if output: 378 | self.assertEqual(len(output), 1) 379 | result.append(output[0]) 380 | 381 | # Expected at least one record to be output 382 | self.assertEqual(len(result), 1) 383 | 384 | self.assertEqual(result[0].size, 25585) # expect below 25*1024=25600 385 | self.assertEqual(result[0].n, 731) 386 | 387 | # Expect some left 388 | self.assertTrue(processor.has_items()) 389 | 390 | output = list(processor.get_items()) 391 | 392 | self.assertEqual(len(output), 1) 393 | 394 | self.assertEqual(output[0].size, 9411) 395 | self.assertEqual(output[0].n, 269) 396 | 397 | self.assertFalse(processor.has_items()) 398 | 399 | 400 | class CheckpointTests(BaseKinesisTests): 401 | """ 402 | Checkpoint Tests 403 | """ 404 | 405 | @classmethod 406 | def patch_consumer_fetch(cls, consumer): 407 | async def get_shard_iterator(shard_id, last_sequence_number=None): 408 | log.info( 409 | "getting shard iterator for {} @ {}".format( 410 | shard_id, last_sequence_number 411 | ) 412 | ) 413 | return True 414 | 415 | consumer.get_shard_iterator = get_shard_iterator 416 | 417 | async def get_records(shard): 418 | log.info("get records shard={}".format(shard["ShardId"])) 419 | return {} 420 | 421 | consumer.get_records = get_records 422 | 423 | consumer.is_fetching = True 424 | 425 | async def test_memory_checkpoint(self): 426 | # first consumer 427 | checkpointer = MemoryCheckPointer(name="test") 428 | 429 | consumer_a = Consumer( 430 | stream_name=None, 431 | checkpointer=checkpointer, 432 | max_shard_consumers=1, 433 | endpoint_url=ENDPOINT_URL, 434 | ) 435 | 436 | self.patch_consumer_fetch(consumer_a) 437 | 438 | consumer_a.shards = [{"ShardId": "test-1"}, {"ShardId": "test-2"}] 439 | 440 | await consumer_a.fetch() 441 | 442 | shards = [s["ShardId"] for s in consumer_a.shards if s.get("stats")] 443 | 444 | # Expect only one shard assigned as max = 1 445 | self.assertEqual(["test-1"], shards) 446 | 447 | # second consumer (note: max_shard_consumers needs to be 2 as uses checkpointer to get allocated shards) 448 | 449 | consumer_b = Consumer( 450 | stream_name=None, 451 | checkpointer=checkpointer, 452 | max_shard_consumers=2, 453 | endpoint_url=ENDPOINT_URL, 454 | ) 455 | 456 | self.patch_consumer_fetch(consumer_b) 457 | 458 | consumer_b.shards = [{"ShardId": "test-1"}, {"ShardId": "test-2"}] 459 | 460 | await consumer_b.fetch() 461 | 462 | shards = [s["ShardId"] for s in consumer_b.shards if s.get("stats")] 463 | 464 | # Expect only one shard assigned as max = 1 465 | self.assertEqual(["test-2"], shards) 466 | 467 | async def test_redis_checkpoint_locking(self): 468 | name = "test-{}".format(str(uuid.uuid4())[0:8]) 469 | 470 | # first consumer 471 | checkpointer_a = RedisCheckPointer(name=name, id="proc-1") 472 | 473 | # second consumer 474 | checkpointer_b = RedisCheckPointer(name=name, id="proc-2") 475 | 476 | # try to allocate the same shard 477 | 478 | result = await asyncio.gather( 479 | *[checkpointer_a.allocate("test"), checkpointer_b.allocate("test")] 480 | ) 481 | 482 | result = list(sorted([x[0] for x in result])) 483 | 484 | # Expect only one to have succeeded 485 | self.assertEquals([False, True], result) 486 | 487 | await checkpointer_a.close() 488 | await checkpointer_b.close() 489 | 490 | async def test_redis_checkpoint_reallocate(self): 491 | name = "test-{}".format(str(uuid.uuid4())[0:8]) 492 | 493 | # first consumer 494 | checkpointer_a = RedisCheckPointer(name=name, id="proc-1") 495 | 496 | await checkpointer_a.allocate("test") 497 | 498 | # checkpoint 499 | await checkpointer_a.checkpoint("test", "123") 500 | 501 | # stop on this shard 502 | await checkpointer_a.deallocate("test") 503 | 504 | # second consumer 505 | checkpointer_b = RedisCheckPointer(name=name, id="proc-2") 506 | 507 | success, sequence = await checkpointer_b.allocate("test") 508 | 509 | self.assertTrue(success) 510 | self.assertEquals("123", sequence) 511 | 512 | await checkpointer_b.close() 513 | 514 | self.assertEquals(checkpointer_b.get_all_checkpoints(), {}) 515 | 516 | await checkpointer_a.close() 517 | 518 | async def test_redis_checkpoint_hearbeat(self): 519 | name = "test-{}".format(str(uuid.uuid4())[0:8]) 520 | 521 | checkpointer = RedisCheckPointer(name=name, heartbeat_frequency=0.5) 522 | 523 | await checkpointer.allocate("test") 524 | await checkpointer.checkpoint("test", "123") 525 | 526 | await asyncio.sleep(1) 527 | 528 | await checkpointer.close() 529 | 530 | # nothing to assert 531 | self.assertTrue(True) 532 | 533 | 534 | class KinesisTests(BaseKinesisTests): 535 | """ 536 | Kinesalite Tests 537 | """ 538 | 539 | async def test_stream_does_not_exist(self): 540 | 541 | await asyncio.sleep(2) 542 | 543 | # Producer 544 | with self.assertRaises(exceptions.StreamDoesNotExist): 545 | async with Producer( 546 | session=AioSession(), 547 | stream_name="test_stream_does_not_exist", endpoint_url=ENDPOINT_URL 548 | ) as producer: 549 | await producer.put("test") 550 | 551 | # Consumer 552 | with self.assertRaises(exceptions.StreamDoesNotExist): 553 | async with Consumer( 554 | stream_name="test_stream_does_not_exist", endpoint_url=ENDPOINT_URL 555 | ): 556 | pass 557 | 558 | @fail_on(unused_loop=True, active_handles=True) 559 | async def test_producer_put(self): 560 | async with Producer( 561 | stream_name=self.stream_name, endpoint_url=ENDPOINT_URL 562 | ) as producer: 563 | await producer.put("test") 564 | 565 | async def test_producer_put_below_limit(self): 566 | async with Producer( 567 | stream_name=self.stream_name, 568 | processor=StringProcessor(), 569 | endpoint_url=ENDPOINT_URL, 570 | ) as producer: 571 | # The maximum size of the data payload of a record before base64-encoding is up to 1 MiB. 572 | # Limit is set in aggregators.BaseAggregator (few bytes short of 1MiB) 573 | await producer.put(self.random_string(40 * 25 * 1024)) 574 | 575 | async def test_producer_put_exceed_batch_size(self): 576 | # Expect to complete by lowering batch size until successful (500 is max) 577 | async with Producer( 578 | stream_name=self.stream_name, endpoint_url=ENDPOINT_URL, batch_size=600 579 | ) as producer: 580 | 581 | for x in range(1000): 582 | await producer.put("test") 583 | 584 | async def test_producer_and_consumer(self): 585 | 586 | async with Producer( 587 | stream_name=self.stream_name, endpoint_url=ENDPOINT_URL 588 | ) as producer: 589 | pass 590 | 591 | async with Consumer( 592 | stream_name=self.stream_name, endpoint_url=ENDPOINT_URL 593 | ): 594 | pass 595 | 596 | async def test_producer_and_consumer_consume_from_start_flush(self): 597 | async with Producer( 598 | stream_name=self.stream_name, endpoint_url=ENDPOINT_URL 599 | ) as producer: 600 | 601 | await producer.put({"test": 123}) 602 | 603 | await producer.flush() 604 | 605 | results = [] 606 | 607 | async with Consumer( 608 | stream_name=self.stream_name, endpoint_url=ENDPOINT_URL 609 | ) as consumer: 610 | async for item in consumer: 611 | results.append(item) 612 | 613 | # Expect to have consumed from start as default iterator_type=TRIM_HORIZON 614 | self.assertEquals([{"test": 123}], results) 615 | 616 | async def test_producer_and_consumer_consume_from_start_after(self): 617 | 618 | # Don't flush, close producer immediately to test all data is written to stream on exit. 619 | async with Producer( 620 | stream_name=self.stream_name, 621 | endpoint_url=ENDPOINT_URL, 622 | processor=StringProcessor(), 623 | ) as producer: 624 | # Put enough data to ensure it will require more than one put 625 | # ie test overflow behaviour 626 | for _ in range(15): 627 | await producer.put(self.random_string(100 * 1024)) 628 | 629 | results = [] 630 | 631 | async with Consumer( 632 | stream_name=self.stream_name, 633 | endpoint_url=ENDPOINT_URL, 634 | processor=StringProcessor(), 635 | ) as consumer: 636 | async for item in consumer: 637 | results.append(item) 638 | 639 | # Expect to have consumed from start as default iterator_type=TRIM_HORIZON 640 | self.assertEquals(len(results), 15) 641 | 642 | async def test_producer_and_consumer_consume_with_json_line_aggregator(self): 643 | 644 | processor = JsonLineProcessor() 645 | 646 | async with Producer( 647 | stream_name=self.stream_name, endpoint_url=ENDPOINT_URL, processor=processor 648 | ) as producer: 649 | 650 | for x in range(0, 10): 651 | await producer.put({"test": x}) 652 | 653 | await producer.flush() 654 | 655 | results = [] 656 | 657 | async with Consumer( 658 | stream_name=self.stream_name, 659 | endpoint_url=ENDPOINT_URL, 660 | processor=processor, 661 | ) as consumer: 662 | async for item in consumer: 663 | results.append(item) 664 | 665 | # Expect to have consumed from start as default iterator_type=TRIM_HORIZON 666 | 667 | self.assertEqual(len(results), 10) 668 | 669 | self.assertEquals(results[0], {"test": 0}) 670 | self.assertEquals(results[-1], {"test": 9}) 671 | 672 | async def test_producer_and_consumer_consume_with_msgpack_aggregator(self): 673 | 674 | processor = MsgpackProcessor() 675 | 676 | async with Producer( 677 | stream_name=self.stream_name, endpoint_url=ENDPOINT_URL, processor=processor 678 | ) as producer: 679 | 680 | for x in range(0, 10): 681 | await producer.put({"test": x}) 682 | 683 | await producer.flush() 684 | 685 | results = [] 686 | 687 | async with Consumer( 688 | stream_name=self.stream_name, 689 | endpoint_url=ENDPOINT_URL, 690 | processor=processor, 691 | ) as consumer: 692 | async for item in consumer: 693 | results.append(item) 694 | 695 | # Expect to have consumed from start as default iterator_type=TRIM_HORIZON 696 | 697 | self.assertEqual(len(results), 10) 698 | 699 | self.assertEquals(results[0], {"test": 0}) 700 | self.assertEquals(results[-1], {"test": 9}) 701 | 702 | async def test_producer_and_consumer_consume_with_bytes(self): 703 | class ByteSerializer(Serializer): 704 | def serialize(self, msg): 705 | result = str.encode(msg) 706 | return result 707 | 708 | def deserialize(self, data): 709 | return data 710 | 711 | class ByteProcessor(Processor, NetstringAggregator, ByteSerializer): 712 | pass 713 | 714 | processor = ByteProcessor() 715 | 716 | async with Producer( 717 | stream_name=self.stream_name, endpoint_url=ENDPOINT_URL, processor=processor 718 | ) as producer: 719 | 720 | for x in range(0, 2): 721 | await producer.put(f"{x}") 722 | 723 | await producer.flush() 724 | 725 | results = [] 726 | 727 | checkpointer = MemoryCheckPointer(name="test") 728 | 729 | async with Consumer( 730 | stream_name=self.stream_name, 731 | endpoint_url=ENDPOINT_URL, 732 | processor=processor, 733 | checkpointer=checkpointer, 734 | ) as consumer: 735 | async for item in consumer: 736 | results.append(item) 737 | await checkpointer.checkpoint( 738 | shard_id=consumer.shards[0]["ShardId"], sequence="seq" 739 | ) 740 | 741 | async for item in consumer: 742 | results.append(item) 743 | 744 | self.assertEquals(len(results), 2) 745 | 746 | await checkpointer.close() 747 | 748 | self.assertEquals(len(checkpointer.get_all_checkpoints()), 1) 749 | 750 | async def test_producer_and_consumer_consume_queue_full(self): 751 | async with Producer( 752 | stream_name=self.stream_name, endpoint_url=ENDPOINT_URL 753 | ) as producer: 754 | 755 | for i in range(0, 100): 756 | await producer.put("test") 757 | 758 | await producer.flush() 759 | 760 | results = [] 761 | 762 | async with Consumer( 763 | stream_name=self.stream_name, 764 | endpoint_url=ENDPOINT_URL, 765 | max_queue_size=20, 766 | ) as consumer: 767 | 768 | async for item in consumer: 769 | results.append(item) 770 | 771 | # Expect 20 only as queue is full and we don't wait on queue 772 | self.assertEqual(20, len(results)) 773 | 774 | async def test_producer_and_consumer_consume_throttle(self): 775 | async with Producer( 776 | stream_name=self.stream_name, endpoint_url=ENDPOINT_URL 777 | ) as producer: 778 | 779 | for i in range(0, 100): 780 | await producer.put("test") 781 | 782 | await producer.flush() 783 | 784 | results = [] 785 | 786 | async with Consumer( 787 | stream_name=self.stream_name, 788 | endpoint_url=ENDPOINT_URL, 789 | record_limit=10, 790 | # 2 per second 791 | shard_fetch_rate=2, 792 | ) as consumer: 793 | 794 | from datetime import datetime 795 | 796 | dt = datetime.now() 797 | 798 | while (datetime.now() - dt).total_seconds() < 3.05: 799 | async for item in consumer: 800 | results.append(item) 801 | 802 | # Expect 2*3*10 = 60 ie at most 6 iterations of 10 records 803 | self.assertGreaterEqual(len(results), 50) 804 | self.assertLessEqual(len(results), 70) 805 | 806 | async def test_producer_and_consumer_consume_with_checkpointer_and_latest(self): 807 | async with Producer( 808 | stream_name=self.stream_name, endpoint_url=ENDPOINT_URL 809 | ) as producer: 810 | 811 | await producer.put("test.A") 812 | 813 | results = [] 814 | 815 | checkpointer = MemoryCheckPointer(name="test") 816 | 817 | async with Consumer( 818 | stream_name=self.stream_name, 819 | endpoint_url=ENDPOINT_URL, 820 | checkpointer=checkpointer, 821 | iterator_type="LATEST", 822 | ) as consumer: 823 | 824 | async for item in consumer: 825 | results.append(item) 826 | 827 | # Expect none as LATEST 828 | self.assertEquals([], results) 829 | 830 | checkpoints = checkpointer.get_all_checkpoints() 831 | 832 | # Expect 1 as only 1 shard 833 | self.assertEquals(1, len(checkpoints)) 834 | 835 | # none as no records yet (using LATEST) 836 | self.assertIsNone(checkpoints[list(checkpoints.keys())[0]]["sequence"]) 837 | 838 | results = [] 839 | 840 | log.info("checkpointer checkpoints: {}".format(checkpoints)) 841 | 842 | log.info("Starting consumer again..") 843 | 844 | async with Consumer( 845 | stream_name=self.stream_name, 846 | endpoint_url=ENDPOINT_URL, 847 | checkpointer=checkpointer, 848 | iterator_type="LATEST", 849 | sleep_time_no_records=0.5, 850 | ) as consumer: 851 | 852 | # Manually start 853 | await consumer.start_consumer() 854 | 855 | await producer.put("test.B") 856 | 857 | await producer.flush() 858 | 859 | log.info("waiting..") 860 | 861 | await asyncio.sleep(1) 862 | 863 | log.info("about to consume..") 864 | 865 | async for item in consumer: 866 | results.append(item) 867 | 868 | self.assertEquals(["test.B"], results) 869 | 870 | checkpoints = checkpointer.get_all_checkpoints() 871 | 872 | log.info("checkpointer checkpoints: {}".format(checkpoints)) 873 | 874 | # expect not None as has processed records 875 | self.assertIsNotNone(checkpoints[list(checkpoints.keys())[0]]["sequence"]) 876 | 877 | # now add some records 878 | for i in range(0, 10): 879 | await producer.put("test.{}".format(i)) 880 | 881 | await producer.flush() 882 | 883 | await asyncio.sleep(1) 884 | 885 | results = [] 886 | 887 | async with Consumer( 888 | stream_name=self.stream_name, 889 | endpoint_url=ENDPOINT_URL, 890 | checkpointer=checkpointer, 891 | iterator_type="LATEST", 892 | sleep_time_no_records=0.5, 893 | ) as consumer: 894 | 895 | async for item in consumer: 896 | results.append(item) 897 | 898 | # Expect results as checkpointer resumed from prior sequence 899 | self.assertEquals(10, len(results)) 900 | 901 | async def test_producer_and_consumer_consume_multiple_shards_with_redis_checkpointer( 902 | self, 903 | ): 904 | stream_name = "test_{}".format(str(uuid.uuid4())[0:8]) 905 | async with Producer( 906 | stream_name=stream_name, 907 | endpoint_url=ENDPOINT_URL, 908 | create_stream=stream_name, 909 | create_stream_shards=2, 910 | ) as producer: 911 | 912 | for i in range(0, 100): 913 | await producer.put("test.{}".format(i)) 914 | 915 | await producer.flush() 916 | 917 | results = [] 918 | 919 | checkpointer = RedisCheckPointer( 920 | name="test-{}".format(str(uuid.uuid4())[0:8]), heartbeat_frequency=3 921 | ) 922 | 923 | async with Consumer( 924 | stream_name=stream_name, 925 | endpoint_url=ENDPOINT_URL, 926 | checkpointer=checkpointer, 927 | record_limit=10, 928 | ) as consumer: 929 | 930 | # consumer will stop if no msgs 931 | for i in range(0, 6): 932 | async for item in consumer: 933 | results.append(item) 934 | await asyncio.sleep(0.5) 935 | 936 | self.assertEquals(100, len(results)) 937 | 938 | checkpoints = checkpointer.get_all_checkpoints() 939 | 940 | self.assertEquals(2, len(checkpoints)) 941 | 942 | # Expect both shards to have been used/set 943 | for item in checkpoints.values(): 944 | self.assertIsNotNone(item) 945 | 946 | 947 | class AWSKinesisTests(BaseKinesisTests): 948 | """ 949 | AWS Kinesis Tests 950 | """ 951 | 952 | STREAM_NAME_SINGLE_SHARD = "pykinesis-test-single-shard" 953 | STREAM_NAME_MULTI_SHARD = "pykinesis-test-multi-shard" 954 | 955 | forbid_get_event_loop = True 956 | 957 | @classmethod 958 | def setUpClass(cls): 959 | if not TESTING_USE_AWS_KINESIS: 960 | return 961 | 962 | log.info( 963 | "Creating (or ignoring if exists) *Actual* Kinesis stream: {}".format( 964 | cls.STREAM_NAME_SINGLE_SHARD 965 | ) 966 | ) 967 | 968 | async def create(stream_name, shards): 969 | async with Producer(stream_name=stream_name, create_stream=True, create_stream_shards=shards) as producer: 970 | await producer.start() 971 | 972 | asyncio.run(create(stream_name=cls.STREAM_NAME_SINGLE_SHARD, shards=1)) 973 | 974 | @classmethod 975 | def tearDownClass(cls): 976 | if not TESTING_USE_AWS_KINESIS: 977 | return 978 | 979 | log.warning( 980 | "Don't forget to delete your $$ streams: {} and {}".format( 981 | cls.STREAM_NAME_SINGLE_SHARD, cls.STREAM_NAME_MULTI_SHARD 982 | ) 983 | ) 984 | 985 | @skipUnless( 986 | TESTING_USE_AWS_KINESIS, "Requires TESTING_USE_AWS_KINESIS flag to be set" 987 | ) 988 | async def test_consumer_checkpoint(self): 989 | 990 | checkpointer = MemoryCheckPointer(name="test") 991 | 992 | results = [] 993 | 994 | async with Producer( 995 | stream_name=self.STREAM_NAME_SINGLE_SHARD, 996 | processor=StringProcessor(), 997 | ) as producer: 998 | 999 | async with Consumer( 1000 | stream_name=self.STREAM_NAME_SINGLE_SHARD, 1001 | checkpointer=checkpointer, 1002 | processor=StringProcessor(), 1003 | iterator_type="LATEST", 1004 | ) as consumer: 1005 | 1006 | # Manually start 1007 | await consumer.start_consumer() 1008 | 1009 | await producer.put("test") 1010 | 1011 | await producer.flush() 1012 | 1013 | for i in range(3): 1014 | async for item in consumer: 1015 | results.append(item) 1016 | 1017 | checkpoints = checkpointer.get_all_checkpoints() 1018 | 1019 | # Expect 1 as only 1 shard 1020 | self.assertEquals(1, len(checkpoints)) 1021 | 1022 | self.assertIsNotNone(checkpoints[list(checkpoints.keys())[0]]["sequence"]) 1023 | 1024 | self.assertListEqual(results, ["test"]) 1025 | 1026 | @skipUnless( 1027 | TESTING_USE_AWS_KINESIS, "Requires TESTING_USE_AWS_KINESIS flag to be set" 1028 | ) 1029 | async def test_consumer_consume_fetch_limit(self): 1030 | 1031 | async with Consumer( 1032 | stream_name=self.STREAM_NAME_SINGLE_SHARD, 1033 | sleep_time_no_records=0.0001, 1034 | shard_fetch_rate=500, 1035 | iterator_type="LATEST", 1036 | ) as consumer: 1037 | await consumer.start() 1038 | 1039 | # GetShardIterator has a limit of five transactions per second per account per open shard 1040 | 1041 | for i in range(0, 500): 1042 | await consumer.fetch() 1043 | # sleep 50ms 1044 | await asyncio.sleep(0.05) 1045 | 1046 | shard_stats = [s["stats"] for s in consumer.shards][0].to_data() 1047 | 1048 | self.assertTrue( 1049 | shard_stats["throttled"] > 0, msg="Expected to be throttled" 1050 | ) 1051 | 1052 | @skipUnless( 1053 | TESTING_USE_AWS_KINESIS, "Requires TESTING_USE_AWS_KINESIS flag to be set" 1054 | ) 1055 | async def test_producer_producer_limit(self): 1056 | # Expect some throughput errors 1057 | 1058 | async with Producer( 1059 | stream_name=self.STREAM_NAME_SINGLE_SHARD, 1060 | processor=StringProcessor(), 1061 | put_bandwidth_limit_per_shard=1500, 1062 | ) as producer: 1063 | 1064 | async with Consumer( 1065 | stream_name=self.STREAM_NAME_SINGLE_SHARD, 1066 | processor=StringProcessor(), 1067 | iterator_type="LATEST", 1068 | ) as consumer: 1069 | 1070 | await consumer.start_consumer() 1071 | 1072 | # Wait a bit just to be sure iterator is gonna get late 1073 | await asyncio.sleep(3) 1074 | 1075 | for x in range(20): 1076 | await producer.put(self.random_string(1024 * 250)) 1077 | 1078 | # todo: async timeout 1079 | output = [] 1080 | while len(output) < 20: 1081 | async for item in consumer: 1082 | output.append(item) 1083 | 1084 | self.assertEquals(len(output), 20) 1085 | self.assertTrue(producer.throughput_exceeded_count > 0) 1086 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py37,py38,black,mypy 3 | 4 | [testenv] 5 | passenv = TESTING_USE_AWS_KINESIS 6 | commands = 7 | nosetests [] 8 | 9 | deps = 10 | -r test-requirements.txt 11 | 12 | [testenv:black] 13 | deps = 14 | black 15 | commands = 16 | black --check --diff kinesis 17 | skip_install = true 18 | 19 | [testenv:mypy] 20 | deps = 21 | mypy 22 | commands = 23 | mypy kinesis --------------------------------------------------------------------------------