├── kiner ├── __init__.py └── producer.py ├── .editorconfig ├── Pipfile ├── .travis.yml ├── setup.py ├── LICENSE ├── .gitignore ├── README.md ├── .github └── workflows │ └── codeql-analysis.yml └── tests └── test_producer.py /kiner/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logging.getLogger(__name__).addHandler(logging.NullHandler()) 4 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 4 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | 11 | [*.md] 12 | trim_trailing_whitespace = true 13 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | "boto3" = "*" 8 | 9 | [dev-packages] 10 | "moto" = "*" 11 | "pytest" = "*" 12 | "kiner" = {editable = true, path = "."} 13 | 14 | [requires] 15 | python_version = "3.7" 16 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 3.6 4 | 5 | env: 6 | - AWS_ACCESS_KEY_ID=1234 AWS_DEFAULT_REGION=us-west-1 AWS_SECRET_ACCESS_KEY=shhh 7 | 8 | install: 9 | - pip install pipenv 10 | - pipenv install --dev --skip-lock 11 | 12 | script: pipenv run pytest 13 | 14 | deploy: 15 | provider: pypi 16 | user: $PYPI_USERNAME 17 | password: $PYPI_PASSWORD 18 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | 3 | from setuptools import setup 4 | 5 | setup( 6 | name="kiner", 7 | packages=["kiner"], 8 | version="0.8.1", 9 | description="Python AWS Kinesis Producer", 10 | author="David Gasquez", 11 | license="MIT", 12 | author_email="davidgasquez@buffer.com", 13 | url="https://github.com/bufferapp/kiner", 14 | keywords=["kinesis", "producer", "aws"], 15 | install_requires=["boto3", "future"], 16 | ) 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Buffer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | Pipfile.lock 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # dotenv 85 | .env 86 | 87 | # virtualenv 88 | .venv 89 | venv/ 90 | ENV/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |

6 | A simple Python AWS Kinesis Producer. 7 |

8 | 9 |      10 | 11 | [![Build Status](https://travis-ci.org/bufferapp/kiner.svg?branch=master)](https://travis-ci.org/bufferapp/kiner) 12 | [![PyPI version](https://badge.fury.io/py/kiner.svg)](https://badge.fury.io/py/kiner) 13 | [![License](https://img.shields.io/github/license/mashape/apistatus.svg)](LICENSE) 14 | 15 | 16 | ### Features 17 | 18 | - Error handling and retrying with exponential backoff 19 | - Automatic batching and flush callbacks 20 | - Threaded execution 21 | 22 | Inspired by the AWS blog post [Implementing Efficient and Reliable Producers with the Amazon Kinesis Producer Library](https://aws.amazon.com/blogs/big-data/implementing-efficient-and-reliable-producers-with-the-amazon-kinesis-producer-library/). 23 | 24 | ## Installation 25 | 26 | You can use `pip` to install Kiner. 27 | 28 | ```bash 29 | pip install kiner 30 | ``` 31 | 32 | ## Usage 33 | 34 | To use Kiner, you'll need to have AWS authentication credentials configured 35 | as stated in the [`boto3` documentation](https://boto3.readthedocs.io/en/latest/guide/quickstart.html#configuration) 36 | 37 | ```python 38 | from kiner.producer import KinesisProducer 39 | 40 | p = KinesisProducer('stream-name', batch_size=500, max_retries=5, threads=10) 41 | 42 | for i in range(10000): 43 | p.put_record(i) 44 | 45 | p.close() 46 | ``` 47 | 48 | To be notified when data is flushed to AWS Kinesis, provide a flush_callback 49 | ```python 50 | from uuid import uuid4 51 | from kiner.producer import KinesisProducer 52 | 53 | def on_flush(count, last_flushed_at, Data=b'', PartitionKey='', Metadata=()): 54 | print(f""" 55 | Flushed {count} messages at timestamp {last_flushed_at} 56 | Last message was {Metadata['id']} paritioned by {PartitionKey} ({len(Data)} bytes) 57 | """) 58 | 59 | p = KinesisProducer('stream-name', flush_callback=on_flush) 60 | 61 | for i in range(10000): 62 | p.put_record(i, metadata={'id': uuid4()}, partition_key=f"{i % 2}") 63 | 64 | p.close() 65 | 66 | ``` 67 | ## Contributions 68 | 69 | - Logo design by [@area55git](https://github.com/area55git) 70 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | name: "CodeQL" 7 | 8 | on: 9 | push: 10 | branches: [main] 11 | pull_request: 12 | # The branches below must be a subset of the branches above 13 | branches: [main] 14 | schedule: 15 | - cron: '0 15 * * 0' 16 | 17 | jobs: 18 | analyze: 19 | name: Analyze 20 | runs-on: ubuntu-latest 21 | 22 | strategy: 23 | fail-fast: false 24 | matrix: 25 | # Override automatic language detection by changing the below list 26 | # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python'] 27 | language: ['python'] 28 | # Learn more... 29 | # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection 30 | 31 | steps: 32 | - name: Checkout repository 33 | uses: actions/checkout@v2 34 | with: 35 | # We must fetch at least the immediate parents so that if this is 36 | # a pull request then we can checkout the head. 37 | fetch-depth: 2 38 | 39 | # If this run was triggered by a pull request event, then checkout 40 | # the head of the pull request instead of the merge commit. 41 | - run: git checkout HEAD^2 42 | if: ${{ github.event_name == 'pull_request' }} 43 | 44 | # Initializes the CodeQL tools for scanning. 45 | - name: Initialize CodeQL 46 | uses: github/codeql-action/init@v1 47 | with: 48 | languages: ${{ matrix.language }} 49 | # If you wish to specify custom queries, you can do so here or in a config file. 50 | # By default, queries listed here will override any specified in a config file. 51 | # Prefix the list here with "+" to use these queries and those in the config file. 52 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 53 | 54 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 55 | # If this step fails, then you should remove it and run the build manually (see below) 56 | - name: Autobuild 57 | uses: github/codeql-action/autobuild@v1 58 | 59 | # ℹ️ Command-line programs to run using the OS shell. 60 | # 📚 https://git.io/JvXDl 61 | 62 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 63 | # and modify them (or add more) to build your code if your project 64 | # uses a compiled language 65 | 66 | #- run: | 67 | # make bootstrap 68 | # make release 69 | 70 | - name: Perform CodeQL Analysis 71 | uses: github/codeql-action/analyze@v1 72 | -------------------------------------------------------------------------------- /tests/test_producer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import time 3 | 4 | from mock import Mock, patch, ANY 5 | from moto import mock_kinesis 6 | from kiner.producer import encode_data 7 | from kiner.producer import KinesisProducer 8 | 9 | 10 | BATCH_SIZE=50 11 | 12 | @pytest.fixture 13 | def flush_callback(): 14 | return Mock() 15 | 16 | @pytest.fixture 17 | def producer(flush_callback): 18 | producer = KinesisProducer('test_stream', batch_size=BATCH_SIZE, 19 | batch_time=1, threads=3, flush_callback=flush_callback) 20 | return producer 21 | 22 | 23 | @pytest.fixture 24 | def client(producer): 25 | return producer.kinesis_client 26 | 27 | 28 | @pytest.mark.parametrize('data', [1, '1', b'1']) 29 | def test_encode_data(data): 30 | assert encode_data(data) == b'1' 31 | 32 | 33 | @mock_kinesis 34 | @pytest.mark.parametrize('n', [1, 101, 179, 234, 399]) 35 | def test_send_records(producer, client, n, flush_callback): 36 | client.create_stream(StreamName=producer.stream_name, ShardCount=1) 37 | 38 | with patch('kiner.producer.time') as mock_time: 39 | # Put records in the stream 40 | for i in range(n - 1): 41 | producer.put_record(i, metadata={'i': i, 'n': n}) 42 | producer.put_record(n - 1, metadata={'i': n - 1, 'n': n}, partition_key='some-partition-key') 43 | 44 | producer.close() 45 | 46 | # Assert flush callback called for at least as many batches were sent 47 | assert flush_callback.call_count >= n // BATCH_SIZE + 1 48 | # Assert the final record was flushed 49 | flush_callback.assert_any_call( 50 | ANY, mock_time.time(), Data=str(n - 1).encode(), Metadata={'i': n-1, 'n': n}, PartitionKey='some-partition-key' 51 | ) 52 | # Assert we flushed n records 53 | assert sum(call[1][0] for call in flush_callback.mock_calls) == n 54 | 55 | response = client.describe_stream(StreamName=producer.stream_name) 56 | shard_id = response['StreamDescription']['Shards'][0]['ShardId'] 57 | shard_iterator = client.get_shard_iterator( 58 | StreamName=producer.stream_name, 59 | ShardId=shard_id, 60 | ShardIteratorType='TRIM_HORIZON' 61 | ).get('ShardIterator') 62 | 63 | records = client.get_records(ShardIterator=shard_iterator, Limit=n)['Records'] 64 | 65 | assert len(records) == n 66 | 67 | 68 | @mock_kinesis 69 | @pytest.mark.parametrize('n', [49, 141]) 70 | def test_send_records_without_close(producer, client, n): 71 | client.create_stream(StreamName=producer.stream_name, ShardCount=1) 72 | 73 | # Put records in the stream 74 | for i in range(n): 75 | producer.put_record(i) 76 | 77 | time.sleep(2) 78 | 79 | assert producer.queue.empty() 80 | 81 | producer.close() 82 | 83 | response = client.describe_stream(StreamName=producer.stream_name) 84 | shard_id = response['StreamDescription']['Shards'][0]['ShardId'] 85 | shard_iterator = client.get_shard_iterator( 86 | StreamName=producer.stream_name, 87 | ShardId=shard_id, 88 | ShardIteratorType='TRIM_HORIZON' 89 | ).get('ShardIterator') 90 | 91 | records = client.get_records(ShardIterator=shard_iterator, Limit=n)['Records'] 92 | assert len(records) == n 93 | -------------------------------------------------------------------------------- /kiner/producer.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from concurrent.futures import ThreadPoolExecutor 3 | import logging 4 | from queue import Queue 5 | import sys 6 | import threading 7 | import time 8 | import uuid 9 | import atexit 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def encode_data(data, encoding='utf_8'): 15 | if isinstance(data, bytes): 16 | return data 17 | else: 18 | return str(data).encode(encoding) 19 | 20 | 21 | class KinesisProducer: 22 | """Basic Kinesis Producer. 23 | 24 | Parameters 25 | ---------- 26 | stream_name : string 27 | Name of the stream to send the records. 28 | batch_size : int 29 | Numbers of records to batch before flushing the queue. 30 | batch_time : int 31 | Maximum of seconds to wait before flushing the queue. 32 | max_retries: int 33 | Maximum number of times to retry the put operation. 34 | kinesis_client: boto3.client 35 | Kinesis client. 36 | flush_callback: [int, float, Data=bytes, PartitionKey=str, Metadata=Any] -> None 37 | An optional callback to be invoked upon flushing to kinesis, 38 | called with the number of records flushed, time.time() when flush occurred 39 | and kwargs of the last record flushed 40 | 41 | Attributes 42 | ---------- 43 | records : array 44 | Queue of formated records. 45 | pool: concurrent.futures.ThreadPoolExecutor 46 | Pool of threads handling client I/O. 47 | """ 48 | 49 | def __init__(self, stream_name, batch_size=500, 50 | batch_size_bytes=1024, 51 | batch_time=5, max_retries=5, threads=10, 52 | kinesis_client=None, flush_callback=None): 53 | self.stream_name = stream_name 54 | self.queue = Queue() 55 | self.queue_size_bytes = 0 # size of queue content 56 | self.batch_size = batch_size 57 | self.batch_size_bytes = batch_size_bytes 58 | self.batch_time = batch_time 59 | self.max_retries = max_retries 60 | if kinesis_client is None: 61 | kinesis_client = boto3.client('kinesis') 62 | self.kinesis_client = kinesis_client 63 | self.flush_callback = flush_callback 64 | self.pool = ThreadPoolExecutor(threads) 65 | self.last_flush = time.time() 66 | self.monitor_running = threading.Event() 67 | self.monitor_running.set() 68 | self.pool.submit(self.monitor) 69 | 70 | atexit.register(self.close) 71 | 72 | def monitor(self): 73 | """Flushes the queue periodically.""" 74 | while self.monitor_running.is_set(): 75 | if time.time() - self.last_flush > self.batch_time: 76 | if not self.queue.empty(): 77 | logger.info("Queue Flush: time without flush exceeded") 78 | self.flush_queue() 79 | time.sleep(self.batch_time) 80 | 81 | def put_records(self, records, partition_key=None): 82 | """Add a list of data records to the record queue in the proper format. 83 | Convinience method that calls self.put_record for each element. 84 | 85 | Parameters 86 | ---------- 87 | records : list 88 | Lists of records to send. 89 | partition_key: str 90 | Hash that determines which shard a given data record belongs to. 91 | 92 | """ 93 | for record in records: 94 | self.put_record(record, partition_key) 95 | 96 | def put_record(self, data, metadata=None, partition_key=None): 97 | """Add data to the record queue in the proper format. 98 | 99 | Parameters 100 | ---------- 101 | data : str|bytes 102 | Data to send. 103 | metadata: dict 104 | Metadata associated with the record. 105 | partition_key: str 106 | Hash that determines which shard a given data record belongs to. 107 | 108 | """ 109 | # Byte encode the data 110 | data = encode_data(data) 111 | 112 | # Create a random partition key if not provided 113 | if not partition_key: 114 | partition_key = uuid.uuid4().hex 115 | 116 | # Build the record 117 | record = { 118 | 'Data': data, 119 | 'PartitionKey': partition_key, 120 | 'Metadata': metadata 121 | } 122 | record_bytes = sys.getsizeof(record) / 1048576 # 1048576=1024**2 123 | 124 | # Flush the queue if it reaches the batch size 125 | if self.queue.qsize() >= self.batch_size or self.queue_size_bytes >= self.batch_size_bytes: 126 | logger.info("Queue Flush: batch size reached") 127 | self.pool.submit(self.flush_queue) 128 | 129 | # Append the record 130 | logger.debug('Putting record "{}"'.format(record['Data'][:100])) 131 | self.queue.put(record) 132 | self.queue_size_bytes += record_bytes 133 | 134 | 135 | def close(self): 136 | """Flushes the queue and waits for the executor to finish.""" 137 | logger.info('Closing producer') 138 | self.flush_queue() 139 | self.monitor_running.clear() 140 | self.pool.shutdown() 141 | logger.info('Producer closed') 142 | 143 | def flush_queue(self): 144 | """Grab all the current records in the queue and send them.""" 145 | records = [] 146 | records_size_bytes = 0 147 | 148 | while \ 149 | not self.queue.empty() and \ 150 | len(records) < self.batch_size and \ 151 | records_size_bytes < self.batch_size_bytes: 152 | record = self.queue.get() 153 | 154 | record_size = sys.getsizeof(record) 155 | records_size_bytes += record_size 156 | self.queue_size_bytes -= record_size 157 | 158 | # Pop metadata from the queued record for notifying the callback 159 | last_metadata = record.pop('Metadata') 160 | records.append(record) 161 | 162 | if records: 163 | self.send_records(records) 164 | self.last_flush = time.time() 165 | if self.flush_callback: 166 | self.flush_callback(len(records), self.last_flush, Metadata=last_metadata, **records[-1]) 167 | 168 | def send_records(self, records, attempt=0): 169 | """Send records to the Kinesis stream. 170 | 171 | Falied records are sent again with an exponential backoff decay. 172 | 173 | Parameters 174 | ---------- 175 | records : array 176 | Array of formated records to send. 177 | attempt: int 178 | Number of times the records have been sent without success. 179 | """ 180 | 181 | # If we already tried more times than we wanted, save to a file 182 | if attempt > self.max_retries: 183 | logger.warning('Writing {} records to file'.format(len(records))) 184 | with open('failed_records.dlq', 'ab') as f: 185 | for r in records: 186 | f.write(r.get('Data')) 187 | return 188 | 189 | # Sleep before retrying 190 | if attempt: 191 | time.sleep(2 ** attempt * .1) 192 | 193 | response = self.kinesis_client.put_records(StreamName=self.stream_name, 194 | Records=records) 195 | failed_record_count = response['FailedRecordCount'] 196 | 197 | # Grab failed records 198 | if failed_record_count: 199 | logger.warning('Retrying failed records') 200 | failed_records = [] 201 | for i, record in enumerate(response['Records']): 202 | if record.get('ErrorCode'): 203 | failed_records.append(records[i]) 204 | 205 | # Recursive call 206 | attempt += 1 207 | self.send_records(failed_records, attempt=attempt) 208 | --------------------------------------------------------------------------------