├── kiner
    ├── __init__.py
    └── producer.py
├── .editorconfig
├── Pipfile
├── .travis.yml
├── setup.py
├── LICENSE
├── .gitignore
├── README.md
├── .github
    └── workflows
    │   └── codeql-analysis.yml
└── tests
    └── test_producer.py


/kiner/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logging.getLogger(__name__).addHandler(logging.NullHandler())
4 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = space
 5 | indent_size = 4
 6 | end_of_line = lf
 7 | charset = utf-8
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | 
11 | [*.md]
12 | trim_trailing_whitespace = true
13 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | "boto3" = "*"
 8 | 
 9 | [dev-packages]
10 | "moto" = "*"
11 | "pytest" = "*"
12 | "kiner" = {editable = true, path = "."}
13 | 
14 | [requires]
15 | python_version = "3.7"
16 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python: 3.6
 4 | 
 5 | env:
 6 |   - AWS_ACCESS_KEY_ID=1234 AWS_DEFAULT_REGION=us-west-1 AWS_SECRET_ACCESS_KEY=shhh
 7 | 
 8 | install:
 9 |   - pip install pipenv
10 |   - pipenv install --dev --skip-lock
11 | 
12 | script: pipenv run pytest
13 | 
14 | deploy:
15 |   provider: pypi
16 |   user: $PYPI_USERNAME
17 |   password: $PYPI_PASSWORD
18 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | setup(
 6 |     name="kiner",
 7 |     packages=["kiner"],
 8 |     version="0.8.1",
 9 |     description="Python AWS Kinesis Producer",
10 |     author="David Gasquez",
11 |     license="MIT",
12 |     author_email="davidgasquez@buffer.com",
13 |     url="https://github.com/bufferapp/kiner",
14 |     keywords=["kinesis", "producer", "aws"],
15 |     install_requires=["boto3", "future"],
16 | )
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Buffer
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | Pipfile.lock
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # dotenv
 85 | .env
 86 | 
 87 | # virtualenv
 88 | .venv
 89 | venv/
 90 | ENV/
 91 | 
 92 | # Spyder project settings
 93 | .spyderproject
 94 | .spyproject
 95 | 
 96 | # Rope project settings
 97 | .ropeproject
 98 | 
 99 | # mkdocs documentation
100 | /site
101 | 
102 | # mypy
103 | .mypy_cache/
104 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |     <img width=35% src="https://user-images.githubusercontent.com/1682202/40414884-c84a4248-5e79-11e8-9df4-e7d1da89ed92.png">
 3 | </p>
 4 | 
 5 | <p align="center">
 6 |     <i>A simple Python AWS Kinesis Producer.</i>
 7 | </p>
 8 | 
 9 | &nbsp;&nbsp;&nbsp;&nbsp; 
10 | 
11 | [![Build Status](https://travis-ci.org/bufferapp/kiner.svg?branch=master)](https://travis-ci.org/bufferapp/kiner)
12 | [![PyPI version](https://badge.fury.io/py/kiner.svg)](https://badge.fury.io/py/kiner)
13 | [![License](https://img.shields.io/github/license/mashape/apistatus.svg)](LICENSE)
14 | 
15 | 
16 | ### Features
17 | 
18 | - Error handling and retrying with exponential backoff
19 | - Automatic batching and flush callbacks
20 | - Threaded execution
21 | 
22 | Inspired by the AWS blog post [Implementing Efficient and Reliable Producers with the Amazon Kinesis Producer Library](https://aws.amazon.com/blogs/big-data/implementing-efficient-and-reliable-producers-with-the-amazon-kinesis-producer-library/).
23 | 
24 | ## Installation
25 | 
26 | You can use `pip` to install Kiner.
27 | 
28 | ```bash
29 | pip install kiner
30 | ```
31 | 
32 | ## Usage
33 | 
34 | To use Kiner, you'll need to have AWS authentication credentials configured
35 | as stated in the [`boto3` documentation](https://boto3.readthedocs.io/en/latest/guide/quickstart.html#configuration)
36 | 
37 | ```python
38 | from kiner.producer import KinesisProducer
39 | 
40 | p = KinesisProducer('stream-name', batch_size=500, max_retries=5, threads=10)
41 | 
42 | for i in range(10000):
43 |     p.put_record(i)
44 | 
45 | p.close()
46 | ```
47 | 
48 | To be notified when data is flushed to AWS Kinesis, provide a flush_callback
49 | ```python
50 | from uuid import uuid4
51 | from kiner.producer import KinesisProducer
52 | 
53 | def on_flush(count, last_flushed_at, Data=b'', PartitionKey='', Metadata=()):
54 |     print(f"""
55 |         Flushed {count} messages at timestamp {last_flushed_at}
56 |         Last message was {Metadata['id']} paritioned by {PartitionKey} ({len(Data)} bytes)
57 |     """)
58 | 
59 | p = KinesisProducer('stream-name', flush_callback=on_flush)
60 | 
61 | for i in range(10000):
62 |     p.put_record(i, metadata={'id': uuid4()}, partition_key=f"{i % 2}")
63 | 
64 | p.close()
65 | 
66 | ```
67 | ## Contributions
68 | 
69 | - Logo design by [@area55git](https://github.com/area55git)
70 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | name: "CodeQL"
 7 | 
 8 | on:
 9 |   push:
10 |     branches: [main]
11 |   pull_request:
12 |     # The branches below must be a subset of the branches above
13 |     branches: [main]
14 |   schedule:
15 |     - cron: '0 15 * * 0'
16 | 
17 | jobs:
18 |   analyze:
19 |     name: Analyze
20 |     runs-on: ubuntu-latest
21 | 
22 |     strategy:
23 |       fail-fast: false
24 |       matrix:
25 |         # Override automatic language detection by changing the below list
26 |         # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python']
27 |         language: ['python']
28 |         # Learn more...
29 |         # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection
30 | 
31 |     steps:
32 |     - name: Checkout repository
33 |       uses: actions/checkout@v2
34 |       with:
35 |         # We must fetch at least the immediate parents so that if this is
36 |         # a pull request then we can checkout the head.
37 |         fetch-depth: 2
38 | 
39 |     # If this run was triggered by a pull request event, then checkout
40 |     # the head of the pull request instead of the merge commit.
41 |     - run: git checkout HEAD^2
42 |       if: ${{ github.event_name == 'pull_request' }}
43 | 
44 |     # Initializes the CodeQL tools for scanning.
45 |     - name: Initialize CodeQL
46 |       uses: github/codeql-action/init@v1
47 |       with:
48 |         languages: ${{ matrix.language }}
49 |         # If you wish to specify custom queries, you can do so here or in a config file.
50 |         # By default, queries listed here will override any specified in a config file. 
51 |         # Prefix the list here with "+" to use these queries and those in the config file.
52 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
53 | 
54 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
55 |     # If this step fails, then you should remove it and run the build manually (see below)
56 |     - name: Autobuild
57 |       uses: github/codeql-action/autobuild@v1
58 | 
59 |     # ℹ️ Command-line programs to run using the OS shell.
60 |     # 📚 https://git.io/JvXDl
61 | 
62 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
63 |     #    and modify them (or add more) to build your code if your project
64 |     #    uses a compiled language
65 | 
66 |     #- run: |
67 |     #   make bootstrap
68 |     #   make release
69 | 
70 |     - name: Perform CodeQL Analysis
71 |       uses: github/codeql-action/analyze@v1
72 | 


--------------------------------------------------------------------------------
/tests/test_producer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import time
 3 | 
 4 | from mock import Mock, patch, ANY
 5 | from moto import mock_kinesis
 6 | from kiner.producer import encode_data
 7 | from kiner.producer import KinesisProducer
 8 | 
 9 | 
10 | BATCH_SIZE=50
11 | 
12 | @pytest.fixture
13 | def flush_callback():
14 |     return Mock()
15 | 
16 | @pytest.fixture
17 | def producer(flush_callback):
18 |     producer = KinesisProducer('test_stream', batch_size=BATCH_SIZE,
19 |                                batch_time=1, threads=3, flush_callback=flush_callback)
20 |     return producer
21 | 
22 | 
23 | @pytest.fixture
24 | def client(producer):
25 |     return producer.kinesis_client
26 | 
27 | 
28 | @pytest.mark.parametrize('data', [1, '1', b'1'])
29 | def test_encode_data(data):
30 |     assert encode_data(data) == b'1'
31 | 
32 | 
33 | @mock_kinesis
34 | @pytest.mark.parametrize('n', [1, 101, 179, 234, 399])
35 | def test_send_records(producer, client, n, flush_callback):
36 |     client.create_stream(StreamName=producer.stream_name, ShardCount=1)
37 | 
38 |     with patch('kiner.producer.time') as mock_time:
39 |         # Put records in the stream
40 |         for i in range(n - 1):
41 |             producer.put_record(i, metadata={'i': i, 'n': n})
42 |         producer.put_record(n - 1, metadata={'i': n - 1, 'n': n}, partition_key='some-partition-key')
43 | 
44 |         producer.close()
45 | 
46 |         # Assert flush callback called for at least as many batches were sent
47 |         assert flush_callback.call_count >= n // BATCH_SIZE + 1
48 |         # Assert the final record was flushed
49 |         flush_callback.assert_any_call(
50 |             ANY, mock_time.time(), Data=str(n - 1).encode(), Metadata={'i': n-1, 'n': n}, PartitionKey='some-partition-key'
51 |         )
52 |         # Assert we flushed n records
53 |         assert sum(call[1][0] for call in flush_callback.mock_calls) == n
54 | 
55 |     response = client.describe_stream(StreamName=producer.stream_name)
56 |     shard_id = response['StreamDescription']['Shards'][0]['ShardId']
57 |     shard_iterator = client.get_shard_iterator(
58 |         StreamName=producer.stream_name,
59 |         ShardId=shard_id,
60 |         ShardIteratorType='TRIM_HORIZON'
61 |     ).get('ShardIterator')
62 | 
63 |     records = client.get_records(ShardIterator=shard_iterator, Limit=n)['Records']
64 | 
65 |     assert len(records) == n
66 | 
67 | 
68 | @mock_kinesis
69 | @pytest.mark.parametrize('n', [49, 141])
70 | def test_send_records_without_close(producer, client, n):
71 |     client.create_stream(StreamName=producer.stream_name, ShardCount=1)
72 | 
73 |     # Put records in the stream
74 |     for i in range(n):
75 |         producer.put_record(i)
76 | 
77 |     time.sleep(2)
78 | 
79 |     assert producer.queue.empty()
80 | 
81 |     producer.close()
82 | 
83 |     response = client.describe_stream(StreamName=producer.stream_name)
84 |     shard_id = response['StreamDescription']['Shards'][0]['ShardId']
85 |     shard_iterator = client.get_shard_iterator(
86 |         StreamName=producer.stream_name,
87 |         ShardId=shard_id,
88 |         ShardIteratorType='TRIM_HORIZON'
89 |     ).get('ShardIterator')
90 | 
91 |     records = client.get_records(ShardIterator=shard_iterator, Limit=n)['Records']
92 |     assert len(records) == n
93 | 


--------------------------------------------------------------------------------
/kiner/producer.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | from concurrent.futures import ThreadPoolExecutor
  3 | import logging
  4 | from queue import Queue
  5 | import sys
  6 | import threading
  7 | import time
  8 | import uuid
  9 | import atexit
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | def encode_data(data, encoding='utf_8'):
 15 |     if isinstance(data, bytes):
 16 |         return data
 17 |     else:
 18 |         return str(data).encode(encoding)
 19 | 
 20 | 
 21 | class KinesisProducer:
 22 |     """Basic Kinesis Producer.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     stream_name : string
 27 |         Name of the stream to send the records.
 28 |     batch_size : int
 29 |         Numbers of records to batch before flushing the queue.
 30 |     batch_time : int
 31 |         Maximum of seconds to wait before flushing the queue.
 32 |     max_retries: int
 33 |         Maximum number of times to retry the put operation.
 34 |     kinesis_client: boto3.client
 35 |         Kinesis client.
 36 |     flush_callback: [int, float, Data=bytes, PartitionKey=str, Metadata=Any] -> None
 37 |         An optional callback to be invoked upon flushing to kinesis,
 38 |         called with the number of records flushed, time.time() when flush occurred
 39 |         and kwargs of the last record flushed
 40 | 
 41 |     Attributes
 42 |     ----------
 43 |     records : array
 44 |         Queue of formated records.
 45 |     pool: concurrent.futures.ThreadPoolExecutor
 46 |         Pool of threads handling client I/O.
 47 |     """
 48 | 
 49 |     def __init__(self, stream_name, batch_size=500,
 50 |                  batch_size_bytes=1024,
 51 |                  batch_time=5, max_retries=5, threads=10,
 52 |                  kinesis_client=None, flush_callback=None):
 53 |         self.stream_name = stream_name
 54 |         self.queue = Queue()
 55 |         self.queue_size_bytes = 0  # size of queue content
 56 |         self.batch_size = batch_size
 57 |         self.batch_size_bytes = batch_size_bytes
 58 |         self.batch_time = batch_time
 59 |         self.max_retries = max_retries
 60 |         if kinesis_client is None:
 61 |             kinesis_client = boto3.client('kinesis')
 62 |         self.kinesis_client = kinesis_client
 63 |         self.flush_callback = flush_callback
 64 |         self.pool = ThreadPoolExecutor(threads)
 65 |         self.last_flush = time.time()
 66 |         self.monitor_running = threading.Event()
 67 |         self.monitor_running.set()
 68 |         self.pool.submit(self.monitor)
 69 | 
 70 |         atexit.register(self.close)
 71 | 
 72 |     def monitor(self):
 73 |         """Flushes the queue periodically."""
 74 |         while self.monitor_running.is_set():
 75 |             if time.time() - self.last_flush > self.batch_time:
 76 |                 if not self.queue.empty():
 77 |                     logger.info("Queue Flush: time without flush exceeded")
 78 |                     self.flush_queue()
 79 |             time.sleep(self.batch_time)
 80 | 
 81 |     def put_records(self, records, partition_key=None):
 82 |         """Add a list of data records to the record queue in the proper format.
 83 |         Convinience method that calls self.put_record for each element.
 84 | 
 85 |         Parameters
 86 |         ----------
 87 |         records : list
 88 |             Lists of records to send.
 89 |         partition_key: str
 90 |             Hash that determines which shard a given data record belongs to.
 91 | 
 92 |         """
 93 |         for record in records:
 94 |             self.put_record(record, partition_key)
 95 | 
 96 |     def put_record(self, data, metadata=None, partition_key=None):
 97 |         """Add data to the record queue in the proper format.
 98 | 
 99 |         Parameters
100 |         ----------
101 |         data : str|bytes
102 |             Data to send.
103 |         metadata: dict
104 |             Metadata associated with the record.
105 |         partition_key: str
106 |             Hash that determines which shard a given data record belongs to.
107 | 
108 |         """
109 |         # Byte encode the data
110 |         data = encode_data(data)
111 | 
112 |         # Create a random partition key if not provided
113 |         if not partition_key:
114 |             partition_key = uuid.uuid4().hex
115 | 
116 |         # Build the record
117 |         record = {
118 |             'Data': data,
119 |             'PartitionKey': partition_key,
120 |             'Metadata': metadata
121 |         }
122 |         record_bytes = sys.getsizeof(record) / 1048576 # 1048576=1024**2
123 | 
124 |         # Flush the queue if it reaches the batch size
125 |         if self.queue.qsize() >= self.batch_size or self.queue_size_bytes >= self.batch_size_bytes:
126 |             logger.info("Queue Flush: batch size reached")
127 |             self.pool.submit(self.flush_queue)
128 | 
129 |         # Append the record
130 |         logger.debug('Putting record "{}"'.format(record['Data'][:100]))
131 |         self.queue.put(record)
132 |         self.queue_size_bytes += record_bytes
133 | 
134 | 
135 |     def close(self):
136 |         """Flushes the queue and waits for the executor to finish."""
137 |         logger.info('Closing producer')
138 |         self.flush_queue()
139 |         self.monitor_running.clear()
140 |         self.pool.shutdown()
141 |         logger.info('Producer closed')
142 | 
143 |     def flush_queue(self):
144 |         """Grab all the current records in the queue and send them."""
145 |         records = []
146 |         records_size_bytes = 0
147 | 
148 |         while \
149 |             not self.queue.empty() and \
150 |                 len(records) < self.batch_size and \
151 |                 records_size_bytes < self.batch_size_bytes:
152 |             record = self.queue.get()
153 | 
154 |             record_size = sys.getsizeof(record)
155 |             records_size_bytes += record_size
156 |             self.queue_size_bytes -= record_size
157 | 
158 |             # Pop metadata from the queued record for notifying the callback
159 |             last_metadata = record.pop('Metadata')
160 |             records.append(record)
161 | 
162 |         if records:
163 |             self.send_records(records)
164 |             self.last_flush = time.time()
165 |             if self.flush_callback:
166 |                 self.flush_callback(len(records), self.last_flush, Metadata=last_metadata, **records[-1])
167 | 
168 |     def send_records(self, records, attempt=0):
169 |         """Send records to the Kinesis stream.
170 | 
171 |         Falied records are sent again with an exponential backoff decay.
172 | 
173 |         Parameters
174 |         ----------
175 |         records : array
176 |             Array of formated records to send.
177 |         attempt: int
178 |             Number of times the records have been sent without success.
179 |         """
180 | 
181 |         # If we already tried more times than we wanted, save to a file
182 |         if attempt > self.max_retries:
183 |             logger.warning('Writing {} records to file'.format(len(records)))
184 |             with open('failed_records.dlq', 'ab') as f:
185 |                 for r in records:
186 |                     f.write(r.get('Data'))
187 |             return
188 | 
189 |         # Sleep before retrying
190 |         if attempt:
191 |             time.sleep(2 ** attempt * .1)
192 | 
193 |         response = self.kinesis_client.put_records(StreamName=self.stream_name,
194 |                                                    Records=records)
195 |         failed_record_count = response['FailedRecordCount']
196 | 
197 |         # Grab failed records
198 |         if failed_record_count:
199 |             logger.warning('Retrying failed records')
200 |             failed_records = []
201 |             for i, record in enumerate(response['Records']):
202 |                 if record.get('ErrorCode'):
203 |                     failed_records.append(records[i])
204 | 
205 |             # Recursive call
206 |             attempt += 1
207 |             self.send_records(failed_records, attempt=attempt)
208 | 


--------------------------------------------------------------------------------