├── .circleci └── config.yml ├── .gitignore ├── LICENSE ├── README.md ├── asyncio_buffered_pipeline.py ├── setup.py └── test.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | workflows: 3 | version: 2 4 | test: 5 | jobs: 6 | - test-3.8.3 7 | - test-3.8.2 8 | - test-3.8.1 9 | - test-3.8.0 10 | - test-3.7.7 11 | - test-3.7.4 12 | - test-3.7.3 13 | - test-3.7.2 14 | - test-3.7.1 15 | jobs: 16 | test-3.8.3: 17 | docker: 18 | - image: python:3.8.3 19 | steps: 20 | - checkout 21 | - run: 22 | name: Install coverage and reporter 23 | command: | 24 | pip install coverage==4.5.2 25 | curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter 26 | chmod +x ./cc-test-reporter 27 | - run: 28 | name: Run tests 29 | command: | 30 | ./cc-test-reporter before-build 31 | coverage run -m unittest -vb 32 | coverage xml -i 33 | ./cc-test-reporter after-build --exit-code $? 34 | test-3.8.2: &template 35 | docker: 36 | - image: python:3.8.2 37 | steps: 38 | - checkout 39 | - run: 40 | name: Run tests 41 | command: | 42 | python -m unittest -vb 43 | test-3.8.1: 44 | <<: *template 45 | docker: 46 | - image: python:3.8.1 47 | test-3.8.0: 48 | <<: *template 49 | docker: 50 | - image: python:3.8.0 51 | test-3.7.7: 52 | <<: *template 53 | docker: 54 | - image: python:3.7.7 55 | test-3.7.6: 56 | <<: *template 57 | docker: 58 | - image: python:3.7.6 59 | test-3.7.5: 60 | <<: *template 61 | docker: 62 | - image: python:3.7.5 63 | test-3.7.4: 64 | <<: *template 65 | docker: 66 | - image: python:3.7.4 67 | test-3.7.3: 68 | <<: *template 69 | docker: 70 | - image: python:3.7.3 71 | test-3.7.2: 72 | <<: *template 73 | docker: 74 | - image: python:3.7.2 75 | test-3.7.1: 76 | <<: *template 77 | docker: 78 | - image: python:3.7.1 79 | test-3.6.10: 80 | <<: *template 81 | docker: 82 | - image: python:3.6.10 83 | test-3.6.9: 84 | <<: *template 85 | docker: 86 | - image: python:3.6.9 87 | test-3.6.8: 88 | <<: *template 89 | docker: 90 | - image: python:3.6.8 91 | test-3.6.7: 92 | <<: *template 93 | docker: 94 | - image: python:3.6.7 95 | test-3.6.6: 96 | <<: *template 97 | docker: 98 | - image: python:3.6.6 99 | test-3.6.5: 100 | <<: *template 101 | docker: 102 | - image: python:3.6.5 103 | test-3.6.4: 104 | <<: *template 105 | docker: 106 | - image: python:3.6.4 107 | test-3.6.3: 108 | <<: *template 109 | docker: 110 | - image: python:3.6.3 111 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Michal Charemza 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # asyncio-buffered-pipeline [![CircleCI](https://circleci.com/gh/michalc/asyncio-buffered-pipeline.svg?style=shield)](https://circleci.com/gh/michalc/asyncio-buffered-pipeline) [![Test Coverage](https://api.codeclimate.com/v1/badges/defb145849be2214e381/test_coverage)](https://codeclimate.com/github/michalc/asyncio-buffered-pipeline/test_coverage) 2 | 3 | Parallelise pipelines of Python async iterables/generators. 4 | 5 | ## Installation 6 | 7 | ```bash 8 | pip install asyncio-buffered-pipeline 9 | ``` 10 | 11 | ## Usage / What problem does this solve? 12 | 13 | If you have a chain of async generators, even though each is async, only one runs at any given time. For example, the below runs in (just over) 30 seconds. 14 | 15 | ```python 16 | import asyncio 17 | 18 | async def gen_1(): 19 | for value in range(0, 10): 20 | await asyncio.sleep(1) # Could be a slow HTTP request 21 | yield value 22 | 23 | async def gen_2(it): 24 | async for value in it: 25 | await asyncio.sleep(1) # Could be a slow HTTP request 26 | yield value * 2 27 | 28 | async def gen_3(it): 29 | async for value in it: 30 | await asyncio.sleep(1) # Could be a slow HTTP request 31 | yield value + 3 32 | 33 | async def main(): 34 | it_1 = gen_1() 35 | it_2 = gen_2(it_1) 36 | it_3 = gen_3(it_2) 37 | 38 | async for val in it_3: 39 | print(val) 40 | 41 | asyncio.run(main()) 42 | ``` 43 | 44 | The `buffered_pipeline` function allows you to make to a small change, passing each generator through its return value, to parallelise the generators to reduce this to (just over) 12 seconds. 45 | 46 | ```python 47 | import asyncio 48 | from asyncio_buffered_pipeline import buffered_pipeline 49 | 50 | async def gen_1(): 51 | for value in range(0, 10): 52 | await asyncio.sleep(1) # Could be a slow HTTP request 53 | yield value 54 | 55 | async def gen_2(it): 56 | async for value in it: 57 | await asyncio.sleep(1) # Could be a slow HTTP request 58 | yield value * 2 59 | 60 | async def gen_3(it): 61 | async for value in it: 62 | await asyncio.sleep(1) # Could be a slow HTTP request 63 | yield value + 3 64 | 65 | async def main(): 66 | buffer_iterable = buffered_pipeline() 67 | it_1 = buffer_iterable(gen_1()) 68 | it_2 = buffer_iterable(gen_2(it_1)) 69 | it_3 = buffer_iterable(gen_3(it_2)) 70 | 71 | async for val in it_3: 72 | print(val) 73 | 74 | asyncio.run(main()) 75 | ``` 76 | 77 | The `buffered_pipeline` ensures internal tasks are cancelled on any exception. 78 | 79 | ### Buffer size 80 | 81 | The default buffer size is 1. This is suitable if each iteration takes approximately the same amount of time. If this is not the case, you may wish to change it using the `buffer_size` parameter of `buffer_iterable`. 82 | 83 | ```python 84 | it = buffer_iterable(gen(), buffer_size=2) 85 | ``` 86 | 87 | ## Features 88 | 89 | - Only one task is created for each `buffer_iterable`, in which the iterable is iterated over, with its values stored in an internal buffer. 90 | 91 | - All the tasks of the pipeline are cancelled if any of the generators raise an exception. 92 | 93 | - If a generator raises an exception, the exception is propagated to calling code. 94 | 95 | - The buffer size of each step in the pipeline is configurable. 96 | 97 | - The "chaining" is not abstracted away. You still have full control over the arguments passed to each step, and you don't need to buffer each iterable in the pipeline if you don't want to: just don't pass those through `buffer_iterable`. 98 | -------------------------------------------------------------------------------- /asyncio_buffered_pipeline.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import collections 3 | 4 | def buffered_pipeline(): 5 | tasks = [] 6 | 7 | def queue(size): 8 | # The regular asyncio.queue doesn't have a function to wait for space 9 | # in the queue without also immediately putting an item into it, which 10 | # would mean effective minimum buffer_size is 2: an item in the queue 11 | # and in memory waiting to put into it. To allow a buffer_size of 1, 12 | # we need to check there is space _before_ fetching the item from 13 | # upstream. This requires a custom queue implementation. 14 | # 15 | # We also can guarantee there will be at most one getter and putter at 16 | # any one time, and that _put won't be called until there is space in 17 | # the queue, so we can have much simpler code than asyncio.Queue 18 | 19 | _queue = collections.deque() 20 | at_least_one_in_queue = asyncio.Event() 21 | until_space = asyncio.Event() 22 | until_space.set() 23 | 24 | async def _space(): 25 | await until_space.wait() 26 | 27 | def _has_items(): 28 | return bool(_queue) 29 | 30 | async def _get(): 31 | nonlocal at_least_one_in_queue 32 | await at_least_one_in_queue.wait() 33 | value = _queue.popleft() 34 | until_space.set() 35 | if not _queue: 36 | at_least_one_in_queue = asyncio.Event() 37 | return value 38 | 39 | def _put(item): 40 | nonlocal until_space 41 | _queue.append(item) 42 | at_least_one_in_queue.set() 43 | if len(_queue) >= size: 44 | until_space = asyncio.Event() 45 | 46 | return _space, _has_items, _get, _put 47 | 48 | async def _buffer_iterable(iterable, buffer_size=1): 49 | nonlocal tasks 50 | queue_space, queue_has_items, queue_get, queue_put = queue(buffer_size) 51 | iterator = iterable.__aiter__() 52 | 53 | async def _iterate(): 54 | try: 55 | while True: 56 | await queue_space() 57 | value = await iterator.__anext__() 58 | queue_put((None, value)) 59 | value = None # So value can be garbage collected 60 | except BaseException as exception: 61 | queue_put((exception, None)) 62 | 63 | task = asyncio.create_task(_iterate()) 64 | tasks.append(task) 65 | 66 | try: 67 | while queue_has_items() or task: 68 | exception, value = await queue_get() 69 | if exception is not None: 70 | raise exception from None 71 | yield value 72 | value = None # So value can be garbage collected 73 | except StopAsyncIteration: 74 | pass 75 | except BaseException as exception: 76 | for task in tasks: 77 | task.cancel() 78 | all_tasks = tasks 79 | tasks = [] 80 | for task in all_tasks: 81 | try: 82 | await task 83 | except asyncio.CancelledError: 84 | pass 85 | raise 86 | 87 | return _buffer_iterable 88 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | 4 | def long_description(): 5 | with open('README.md', 'r') as file: 6 | return file.read() 7 | 8 | 9 | setuptools.setup( 10 | name='asyncio-buffered-pipeline', 11 | version='0.0.8', 12 | author='Michal Charemza', 13 | author_email='michal@charemza.name', 14 | description='Parallelize pipelines of Python async iterables/generators', 15 | long_description=long_description(), 16 | long_description_content_type='text/markdown', 17 | url='https://github.com/michalc/asyncio-buffered-pipeline', 18 | py_modules=[ 19 | 'asyncio_buffered_pipeline', 20 | ], 21 | python_requires='>=3.7.1', 22 | classifiers=[ 23 | 'Programming Language :: Python :: 3', 24 | 'License :: OSI Approved :: MIT License', 25 | 'Operating System :: OS Independent', 26 | 'Framework :: AsyncIO', 27 | ], 28 | ) 29 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from unittest import ( 3 | TestCase, 4 | ) 5 | 6 | from asyncio_buffered_pipeline import buffered_pipeline 7 | 8 | 9 | def async_test(func): 10 | def wrapper(*args, **kwargs): 11 | future = func(*args, **kwargs) 12 | loop = asyncio.get_event_loop() 13 | loop.run_until_complete(future) 14 | return wrapper 15 | 16 | 17 | class TestBufferIterable(TestCase): 18 | 19 | @async_test 20 | async def test_chain_all_buffered(self): 21 | async def gen_1(): 22 | for value in range(0, 10): 23 | yield value 24 | 25 | async def gen_2(it): 26 | async for value in it: 27 | yield value * 2 28 | 29 | async def gen_3(it): 30 | async for value in it: 31 | yield value + 3 32 | 33 | buffer_iterable = buffered_pipeline() 34 | it_1 = buffer_iterable(gen_1()) 35 | it_2 = buffer_iterable(gen_2(it_1)) 36 | it_3 = buffer_iterable(gen_3(it_2)) 37 | 38 | values = [value async for value in it_3] 39 | self.assertEqual(values, [3, 5, 7, 9, 11, 13, 15, 17, 19, 21]) 40 | 41 | @async_test 42 | async def test_chain_some_buffered(self): 43 | async def gen_1(): 44 | for value in range(0, 10): 45 | yield value 46 | 47 | async def gen_2(it): 48 | async for value in it: 49 | yield value * 2 50 | 51 | async def gen_3(it): 52 | async for value in it: 53 | yield value + 3 54 | 55 | buffer_iterable = buffered_pipeline() 56 | it_1 = buffer_iterable(gen_1()) 57 | it_2 = gen_2(it_1) 58 | it_3 = buffer_iterable(gen_3(it_2)) 59 | 60 | values = [value async for value in it_3] 61 | self.assertEqual(values, [3, 5, 7, 9, 11, 13, 15, 17, 19, 21]) 62 | 63 | @async_test 64 | async def test_chain_parallel(self): 65 | num_gen_1 = 0 66 | num_gen_2 = 0 67 | num_gen_3 = 0 68 | 69 | async def gen_1(): 70 | nonlocal num_gen_1 71 | for value in range(0, 10): 72 | num_gen_1 += 1 73 | yield 74 | 75 | async def gen_2(it): 76 | nonlocal num_gen_2 77 | async for value in it: 78 | num_gen_2 += 1 79 | yield 80 | 81 | async def gen_3(it): 82 | nonlocal num_gen_3 83 | async for value in it: 84 | num_gen_3 += 1 85 | yield 86 | 87 | buffer_iterable = buffered_pipeline() 88 | it_1 = buffer_iterable(gen_1()) 89 | it_2 = buffer_iterable(gen_2(it_1)) 90 | it_3 = buffer_iterable(gen_3(it_2)) 91 | 92 | num_done = [] 93 | async for _ in it_3: 94 | # Slight hack to wait for buffers to be full 95 | await asyncio.sleep(0.02) 96 | num_done.append((num_gen_1, num_gen_2, num_gen_3)) 97 | 98 | self.assertEqual(num_done, [ 99 | (4, 3, 2), (5, 4, 3), (6, 5, 4), (7, 6, 5), (8, 7, 6), 100 | (9, 8, 7), (10, 9, 8), (10, 10, 9), (10, 10, 10), (10, 10, 10), 101 | ]) 102 | 103 | @async_test 104 | async def test_num_tasks(self): 105 | async def gen_1(): 106 | for value in range(0, 10): 107 | yield 108 | 109 | async def gen_2(it): 110 | async for value in it: 111 | yield 112 | 113 | async def gen_3(it): 114 | async for value in it: 115 | yield 116 | 117 | buffer_iterable = buffered_pipeline() 118 | it_1 = buffer_iterable(gen_1()) 119 | it_2 = buffer_iterable(gen_2(it_1)) 120 | it_3 = buffer_iterable(gen_3(it_2)) 121 | 122 | num_tasks = [] 123 | async for _ in it_3: 124 | # Slight hack to wait for buffers to be full 125 | await asyncio.sleep(0.02) 126 | num_tasks.append(len(asyncio.all_tasks())) 127 | 128 | self.assertEqual(num_tasks, [4, 4, 4, 4, 4, 4, 4, 3, 2, 1]) 129 | 130 | @async_test 131 | async def test_exception_propagates(self): 132 | class MyException(Exception): 133 | pass 134 | 135 | async def gen_1(): 136 | for value in range(0, 10): 137 | yield 138 | 139 | async def gen_2(it): 140 | async for value in it: 141 | yield 142 | 143 | async def gen_3(it): 144 | async for value in it: 145 | yield 146 | raise MyException() 147 | 148 | async def gen_4(it): 149 | async for value in it: 150 | yield 151 | 152 | buffer_iterable = buffered_pipeline() 153 | it_1 = buffer_iterable(gen_1()) 154 | it_2 = buffer_iterable(gen_2(it_1)) 155 | it_3 = buffer_iterable(gen_3(it_2)) 156 | it_4 = buffer_iterable(gen_4(it_3)) 157 | 158 | with self.assertRaises(MyException): 159 | async for _ in it_4: 160 | pass 161 | 162 | self.assertEqual(1, len(asyncio.all_tasks())) 163 | 164 | @async_test 165 | async def test_cancellation_propagates(self): 166 | event = asyncio.Event() 167 | 168 | async def gen_1(): 169 | for value in range(0, 10): 170 | yield 171 | 172 | async def gen_2(it): 173 | async for value in it: 174 | yield 175 | 176 | async def gen_3(it): 177 | async for value in it: 178 | yield 179 | event.set() 180 | await asyncio.Future() 181 | 182 | async def gen_4(it): 183 | async for value in it: 184 | yield 185 | 186 | async def pipeline(): 187 | buffer_iterable = buffered_pipeline() 188 | it_1 = buffer_iterable(gen_1()) 189 | it_2 = buffer_iterable(gen_2(it_1)) 190 | it_3 = buffer_iterable(gen_3(it_2)) 191 | it_4 = buffer_iterable(gen_4(it_3)) 192 | [value async for value in it_4] 193 | 194 | task = asyncio.create_task(pipeline()) 195 | await event.wait() 196 | task.cancel() 197 | try: 198 | await task 199 | except asyncio.CancelledError: 200 | pass 201 | 202 | await asyncio.sleep(0) 203 | self.assertEqual(1, len(asyncio.all_tasks())) 204 | 205 | @async_test 206 | async def test_default_bufsize(self): 207 | num_gen = 0 208 | 209 | def get_value(): 210 | nonlocal num_gen 211 | num_gen += 1 212 | return 1 213 | 214 | async def gen_1(): 215 | for _ in range(0, 10): 216 | yield get_value() 217 | 218 | num_gens = [] 219 | buffer_iterable = buffered_pipeline() 220 | async for _ in buffer_iterable(gen_1()): 221 | # Slight hack to wait for buffers to be full 222 | await asyncio.sleep(0.02) 223 | num_gens.append(num_gen) 224 | 225 | self.assertEqual(num_gens, [2, 3, 4, 5, 6, 7, 8, 9, 10, 10]) 226 | 227 | @async_test 228 | async def test_bigger_bufsize(self): 229 | num_gen = 0 230 | 231 | def get_value(): 232 | nonlocal num_gen 233 | num_gen += 1 234 | return 1 235 | 236 | async def gen_1(): 237 | for _ in range(0, 10): 238 | yield get_value() 239 | 240 | num_gens = [] 241 | buffer_iterable = buffered_pipeline() 242 | async for _ in buffer_iterable(gen_1(), buffer_size=2): 243 | # Slight hack to wait for buffers to be full 244 | await asyncio.sleep(0.02) 245 | num_gens.append(num_gen) 246 | 247 | self.assertEqual(num_gens, [3, 4, 5, 6, 7, 8, 9, 10, 10, 10]) 248 | --------------------------------------------------------------------------------