├── .gitignore ├── LICENSE ├── README.md ├── chunkify.py ├── requirements.txt └── tests.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Churkin Oleg 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Efficient celery tasks chunkification 2 | 3 | This library allows to chunkify a huge bunch of celery tasks into several numbers of chunks which will be executed periodically until the initial queue is not empty. 4 | 5 | In other words you may split a huge amount of created tasks in the small chunks of predefined length and distribute your task creation routine among several periodic tasks. 6 | 7 | **Real life example:** you need to send a push-notification for a zillion of your users. If you put every notification into individual task, you have to execute a zillion tasks. How to execute such amount of tasks and do not consume a lot of memory/CPU? How to avoid tasks flooding when we put all the messages in a queue at once and celery workers start producing a high load on our external/internal services? 8 | 9 | [celery.chunks](http://docs.celeryproject.org/en/latest/userguide/canvas.html#chunks) is not an option, because it still creates all tasks in a memory and attaches a huge blob of data to a message. 10 | 11 | Here is the example how to execute 1000 of tasks every 5 seconds: 12 | 13 | ```python 14 | from django.db.models import Min, Max 15 | from chunkify import chunkify_task, Chunk 16 | 17 | users_queryset = User.objects.active() 18 | 19 | def get_initial_chunk(*args, **kwargs): 20 | """ 21 | Create an chunk of integers based on max and min primary keys. 22 | """ 23 | result = users_queryset.aggregate(Min('pk'), Max('pk')) 24 | chunk = Chunk( 25 | start=result['pk__min'] or 0, 26 | size=1000, 27 | max=result['pk__max'] or 0, 28 | ) 29 | return chunk 30 | 31 | 32 | @task 33 | @chunkify_task( 34 | sleep_timeout=5, 35 | initial_chunk=get_initial_chunk 36 | ) 37 | def send_push_notifications(chunk: Chunk): 38 | """Create several tasks based on provided chunk and re-schedule their execution""" 39 | chunked_qs = ( 40 | users_queryset 41 | .filter(pk__range=chunk.range) 42 | .values_list('pk', flat=True) 43 | .order_by('pk') 44 | ) 45 | 46 | for user_id in chunked_qs: 47 | send_push_notifications_for_user.delay(user_id) 48 | ``` 49 | 50 | Then the task function will be re-scheduled to run in `sleep_timeout` seconds with a next chunk. 51 | 52 | ## chunkify_task 53 | 54 | The decorator accepts 3 parameters: 55 | 56 | * `sleep_timeout` – seconds between processing each chunk of tasks 57 | * `initial_chunk` – either `Chunk`, `DateChunk` or `DateTimeChunk` instance or a callable which returns one of the specified instances. 58 | * `chunk_class` – `Chunk`, `DateChunk` or `DateTimeChunk` type, will be used to (de)serialize chunks. 59 | 60 | ## Chunk classes 61 | 62 | `Chunk` aka `IntChunk` – represents chunk data in list of integers, `DateChunk` and `DateTimeChunk` – represent date chunks, public API can be explorer via `BaseChunk` class. 63 | 64 | -------------------------------------------------------------------------------- /chunkify.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from functools import wraps 3 | from typing import Any, NamedTuple, Optional, Callable, Union, Type 4 | 5 | import structlog 6 | from celery import current_task 7 | 8 | 9 | logger = structlog.get_logger(__name__) 10 | seconds = int 11 | 12 | 13 | class ChunkMode: 14 | RANGE = 'range' 15 | SLICE = 'slice' 16 | 17 | 18 | class BaseChunk: 19 | """Abstraction above chunk objects. 20 | 21 | :param start: start point of a chunk 22 | :param size: size of a chunk 23 | :param max: maximum value `start` parameter can get for a chunk 24 | :param ChunkMode mode: declares how this chunk will be used 25 | """ 26 | start: Any 27 | size: Any 28 | max: Optional[Any] 29 | mode: str = ChunkMode.RANGE 30 | 31 | @property 32 | def end(self): 33 | end = self.start + self.size 34 | 35 | if self.max and end > self.max: 36 | return self.max 37 | 38 | return self.start + self.size 39 | 40 | @property 41 | def range(self): 42 | return self.start, self.end 43 | 44 | @property 45 | def slice(self): 46 | return slice(self.start, self.end) 47 | 48 | @property 49 | def is_exhausted(self): 50 | if self.mode == ChunkMode.SLICE: 51 | return bool(self.max is not None and self.start >= self.max) 52 | 53 | return bool(self.max is not None and self.start > self.max) 54 | 55 | @property 56 | def shift_getter(self) -> Callable: 57 | raise NotImplementedError() 58 | 59 | def next(self, shift: int=None): 60 | if shift is None: 61 | if self.mode == ChunkMode.SLICE: 62 | shift = self.shift_getter(0) 63 | else: 64 | shift = self.shift_getter(1) 65 | 66 | return type(self)( 67 | start=self.start + self.size + shift, 68 | size=self.size, 69 | max=self.max, 70 | mode=self.mode, 71 | ) 72 | 73 | 74 | class IntChunkMixin(NamedTuple): 75 | start: int 76 | size: int 77 | max: Optional[int] 78 | mode: str = ChunkMode.RANGE 79 | 80 | 81 | class IntChunk(IntChunkMixin, BaseChunk): 82 | 83 | @property 84 | def shift_getter(self): 85 | return int 86 | 87 | 88 | # backward compatibility 89 | Chunk = IntChunk 90 | 91 | 92 | class DateChunkMixin(NamedTuple): 93 | start: datetime.date 94 | size: datetime.timedelta 95 | max: Optional[datetime.date] 96 | mode: str = ChunkMode.RANGE 97 | 98 | 99 | class DateChunk(DateChunkMixin, BaseChunk): 100 | 101 | @property 102 | def shift_getter(self): 103 | return lambda shift: datetime.timedelta(days=shift) 104 | 105 | 106 | class DatetimeChunkMixin(NamedTuple): 107 | start: datetime.datetime 108 | size: datetime.timedelta 109 | max: Optional[datetime.datetime] 110 | mode: str = ChunkMode.RANGE 111 | 112 | 113 | class DateTimeChunk(DatetimeChunkMixin, BaseChunk): 114 | 115 | @property 116 | def shift_getter(self): 117 | return lambda shift: datetime.timedelta(seconds=shift) 118 | 119 | 120 | class chunkify_task: 121 | """ 122 | This decorator allows to chunkify a huge bunch of celery tasks into several 123 | number of chunks which will be executed periodically until the initial queue 124 | is not empty. 125 | 126 | In other words you may split a huge amount of created tasks in the small 127 | chunks of predefined length and distribute your task creation routine 128 | among several periodic tasks. 129 | 130 | How to use: 131 | 132 | @celery_app.task(...params...) 133 | @chunkify_task(sleep_timeout=10, initial_chunk=Chunk(0, 100, 1000)) 134 | def sync_provider_accounts(chunk: Chunk=None): 135 | # create several tasks based on provided chunk 136 | # schedule their execution 137 | return ... 138 | 139 | Then the task function will be re-scheduled to run in `sleep_timeout` 140 | seconds with a next chunk. 141 | 142 | Where: 143 | 144 | * sleep_timeout – seconds between processing each chunk of tasks. 145 | * initial_chunk – either `Chunk` object or callable which should return one. 146 | Allows to specify first value in a chunk, chunk size and maximum 147 | number of tasks. 148 | """ 149 | def __init__(self, *, 150 | sleep_timeout: seconds, 151 | initial_chunk: Union[BaseChunk, Callable], 152 | chunk_class: Type[BaseChunk]=Chunk): 153 | self.sleep_timeout = sleep_timeout 154 | self.initial_chunk = initial_chunk 155 | self.logger = logger 156 | self.task_function = None 157 | self.chunk_class = chunk_class 158 | 159 | def __call__(self, task_function, *args, **kwargs): 160 | self.task_function = task_function 161 | 162 | @wraps(task_function) 163 | def _wrapper(*args, **kwargs): 164 | return self._task_function_wrapper(*args, **kwargs) 165 | return _wrapper 166 | 167 | def _task_function_wrapper(self, *args, **kwargs): 168 | chunk = self._get_or_create_chunk(args, kwargs) 169 | 170 | log = self.logger.bind( 171 | chunk=chunk, 172 | name=current_task.name, 173 | ) 174 | 175 | log.info('Executing chunked task') 176 | 177 | kwargs['chunk'] = chunk 178 | 179 | result = self.task_function(*args, **kwargs) 180 | 181 | next_chunk = chunk.next() 182 | if next_chunk.is_exhausted: 183 | log.info('Chunk is exhausted, iteration stopped', 184 | next_chunk=next_chunk) 185 | 186 | return result 187 | 188 | kwargs['chunk'] = next_chunk 189 | current_task.apply_async( 190 | kwargs=kwargs, countdown=self.sleep_timeout 191 | ) 192 | 193 | log.info('Next chunk is scheduled', 194 | next_chunk=next_chunk, 195 | sleep_timeout=self.sleep_timeout) 196 | 197 | return result 198 | 199 | def _get_or_create_chunk(self, args, kwargs): 200 | chunk = kwargs.get('chunk') 201 | 202 | # first task execution – we should create an initial chunk 203 | # and pass it to a task function 204 | if not chunk: 205 | return self._get_initial_chunk(args, kwargs) 206 | 207 | if isinstance(chunk, BaseChunk): 208 | return chunk 209 | elif isinstance(chunk, dict): 210 | return self.chunk_class(**chunk) 211 | elif isinstance(chunk, (list, tuple)): 212 | return self.chunk_class(*chunk) 213 | 214 | raise TypeError(f'Unexpected chunk type "{type(chunk)}"') 215 | 216 | def _get_initial_chunk(self, args, kwargs) -> Chunk: 217 | if callable(self.initial_chunk): 218 | return self.initial_chunk(*args, **kwargs) 219 | return self.initial_chunk 220 | 221 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | celery 2 | structlog 3 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import celery 4 | from chunkify import Chunk, ChunkMode, chunkify_task 5 | 6 | import logging 7 | 8 | logging.basicConfig(level=logging.DEBUG) 9 | 10 | celery_app = celery.Celery(broker='memory://', backend='cache+memory://') 11 | 12 | 13 | class Config: 14 | task_always_eager = True 15 | 16 | 17 | celery_app.config_from_object(Config) 18 | 19 | 20 | class TestChunkObject: 21 | 22 | @pytest.mark.parametrize( 23 | ['chunk', 'expected_end'], [ 24 | (Chunk(0, 10, 100), 10), 25 | (Chunk(1, 11, None), 12), 26 | (Chunk(1, 10, 5), 5), 27 | ] 28 | ) 29 | def test_end(self, chunk, expected_end): 30 | assert chunk.end == expected_end 31 | 32 | @pytest.mark.parametrize( 33 | ['chunk', 'expected_slice'], [ 34 | (Chunk(0, 10, 100), slice(0, 10)), 35 | (Chunk(1, 11, None), slice(1, 12)), 36 | (Chunk(1, 10, 5), slice(1, 5)), 37 | ] 38 | ) 39 | def test_slicing(self, chunk, expected_slice): 40 | assert chunk.slice == expected_slice 41 | 42 | @pytest.mark.parametrize( 43 | ['chunk', 'is_exhausted'], [ 44 | (Chunk(0, 10, 100), False), 45 | (Chunk(1, 11, None), False), 46 | (Chunk(100, 11, None), False), 47 | (Chunk(100, 10, 100), False), 48 | (Chunk(100, 10, 100, ChunkMode.SLICE), True), 49 | (Chunk(100, 10, 91), True), 50 | (Chunk(100, 10, 91, ChunkMode.SLICE), True), 51 | ] 52 | ) 53 | def test_exhausted(self, chunk, is_exhausted): 54 | assert chunk.is_exhausted == is_exhausted 55 | 56 | def test_full_iteration(self): 57 | values = list(range(1, 100, 2)) 58 | 59 | chunk = Chunk(0, 10, len(values), ChunkMode.SLICE) 60 | 61 | count = 0 62 | iterated_values = [] 63 | 64 | while not chunk.is_exhausted: 65 | iterated_values.extend(values[chunk.slice]) 66 | chunk = chunk.next() 67 | count += 1 68 | 69 | assert iterated_values == values 70 | assert count == len(values) / chunk.size 71 | 72 | 73 | items = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] 74 | items_len = len(items) 75 | 76 | 77 | @celery_app.task 78 | def remove_item(index): 79 | items[index] = None 80 | 81 | 82 | @celery_app.task 83 | @chunkify_task( 84 | sleep_timeout=10, 85 | initial_chunk=Chunk(0, 3, 10, ChunkMode.SLICE), 86 | ) 87 | def runner_list(chunk: Chunk): 88 | for i in list(range(items_len))[chunk.slice]: 89 | remove_item.delay(i) 90 | 91 | 92 | class TestChunkificator: 93 | 94 | def test_list_chunkificator(self): 95 | runner_list.delay() 96 | assert [i for i in items if i is not None] == [11, 12] 97 | --------------------------------------------------------------------------------