├── .gitignore
├── LICENSE
├── README.md
├── chunkify.py
├── requirements.txt
└── tests.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Churkin Oleg
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Efficient celery tasks chunkification
 2 | 
 3 | This library allows to chunkify a huge bunch of celery tasks into several numbers of chunks which will be executed periodically until the initial queue is not empty.
 4 | 
 5 | In other words you may split a huge amount of created tasks in the small chunks of predefined length and distribute your task creation routine among several periodic tasks.
 6 | 
 7 | **Real life example:** you need to send a push-notification for a zillion of your users. If you put every notification into individual task, you have to execute a zillion tasks. How to execute such amount of tasks and do not consume a lot of memory/CPU? How to avoid tasks flooding when we put all the messages in a queue at once and celery workers start producing a high load on our external/internal services?
 8 | 
 9 | [celery.chunks](http://docs.celeryproject.org/en/latest/userguide/canvas.html#chunks) is not an option, because it still creates all tasks in a memory and attaches a huge blob of data to a message.
10 | 
11 | Here is the example how to execute 1000 of tasks every 5 seconds:
12 | 
13 | ```python
14 | from django.db.models import Min, Max  
15 | from chunkify import chunkify_task, Chunk  
16 |   
17 | users_queryset = User.objects.active()  
18 |   
19 | def get_initial_chunk(*args, **kwargs):
20 |     """
21 |     Create an chunk of integers based on max and min primary keys.
22 | 	"""
23 |     result = users_queryset.aggregate(Min('pk'), Max('pk'))  
24 |     chunk = Chunk(  
25 |         start=result['pk__min'] or 0,  
26 |         size=1000,  
27 |         max=result['pk__max'] or 0,  
28 |     )  
29 |     return chunk  
30 |   
31 |   
32 | @task  
33 | @chunkify_task(  
34 |     sleep_timeout=5,   
35 |     initial_chunk=get_initial_chunk  
36 | )  
37 | def send_push_notifications(chunk: Chunk):
38 |     """Create several tasks based on provided chunk and re-schedule their execution"""
39 |     chunked_qs = (  
40 |         users_queryset  
41 |         .filter(pk__range=chunk.range)  
42 |         .values_list('pk', flat=True)  
43 |         .order_by('pk')   
44 |     )     
45 |   
46 |     for user_id in chunked_qs:  
47 |         send_push_notifications_for_user.delay(user_id)
48 | ```
49 | 
50 | Then the task function will be re-scheduled to run in `sleep_timeout` seconds with a next chunk.
51 | 
52 | ##   chunkify_task
53 | 
54 | The decorator accepts 3 parameters:
55 | 
56 | * `sleep_timeout` – seconds between processing each chunk of tasks
57 | * `initial_chunk` – either `Chunk`, `DateChunk` or `DateTimeChunk` instance or a callable which returns one of the specified instances.
58 | * `chunk_class` – `Chunk`, `DateChunk` or `DateTimeChunk` type, will be used to (de)serialize chunks.
59 | 
60 | ## Chunk classes
61 | 
62 | `Chunk` aka `IntChunk` – represents chunk data in list of integers, `DateChunk` and `DateTimeChunk` – represent date chunks, public API can be explorer via `BaseChunk` class.
63 | 
64 | 


--------------------------------------------------------------------------------
/chunkify.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from functools import wraps
  3 | from typing import Any, NamedTuple, Optional, Callable, Union, Type
  4 | 
  5 | import structlog
  6 | from celery import current_task
  7 | 
  8 | 
  9 | logger = structlog.get_logger(__name__)
 10 | seconds = int
 11 | 
 12 | 
 13 | class ChunkMode:
 14 |     RANGE = 'range'
 15 |     SLICE = 'slice'
 16 | 
 17 | 
 18 | class BaseChunk:
 19 |     """Abstraction above chunk objects.
 20 | 
 21 |     :param start: start point of a chunk
 22 |     :param size: size of a chunk
 23 |     :param max: maximum value `start` parameter can get for a chunk
 24 |     :param ChunkMode mode: declares how this chunk will be used
 25 |     """
 26 |     start: Any
 27 |     size: Any
 28 |     max: Optional[Any]
 29 |     mode: str = ChunkMode.RANGE
 30 | 
 31 |     @property
 32 |     def end(self):
 33 |         end = self.start + self.size
 34 | 
 35 |         if self.max and end > self.max:
 36 |             return self.max
 37 | 
 38 |         return self.start + self.size
 39 | 
 40 |     @property
 41 |     def range(self):
 42 |         return self.start, self.end
 43 | 
 44 |     @property
 45 |     def slice(self):
 46 |         return slice(self.start, self.end)
 47 | 
 48 |     @property
 49 |     def is_exhausted(self):
 50 |         if self.mode == ChunkMode.SLICE:
 51 |             return bool(self.max is not None and self.start >= self.max)
 52 | 
 53 |         return bool(self.max is not None and self.start > self.max)
 54 | 
 55 |     @property
 56 |     def shift_getter(self) -> Callable:
 57 |         raise NotImplementedError()
 58 | 
 59 |     def next(self, shift: int=None):
 60 |         if shift is None:
 61 |             if self.mode == ChunkMode.SLICE:
 62 |                 shift = self.shift_getter(0)
 63 |             else:
 64 |                 shift = self.shift_getter(1)
 65 | 
 66 |         return type(self)(
 67 |             start=self.start + self.size + shift,
 68 |             size=self.size,
 69 |             max=self.max,
 70 |             mode=self.mode,
 71 |         )
 72 | 
 73 | 
 74 | class IntChunkMixin(NamedTuple):
 75 |     start: int
 76 |     size: int
 77 |     max: Optional[int]
 78 |     mode: str = ChunkMode.RANGE
 79 | 
 80 | 
 81 | class IntChunk(IntChunkMixin, BaseChunk):
 82 | 
 83 |     @property
 84 |     def shift_getter(self):
 85 |         return int
 86 | 
 87 | 
 88 | # backward compatibility
 89 | Chunk = IntChunk
 90 | 
 91 | 
 92 | class DateChunkMixin(NamedTuple):
 93 |     start: datetime.date
 94 |     size: datetime.timedelta
 95 |     max: Optional[datetime.date]
 96 |     mode: str = ChunkMode.RANGE
 97 | 
 98 | 
 99 | class DateChunk(DateChunkMixin, BaseChunk):
100 | 
101 |     @property
102 |     def shift_getter(self):
103 |         return lambda shift: datetime.timedelta(days=shift)
104 | 
105 | 
106 | class DatetimeChunkMixin(NamedTuple):
107 |     start: datetime.datetime
108 |     size: datetime.timedelta
109 |     max: Optional[datetime.datetime]
110 |     mode: str = ChunkMode.RANGE
111 | 
112 | 
113 | class DateTimeChunk(DatetimeChunkMixin, BaseChunk):
114 | 
115 |     @property
116 |     def shift_getter(self):
117 |         return lambda shift: datetime.timedelta(seconds=shift)
118 | 
119 | 
120 | class chunkify_task:
121 |     """
122 |     This decorator allows to chunkify a huge bunch of celery tasks into several
123 |     number of chunks which will be executed periodically until the initial queue
124 |     is not empty.
125 | 
126 |     In other words you may split a huge amount of created tasks in the small
127 |     chunks of predefined length and distribute your task creation routine
128 |     among several periodic tasks.
129 | 
130 |     How to use:
131 | 
132 |     @celery_app.task(...params...)
133 |     @chunkify_task(sleep_timeout=10, initial_chunk=Chunk(0, 100, 1000))
134 |     def sync_provider_accounts(chunk: Chunk=None):
135 |         # create several tasks based on provided chunk
136 |         # schedule their execution
137 |         return ...
138 | 
139 |     Then the task function will be re-scheduled to run in `sleep_timeout`
140 |     seconds with a next chunk.
141 | 
142 |     Where:
143 | 
144 |         * sleep_timeout – seconds between processing each chunk of tasks.
145 |         * initial_chunk – either `Chunk` object or callable which should return one.
146 |             Allows to specify first value in a chunk, chunk size and maximum
147 |             number of tasks.
148 |     """
149 |     def __init__(self, *,
150 |                  sleep_timeout: seconds,
151 |                  initial_chunk: Union[BaseChunk, Callable],
152 |                  chunk_class: Type[BaseChunk]=Chunk):
153 |         self.sleep_timeout = sleep_timeout
154 |         self.initial_chunk = initial_chunk
155 |         self.logger = logger
156 |         self.task_function = None
157 |         self.chunk_class = chunk_class
158 | 
159 |     def __call__(self, task_function, *args, **kwargs):
160 |         self.task_function = task_function
161 | 
162 |         @wraps(task_function)
163 |         def _wrapper(*args, **kwargs):
164 |             return self._task_function_wrapper(*args, **kwargs)
165 |         return _wrapper
166 | 
167 |     def _task_function_wrapper(self, *args, **kwargs):
168 |         chunk = self._get_or_create_chunk(args, kwargs)
169 | 
170 |         log = self.logger.bind(
171 |             chunk=chunk,
172 |             name=current_task.name,
173 |         )
174 | 
175 |         log.info('Executing chunked task')
176 | 
177 |         kwargs['chunk'] = chunk
178 | 
179 |         result = self.task_function(*args, **kwargs)
180 | 
181 |         next_chunk = chunk.next()
182 |         if next_chunk.is_exhausted:
183 |             log.info('Chunk is exhausted, iteration stopped',
184 |                      next_chunk=next_chunk)
185 | 
186 |             return result
187 | 
188 |         kwargs['chunk'] = next_chunk
189 |         current_task.apply_async(
190 |             kwargs=kwargs, countdown=self.sleep_timeout
191 |         )
192 | 
193 |         log.info('Next chunk is scheduled',
194 |                  next_chunk=next_chunk,
195 |                  sleep_timeout=self.sleep_timeout)
196 | 
197 |         return result
198 | 
199 |     def _get_or_create_chunk(self, args, kwargs):
200 |         chunk = kwargs.get('chunk')
201 | 
202 |         # first task execution – we should create an initial chunk
203 |         # and pass it to a task function
204 |         if not chunk:
205 |             return self._get_initial_chunk(args, kwargs)
206 | 
207 |         if isinstance(chunk, BaseChunk):
208 |             return chunk
209 |         elif isinstance(chunk, dict):
210 |             return self.chunk_class(**chunk)
211 |         elif isinstance(chunk, (list, tuple)):
212 |             return self.chunk_class(*chunk)
213 | 
214 |         raise TypeError(f'Unexpected chunk type "{type(chunk)}"')
215 | 
216 |     def _get_initial_chunk(self, args, kwargs) -> Chunk:
217 |         if callable(self.initial_chunk):
218 |             return self.initial_chunk(*args, **kwargs)
219 |         return self.initial_chunk
220 | 
221 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | celery
2 | structlog
3 | 


--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import celery
 4 | from chunkify import Chunk, ChunkMode, chunkify_task
 5 | 
 6 | import logging
 7 | 
 8 | logging.basicConfig(level=logging.DEBUG)
 9 | 
10 | celery_app = celery.Celery(broker='memory://', backend='cache+memory://')
11 | 
12 | 
13 | class Config:
14 |     task_always_eager = True
15 | 
16 | 
17 | celery_app.config_from_object(Config)
18 | 
19 | 
20 | class TestChunkObject:
21 | 
22 |     @pytest.mark.parametrize(
23 |         ['chunk', 'expected_end'], [
24 |             (Chunk(0, 10, 100), 10),
25 |             (Chunk(1, 11, None), 12),
26 |             (Chunk(1, 10, 5), 5),
27 |         ]
28 |     )
29 |     def test_end(self, chunk, expected_end):
30 |         assert chunk.end == expected_end
31 | 
32 |     @pytest.mark.parametrize(
33 |         ['chunk', 'expected_slice'], [
34 |             (Chunk(0, 10, 100), slice(0, 10)),
35 |             (Chunk(1, 11, None), slice(1, 12)),
36 |             (Chunk(1, 10, 5), slice(1, 5)),
37 |         ]
38 |     )
39 |     def test_slicing(self, chunk, expected_slice):
40 |         assert chunk.slice == expected_slice
41 | 
42 |     @pytest.mark.parametrize(
43 |         ['chunk', 'is_exhausted'], [
44 |             (Chunk(0, 10, 100), False),
45 |             (Chunk(1, 11, None), False),
46 |             (Chunk(100, 11, None), False),
47 |             (Chunk(100, 10, 100), False),
48 |             (Chunk(100, 10, 100, ChunkMode.SLICE), True),
49 |             (Chunk(100, 10, 91), True),
50 |             (Chunk(100, 10, 91, ChunkMode.SLICE), True),
51 |         ]
52 |     )
53 |     def test_exhausted(self, chunk, is_exhausted):
54 |         assert chunk.is_exhausted == is_exhausted
55 | 
56 |     def test_full_iteration(self):
57 |         values = list(range(1, 100, 2))
58 | 
59 |         chunk = Chunk(0, 10, len(values), ChunkMode.SLICE)
60 | 
61 |         count = 0
62 |         iterated_values = []
63 | 
64 |         while not chunk.is_exhausted:
65 |             iterated_values.extend(values[chunk.slice])
66 |             chunk = chunk.next()
67 |             count += 1
68 | 
69 |         assert iterated_values == values
70 |         assert count == len(values) / chunk.size
71 | 
72 | 
73 | items = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
74 | items_len = len(items)
75 | 
76 | 
77 | @celery_app.task
78 | def remove_item(index):
79 |     items[index] = None
80 | 
81 | 
82 | @celery_app.task
83 | @chunkify_task(
84 |     sleep_timeout=10,
85 |     initial_chunk=Chunk(0, 3, 10, ChunkMode.SLICE),
86 | )
87 | def runner_list(chunk: Chunk):
88 |     for i in list(range(items_len))[chunk.slice]:
89 |         remove_item.delay(i)
90 | 
91 | 
92 | class TestChunkificator:
93 | 
94 |     def test_list_chunkificator(self):
95 |         runner_list.delay()
96 |         assert [i for i in items if i is not None] == [11, 12]
97 | 


--------------------------------------------------------------------------------