├── .gitignore ├── LICENSE ├── README.md ├── assets ├── banner.png ├── banner.pptx ├── diagram.png ├── diagram.pptx ├── multiprocessing_parallel1.png ├── multiprocessing_parallel2.png ├── multiprocessing_parallel3.png ├── multiprocessing_serial1.png └── optimal_cpu.png ├── notebooks └── tqdm_batch_example.ipynb ├── pyproject.toml ├── requirements-notebook.txt ├── requirements.txt ├── setup.py └── src ├── testconf.py └── tqdm_batch ├── __init__.py ├── batch_process.py ├── progress_bar.py └── task_wrapper.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Dennis Bakhuis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tqdm_batch 2 | Batch processing using joblib including tqdm progress bars 3 | 4 | Add batch processing to joblib, including tqdm progress bars. 5 | 6 | [I wrote a blog post about this](https://towardsdatascience.com/parallel-batch-processing-in-python-8dcce607d226) 7 | 8 | ## Install 9 | ```bash 10 | pip install tqdm_batch 11 | ``` 12 | 13 | ## Usage 14 | Process a list of `items` using a `function`. 15 | 16 | ```python 17 | from tqdm_batch import batch_process 18 | import random 19 | import time 20 | 21 | def batch_process_function(row, some_var): 22 | time.sleep(0.01) 23 | return row + random.randint(0, some_var) 24 | 25 | N = 1_000 26 | items = range(N) 27 | 28 | result = batch_process( 29 | items, 30 | batch_process_function, 31 | some_var=42, 32 | n_workers=6, 33 | sep_progress=True, 34 | ) 35 | ``` 36 | 37 | ![Multi batch processing with progress bars](assets/multiprocessing_parallel3.png?raw=true "Multi batch processing with progress bars") 38 | -------------------------------------------------------------------------------- /assets/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennisbakhuis/tqdm_batch/d13e1e6c966a2eccfde61db1bf3044210262dd86/assets/banner.png -------------------------------------------------------------------------------- /assets/banner.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennisbakhuis/tqdm_batch/d13e1e6c966a2eccfde61db1bf3044210262dd86/assets/banner.pptx -------------------------------------------------------------------------------- /assets/diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennisbakhuis/tqdm_batch/d13e1e6c966a2eccfde61db1bf3044210262dd86/assets/diagram.png -------------------------------------------------------------------------------- /assets/diagram.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennisbakhuis/tqdm_batch/d13e1e6c966a2eccfde61db1bf3044210262dd86/assets/diagram.pptx -------------------------------------------------------------------------------- /assets/multiprocessing_parallel1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennisbakhuis/tqdm_batch/d13e1e6c966a2eccfde61db1bf3044210262dd86/assets/multiprocessing_parallel1.png -------------------------------------------------------------------------------- /assets/multiprocessing_parallel2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennisbakhuis/tqdm_batch/d13e1e6c966a2eccfde61db1bf3044210262dd86/assets/multiprocessing_parallel2.png -------------------------------------------------------------------------------- /assets/multiprocessing_parallel3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennisbakhuis/tqdm_batch/d13e1e6c966a2eccfde61db1bf3044210262dd86/assets/multiprocessing_parallel3.png -------------------------------------------------------------------------------- /assets/multiprocessing_serial1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennisbakhuis/tqdm_batch/d13e1e6c966a2eccfde61db1bf3044210262dd86/assets/multiprocessing_serial1.png -------------------------------------------------------------------------------- /assets/optimal_cpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennisbakhuis/tqdm_batch/d13e1e6c966a2eccfde61db1bf3044210262dd86/assets/optimal_cpu.png -------------------------------------------------------------------------------- /notebooks/tqdm_batch_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "caa1dc0f-59fd-45ca-b67b-bcdd8ad7d3b6", 6 | "metadata": {}, 7 | "source": [ 8 | "# Batch processing using Joblib and Tqdm" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "0fdfdbae-cded-4bbc-81e5-11f902ed0b3c", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from math import ceil\n", 19 | "import random\n", 20 | "import datetime\n", 21 | "from joblib import Parallel, delayed\n", 22 | "\n", 23 | "from tqdm.auto import tqdm\n", 24 | "\n", 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "import seaborn as sns\n", 30 | "\n", 31 | "from tqdm_batch import batch_process\n", 32 | "\n", 33 | "sns.set_context('poster')" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "id": "be3e7155-af9e-40ee-b0e2-5d2c3bc354fe", 39 | "metadata": {}, 40 | "source": [ 41 | "Simple batch processing function" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "id": "92bc363d-1bfc-4606-baec-1ef9bcbcd601", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "def batch_process_function(row, order, payload):\n", 52 | " \"\"\"\n", 53 | " Simulate process function\n", 54 | " \n", 55 | " Row and payload are ignored.\n", 56 | " \n", 57 | " Approximate pi\n", 58 | " \"\"\"\n", 59 | " k, pi = 1, 0\n", 60 | " for i in range(10**6):\n", 61 | " if i % 2 == 0: # even\n", 62 | " pi += 4 / k\n", 63 | " else: # odd \n", 64 | " pi -= 4 / k \n", 65 | " k += 2\n", 66 | " return pi" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "id": "da6277d8-7093-4010-9866-2dbdcb8568fe", 72 | "metadata": {}, 73 | "source": [ 74 | "This function just calculates Pi:" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 3, 80 | "id": "a05dbf0e-f28c-4859-9374-16eec6d5958e", 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "3.1415916535897743" 87 | ] 88 | }, 89 | "execution_count": 3, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "batch_process_function('x', 6, None)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "id": "fc3bf7d5-4dd3-4885-8f4b-e0026821f752", 101 | "metadata": {}, 102 | "source": [ 103 | "Lets have some demo settings:" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 4, 109 | "id": "28cc0946-7528-42b5-8e5a-2884f1495cc4", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "order=6\n", 114 | "N = 1_000\n", 115 | "items = range(N)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "id": "d24cb841-77e1-461a-b26f-311003913e51", 121 | "metadata": {}, 122 | "source": [ 123 | "Process serially:" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "id": "01ff0f46-849c-48bf-853d-e08bc0a0af42", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "%%time\n", 134 | "result = [batch_process_function(row, order, None) for row in items]" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "904ac6ab-fb3f-4d9d-a142-dd293ff62fdd", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "%%time\n", 145 | "result = Parallel(n_jobs=8)(\n", 146 | " delayed(batch_process_function)\n", 147 | " (row, order, None) \n", 148 | " for row in tqdm(items)\n", 149 | ")" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "id": "28cc9a7f-1eb1-4bfd-ac4c-58ae7674f70e", 155 | "metadata": {}, 156 | "source": [ 157 | "Serialization can increase the overhead such that it takes much longer than in a serial fashion:" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "id": "f31b3b88-b5a9-49a1-b392-c9eabeb5030e", 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "matrix = np.random.normal(size=(500, 500, 100))" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "id": "0527b6e7-524c-4d10-9606-0bf2cf7a7ae6", 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "%%time\n", 178 | "result = Parallel(n_jobs=8)(\n", 179 | " delayed(batch_process_function)\n", 180 | " (row, order, matrix) \n", 181 | " for row in tqdm(items)\n", 182 | ")" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "id": "0dfe2973-87c6-4f42-814c-2dcae9614e9c", 188 | "metadata": {}, 189 | "source": [ 190 | "But working in batches, minimizing IO can bring us back on track:" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "id": "78d4a3db-67c7-4737-b7a0-38e8fba926d9", 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "%%time\n", 201 | "\n", 202 | "n_workers = 8\n", 203 | "\n", 204 | "# Create a batch function\n", 205 | "def proc_batch(batch, order, matrix):\n", 206 | " return [\n", 207 | " batch_process_function(row, order, matrix)\n", 208 | " for row in batch\n", 209 | " ]\n", 210 | "\n", 211 | "# Divide data in batches\n", 212 | "batch_size = ceil(len(items) / n_workers)\n", 213 | "batches = [\n", 214 | " items[ix:ix+batch_size]\n", 215 | " for ix in range(0, len(items), batch_size)\n", 216 | "]\n", 217 | "\n", 218 | "# divide the work\n", 219 | "result = Parallel(n_jobs=8)(\n", 220 | " delayed(proc_batch)\n", 221 | " (batch, order, matrix) \n", 222 | " for batch in tqdm(batches)\n", 223 | ")" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "id": "ef284343-f7ab-4cac-8ff8-533d46c34d3b", 229 | "metadata": {}, 230 | "source": [ 231 | "This is all wrapped into the tqdm_batch package:" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "id": "ff07a6c7-78d0-4f2d-babf-27044b78fb9f", 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "result = batch_process(\n", 242 | " items,\n", 243 | " batch_process_function,\n", 244 | " order=6,\n", 245 | " n_workers=6,\n", 246 | " payload=matrix,\n", 247 | " sep_progress=True,\n", 248 | ")" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "id": "c1ec3ea0-788c-4cdb-b88f-d97cad497058", 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "id": "16436a51-5cd0-4a39-98b4-310eb87af29b", 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "id": "ce32f468-6353-4b9f-addd-b3ef80e7bc88", 270 | "metadata": {}, 271 | "source": [ 272 | "## How many CPUs are optimal?" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 5, 278 | "id": "2ec518bc-9e8b-4e31-9648-9404560606dc", 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "data": { 283 | "application/vnd.jupyter.widget-view+json": { 284 | "model_id": "4b530caa96b0426384b71cf9f7d5b4d2", 285 | "version_major": 2, 286 | "version_minor": 0 287 | }, 288 | "text/plain": [ 289 | " 0%| | 0/1000 [00:00" 498 | ] 499 | }, 500 | "metadata": { 501 | "needs_background": "light" 502 | }, 503 | "output_type": "display_data" 504 | } 505 | ], 506 | "source": [ 507 | "fig, ax = plt.subplots(figsize=(12, 8))\n", 508 | "sns.lineplot(x='workers', y='dt', data=df, ax=ax)\n", 509 | "ax.plot([1, 12], [df.dt.min(), df.dt.min()], 'k--')\n", 510 | "_ = ax.set_ylabel('time [s]')\n", 511 | "sns.despine()" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 8, 517 | "id": "a616b986-2251-46ae-86aa-fec341361e92", 518 | "metadata": {}, 519 | "outputs": [], 520 | "source": [ 521 | "fig.savefig('../assets/optimal_cpu.png', bbox_inches='tight')" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "id": "1fa6fbf5-7c86-482d-b448-845a1927add9", 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "id": "1d9ba8ad-9b07-45cf-aa5b-a9989fd9ed80", 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "id": "232e6cf6-ac2a-438d-8741-9b71fe28cb71", 544 | "metadata": {}, 545 | "outputs": [], 546 | "source": [] 547 | } 548 | ], 549 | "metadata": { 550 | "kernelspec": { 551 | "display_name": "Python 3 (ipykernel)", 552 | "language": "python", 553 | "name": "python3" 554 | }, 555 | "language_info": { 556 | "codemirror_mode": { 557 | "name": "ipython", 558 | "version": 3 559 | }, 560 | "file_extension": ".py", 561 | "mimetype": "text/x-python", 562 | "name": "python", 563 | "nbconvert_exporter": "python", 564 | "pygments_lexer": "ipython3", 565 | "version": "3.10.0" 566 | } 567 | }, 568 | "nbformat": 4, 569 | "nbformat_minor": 5 570 | } 571 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | -------------------------------------------------------------------------------- /requirements-notebook.txt: -------------------------------------------------------------------------------- 1 | ipywidgets==7.6.5 2 | jupyterlab==3.2.5 3 | matplotlib==3.5.1 4 | numpy==1.21.5 5 | pandas==1.3.5 6 | seaborn==0.11.2 7 | tqdm==4.62.3 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib==1.1.0 2 | tqdm==4.62.3 3 | 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="tqdm_batch", 8 | version="0.1.0", 9 | author="Dennis Bakhuis", 10 | author_email="pypi@bakhuis.nu", 11 | description="Wrapper for tqdm and joblib to have a progressbar while batch processing", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/dennisbakhuis/tqdm_batch", 15 | classifiers=[ 16 | "Programming Language :: Python :: 3", 17 | "License :: OSI Approved :: MIT License", 18 | "Operating System :: OS Independent", 19 | ], 20 | package_dir={"": "src"}, 21 | packages=setuptools.find_packages(where="src"), 22 | python_requires=">=3.6", 23 | ) 24 | -------------------------------------------------------------------------------- /src/testconf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennisbakhuis/tqdm_batch/d13e1e6c966a2eccfde61db1bf3044210262dd86/src/testconf.py -------------------------------------------------------------------------------- /src/tqdm_batch/__init__.py: -------------------------------------------------------------------------------- 1 | from .batch_process import batch_process 2 | from .progress_bar import progress_bar 3 | from .task_wrapper import task_wrapper 4 | 5 | 6 | __all__ = [ 7 | 'batch_process', 8 | 'progress_bar', 9 | 'task_wrapper', 10 | ] 11 | -------------------------------------------------------------------------------- /src/tqdm_batch/batch_process.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Union, Callable 2 | from math import ceil 3 | from threading import Thread 4 | from multiprocessing import Manager 5 | 6 | from joblib import Parallel, delayed 7 | 8 | from .progress_bar import progress_bar 9 | from .task_wrapper import task_wrapper 10 | 11 | 12 | def batch_process( 13 | items: list, 14 | function: Callable, 15 | n_workers: int=8, 16 | sep_progress: bool=False, 17 | *args, 18 | **kwargs, 19 | ) -> List[Dict[str, Union[str, List[str]]]]: 20 | """ 21 | Batch process a list of items 22 | 23 | The will be divided into n_workers batches which process 24 | the list individually using joblib. When done, all results are 25 | collected and returned as a list. 26 | 27 | Parameters: 28 | ----------- 29 | items : list 30 | List of items to batch process. This list will be divided in 31 | n_workers batches and processed by the function. 32 | function : Callable 33 | Function used to process each row. Format needs to be: 34 | callable(item, *args, **kwargs). 35 | n_workers : int (Default: 8) 36 | Number of processes to start (processes). Generally there is 37 | an optimum between 1 <= n_workeres <= total_cpus as there is 38 | an overhead for creating separate processes. 39 | sep_progress : bool (Default: False) 40 | Show a separate progress bar for each worker. 41 | *args, **kwargs : - 42 | (named) arguments to pass to batch process function. 43 | 44 | Returns: 45 | -------- 46 | input_items : List [ Dict [ str, Union [ str, List [ str ]]]] 47 | List of processed input_items with collected id, words, 48 | tokens, and labels. 49 | """ 50 | # Divide data in batches 51 | batch_size = ceil(len(items) / n_workers) 52 | batches = [ 53 | items[ix:ix+batch_size] 54 | for ix in range(0, len(items), batch_size) 55 | ] 56 | 57 | # Check single or multiple progress bars 58 | if sep_progress: 59 | totals = [len(batch) for batch in batches] 60 | else: 61 | totals = len(items) 62 | 63 | # Start progress bar in separate thread 64 | manager = Manager() 65 | queue = manager.Queue() 66 | try: 67 | progproc = Thread(target=progress_bar, args=(totals, queue)) 68 | progproc.start() 69 | 70 | # Parallel process the batches 71 | result = Parallel(n_jobs=n_workers)( 72 | delayed(task_wrapper) 73 | (pid, function, batch, queue, *args, **kwargs) 74 | for pid, batch in enumerate(batches) 75 | ) 76 | 77 | finally: 78 | # Stop the progress bar thread 79 | queue.put('done') 80 | progproc.join() 81 | 82 | # Flatten result 83 | flattened = [item for sublist in result for item in sublist] 84 | 85 | return flattened 86 | -------------------------------------------------------------------------------- /src/tqdm_batch/progress_bar.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | from multiprocessing import Queue 3 | 4 | from tqdm.auto import tqdm 5 | 6 | 7 | def progress_bar( 8 | totals: Union[int, List[int]], 9 | queue : Queue, 10 | ) -> None: 11 | """ 12 | Progress bar Thread 13 | 14 | A separate thread to manage the progress of all 15 | workers. When totals is a integer value a 16 | single progress bar is created and all updates 17 | in the queue update this single bar. To have 18 | a progress bar for each worker, totals should 19 | be a list with totals for each worker. 20 | 21 | Parameters: 22 | ----------- 23 | totals : Union[int, List[int]] 24 | Totals for the single bar or for each worker, 25 | depending if it is a List of int or a single 26 | int. 27 | queue : multiprocessing.Queue 28 | Queue to receive progress updates. progress_bar 29 | expects an 'update' string to update a single 30 | bar or a string with the pid of the worker 31 | (i.e. f'update{pid}'). When finished, send a 32 | 'done' to terminate the Thread. 33 | """ 34 | if isinstance(totals, list): 35 | splitted = True 36 | pbars = [ 37 | tqdm( 38 | desc=f'Worker {pid + 1}', 39 | total=total, 40 | position=pid, 41 | ) 42 | for pid, total in enumerate(totals) 43 | ] 44 | else: 45 | splitted = False 46 | pbars = [ 47 | tqdm(total=totals) 48 | ] 49 | 50 | while True: 51 | try: 52 | message = queue.get() 53 | if message.startswith('update'): 54 | if splitted: 55 | pid = int(message[6:]) 56 | pbars[pid].update(1) 57 | else: 58 | pbars[0].update(1) 59 | elif message == 'done': 60 | break 61 | except: 62 | pass 63 | for pbar in pbars: 64 | pbar.close() 65 | -------------------------------------------------------------------------------- /src/tqdm_batch/task_wrapper.py: -------------------------------------------------------------------------------- 1 | def task_wrapper(pid, function, batch, queue, *args, **kwargs): 2 | """ 3 | Wrapper to add progress bar update 4 | """ 5 | result = [] 6 | for example in batch: 7 | result.append(function(example, *args, **kwargs)) 8 | queue.put(f'update{pid}') 9 | return result 10 | --------------------------------------------------------------------------------