├── pyrallel ├── tests │ ├── __init__.py │ ├── test_map_reduce.py │ ├── test_queue.py │ └── test_parallel_processor.py ├── __version__.py ├── __init__.py ├── paralleller.py ├── map_reduce.py ├── parallel_processor.py └── queue.py ├── .coveragerc ├── requirements-dev.txt ├── requirements.txt ├── MANIFEST.in ├── docs ├── _static │ ├── logo.png │ ├── logo_barcode.png │ └── style.css ├── modules.rst ├── queue.rst ├── map_reduce.rst ├── parallel_processor.rst ├── index.rst ├── Makefile ├── installation.rst └── conf.py ├── requirements-docs.txt ├── .readthedocs.yml ├── Makefile ├── .github └── workflows │ ├── deploy.yml │ └── tests.yml ├── LICENSE ├── .travis.yml.bak ├── README.rst ├── .gitignore └── setup.py /pyrallel/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = pyrallel/tests/* -------------------------------------------------------------------------------- /pyrallel/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.10' 2 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-cov<2.6 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | typing>=3.6 2 | multiprocess>=0.70 3 | dill>=0.3 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include VERSION 3 | include requirements.txt -------------------------------------------------------------------------------- /docs/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/pyrallel/HEAD/docs/_static/logo.png -------------------------------------------------------------------------------- /docs/_static/logo_barcode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/pyrallel/HEAD/docs/_static/logo_barcode.png -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | Modules 2 | ======= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | parallel_processor.rst 8 | map_reduce.rst 9 | queue.rst 10 | -------------------------------------------------------------------------------- /docs/queue.rst: -------------------------------------------------------------------------------- 1 | Queue 2 | ===== 3 | 4 | .. automodule:: pyrallel.queue 5 | :members: 6 | :special-members: 7 | :exclude-members: __dict__, __weakref__, __init__ 8 | -------------------------------------------------------------------------------- /requirements-docs.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | alabaster>=0.7.9 3 | Sphinx>=1.5.6 4 | sphinx-autobuild>=0.6.0 5 | sphinxcontrib-napoleon>=0.6.0 6 | nbsphinx>=0.3.4 7 | pandoc>=1.0.2 8 | -------------------------------------------------------------------------------- /docs/map_reduce.rst: -------------------------------------------------------------------------------- 1 | MapReduce 2 | ========= 3 | 4 | .. automodule:: pyrallel.map_reduce 5 | :members: 6 | :special-members: 7 | :exclude-members: __dict__, __weakref__, __init__ 8 | -------------------------------------------------------------------------------- /docs/parallel_processor.rst: -------------------------------------------------------------------------------- 1 | ParallelProcessor 2 | ================= 3 | 4 | .. automodule:: pyrallel.parallel_processor 5 | :members: 6 | :special-members: 7 | :exclude-members: __dict__, __weakref__, __init__ 8 | -------------------------------------------------------------------------------- /pyrallel/__init__.py: -------------------------------------------------------------------------------- 1 | from pyrallel.queue import * 2 | from pyrallel.paralleller import Paralleller 3 | from pyrallel.parallel_processor import ParallelProcessor, Mapper, ProgressThread 4 | from pyrallel.map_reduce import MapReduce 5 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | 6 | sphinx: 7 | configuration: docs/conf.py 8 | 9 | python: 10 | version: 3.8 11 | install: 12 | - requirements: requirements-docs.txt 13 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Pyrallel 2 | ======== 3 | 4 | .. include:: ./../README.rst 5 | :start-after: begin-intro 6 | :end-before: end-intro 7 | 8 | Installation 9 | ------------ 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | installation.rst 15 | 16 | Modules 17 | ------- 18 | 19 | .. toctree:: 20 | :maxdepth: 3 21 | 22 | modules.rst 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: docs 2 | 3 | docs: 4 | @cd docs && make html 5 | 6 | release: 7 | @VERSION=$$(python -c "from pyrallel.__version__ import __version__;print(__version__)") && git tag $$VERSION 8 | 9 | # locate all the files in this directory or below: 10 | FILES=`find . -name '*.py'` 11 | 12 | # The command for running mypy: 13 | lint: 14 | python3 -m mypy $(FILES) 15 | 16 | # Run the unit tests. 17 | test: 18 | python3 -m pytest -s pyrallel/tests/test_map_reduce.py 19 | python3 -m pytest -s pyrallel/tests/test_parallel_processor.py 20 | python3 -m pytest -s pyrallel/tests/test_queue.py 21 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = pyrallel 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy 2 | on: 3 | push: 4 | tags: 5 | - '*' 6 | jobs: 7 | deploy-to-pypi: 8 | name: Deploy to pypi 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: checkout code 12 | uses: actions/checkout@v2 13 | - name: Set up Python 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: '3.6' 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install setuptools wheel twine 21 | - name: Build and publish 22 | env: 23 | TWINE_USERNAME: usc_isi_i2_admin 24 | TWINE_PASSWORD: ${{ secrets.PYPI }} 25 | run: | 26 | python setup.py sdist bdist_wheel 27 | twine upload dist/* 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 USC ISI I2 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: push 3 | jobs: 4 | run-tests: 5 | name: Run pytest 6 | runs-on: ubuntu-latest 7 | strategy: 8 | matrix: 9 | python-version: [3.6, 3.7, 3.8, 3.9] 10 | steps: 11 | - name: Checkout code 12 | uses: actions/checkout@v2 13 | - name: Set up Python ${{ matrix.python-version }} 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: ${{ matrix.python-version }} 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install -r requirements.txt 21 | pip install -r requirements-dev.txt 22 | pip install -e . 23 | pip install coverage coveralls 24 | - name: Test with pytest 25 | run: | 26 | python -m pytest -v --color=yes --cov pyrallel pyrallel/tests/test_* 27 | - name: Coverage 28 | env: 29 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 30 | run: | 31 | # coverage run --omit pyrallel/tests/* -m pytest pyrallel/tests/test_* 32 | # coverage report 33 | coveralls --service=github 34 | 35 | -------------------------------------------------------------------------------- /pyrallel/paralleller.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class Paralleller(ABC): 5 | """ 6 | Paralleller is an abstract class defines common methods for concrete Parallellers. 7 | """ 8 | 9 | @abstractmethod 10 | def start(self): 11 | """ 12 | Start processes and / or threads. 13 | """ 14 | raise NotImplementedError 15 | 16 | @abstractmethod 17 | def add_task(self, *args, **kwargs): 18 | """ 19 | Add new task. 20 | """ 21 | raise NotImplementedError 22 | 23 | @abstractmethod 24 | def task_done(self): 25 | """ 26 | All tasks are added. 27 | """ 28 | raise NotImplementedError 29 | 30 | @abstractmethod 31 | def join(self): 32 | """ 33 | Wait until all processes (threads) finish. 34 | """ 35 | raise NotImplementedError 36 | 37 | def map(self, tasks: iter): 38 | """ 39 | Syntactic sugar for adding task from an iterable object. 40 | 41 | Args: 42 | tasks (iter): Any iterable object. 43 | """ 44 | for task in tasks: 45 | self.add_task(task) 46 | -------------------------------------------------------------------------------- /docs/_static/style.css: -------------------------------------------------------------------------------- 1 | @import url("https://fonts.googleapis.com/css?family=Ubuntu+Mono"); 2 | @import url("https://fonts.googleapis.com/css?family=Open+Sans"); 3 | 4 | pre, code { 5 | font-family: "Ubuntu Mono", "Consolas", "Menlo", "DejaVu Sans Mono", "Bitstream Vera Sans Mono", monospace; 6 | font-size: 15px; 7 | } 8 | 9 | h1, h2, h3, h4, h5, h6, p.admonition-title, div.sphinxsidebar input, body { 10 | font-family: "Open Sans", "Helvetica", "Arial", sans-serif; 11 | } 12 | 13 | div.sphinxsidebar ul li.toctree-l1 > a { 14 | font-size: 100%; 15 | } 16 | 17 | div.sphinxsidebar ul li.toctree-l2 > a { 18 | font-size: 100%; 19 | } 20 | 21 | div.sphinxsidebar ul li.toctree-l3 > a { 22 | font-size: 100%; 23 | } 24 | 25 | div.body { 26 | max-width: 100%; /* overwrite basic.css */ 27 | } 28 | 29 | table.dataframe { 30 | border-collapse: collapse; 31 | /*width: 100%;*/ 32 | } 33 | 34 | table.dataframe th, table.dataframe td { 35 | text-align: left; 36 | padding: 8px; 37 | } 38 | 39 | table.dataframe tr:nth-child(even) { 40 | background-color: #f2f2f2; 41 | } 42 | 43 | blockquote { 44 | border-left: 5px solid #eeeeee; 45 | padding: 10px 20px; 46 | } 47 | 48 | div.sphinxsidebarwrapper p.logo { 49 | margin-bottom: 30px; 50 | } 51 | -------------------------------------------------------------------------------- /.travis.yml.bak: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - '3.9' 5 | - '3.8' 6 | - '3.7' 7 | - '3.6' 8 | - '3.5' 9 | 10 | install: 11 | - pip install -r requirements.txt 12 | - pip install -r requirements-dev.txt 13 | - pip install -e . 14 | - pip install coveralls 15 | 16 | script: 17 | - py.test -v --color=yes --cov pyrallel 18 | 19 | after_success: 20 | - coveralls 21 | 22 | notifications: 23 | email: false 24 | 25 | deploy: 26 | - provider: pypi 27 | user: usc_isi_i2_admin 28 | password: 29 | secure: QwVcmGEN4dJN1vi7HM0E4ZIgGM/kCKWJ323AoXDUtSwbZIUYX5sZsNoh+buJzfJR94geZqckf9ABSk22cazuXzrDBUeh73sOReILcCciEMxYWkrBDrvtr2rBBq2GOC8B8Xc3BzNcZGG1pVhoNFjr6/Co0rOIn6JmxPRBLjCoyT33bQGHchuXPbozhMNDtG1+p+j5+lrGZetdD6sSl8O3BCOkJtfor50LvgxLoYcqcOd6jj9DgY9r6fo7if43xESj07UfneZ+Eo+xVQ9NRsItD4sc2toC5wPcdggqVQ+cy/mc9A3SAbD/Y36Jz6RX1hM5LrnPEDRi/URlmBriwf59VygwSXaypfaex8aEsx5W7CPuexNbRg/qWojoZASE9GzQcAw6aamWIzJy6EOvKI5NmGzVLDUqU3U5Ow/7vLhb/iQi+09Du+5bPSmR2qZvO+uyIHjW8ZGgZQ7Q4uldokYjwca8jUWvx5HrT0B1gJPjV0PjNFrwVuA40TvodoHQJ+Ief0cE9ALsBH6VLEAduC+lWOV1B/bvEBGUgAgD2l1Gb7QKpq1bl1izoGB6H3V0qqXnHdIstMC+0rYwD19FDXPxThrIpuwtsx2KsgXbeFL/wpcABd40rVgLRt6Ol/dBR60DzAGn+L5fV9ahPZY/UNYZwlblPbDsZDP4kUDXYrhW6T4= 30 | on: 31 | tags: true 32 | condition: $TRAVIS_PYTHON_VERSION = 3.6 33 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Pyrallel 2 | ======== 3 | 4 | .. begin-intro 5 | 6 | Yet another easy-to-use python parallel library for humans. 7 | 8 | .. image:: https://img.shields.io/badge/license-MIT-blue.svg 9 | :target: https://raw.githubusercontent.com/usc-isi-i2/pyrallel/master/LICENSE 10 | :alt: License 11 | 12 | .. image:: https://github.com/usc-isi-i2/pyrallel/workflows/Tests/badge.svg?branch=master 13 | :target: https://github.com/usc-isi-i2/pyrallel/actions 14 | :alt: Github actions 15 | 16 | .. image:: https://coveralls.io/repos/github/usc-isi-i2/pyrallel/badge.svg?branch=master 17 | :target: https://coveralls.io/github/usc-isi-i2/pyrallel?branch=master 18 | :alt: Coveralls 19 | 20 | .. image:: https://badge.fury.io/py/pyrallel.lib.svg 21 | :target: https://badge.fury.io/py/pyrallel.lib 22 | :alt: pypi 23 | 24 | .. image:: https://readthedocs.org/projects/pyrallel/badge/?version=latest 25 | :target: http://pyrallel.readthedocs.io/en/latest 26 | :alt: Documents 27 | 28 | - ParallelProcessor: Newbie-friendly process-based parallel computing api. 29 | - MapReduce: Ultimately simple map and reduce computing model. 30 | - ShmQueue: Extremely fast shared memory driven general purpose multiprocessing queue. 31 | 32 | .. end-intro 33 | 34 | Installation 35 | ------------ 36 | :: 37 | 38 | pip install pyrallel.lib 39 | 40 | 41 | Documentation 42 | ------------- 43 | 44 | `Read the Doc `_ 45 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | .. note:: 5 | 6 | Pyrallel supports Python 3 and it's tested under Python 3.4+. 7 | 8 | pip 9 | ---- 10 | 11 | Using pip to install:: 12 | 13 | pip install pyrallel.lib 14 | 15 | If you want to update installed Pyrallel:: 16 | 17 | pip install -U pyrallel.lib 18 | 19 | Development installation 20 | ------------------------ 21 | 22 | Install from source 23 | ``````````````````` 24 | 25 | :: 26 | 27 | git clone https://github.com/usc-isi-i2/pyrallel.git 28 | cd paraly 29 | 30 | virtualenv pyrallel_env 31 | source activate pyrallel_env 32 | pip install -r requirements.txt 33 | pip install -r requreiments-dev.txt 34 | pip install -e . 35 | 36 | Run tests 37 | ````````` 38 | 39 | Pyrallel uses `pytest `_ for unit tests. To run them, simply do following command from the root of Pyrallel package:: 40 | 41 | pytest 42 | 43 | If you need more detailed information, do:: 44 | 45 | pytest -v --color=yes 46 | 47 | Build documentation 48 | ------------------- 49 | 50 | Additional dependencies for building documentation should be installed first:: 51 | 52 | pip install -r requirements-docs.txt 53 | 54 | Documentation is powered by `Sphinx `_ , to generate it on your local, please run:: 55 | 56 | cd docs 57 | make html # the generated doc is located at _build/html/index.html 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .idea/ 107 | local/ -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from distutils.core import setup 3 | from setuptools import find_packages 4 | 5 | 6 | with open('README.rst', 'r') as fh: 7 | long_description = fh.read() 8 | 9 | with open('requirements.txt', 'r') as f: 10 | install_requires = list() 11 | dependency_links = list() 12 | for line in f: 13 | re = line.strip() 14 | if re: 15 | if re.startswith('git+') or re.startswith('svn+') or re.startswith('hg+'): 16 | dependency_links.append(re) 17 | else: 18 | install_requires.append(re) 19 | 20 | about = {} 21 | with open( 22 | os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pyrallel', '__version__.py'), 23 | 'r', encoding='utf-8') as f: 24 | exec(f.read(), about) 25 | 26 | packages = find_packages() 27 | 28 | setup( 29 | name='pyrallel.lib', 30 | version=about['__version__'], 31 | packages=packages, 32 | url='https://github.com/usc-isi-i2/pyrallel', 33 | project_urls={ 34 | "Bug Tracker": "https://github.com/usc-isi-i2/pyrallel/issues", 35 | "Documentation": "https://pyrallel.readthedocs.io", 36 | "Source Code": "https://github.com/usc-isi-i2/pyrallel", 37 | }, 38 | license='MIT', 39 | author='USC/ISI', 40 | author_email='yixiangy@isi.edu', 41 | description='Yet another easy-to-use python3 parallel library for humans.', 42 | long_description=long_description, 43 | long_description_content_type='text/x-rst', 44 | include_package_data=True, 45 | install_requires=install_requires, 46 | dependency_links=dependency_links, 47 | classifiers=( 48 | "Programming Language :: Python :: 3", 49 | "Natural Language :: English", 50 | "License :: OSI Approved :: MIT License", 51 | "Operating System :: OS Independent", 52 | "Topic :: Software Development :: Libraries", 53 | "Topic :: Software Development :: Libraries :: Python Modules", 54 | ) 55 | ) 56 | -------------------------------------------------------------------------------- /pyrallel/tests/test_map_reduce.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | 3 | from pyrallel.map_reduce import MapReduce 4 | 5 | 6 | NUM_OF_PROCESSOR = max(2, int(mp.cpu_count() / 2)) 7 | 8 | 9 | def test_map_reduce_number(): 10 | 11 | def mapper(x): 12 | return x 13 | 14 | def reducer(r1, r2): 15 | return r1 + r2 16 | 17 | mr = MapReduce(3, mapper, reducer) 18 | mr.start() 19 | mr.add_task(1) 20 | mr.task_done() 21 | assert mr.join() == 1 22 | 23 | mr = MapReduce(NUM_OF_PROCESSOR, mapper, reducer) 24 | mr.start() 25 | mr.add_task(1) 26 | mr.task_done() 27 | assert mr.join() == 1 28 | 29 | mr = MapReduce(1, mapper, reducer) 30 | mr.start() 31 | for i in range(1, 101): 32 | mr.add_task(i) 33 | mr.task_done() 34 | assert mr.join() == 5050 35 | 36 | mr = MapReduce(NUM_OF_PROCESSOR, mapper, reducer) 37 | mr.start() 38 | for i in range(1, 101): 39 | mr.add_task(i) 40 | mr.task_done() 41 | assert mr.join() == 5050 42 | 43 | mr = MapReduce(NUM_OF_PROCESSOR, mapper, reducer) 44 | mr.start() 45 | for i in range(1, 100001): 46 | mr.add_task(i) 47 | mr.task_done() 48 | assert mr.join() == 5000050000 49 | 50 | 51 | def test_map_reduce_object(): 52 | 53 | def mapper(k, v): 54 | return {k: v} 55 | 56 | def reducer(r1, r2): 57 | for k1, v1 in r1.items(): 58 | if k1 in r2: 59 | r2[k1] += v1 60 | else: 61 | r2[k1] = v1 62 | return r2 63 | 64 | mr = MapReduce(1, mapper, reducer) 65 | mr.start() 66 | for i in range(100): 67 | if i % 2 == 0: 68 | mr.add_task('a', i) 69 | else: 70 | mr.add_task('b', i) 71 | mr.task_done() 72 | assert mr.join() == {'a': 2450, 'b': 2500} 73 | 74 | mr = MapReduce(NUM_OF_PROCESSOR, mapper, reducer) 75 | mr.start() 76 | for i in range(100): 77 | if i % 2 == 0: 78 | mr.add_task('a', i) 79 | else: 80 | mr.add_task('b', i) 81 | mr.task_done() 82 | assert mr.join() == {'a': 2450, 'b': 2500} 83 | -------------------------------------------------------------------------------- /pyrallel/tests/test_queue.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import queue 3 | import pyrallel 4 | import os 5 | 6 | 7 | # 30 bytes each 8 | # CONTENT = os.urandom(30) 9 | CONTENT = b'\xaa' * 10 + b'\xbb' * 10 + b'\xcc' * 10 10 | 11 | 12 | def sender(sq): 13 | for _ in range(10): 14 | sq.put(CONTENT) 15 | 16 | 17 | def receiver(sq, q): 18 | try: 19 | while True: 20 | content = sq.get(timeout=2) 21 | q.put(content) 22 | except queue.Empty: 23 | return 24 | 25 | 26 | class DummySerializer(object): 27 | def dumps(self, o): 28 | return o 29 | 30 | def loads(self, d): 31 | return d 32 | 33 | 34 | def test_shmqueue(): 35 | if not hasattr(pyrallel, 'ShmQueue'): 36 | return 37 | 38 | params = [ # chunk size, maxsize 39 | [50, 100], # chunk size > content, maxsize is enough 40 | [10, 100], # chunk size < content, maxsize is enough 41 | # [50, 1], # chunk size > content, maxsize is limited 42 | # [10, 1], # chunk size < content, maxsize is limited 43 | ] 44 | 45 | for mode in ['fork', 'spawn']: 46 | mp.set_start_method(mode, force=True) 47 | ShmQueueCls = getattr(pyrallel, 'ShmQueue') 48 | for param in params: 49 | sq = ShmQueueCls(chunk_size=param[0], maxsize=param[1], serializer=DummySerializer()) 50 | q = mp.Queue() 51 | # 3 senders and 2 receivers 52 | # each sender process add 10 content, in total 30 * 10 = 300 bytes 53 | p_senders = [mp.Process(target=sender, args=(sq,)) for _ in range(3)] 54 | p_receivers = [mp.Process(target=receiver, args=(sq, q)) for _ in range(2)] 55 | 56 | for p in p_senders: 57 | p.start() 58 | for p in p_receivers: 59 | p.start() 60 | 61 | for p in p_senders: 62 | p.join() 63 | for p in p_receivers: 64 | p.join() 65 | sq.close() 66 | 67 | total_put = 30 # there should be in total 30 elements 68 | while True: 69 | try: 70 | r = q.get(timeout=2) 71 | total_put -= 1 72 | assert r == CONTENT 73 | except queue.Empty: 74 | break 75 | 76 | assert total_put == 0 77 | -------------------------------------------------------------------------------- /pyrallel/tests/test_parallel_processor.py: -------------------------------------------------------------------------------- 1 | import time 2 | import multiprocessing as mp 3 | 4 | from pyrallel.parallel_processor import ParallelProcessor, Mapper 5 | 6 | 7 | NUM_OF_PROCESSOR = max(2, int(mp.cpu_count() / 2)) 8 | 9 | 10 | def test_basic(): 11 | def dummy_computation(): 12 | time.sleep(0.0001) 13 | 14 | pp = ParallelProcessor(NUM_OF_PROCESSOR, dummy_computation) 15 | pp.start() 16 | 17 | for i in range(1000): 18 | pp.add_task() 19 | 20 | pp.task_done() 21 | pp.join() 22 | 23 | class MyMapper(Mapper): 24 | def enter(self): 25 | self.i = 0 26 | 27 | def process(self): 28 | dummy_computation() 29 | self.i += 1 30 | 31 | pp = ParallelProcessor(NUM_OF_PROCESSOR, MyMapper) 32 | pp.start() 33 | 34 | for i in range(1000): 35 | pp.add_task() 36 | 37 | pp.task_done() 38 | pp.join() 39 | 40 | 41 | def test_with_input(): 42 | def dummy_computation_with_input(x, _idx): 43 | time.sleep(0.0001) 44 | 45 | pp = ParallelProcessor(NUM_OF_PROCESSOR, dummy_computation_with_input, enable_process_id=True) 46 | pp.start() 47 | 48 | for i in range(1000): 49 | pp.add_task(i) 50 | 51 | pp.map(range(1000)) 52 | 53 | pp.task_done() 54 | pp.join() 55 | 56 | class MyMapper(Mapper): 57 | def process(self, x): 58 | dummy_computation_with_input(x, _idx=self._idx) 59 | 60 | pp = ParallelProcessor(NUM_OF_PROCESSOR, MyMapper) 61 | pp.start() 62 | 63 | for i in range(1000): 64 | pp.add_task(i) 65 | 66 | pp.task_done() 67 | pp.join() 68 | 69 | 70 | def test_with_multiple_input(): 71 | def dummy_computation_with_input(x, y): 72 | assert x * 2 == y 73 | time.sleep(0.0001) 74 | 75 | pp = ParallelProcessor(NUM_OF_PROCESSOR, dummy_computation_with_input) 76 | pp.start() 77 | 78 | for i in range(1000): 79 | pp.add_task(i, y=i*2) 80 | 81 | pp.map([(i, i*2) for i in range(1000)]) 82 | 83 | pp.task_done() 84 | pp.join() 85 | 86 | 87 | def test_with_output(): 88 | result = [] 89 | 90 | def dummy_computation_with_input(x): 91 | time.sleep(0.0001) 92 | return x * x 93 | 94 | def collector(r): 95 | result.append(r) 96 | 97 | pp = ParallelProcessor(NUM_OF_PROCESSOR, dummy_computation_with_input, collector=collector) 98 | pp.start() 99 | 100 | for i in range(8): 101 | pp.add_task(i) 102 | 103 | pp.task_done() 104 | pp.join() 105 | 106 | for i in [0, 1, 4, 9, 16, 25, 36, 49]: 107 | assert i in result 108 | 109 | 110 | def test_with_multiple_output(): 111 | result = [] 112 | 113 | def dummy_computation_with_input(x): 114 | time.sleep(0.0001) 115 | return x * x, x * x 116 | 117 | def collector(r1, r2): 118 | result.append(r1) 119 | 120 | pp = ParallelProcessor(NUM_OF_PROCESSOR, dummy_computation_with_input, collector=collector) 121 | pp.start() 122 | 123 | for i in range(8): 124 | pp.add_task(i) 125 | 126 | pp.task_done() 127 | pp.join() 128 | 129 | for i in [0, 1, 4, 9, 16, 25, 36, 49]: 130 | assert i in result 131 | 132 | 133 | def test_with_progress(): 134 | def dummy_computation(): 135 | time.sleep(0.0001) 136 | 137 | def progress(p): 138 | assert p['total'] >= p['added'] >= p['loaded'] >= p['processed'] 139 | 140 | pp = ParallelProcessor(NUM_OF_PROCESSOR, dummy_computation, progress=progress, progress_total=10) 141 | pp.start() 142 | for i in range(10): 143 | pp.add_task() 144 | pp.task_done() 145 | pp.join() 146 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # rltk documentation build configuration file, created by 4 | # sphinx-quickstart on Thu Feb 23 13:46:31 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | import os 20 | import sys 21 | import datetime 22 | sys.path.insert(0, os.path.abspath('../pyrallel')) 23 | sys.path.insert(0, os.path.abspath('../')) 24 | 25 | 26 | # -- General configuration ------------------------------------------------ 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | # 30 | # needs_sphinx = '1.0' 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon', 'nbsphinx'] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # The suffix(es) of source filenames. 41 | # You can specify multiple suffix as a list of string: 42 | # 43 | source_suffix = '.rst' 44 | 45 | # The master toctree document. 46 | master_doc = 'index' 47 | 48 | # General information about the project. 49 | project = 'Pyrallel' 50 | copyright = '{}, USC/ISI'.format(datetime.datetime.now().year) 51 | author = 'USC/ISI' 52 | 53 | # The version info for the project you're documenting, acts as replacement for 54 | # |version| and |release|, also used in various other places throughout the 55 | # built documents. 56 | # 57 | from pyrallel.__version__ import __version__ 58 | # The short X.Y version. 59 | version = '.'.join(__version__.split('.')[:2]) 60 | # The full version, including alpha/beta/rc tags. 61 | release = __version__ 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | # 66 | # This is also used if you do content translation via gettext catalogs. 67 | # Usually you set "language" from the command line for these cases. 68 | language = None 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | # This patterns also effect to html_static_path and html_extra_path 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints'] 74 | 75 | # The name of the Pygments (syntax highlighting) style to use. 76 | pygments_style = 'sphinx' 77 | 78 | # If true, `todo` and `todoList` produce output, else they produce nothing. 79 | todo_include_todos = False 80 | 81 | 82 | # -- Options for HTML output ---------------------------------------------- 83 | 84 | # The theme to use for HTML and HTML Help pages. See the documentation for 85 | # a list of builtin themes. 86 | # 87 | html_theme = 'alabaster' # default, alabaster, pyramid, bizstyle 88 | 89 | # Theme options are theme-specific and customize the look and feel of a theme 90 | # further. For a list of options available for each theme, see the 91 | # documentation. 92 | # 93 | html_theme_options = { 94 | 'logo': 'logo.png', 95 | 'page_width': '1380px', 96 | 'sidebar_width': '220px', 97 | 'github_user': 'usc-isi-i2', 98 | 'github_repo': 'pyrallel', 99 | 'github_banner': 'true', 100 | 'github_type': 'star', 101 | 'extra_nav_links': { 102 | 'pyrallel @ GitHub': 'https://github.com/usc-isi-i2/pyrallel', 103 | 'pyrallel @ PyPI': 'https://pypi.org/project/pyrallel', 104 | 'Issue Tracker': 'https://github.com/usc-isi-i2/pyrallel/issues', 105 | 'USC/ISI CKG': 'http://usc-isi-i2.github.io/' 106 | }, 107 | 'show_powered_by': False 108 | } 109 | 110 | html_show_sourcelink = False 111 | 112 | html_sidebars = { 113 | '**': [ 114 | 'about.html', 115 | 'localtoc.html', 116 | 'navigation.html', 117 | # 'relations.html', 118 | 'searchbox.html', 119 | # 'donate.html', 120 | ] 121 | } 122 | 123 | # Add any paths that contain custom static files (such as style sheets) here, 124 | # relative to this directory. They are copied after the builtin static files, 125 | # so a file named "default.css" will overwrite the builtin "default.css". 126 | html_static_path = ['_static'] 127 | 128 | 129 | # -- Options for HTMLHelp output ------------------------------------------ 130 | 131 | # Output file base name for HTML help builder. 132 | htmlhelp_basename = 'pyralleldoc' 133 | 134 | 135 | # -- Options for LaTeX output --------------------------------------------- 136 | 137 | latex_elements = { 138 | # The paper size ('letterpaper' or 'a4paper'). 139 | # 140 | # 'papersize': 'letterpaper', 141 | 142 | # The font size ('10pt', '11pt' or '12pt'). 143 | # 144 | # 'pointsize': '10pt', 145 | 146 | # Additional stuff for the LaTeX preamble. 147 | # 148 | # 'preamble': '', 149 | 150 | # Latex figure (float) alignment 151 | # 152 | # 'figure_align': 'htbp', 153 | } 154 | 155 | # Grouping the document tree into LaTeX files. List of tuples 156 | # (source start file, target name, title, 157 | # author, documentclass [howto, manual, or own class]). 158 | latex_documents = [ 159 | (master_doc, 'pyrallel.tex', 'Pyrallel Documentation', 160 | u'USC/ISI', 'manual'), 161 | ] 162 | 163 | 164 | # -- Options for manual page output --------------------------------------- 165 | 166 | # One entry per manual page. List of tuples 167 | # (source start file, name, description, authors, manual section). 168 | man_pages = [ 169 | (master_doc, 'Pyrallel', 'Pyrallel Documentation', 170 | [author], 1) 171 | ] 172 | 173 | 174 | # -- Options for Texinfo output ------------------------------------------- 175 | 176 | # Grouping the document tree into Texinfo files. List of tuples 177 | # (source start file, target name, title, author, 178 | # dir menu entry, description, category) 179 | texinfo_documents = [ 180 | (master_doc, 'Pyrallel', 'Pyrallel Documentation', 181 | author, 'Pyrallel', 'Yet another easy-to-use python3 parallel library for humans.', 182 | 'Miscellaneous'), 183 | ] 184 | 185 | 186 | def setup(app): 187 | app.add_stylesheet('style.css') -------------------------------------------------------------------------------- /pyrallel/map_reduce.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a multiprocessing-based map reduce computing model. 3 | 4 | It's different from normal MapReduce model: 5 | 6 | - Manager fires up mapper and reducer processes simultaneously: Output of mapper is identical to reducer, \ 7 | so reducers don't need to wait until all mappers finish. 8 | - Data can be passed to mapper gradually: Mappers are waiting to consume data until user tells them no more new data \ 9 | will be added. 10 | - Reducing is not between two mapper's output (though the api to user is as this) \ 11 | but output and context: Data pickling (serialization) and unpickling \ 12 | (unserialization) for IPC are time consuming. As an alternation, each reducer process holds a context \ 13 | which aggregates output in reducing step. \ 14 | Once all output is reduced, reducing will be among contexts. 15 | - It doesn't support shuffling and reduce-by-key. 16 | 17 | Example:: 18 | 19 | def mapper(x): 20 | time.sleep(0.0001) 21 | return x 22 | 23 | def reducer(r1, r2): 24 | return r1 + r2 25 | 26 | mr = MapReduce(8, mapper, reducer) 27 | mr.start() 28 | 29 | for i in range(10000): 30 | mr.add_task(i) 31 | 32 | mr.task_done() 33 | result = mr.join() 34 | 35 | print(result) 36 | 37 | """ 38 | __all__ = ['MapReduce'] 39 | 40 | import multiprocess as mp 41 | import multiprocess.queues as mpq 42 | import queue 43 | from typing import Callable 44 | import sys 45 | import logging 46 | import uuid 47 | import pickle 48 | import math 49 | 50 | from pyrallel import Paralleller 51 | 52 | 53 | logger = logging.getLogger('MapReduce') 54 | logger.setLevel(logging.ERROR) 55 | stdout_handler = logging.StreamHandler(sys.stdout) 56 | stdout_handler.setFormatter(logging.Formatter('%(asctime)-15s %(name)s [%(levelname)s] %(message)s')) 57 | logger.addHandler(stdout_handler) 58 | 59 | 60 | class ChunkedQueue(mpq.Queue): 61 | CHUNK_SIZE = 512 * 1024 * 1024 62 | 63 | def __init__(self, *args, **kwargs): 64 | ctx = mp.get_context() 65 | super().__init__(*args, **kwargs, ctx=ctx) 66 | self.buff = {} 67 | 68 | def put(self, obj, block=True, timeout=None): 69 | if not block: 70 | return super().put(obj=obj, block=False, timeout=timeout) 71 | 72 | chunk_size = self.__class__.CHUNK_SIZE 73 | msg_id = uuid.uuid4() 74 | msg_bytes = pickle.dumps(obj) 75 | num_of_chunks = math.ceil(len(msg_bytes) / chunk_size) 76 | logger.debug('putting data: #%s [%d], size: %d', msg_id, num_of_chunks, len(msg_bytes)) 77 | for i in range(num_of_chunks): 78 | msg_obj = { 79 | 'b': msg_bytes[i * chunk_size : (i + 1) * chunk_size], # body 80 | 'u': msg_id, # msg id 81 | 'i': i + 1, # chunk id 82 | 'n': num_of_chunks # total number of chunks 83 | } 84 | super().put(obj=msg_obj, block=block, timeout=timeout) 85 | 86 | def get(self, block=True, timeout=None): 87 | if not block: 88 | return super().get(block=False, timeout=timeout) 89 | 90 | while True: 91 | msg_obj = super().get(block=block, timeout=timeout) 92 | logger.debug('getting data: #%s [%d/%d]', msg_obj['u'], msg_obj['i'], msg_obj['n']) 93 | # small message 94 | if msg_obj['u'] not in self.buff and msg_obj['i'] == msg_obj['n']: 95 | return pickle.loads(msg_obj['b']) 96 | 97 | # chunked message 98 | if msg_obj['u'] not in self.buff: 99 | self.buff[msg_obj['u']] = [None] * msg_obj['n'] 100 | self.buff[msg_obj['u']][msg_obj['i']-1] = msg_obj['b'] 101 | if msg_obj['i'] == msg_obj['n']: 102 | msg = pickle.loads(b''.join(self.buff[msg_obj['u']])) 103 | del self.buff[msg_obj['u']] 104 | return msg 105 | 106 | 107 | class MapReduce(Paralleller): 108 | """ 109 | Args: 110 | num_of_process (int): Number of process for both mappers and reducers. 111 | mapper (Callable): Mapper function. The signature is `mapper(*args, **kwargs) -> object`. 112 | reducer (Callable): Reducer function. The signature is `reduce(object, object) -> object`. 113 | `object` arguments are the returns from `mapper` s. 114 | mapper_queue_size (int, optional): Maximum size of mapper queue, 0 by default means unlimited. 115 | reducer_queue_size (int, optional): Maximum size of reduce queue, 0 by default means unlimited. 116 | """ 117 | 118 | CMD_NO_NEW_DATA = 1 # no more new user data 119 | CMD_MAPPER_FINISH = 2 # mapper finished 120 | CMD_REDUCER_WAITING = 3 # reducer is waiting 121 | CMD_NO_RUNNING_MAPPER = 4 # no mapper is running 122 | CMD_REDUCER_AWAKE = 5 # awake a reducer 123 | CMD_REDUCER_KILL = 6 # kill a reducer 124 | CMD_REDUCER_FINISH = 7 # reducer finished 125 | 126 | def __init__(self, num_of_process: int, mapper: Callable, reducer: Callable, 127 | mapper_queue_size: int = 0, reducer_queue_size: int = 0): 128 | self._mapper_queue = mp.Queue(maxsize=mapper_queue_size) 129 | self._reducer_queue = ChunkedQueue(maxsize=reducer_queue_size) 130 | self._result_queue = ChunkedQueue() 131 | self._mapper_cmd_queue = [mp.Queue() for _ in range(num_of_process)] 132 | self._reducer_cmd_queue = [mp.Queue() for _ in range(num_of_process)] 133 | self._manager_cmd_queue = mp.Queue() 134 | 135 | self._manager_process = mp.Process(target=self._run_manager) 136 | self._mapper_process = [mp.Process(target=self._run_mapper, args=(i, )) 137 | for i in range(num_of_process)] 138 | self._reducer_process = [mp.Process(target=self._run_reducer, args=(i, )) 139 | for i in range(num_of_process)] 140 | 141 | self._mapper = mapper 142 | self._reducer = reducer 143 | self._num_of_process = num_of_process 144 | 145 | def start(self): 146 | """ 147 | Start all child processes. 148 | """ 149 | # start manager, mapper and reducer processes 150 | self._manager_process.start() 151 | for m in self._mapper_process: 152 | m.start() 153 | for r in self._reducer_process: 154 | r.start() 155 | 156 | def add_task(self, *args, **kwargs): 157 | """ 158 | Add data. 159 | 160 | Args: 161 | args: Same to args in `mapper` function. 162 | kwargs: Same to kwargs in `mapper` function. 163 | """ 164 | self._mapper_queue.put( (args, kwargs) ) 165 | 166 | def task_done(self): 167 | """ 168 | No more new task. 169 | """ 170 | # no more user data 171 | self._manager_cmd_queue.put( (self.__class__.CMD_NO_NEW_DATA,) ) 172 | 173 | def join(self): 174 | """ 175 | This method blocks until all mappers and reducers finish. 176 | 177 | Returns: 178 | object: The final reduced object. 179 | """ 180 | # reduced result 181 | result = self._result_queue.get() 182 | 183 | # make sure all child processes exited 184 | # (do this after clean up all queues to avoid deadlock 185 | # https://docs.python.org/3.6/library/multiprocessing.html?highlight=process#all-start-methods 186 | # "Joining processes that use queues") 187 | for m in self._mapper_process: 188 | m.join() 189 | for r in self._reducer_process: 190 | r.join() 191 | self._manager_process.join() 192 | 193 | return result 194 | 195 | def _run_manager(self): 196 | running_mapper = [1 for _ in range(self._num_of_process)] # running mappers, 1 is running 197 | running_reducer = [1 for _ in range(self._num_of_process)] # running reducers, 1 is running 198 | waiting_reducer = [0 for _ in range(self._num_of_process)] # waiting reducers, 1 is waiting 199 | killing_reducer = [0 for _ in range(self._num_of_process)] # killing reducers, 1 is asked to kill 200 | 201 | # only return the index where mask shows 1 202 | def apply_mask(mask): 203 | for idx, m in enumerate(mask): 204 | if m == 1: 205 | yield idx 206 | 207 | while True: 208 | try: 209 | cmd = self._manager_cmd_queue.get(timeout=0.1) 210 | 211 | # no more user data, notify all mappers 212 | if cmd[0] == self.__class__.CMD_NO_NEW_DATA: 213 | for q in self._mapper_cmd_queue: 214 | q.put( (self.__class__.CMD_NO_NEW_DATA,) ) 215 | 216 | # a mapper finished 217 | elif cmd[0] == self.__class__.CMD_MAPPER_FINISH: 218 | idx = cmd[1] 219 | running_mapper[idx] = 0 220 | # notify reducers if all mappers are finished 221 | if sum(running_mapper) == 0: 222 | for r in self._reducer_cmd_queue: 223 | r.put( (self.__class__.CMD_NO_RUNNING_MAPPER,) ) 224 | 225 | # a reducer is waiting 226 | # if all reducers are waiting, 227 | # ask half of them to kill themselves and release held resources (context), 228 | # after being killed, wake up rest of the reducers 229 | elif cmd[0] == self.__class__.CMD_REDUCER_WAITING: 230 | idx = cmd[1] 231 | waiting_reducer[idx] = 1 232 | logger.info('waiting reducer #%d', idx) 233 | 234 | # total num of running reducers 235 | running_reducer_num = len(list(apply_mask(running_reducer))) 236 | logger.info('running reducer num %d', running_reducer_num) 237 | 238 | # only one reducer and nothing to reduce anymore 239 | if running_reducer_num == 1: 240 | # kill last reducer 241 | idx = next(apply_mask(running_reducer)) 242 | self._reducer_cmd_queue[idx].put( (self.__class__.CMD_REDUCER_KILL,) ) 243 | # return result to main process 244 | self._result_queue.put(self._reducer_queue.get()) 245 | return 246 | 247 | # total num of waiting reducers 248 | waiting_reducer_num = len(list(filter(lambda x: x > 0, 249 | [waiting_reducer[idx] for idx in apply_mask(running_reducer)]))) 250 | logger.info('waiting reducer num %d', waiting_reducer_num) 251 | logger.info('waiting reducer status %s', str(waiting_reducer)) 252 | 253 | # need to kill half of the reducers and release resources 254 | if running_reducer_num == waiting_reducer_num: 255 | # reset waiting reducer (for next round) 256 | waiting_reducer = [0 for _ in range(self._num_of_process)] 257 | # pick half of them to kill, notify these reducers 258 | kill_reducer_num = running_reducer_num - int(running_reducer_num / 2) 259 | notified_kill_reducer_num = 0 260 | for idx in apply_mask(running_reducer): 261 | self._reducer_cmd_queue[idx].put( (self.__class__.CMD_REDUCER_KILL,) ) 262 | killing_reducer[idx] = 1 263 | notified_kill_reducer_num += 1 264 | logging.info('killing reducer #%d', idx) 265 | if kill_reducer_num == notified_kill_reducer_num: 266 | break 267 | 268 | # make sure these reducers are killed 269 | while True: 270 | cmd = self._manager_cmd_queue.get() 271 | # other command, put it back 272 | if cmd[0] != self.__class__.CMD_REDUCER_FINISH: 273 | self._manager_cmd_queue.put(cmd) 274 | else: 275 | idx = cmd[1] 276 | # reset state for killed reducer 277 | running_reducer[idx] = 0 278 | killing_reducer[idx] = 0 279 | logger.info('reducer killed #%d', idx) 280 | 281 | # all killed, wake up rest of the reducers 282 | if sum(killing_reducer) == 0: 283 | for idx in apply_mask(running_reducer): 284 | logger.info('awaking reducer #%d', idx) 285 | self._reducer_cmd_queue[idx].put( (self.__class__.CMD_REDUCER_AWAKE,) ) 286 | break 287 | 288 | except queue.Empty: 289 | continue 290 | 291 | def _run_mapper(self, idx): 292 | no_new_data = False 293 | 294 | while True: 295 | # cmd 296 | try: 297 | cmd = self._mapper_cmd_queue[idx].get_nowait() 298 | if cmd[0] == self.__class__.CMD_NO_NEW_DATA: 299 | no_new_data = True 300 | except queue.Empty: 301 | pass 302 | 303 | # data 304 | try: 305 | data = self._mapper_queue.get(timeout=0.1) 306 | args, kwargs = data[0], data[1] 307 | result = self._mapper(*args, **kwargs) 308 | self._reducer_queue.put(result) 309 | except queue.Empty: 310 | # no more new data, mapper finishes 311 | if no_new_data: 312 | self._manager_cmd_queue.put( (self.__class__.CMD_MAPPER_FINISH, idx) ) 313 | return 314 | continue 315 | 316 | def _run_reducer(self, idx): 317 | no_running_mapper = False 318 | context = None # it holds result of last reducing, and can be used in next reducing 319 | 320 | while True: 321 | # cmd 322 | try: 323 | cmd = self._reducer_cmd_queue[idx].get_nowait() 324 | if cmd[0] == self.__class__.CMD_NO_RUNNING_MAPPER: 325 | no_running_mapper = True 326 | except queue.Empty: 327 | pass 328 | 329 | # data 330 | try: 331 | if context is None: # can't use "not" operator here, context could be empty object (list, dict, ...) 332 | context = self._reducer_queue.get(timeout=0.1) 333 | 334 | m = self._reducer_queue.get(timeout=0.1) 335 | context = self._reducer(context, m) 336 | except queue.Empty: 337 | # there are still some alive mappers, wait for their output 338 | if not no_running_mapper: 339 | continue 340 | 341 | # no data in reducer queue, ask manager and wait for further action 342 | self._manager_cmd_queue.put( (self.__class__.CMD_REDUCER_WAITING, idx) ) 343 | cmd = self._reducer_cmd_queue[idx].get() 344 | # awake 345 | if cmd[0] == self.__class__.CMD_REDUCER_AWAKE: 346 | continue 347 | # kill itself, put context back to reducer queue 348 | elif cmd[0] == self.__class__.CMD_REDUCER_KILL: 349 | if context is not None: 350 | self._reducer_queue.put(context) 351 | self._manager_cmd_queue.put( (self.__class__.CMD_REDUCER_FINISH, idx) ) 352 | return 353 | -------------------------------------------------------------------------------- /pyrallel/parallel_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | ParallelProcessor utilizes multiple CPU cores to process compute-intensive tasks. 3 | 4 | 5 | If you have a some time-consuming statements in a for-loop and no state is shared among loops, you can map these 6 | statements to different processes. Assume you need to process a couple of files, you can do this in parallel:: 7 | 8 | def mapper(filename): 9 | with open(filename) as f_in, open(filename + '.out') as f_out: 10 | f_out.write(process_a_file(f_in.read())) 11 | 12 | pp = ParallelProcessor(2, mapper) 13 | pp.start() 14 | 15 | for fname in ['file1', 'file2', 'file3', 'file4']: 16 | pp.add_task(fname) 17 | 18 | pp.task_done() 19 | pp.join() 20 | 21 | It's not required to write a cumbersome loop statement if you have iterable object or type (list, generator, etc). 22 | Instead, you could use `map`:: 23 | 24 | pp = ParallelProcessor(2, mapper) 25 | pp.start() 26 | 27 | pp.map(['file1', 'file2', 'file3', 'file4']) 28 | 29 | pp.task_done() 30 | pp.join() 31 | 32 | Usually, some files are small and some are big, it would be better if it can keep all cores busy. 33 | One way is to send content line by line to each process (assume content is line-separated):: 34 | 35 | def mapper(line, _idx): 36 | with open('processed_{}.out'.format(_idx), 'a') as f_out: 37 | f_out.write(process_a_line(line)) 38 | 39 | pp = ParallelProcessor(2, mapper, enable_process_id=True) 40 | pp.start() 41 | 42 | for fname in ['file1', 'file2', 'file3', 'file4']: 43 | with open(fname) as f_in: 44 | for line in f_in: 45 | pp.add_task(line) 46 | 47 | pp.task_done() 48 | pp.join() 49 | 50 | One problem here is you need to acquire file descriptor every time the mapper is called. 51 | To avoid this, use Mapper class to replace mapper function. 52 | It allows user to define how the process is constructed and deconstructed:: 53 | 54 | class MyMapper(Mapper): 55 | def enter(self): 56 | self.f = open('processed_{}.out'.format(self._idx), 'w') 57 | 58 | def exit(self, *args, **kwargs): 59 | self.f.close() 60 | 61 | def process(self, line): 62 | self.f.write(process_a_line(line)) 63 | 64 | pp = ParallelProcessor(..., mapper=MyMapper, ...) 65 | 66 | In some situations, you may need to use `collector` to collect data back from child processes to main process:: 67 | 68 | processed = [] 69 | 70 | def mapper(line): 71 | return process_a_line(line) 72 | 73 | def collector(data): 74 | processed.append(data) 75 | 76 | pp = ParallelProcessor(2, mapper, collector=collector) 77 | pp.start() 78 | 79 | for fname in ['file1', 'file2', 'file3', 'file4']: 80 | with open(fname) as f_in: 81 | for line in f_in: 82 | pp.add_task(line) 83 | 84 | pp.task_done() 85 | pp.join() 86 | 87 | print(processed) 88 | 89 | You can count the executions in `collector` to estimate the progress. To get the progress of mapper, \ 90 | create a progress function and set it in `ParallelProcessor`:: 91 | 92 | def progress(p): 93 | 94 | # print('Total task: {}, Added to queue: {}, Mapper Loaded: {}, Mapper Processed {}'.format( 95 | # p['total'], p['added'], p['loaded'], p['processed'])) 96 | if p['processed'] % 10 == 0: 97 | print('Progress: {}%'.format(100.0 * p['processed'] / p['total'])) 98 | 99 | pp = ParallelProcessor(8, mapper=mapper, progress=progress, progress_total=len(tasks)) 100 | pp.start() 101 | 102 | for t in tasks: 103 | pp.add_task(t) 104 | 105 | """ 106 | 107 | import multiprocess as mp 108 | import threading 109 | import queue 110 | import inspect 111 | import sys 112 | import typing 113 | from typing import Callable, Iterable 114 | 115 | from pyrallel import Paralleller 116 | 117 | if sys.version_info >= (3, 8): 118 | from pyrallel import ShmQueue 119 | 120 | 121 | class Mapper(object): 122 | """ 123 | Mapper class. 124 | 125 | This defines how mapper works. 126 | 127 | The methods will be called in following order:: 128 | 129 | enter (one time) -> process (many times) -> exit (one time) 130 | """ 131 | def __init__(self, idx): 132 | self._idx = idx 133 | self._progress_info = ProgressThread.init_mapper_progress_info() 134 | 135 | def __enter__(self): 136 | self.enter() 137 | return self 138 | 139 | def __exit__(self, exc_type, exc_val, exc_tb): 140 | self.exit(exc_type, exc_val, exc_tb) 141 | 142 | def enter(self): 143 | """ 144 | Invoked when subprocess is created and listening the queue. 145 | """ 146 | pass 147 | 148 | def exit(self, *args, **kwargs): 149 | """ 150 | Invoked when subprocess is going to exit. Arguments will be set if exception occurred. 151 | """ 152 | pass 153 | 154 | def process(self, *args, **kwargs): 155 | """ 156 | Same as mapper function, but `self` argument can provide additional context (e.g., `self._idx`). 157 | """ 158 | raise NotImplementedError 159 | 160 | 161 | class CollectorThread(threading.Thread): 162 | """ 163 | Handle collector in main process. 164 | Create a thread and call ParallelProcessor.collect(). 165 | """ 166 | 167 | def __init__(self, instance, collector): 168 | super(CollectorThread, self).__init__() 169 | self.collector = collector 170 | self.instance = instance 171 | 172 | def run(self): 173 | for batched_collector in self.instance.collect(): 174 | for o in batched_collector: 175 | self.collector(*o) 176 | 177 | 178 | class ProgressThread(threading.Thread): 179 | """ 180 | Progress information in main process. 181 | """ 182 | 183 | P_ADDED = 0 184 | P_LOADED = 1 185 | P_PROCESSED = 2 186 | P_TOTAL = 3 187 | 188 | def __init__(self, instance, progress, progress_total, num_of_processor): 189 | super(ProgressThread, self).__init__() 190 | self.progress_info = { 191 | ProgressThread.P_ADDED: 0, 192 | ProgressThread.P_LOADED: 0, 193 | ProgressThread.P_PROCESSED: 0, 194 | ProgressThread.P_TOTAL: progress_total 195 | } 196 | self.mapper_progress_info = [ProgressThread.init_mapper_progress_info() for _ in range(num_of_processor)] 197 | self.instance = instance 198 | self.progress = progress 199 | 200 | @staticmethod 201 | def init_mapper_progress_info(): 202 | return {ProgressThread.P_LOADED: 0, ProgressThread.P_PROCESSED: 0} 203 | 204 | def refresh_progress_info(self): 205 | self.progress_info[ProgressThread.P_LOADED] \ 206 | = sum([p[ProgressThread.P_LOADED] for p in self.mapper_progress_info]) 207 | self.progress_info[ProgressThread.P_PROCESSED] \ 208 | = sum([p[ProgressThread.P_PROCESSED] for p in self.mapper_progress_info]) 209 | 210 | def run(self): 211 | for idx, mapper_progress_info in self.instance.get_progress(): 212 | self.mapper_progress_info[idx] = mapper_progress_info 213 | self.refresh_progress_info() 214 | progress_info = { 215 | 'added': self.progress_info[ProgressThread.P_ADDED], 216 | 'loaded': self.progress_info[ProgressThread.P_LOADED], 217 | 'processed': self.progress_info[ProgressThread.P_PROCESSED], 218 | 'total': self.progress_info[ProgressThread.P_TOTAL], 219 | } 220 | self.progress(progress_info) 221 | 222 | 223 | class ParallelProcessor(Paralleller): 224 | """ 225 | Args: 226 | num_of_processor (int): Number of processes to use. 227 | mapper (Callable / Mapper): Function or subclass of `Mapper` class. 228 | max_size_per_mapper_queue (int, optional): Maximum size of mapper queue for one process. 229 | If it's full, the corresponding process will be blocked. 230 | 0 by default means unlimited. 231 | collector (Callable, optional): If the collector data needs to be get in main process (another thread), 232 | set this handler, the arguments are same to the return from mapper. 233 | The return result is one by one, order is arbitrary. 234 | max_size_per_collector_queue (int, optional): Maximum size of collector queue for one process. 235 | If it's full, the corresponding process will be blocked. 236 | 0 by default means unlimited. 237 | enable_process_id (bool, optional): If it's true, an additional argument `_idx` (process id) will be 238 | passed to `mapper` function. This has no effect for `Mapper` class. 239 | It defaults to False. 240 | batch_size (int, optional): Batch size, defaults to 1. 241 | progress (Callable, optional): Progress function, which takes a dictionary as input. 242 | The dictionary contains following keys: `total` can be set by `progress_total`, 243 | `added` indicates the number of tasks has been added to the queue, 244 | `loaded` indicates the number of tasks has been loaded to worker processes, 245 | `processed` indicates the number of tasks has been processed by worker processes. 246 | Defaults to None. 247 | progress_total (int, optional): Total number of tasks. Defaults to None. 248 | use_shm (bool, optional): When True, and when running on Python version 3.8 or later, 249 | use ShmQueue for higher performance. Defaults to False. 250 | enable_collector_queues (bool, optional): When True, create a collector queue for each 251 | processor. When False, do not allocate collector queues, saving 252 | resources. Defaults to True. 253 | single_mapper_queue (bool, optional): When True, allocate a single mapper queue that will 254 | be shared between the worker processes. Sending processes can 255 | go to sleep when the mapper queue is full. When False, each process 256 | gets its own mapper queue, and CPU-intensive polling may be needed to 257 | find a mapper queue which can accept a new request. 258 | 259 | Note: 260 | - Do NOT implement heavy compute-intensive operations in collector, they should be in mapper. 261 | - Tune the value for queue size and batch size will optimize performance a lot. 262 | - `collector` only collects returns from `mapper` or `Mapper.process`. 263 | - The frequency of executing `progress` function depends on CPU. 264 | """ 265 | 266 | # Command format in queue. Represent in tuple. 267 | # The first element of tuple will be command, the rests are arguments or data. 268 | # (CMD_XXX, args...) 269 | CMD_DATA = 0 270 | CMD_STOP = 1 271 | 272 | QSTATS_ON = 0 273 | QSTATS_OFF = 1 274 | 275 | def __init__(self, num_of_processor: int, mapper: Callable, max_size_per_mapper_queue: int = 0, 276 | collector: Callable = None, max_size_per_collector_queue: int = 0, 277 | enable_process_id: bool = False, batch_size: int = 1, progress=None, progress_total = None, 278 | use_shm=False, enable_collector_queues=True, 279 | single_mapper_queue: bool = False): 280 | self.num_of_processor = num_of_processor 281 | self.single_mapper_queue = single_mapper_queue 282 | if sys.version_info >= (3, 8): 283 | self.collector_queues: typing.Optional[typing.Union[ShmQueue, mp.Queue]] 284 | else: 285 | self.collector_queues: typing.Optional[mp.Queue] 286 | if use_shm: 287 | if sys.version_info >= (3, 8): 288 | if single_mapper_queue: 289 | self.mapper_queues = [ShmQueue(maxsize=max_size_per_mapper_queue * num_of_processor)] 290 | else: 291 | self.mapper_queues = [ShmQueue(maxsize=max_size_per_mapper_queue) for _ in range(num_of_processor)] 292 | if enable_collector_queues: 293 | self.collector_queues = [ShmQueue(maxsize=max_size_per_collector_queue) for _ in range(num_of_processor)] 294 | else: 295 | self.collector_queues = None 296 | else: 297 | raise ValueError("shm not available in this version of Python.") 298 | else: 299 | if single_mapper_queue: 300 | self.mapper_queues = [mp.Queue(maxsize=max_size_per_mapper_queue * num_of_processor)] 301 | else: 302 | self.mapper_queues = [mp.Queue(maxsize=max_size_per_mapper_queue) for _ in range(num_of_processor)] 303 | if enable_collector_queues: 304 | self.collector_queues = [mp.Queue(maxsize=max_size_per_collector_queue) for _ in range(num_of_processor)] 305 | self.collector_qstats = [self.QSTATS_ON for _ in range(num_of_processor)] 306 | else: 307 | self.collector_queues = None 308 | 309 | if self.collector_queues is not None: 310 | if single_mapper_queue: 311 | self.processes = [mp.Process(target=self._run, args=(i, self.mapper_queues[0], self.collector_queues[i])) 312 | for i in range(num_of_processor)] 313 | else: 314 | self.processes = [mp.Process(target=self._run, args=(i, self.mapper_queues[i], self.collector_queues[i])) 315 | for i in range(num_of_processor)] 316 | else: 317 | if single_mapper_queue: 318 | self.processes = [mp.Process(target=self._run, args=(i, self.mapper_queues[0], None)) 319 | for i in range(num_of_processor)] 320 | else: 321 | self.processes = [mp.Process(target=self._run, args=(i, self.mapper_queues[i], None)) 322 | for i in range(num_of_processor)] 323 | if progress is not None: 324 | if sys.version_info >= (3, 8): 325 | self.progress_queues: typing.Optional[typing.Union[ShmQueue, mp.Queue]] 326 | else: 327 | self.progress_queues: typing.Optional[mp.Queue] 328 | if use_shm: 329 | if sys.version_info >= (3, 8): 330 | self.progress_queues = [ShmQueue(maxsize=1) for _ in range(num_of_processor)] 331 | else: 332 | raise ValueError("shm not available in this version of Python.") 333 | else: 334 | self.progress_queues = [mp.Queue(maxsize=1) for _ in range(num_of_processor)] 335 | self.progress_qstats = [self.QSTATS_ON for _ in range(num_of_processor)] 336 | else: 337 | self.progress_queues = None 338 | self.progress = progress 339 | 340 | ctx = self 341 | if not inspect.isclass(mapper) or not issubclass(mapper, Mapper): 342 | class DefaultMapper(Mapper): 343 | def process(self, *args, **kwargs): 344 | if ctx.enable_process_id: 345 | kwargs['_idx'] = self._idx 346 | return mapper(*args, **kwargs) 347 | self.mapper = DefaultMapper 348 | else: 349 | self.mapper = mapper 350 | 351 | self.collector = collector 352 | self.mapper_queue_index = 0 353 | self.enable_process_id = enable_process_id 354 | self.batch_size = batch_size 355 | self.batch_data = [] 356 | 357 | # collector can be handled in each process or in main process after merging (collector needs to be set) 358 | # if collector is set, it needs to be handled in main process; 359 | # otherwise, it assumes there's no collector. 360 | if collector: 361 | self.collector_thread = CollectorThread(self, collector) 362 | 363 | if progress: 364 | self.progress_thread = ProgressThread(self, progress, progress_total, num_of_processor) 365 | 366 | def start(self): 367 | """ 368 | Start processes and threads. 369 | """ 370 | if self.collector: 371 | self.collector_thread.start() 372 | if self.progress: 373 | self.progress_thread.start() 374 | for p in self.processes: 375 | p.start() 376 | 377 | def join(self): 378 | """ 379 | Block until processes and threads return. 380 | """ 381 | if self.collector: 382 | self.collector_thread.join() 383 | if self.progress: 384 | self.progress_thread.join() 385 | for p in self.processes: 386 | p.join() 387 | for q in self.mapper_queues: 388 | q.close() 389 | q.join_thread() 390 | if self.collector_queues is not None: 391 | for q in self.collector_queues: 392 | q.close() 393 | q.join_thread() 394 | if self.progress_queues is not None: 395 | for q in self.progress_queues: 396 | q.close() 397 | q.join_thread() 398 | pass 399 | 400 | def task_done(self): 401 | """ 402 | Indicate that all resources which need to add_task are added to processes. 403 | (main process, blocked) 404 | """ 405 | if len(self.batch_data) > 0: 406 | self._add_task(self.batch_data) 407 | self.batch_data = [] 408 | 409 | for i in range(self.num_of_processor): 410 | if self.single_mapper_queue: 411 | self.mapper_queues[0].put((ParallelProcessor.CMD_STOP,)) 412 | else: 413 | self.mapper_queues[i].put((ParallelProcessor.CMD_STOP,)) 414 | 415 | def add_task(self, *args, **kwargs): 416 | """ 417 | Add data to one a mapper queue. 418 | 419 | When a single mapper queue is in use, put the process to sleep if the 420 | queue is full. When multiple mapper queues are in use (one per process), 421 | use CPU-intensive polling (round-robin processing) to find the next available 422 | queue. (main process, blocked or unblocked depending upon single_mapper_queue) 423 | """ 424 | self.batch_data.append((args, kwargs)) 425 | if self.progress: 426 | self.progress_thread.progress_info[ProgressThread.P_ADDED] += 1 427 | 428 | if len(self.batch_data) == self.batch_size: 429 | self._add_task(self.batch_data) 430 | self.batch_data = [] # reset buffer 431 | 432 | def _add_task(self, batched_args): 433 | if self.single_mapper_queue: 434 | self.mapper_queues[0].put((ParallelProcessor.CMD_DATA, batched_args)) 435 | else: 436 | while True: 437 | q = self.mapper_queues[self.mapper_queue_index] 438 | self.mapper_queue_index = (self.mapper_queue_index + 1) % self.num_of_processor 439 | try: 440 | q.put_nowait((ParallelProcessor.CMD_DATA, batched_args)) 441 | return # put in 442 | except queue.Full: 443 | continue # find next available 444 | 445 | def _run(self, idx: int, mapper_queue: mp.Queue, collector_queue: typing.Optional[mp.Queue]): 446 | """ 447 | Process's activity. It handles queue IO and invokes user's mapper handler. 448 | (subprocess, blocked, only two queues can be used to communicate with main process) 449 | """ 450 | with self.mapper(idx) as mapper: 451 | while True: 452 | data = mapper_queue.get() 453 | if data[0] == ParallelProcessor.CMD_STOP: 454 | # print(idx, 'stop') 455 | self._update_progress(mapper, finish=True) 456 | if self.collector and collector_queue is not None: 457 | collector_queue.put((ParallelProcessor.CMD_STOP,)) 458 | return 459 | elif data[0] == ParallelProcessor.CMD_DATA: 460 | batch_result = [] 461 | for d in data[1]: 462 | args, kwargs = d[0], d[1] 463 | # print(idx, 'data') 464 | self._update_progress(mapper, type_=ProgressThread.P_LOADED) 465 | result = mapper.process(*args, **kwargs) 466 | self._update_progress(mapper, type_=ProgressThread.P_PROCESSED) 467 | if collector_queue is not None: 468 | if self.collector: 469 | if not isinstance(result, tuple): # collector must represent as tuple 470 | result = (result,) 471 | batch_result.append(result) 472 | if collector_queue is not None and len(batch_result) > 0: 473 | collector_queue.put((ParallelProcessor.CMD_DATA, batch_result)) 474 | batch_result = [] # reset buffer 475 | 476 | def _update_progress(self, mapper, type_=None, finish=False): 477 | if self.progress: 478 | try: 479 | if not finish: 480 | # No need to ensure the status will be pulled from main process 481 | # so if queue is full just skip this update 482 | mapper._progress_info[type_] += 1 483 | self.progress_queues[mapper._idx].put_nowait( (ParallelProcessor.CMD_DATA, mapper._progress_info) ) 484 | else: 485 | # update the last progress of each mapper 486 | self.progress_queues[mapper._idx].put( (ParallelProcessor.CMD_STOP, mapper._progress_info) ) 487 | except queue.Full: 488 | pass 489 | 490 | def collect(self): 491 | """ 492 | Get data from collector queue sequentially. 493 | (main process, unblocked, using round robin to find next available queue) 494 | """ 495 | if not self.collector: 496 | return 497 | 498 | idx = 0 499 | while True: 500 | # all queues finished 501 | if sum([int(s == self.QSTATS_OFF) for s in self.collector_qstats]) == self.num_of_processor: 502 | return 503 | 504 | # get next unfinished queue 505 | while self.collector_qstats[idx] == self.QSTATS_OFF: 506 | idx = (idx + 1) % self.num_of_processor 507 | q = self.collector_queues[idx] 508 | 509 | try: 510 | data = q.get_nowait() # get out 511 | if data[0] == ParallelProcessor.CMD_STOP: 512 | self.collector_qstats[idx] = self.QSTATS_OFF 513 | elif data[0] == ParallelProcessor.CMD_DATA: 514 | yield data[1] 515 | except queue.Empty: 516 | continue # find next available 517 | finally: 518 | idx = (idx + 1) % self.num_of_processor 519 | 520 | def get_progress(self): 521 | """ 522 | Get progress information from each mapper. 523 | (main process) 524 | """ 525 | if not self.progress: 526 | return 527 | 528 | idx = 0 529 | while True: 530 | # all queues finished 531 | if sum([int(s == self.QSTATS_OFF) for s in self.progress_qstats]) == self.num_of_processor: 532 | return 533 | 534 | # get next unfinished queue 535 | while self.progress_qstats[idx] == self.QSTATS_OFF: 536 | idx = (idx + 1) % self.num_of_processor 537 | q = self.progress_queues[idx] 538 | 539 | try: 540 | data = q.get_nowait() 541 | if data[0] == ParallelProcessor.CMD_STOP: 542 | self.progress_qstats[idx] = self.QSTATS_OFF 543 | elif data[0] == ParallelProcessor.CMD_DATA: 544 | pass 545 | yield idx, data[1] 546 | except queue.Empty: 547 | continue # find next available 548 | finally: 549 | idx = (idx + 1) % self.num_of_processor 550 | -------------------------------------------------------------------------------- /pyrallel/queue.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import multiprocessing as mp 3 | import multiprocessing.queues as mpq 4 | from queue import Full, Empty 5 | import pickle 6 | import math 7 | # import uuid 8 | import os 9 | import struct 10 | import sys 11 | import time 12 | import typing 13 | import dill # type: ignore 14 | import zlib 15 | 16 | 17 | if sys.version_info >= (3, 8): 18 | from multiprocessing.shared_memory import SharedMemory 19 | __all__ = ['ShmQueue'] 20 | else: 21 | from typing import TypeVar 22 | SharedMemory = TypeVar('SharedMemory') 23 | __all__ = [] 24 | 25 | 26 | class ShmQueue(mpq.Queue): 27 | """ShmQueue depends on shared memory instead of pipe to efficiently exchange data among processes. 28 | Shared memory is "System V style" memory blocks which can be shared and accessed directly by processes. 29 | This implementation is based on `multiprocessing.shared_memory.SharedMemory` hence requires Python >= 3.8. 30 | Its interface is almost identical to `multiprocessing.queue `_. 31 | But it allows one to specify the serializer, which by default is pickle. 32 | 33 | This implementation maintains two lists: a free buffer list, and a ready message list. 34 | The list heads for both lists are stored in a single shared memory area. 35 | 36 | The free buffer list is linked by the next_block_id field in each shared 37 | buffer's metadata area. 38 | 39 | Messages are built out of chunks. Each chunk occupies a single buffer. 40 | Each chunk contains a pointer (an integer identifier) to the next chunk's 41 | buffer using the next_chunk_block_id field in the shared buffer's metadata 42 | area. The list of ready messages links the first chunk of each ready 43 | message using the next_block_id field in the shared buffer's metadata 44 | area. 45 | 46 | Messages are serialized for transfer from the sender to the receiver. 47 | The serialized size of a message may not exceed the chunk size times 48 | the maximum queue size. If the deadlock_immanent_check is enabled 49 | (which is True by default), a ValueError will be raised on an attempt 50 | to put a message that is too large. 51 | 52 | Args: 53 | chunk_size (int, optional): Size of each chunk. By default, it is `ShmQueue.DEFAULT_CHUNK_SIZE` (1*1024*1024). \ 54 | If it is 0, it will be set to `ShmQueue.MAX_CHUNK_SIZE` (512*1024*1024). 55 | maxsize (int, optional): Maximum queue size, e.g. the maximum number of chunks available to a queue. \ 56 | If it is 0 (default), it will be set to `ShmQueue.DEFAULT_MAXSIZE` (2). 57 | serializer (obj, optional): Serializer to serialize and deserialize data. \ 58 | If it is None (default), pickle will be used. \ 59 | The serializer should implement `loads(bytes data) -> object` \ 60 | and `dumps(object obj) -> bytes`. 61 | integrity_check (bool, optional): When True, perform certain integrity checks on messages. 62 | 1) After serializing a message, immediately deserialize it to check for validity. 63 | 2) Save the length of a message after serialization. 64 | 3) Compute a checksum of each chunk of the message. 65 | 4) Include the total message size and chunk checksum in the metadata for each chunk. 66 | 5) When pulling a chunk from the queue, verify the chunk checksum. 67 | 6) After reassembling a message out of chunks, verify the total message size. 68 | deadlock_check (bool, optional): When fetching a writable block, print a message if two or more 69 | loops are needed to get a free block. (default is False) 70 | deadlock_immanent_check (bool, optional): Raise a ValueError if a message submitted to 71 | put(...) is too large to process. (Default is True) 72 | watermark_check (bool, optional): When true, prit a mesage with the largest message size so far in chunks. 73 | use_semaphores (bool, optional): When true, use semaphores to control access to the free list and the 74 | message list. The system will sleep when accessing these shared resources, 75 | instead of entering a polling loop. 76 | 77 | Note: 78 | - `close` needs to be invoked once to release memory and avoid a memory leak. 79 | - `qsize`, `empty` and `full` are implemented but may block. 80 | - Each shared queue consumes one shared memory area for the shared list heads 81 | and one shared memory area for each shared buffer. The underlying code in 82 | multiprocessing.shared_memory.SharedMemory consumes one process file descriptor 83 | for each shared memory area. There is a limit on the number of file descriptors 84 | that a process may have open. 85 | - Thus, there is a tradeoff between the chunk_size and maxsize: smaller chunks 86 | use memory more effectively with some overhead cost, but may run into the limit 87 | on the number of open file descriptors to process large messages and avoid blocking. 88 | Larger chunks waste unused space, but are less likely to run into the open file descriptor 89 | limit or to block waiting for a free buffer. 90 | 91 | Example:: 92 | 93 | def run(q): 94 | e = q.get() 95 | print(e) 96 | 97 | if __name__ == '__main__': 98 | q = ShmQueue(chunk_size=1024 * 4, maxsize=10) 99 | p = Process(target=run, args=(q,)) 100 | p.start() 101 | q.put(100) 102 | p.join() 103 | q.close() 104 | 105 | """ 106 | 107 | MAX_CHUNK_SIZE: int = 512 * 1024 * 1024 108 | """int: The maximum allowable size for a buffer chunk. 512MB should be a large enough 109 | value.""" 110 | 111 | DEFAULT_CHUNK_SIZE: int = 1 * 1024 * 1024 112 | """int: The default size for a buffer chunk.""" 113 | 114 | DEFAULT_MAXSIZE: int = 2 115 | """int: The default maximum size for a queue.""" 116 | 117 | RESERVED_BLOCK_ID: int = 0xffffffff 118 | """int: RESERVED_BLOCK_ID is stored in the list head pointer and next chunk 119 | block id fields to indicate that thee is no next block. This value is intended 120 | to simplify debugging by removing stale next-block values. It is not used to 121 | test for blok chain termination; counters are used for that purpose, instead.""" 122 | 123 | META_STRUCT: typing.Mapping[str, typing.Tuple[int, int, str]] = { 124 | 'msg_id': (0, 12, '12s'), 125 | 'msg_size': (12, 16, 'I'), 126 | 'chunk_id': (16, 20, 'I'), 127 | 'total_chunks': (20, 24, 'I'), 128 | 'total_msg_size': (24, 28, 'I'), 129 | 'checksum': (28, 32, 'I'), 130 | 'src_pid': (32, 36, 'I'), 131 | 'next_chunk_block_id': (36, 40, 'I'), 132 | 'next_block_id': (40, 44, 'I') 133 | } 134 | """The per-buffer metadata structure parameters for struct.pack(...) and 135 | struct.unpack(...).""" 136 | 137 | META_BLOCK_SIZE: int = 44 138 | """int: The length of the buffer metadata structure in bytes.""" 139 | 140 | LIST_HEAD_STRUCT: typing.Mapping[str, typing.Tuple[int, int, str]] = { 141 | 'first_block': (0, 4, 'I'), 142 | 'last_block': (4, 8, 'I'), 143 | 'block_count': (8, 12, 'I') 144 | } 145 | """The list head structure parameters for struct.pack(...) and 146 | struct.unpack(...). The list header structure maintains a block 147 | count in addition to first_block and last_block pointers.""" 148 | 149 | LIST_HEAD_SIZE: int = 12 150 | """int: The length of a list head structure in bytes.""" 151 | 152 | FREE_LIST_HEAD: int = 0 153 | """int: The index of the free buffer list head in the SharedMemory segment for 154 | sharing message queue list heads between processes.""" 155 | 156 | MSG_LIST_HEAD: int = 1 157 | """int: The index of the queued message list head in the SharedMemory segment for 158 | sharing message queue list heads between processes.""" 159 | 160 | qid_counter: int = 0 161 | """int: Each message queue has a queue ID (qid) that identifies the queue for 162 | debugging messages. This mutable class counter is used to create new queue ID 163 | values for newly-created queue. Implicitly, this assumes that message queues 164 | will be created by a single initialization process, then distributed to worker 165 | process. If shared message queues will be created by multiple processes, then 166 | the queue ID should be altered to incorporate the process ID (pid) of the 167 | process that created the shared message queue, or an additional field should 168 | be created and presented with the shared message queue's creator's pid..""" 169 | 170 | def __init__(self, 171 | chunk_size: int=DEFAULT_CHUNK_SIZE, 172 | maxsize: int=DEFAULT_MAXSIZE, 173 | serializer=None, 174 | integrity_check: bool=False, 175 | deadlock_check: bool=False, 176 | deadlock_immanent_check: bool=True, 177 | watermark_check: bool = False, 178 | use_semaphores: bool = True, 179 | verbose: bool=False): 180 | ctx = mp.get_context() # TODO: What is the proper type hint here? 181 | 182 | super().__init__(maxsize, ctx=ctx) 183 | 184 | self.qid: int = self.__class__.qid_counter 185 | self.__class__.qid_counter += 1 186 | 187 | self.verbose: bool = verbose 188 | if self.verbose: 189 | print("Starting ShmQueue qid=%d pid=%d chunk_size=%d maxsize=%d." % (self.qid, os.getpid(), chunk_size, maxsize), file=sys.stderr, flush=True) # *** 190 | 191 | self.chunk_size: int = min(chunk_size, self.__class__.MAX_CHUNK_SIZE) \ 192 | if chunk_size > 0 else self.__class__.MAX_CHUNK_SIZE 193 | 194 | self.maxsize: int = maxsize if maxsize > 0 else self.__class__.DEFAULT_MAXSIZE 195 | 196 | self.serializer = serializer or pickle 197 | 198 | self.integrity_check: bool = integrity_check 199 | self.deadlock_check: bool = deadlock_check 200 | self.deadlock_immanent_check: bool = deadlock_immanent_check 201 | self.watermark_check: bool = watermark_check 202 | self.chunk_watermark: int = 0 203 | 204 | self.mid_counter: int = 0 205 | 206 | self.producer_lock = ctx.Lock() 207 | self.free_list_lock = ctx.Lock() 208 | self.msg_list_lock = ctx.Lock() 209 | 210 | self.use_semaphores: bool = use_semaphores 211 | if not use_semaphores: 212 | # Put the None case first to make mypy happier. 213 | self.free_list_semaphore: typing.Optional[typing.Any] = None # TODO: what is the type returned by ctx.Semaphore(0)? 214 | self.msg_list_semaphore: typing.Optional[typing.Any] = None 215 | else: 216 | self.free_list_semaphore = ctx.Semaphore(0) 217 | self.msg_list_semaphore = ctx.Semaphore(0) 218 | 219 | self.list_heads: SharedMemory = SharedMemory(create=True, size=self.__class__.LIST_HEAD_SIZE * 2) 220 | self.init_list_head(self.__class__.FREE_LIST_HEAD) 221 | self.init_list_head(self.__class__.MSG_LIST_HEAD) 222 | 223 | self.block_locks: typing.List[typing.Any] = [ctx.Lock()] * maxsize # TODO: what is the type returned by ctx.Lock()? 224 | self.data_blocks: typing.List[SharedMemory] = [] 225 | block_id: int 226 | for block_id in range(maxsize): 227 | self.data_blocks.append(SharedMemory(create=True, size=self.__class__.META_BLOCK_SIZE + self.chunk_size)) 228 | self.add_free_block(block_id) 229 | 230 | def __getstate__(self): 231 | """This routine retrieves queue information when forking a new process.""" 232 | return (self.qid, 233 | self.verbose, 234 | self.chunk_size, 235 | self.maxsize, 236 | dill.dumps(self.serializer), 237 | self.integrity_check, 238 | self.deadlock_check, 239 | self.deadlock_immanent_check, 240 | self.watermark_check, 241 | self.chunk_watermark, 242 | self.mid_counter, 243 | self.producer_lock, 244 | self.free_list_lock, 245 | self.msg_list_lock, 246 | self.use_semaphores, 247 | self.free_list_semaphore, 248 | self.msg_list_semaphore, 249 | dill.dumps(self.list_heads), 250 | self.block_locks, 251 | dill.dumps(self.data_blocks)) 252 | 253 | def __setstate__(self, state): 254 | """This routine saves queue information when forking a new process.""" 255 | (self.qid, 256 | self.verbose, 257 | self.chunk_size, 258 | self.maxsize, 259 | self.serializer, 260 | self.integrity_check, 261 | self.deadlock_check, 262 | self.deadlock_immanent_check, 263 | self.watermark_check, 264 | self.chunk_watermark, 265 | self.mid_counter, 266 | self.producer_lock, 267 | self.free_list_lock, 268 | self.msg_list_lock, 269 | self.use_semaphores, 270 | self.free_list_semaphore, 271 | self.msg_list_semaphore, 272 | self.list_heads, 273 | self.block_locks, 274 | self.data_blocks) = state 275 | 276 | self.list_heads = dill.loads(self.list_heads) 277 | self.data_blocks = dill.loads(self.data_blocks) 278 | self.serializer = dill.loads(self.serializer) 279 | 280 | def get_list_head_field(self, lh: int, type_: str)->int: 281 | """int: Get a field from a list head. 282 | 283 | Args: 284 | lh (int): The index of the list head in the list head shared memory. 285 | type (str): The name of the list head field.""" 286 | addr_s: typing.Optional[int] 287 | addr_e: typing.Optional[int] 288 | ctype: typing.Optional[str] 289 | addr_s, addr_e, ctype = self.__class__.LIST_HEAD_STRUCT.get(type_, (None, None, None)) 290 | if addr_s is None or addr_e is None or ctype is None: 291 | raise ValueError("get_list_head_field: unrecognized %s" % repr(type_)) 292 | return struct.unpack(ctype, self.list_heads.buf[(self.__class__.LIST_HEAD_SIZE * lh) + addr_s : (self.__class__.LIST_HEAD_SIZE * lh) + addr_e])[0] 293 | 294 | def set_list_head_field(self, lh: int, data: int, type_: str): 295 | addr_s: typing.Optional[int] 296 | addr_e: typing.Optional[int] 297 | ctype: typing.Optional[str] 298 | addr_s, addr_e, ctype = self.__class__.LIST_HEAD_STRUCT.get(type_, (None, None, None)) 299 | if addr_s is None or addr_e is None or ctype is None: 300 | raise ValueError("get_list_head_field: unrecognized %s" % repr(type_)) 301 | 302 | # TODO: find a better way to calm mypy's annoyance at the following: 303 | self.list_heads.buf[(self.__class__.LIST_HEAD_SIZE * lh) + addr_s : (self.__class__.LIST_HEAD_SIZE * lh) + addr_e] = struct.pack(ctype, data) #type: ignore 304 | 305 | def get_meta(self, block: SharedMemory, type_: str)->typing.Union[bytes, int]: 306 | """typing.Union[bytes, int]: Get a field from a block's metadata area in shared memory. 307 | 308 | Args: 309 | block (SharedMemory): The shared memory for the data block. 310 | type_ (str): The name of the metadata field to extract.""" 311 | addr_s: typing.Optional[int] 312 | addr_e: typing.Optional[int] 313 | ctype: typing.Optional[str] 314 | addr_s, addr_e, ctype = self.__class__.META_STRUCT.get(type_, (None, None, None)) 315 | if addr_s is None or addr_e is None or ctype is None: 316 | raise ValueError("get_meta: unrecognized %s" % repr(type_)) 317 | return struct.unpack(ctype, block.buf[addr_s : addr_e])[0] 318 | 319 | def set_meta(self, block: SharedMemory, data, type_: str): 320 | addr_s: typing.Optional[int] 321 | addr_e: typing.Optional[int] 322 | ctype: typing.Optional[str] 323 | addr_s, addr_e, ctype = self.__class__.META_STRUCT.get(type_, (None, None, None)) 324 | if addr_s is None or addr_e is None or ctype is None: 325 | raise ValueError("set_meta: unrecognized %s" % repr(type_)) 326 | 327 | # TODO: find a better way to calm mypy's annoyance at the following: 328 | block.buf[addr_s : addr_e] = struct.pack(ctype, data) #type: ignore 329 | 330 | def get_data(self, block: SharedMemory, data_size: int)->bytes: 331 | """bytes: Get a memoryview of the a shared memory data block. 332 | 333 | Args: 334 | block (SharedMemory): The chared memory block. 335 | data_size (int): The number of bytes in the returned memoryview slice.""" 336 | return block.buf[self.__class__.META_BLOCK_SIZE:self.__class__.META_BLOCK_SIZE+data_size] 337 | 338 | def set_data(self, block: SharedMemory, data: bytes, data_size: int): 339 | # TODO: find a better way to calm mypy's annoyance at the following: 340 | block.buf[self.__class__.META_BLOCK_SIZE:self.__class__.META_BLOCK_SIZE+data_size] = data # type: ignore 341 | 342 | def init_list_head(self, lh: int): 343 | """Initialize a block list, clearing the block count and setting the first_block 344 | and last_block fields to the reserved value that indicates that they are 345 | void pointers. 346 | 347 | Args: 348 | lh (int): The index of the list head in the list head shared memory area.""" 349 | self.set_list_head_field(lh, 0, 'block_count') 350 | self.set_list_head_field(lh, self.__class__.RESERVED_BLOCK_ID, 'first_block') 351 | self.set_list_head_field(lh, self.__class__.RESERVED_BLOCK_ID, 'last_block') 352 | 353 | def get_block_count(self, lh: int)->int: 354 | """int: Get the count of blocks queued in a block list. 355 | 356 | Args: 357 | lh (int): The index of the list head in the list head shared memory area. 358 | """ 359 | return self.get_list_head_field(lh, 'block_count') 360 | 361 | def get_first_block(self, lh: int)->typing.Optional[int]: 362 | """Get the first block on a block list, updating the list head fields. 363 | 364 | Args: 365 | lh (int): The index of the list head in the list head shared memory area. 366 | 367 | Returns: 368 | None: No block is available 369 | int: The block_id of the first available block. 370 | """ 371 | 372 | block_count: int = self.get_block_count(lh) 373 | if block_count == 0: 374 | return None 375 | 376 | block_id: int = self.get_list_head_field(lh, 'first_block') 377 | 378 | block_count -= 1 379 | if block_count == 0: 380 | self.init_list_head(lh) 381 | else: 382 | with self.block_locks[block_id]: 383 | maybe_next_block_id: typing.Union[bytes, int] = self.get_meta(self.data_blocks[block_id], 'next_block_id') 384 | if isinstance(maybe_next_block_id, int): 385 | next_block_id: int = maybe_next_block_id 386 | else: 387 | raise ValueError("get_first_block internal error: next_block_id is not int.") 388 | self.set_list_head_field(lh, next_block_id, 'first_block') 389 | self.set_list_head_field(lh, block_count, 'block_count') 390 | return block_id 391 | 392 | def add_block(self, lh: int, block_id: int): 393 | """Add a block to a block list. 394 | 395 | Args: 396 | lh (int): The index of the list head in the list head shared memory area. 397 | """ 398 | block_count: int = self.get_list_head_field(lh, 'block_count') 399 | if block_count == 0: 400 | self.set_list_head_field(lh, block_id, 'first_block') 401 | self.set_list_head_field(lh, block_id, 'last_block') 402 | self.set_list_head_field(lh, 1, 'block_count') 403 | 404 | else: 405 | last_block: int = self.get_list_head_field(lh, 'last_block') 406 | with self.block_locks[last_block]: 407 | self.set_meta(self.data_blocks[last_block], block_id, 'next_block_id') 408 | self.set_list_head_field(lh, block_id, 'last_block') 409 | self.set_list_head_field(lh, block_count + 1, 'block_count') 410 | 411 | def get_free_block_count(self)->int: 412 | """int: Get the number of free blocks.""" 413 | with self.free_list_lock: 414 | return self.get_block_count(self.__class__.FREE_LIST_HEAD) 415 | 416 | def get_first_free_block(self, block: bool, timeout: typing.Optional[float])->typing.Optional[int]: 417 | """Get the first free block. 418 | 419 | When using semaphores, optionally block with an optional timeout. If 420 | you choose to block without a timeout, the method will not return until 421 | a free block is available. 422 | 423 | Args: 424 | block (bool): When True, and when using semaphores, wait until an 425 | free block is available or a timeout occurs. 426 | timeout (typing.Optional[float]): When block is True and timeout is 427 | positive, block for at most timeout seconds attempting to acquire 428 | the free block. 429 | 430 | Returns: 431 | None: No block is available 432 | int: The block_id of the first available block. 433 | """ 434 | if self.free_list_semaphore is not None: 435 | self.free_list_semaphore.acquire(block=block, timeout=timeout) 436 | with self.free_list_lock: 437 | return self.get_first_block(self.__class__.FREE_LIST_HEAD) 438 | 439 | def add_free_block(self, block_id: int): 440 | """Return a block to the free block list. 441 | 442 | Args: 443 | block_id (int): The identifier of the block being returned. 444 | """ 445 | with self.free_list_lock: 446 | self.add_block(self.__class__.FREE_LIST_HEAD, block_id) 447 | if self.free_list_semaphore is not None: 448 | self.free_list_semaphore.release() 449 | 450 | def get_msg_count(self)->int: 451 | """int: Get the number of messages on the message list.""" 452 | with self.msg_list_lock: 453 | return self.get_block_count(self.__class__.MSG_LIST_HEAD) 454 | 455 | def get_first_msg(self, block: bool, timeout: typing.Optional[float])->typing.Optional[int]: 456 | """Take the first available message, if any, from the available message list. 457 | 458 | When using semaphores, optionally block with an optional timeout. If 459 | you choose to block without a timeout, the method will not return until 460 | a free block is available. 461 | 462 | Args: 463 | block (bool): When True, and when using semaphores, wait until an 464 | message is available or a timeout occurs. 465 | timeout (typing.Optional[float]): When block is True and timeout is 466 | positive, block for at most timeout seconds attempting to acquire 467 | the message. 468 | 469 | Returns: 470 | None: No message is available 471 | int: The block_id of the first chunk of the first available message. 472 | """ 473 | if self.msg_list_semaphore is not None: 474 | self.msg_list_semaphore.acquire(block=block, timeout=timeout) 475 | with self.msg_list_lock: 476 | return self.get_first_block(self.__class__.MSG_LIST_HEAD) 477 | 478 | def add_msg(self, block_id: int): 479 | """Add a message to the available message list 480 | 481 | Args: 482 | block_id (int): The block identifier of the first chunk of the message. 483 | """ 484 | with self.msg_list_lock: 485 | self.add_block(self.__class__.MSG_LIST_HEAD, block_id) 486 | if self.msg_list_semaphore is not None: 487 | self.msg_list_semaphore.release() 488 | 489 | def generate_msg_id(self)->bytes: 490 | """bytes: Generate the next message identifier, but do not consume it. 491 | 492 | Note: 493 | Message IDs are assigned independenyly by each process using the queue. 494 | They need to be paired with the source process ID to be used to identify 495 | a message for debugging. 496 | """ 497 | return ("%012x" % (self.mid_counter + 1)).encode('utf-8') 498 | 499 | def consume_msg_id(self): 500 | """Consume a message identifier. 501 | 502 | Note: 503 | Message identifiers are consumed when we are certain that we can process 504 | the message. They will not be consumed if we start to process a message 505 | but fail due to a conition such as insufficient free buffers. 506 | """ 507 | self.mid_counter += 1 508 | 509 | def next_writable_block_id(self, block: bool, timeout: typing.Optional[float], msg_id: bytes, src_pid: int)->int: 510 | """int: Get the block ID of the first free block. 511 | 512 | Get the block ID of the first free block, supporting 513 | blocking/nonblocking modes and timeouts when blocking, even when 514 | semaphores are not being used. Store int he block's metadata area the 515 | message ID for the message we are building and the pid of the process 516 | acquiring the block. 517 | 518 | Args: 519 | block (bool): When True, and when using semaphores, wait until an 520 | free block is available or a timeout occurs. 521 | timeout (typing.Optional[float]): When block is True and timeout is 522 | positive, block for at most timeout seconds attempting to acquire 523 | the free block. 524 | msg_id (bytes): The message ID assigned to the message being built. 525 | src_pid: The process ID (pid) of the process that is acquiring the block. 526 | 527 | Raises: 528 | queue.Full: No block is available. Full is raised immediately in nonblocking 529 | mode, or after the timeout in blocking mode when a timeout is specified. 530 | 531 | """ 532 | looped: bool = False 533 | loop_cnt: int = 0 534 | time_start = time.time() 535 | while True: 536 | remaining_timeout: typing.Optional[float] = timeout 537 | if remaining_timeout is not None: 538 | remaining_timeout -= (time.time() - time_start) 539 | if remaining_timeout <= 0: 540 | if self.verbose: 541 | print("next_writable_block_id: qid=%d src_pid=%d: queue FULL (timeout)" % (self.qid, src_pid), file=sys.stderr, flush=True) # *** 542 | raise Full 543 | 544 | block_id: typing.Optional[int] = self.get_first_free_block(block, remaining_timeout) 545 | if block_id is not None: 546 | break 547 | 548 | if not block: 549 | if self.verbose: 550 | print("next_writable_block_id: qid=%d src_pid=%d: FULL (nonblocking)" % (self.qid, src_pid), file=sys.stderr, flush=True) # *** 551 | raise Full 552 | 553 | if self.deadlock_check or self.verbose: 554 | loop_cnt += 1 555 | if (self.verbose and loop_cnt == 2) or (self.deadlock_check and loop_cnt % 10000 == 0): 556 | looped = True 557 | print("next_writable_block_id: qid=%d src_pid=%d: looping (%d loops)" % (self.qid, src_pid, loop_cnt), file=sys.stderr, flush=True) # *** 558 | 559 | if looped: 560 | print("next_writable_block_id: qid=%d src_pid=%d: looping ended after %d loops." % (self.qid, src_pid, loop_cnt), file=sys.stderr, flush=True) # *** 561 | 562 | with self.block_locks[block_id]: 563 | data_block = self.data_blocks[block_id] 564 | self.set_meta(data_block, msg_id, 'msg_id') 565 | self.set_meta(data_block, src_pid, 'src_pid') 566 | 567 | return block_id 568 | 569 | def next_readable_msg(self, block: bool, timeout: typing.Optional[float]=None)->typing.Tuple[int, bytes, int, int, int]: 570 | """Get the next available message, with blocking and timeouts. 571 | 572 | This method returns a 5-tuple: the data block and certain metadata. 573 | The reason for this complexity is to 574 | retrieve the metadata under a single access lock. 575 | 576 | Args: 577 | block (bool): When True, and when using semaphores, wait until an 578 | free block is available or a timeout occurs. 579 | timeout (typing.Optional[float]): When block is True and timeout is 580 | positive, block for at most timeout seconds attempting to acquire 581 | the free block. 582 | 583 | Returns: 584 | src_pid (int): The process iodentifier of the process that originated the message. 585 | msg_id (bytes): The messag identifier. 586 | block_id (int): The identifier for the first chunk in the message. 587 | total_chunks (int): The total number of chunks in the message. 588 | next_chunk_block_id (int): The identifier for the next chunk in the message. 589 | 590 | Raises: 591 | queue.Empty: no messages are available and either nonblocking mode or a timeout occured. 592 | ValueError: An internal error occured in accessing the message's metadata. 593 | """ 594 | i = 0 595 | time_start = time.time() 596 | while True: 597 | remaining_timeout: typing.Optional[float] = timeout 598 | if remaining_timeout is not None: 599 | remaining_timeout -= (time.time() - time_start) 600 | if remaining_timeout <= 0: 601 | raise Empty 602 | block_id: typing.Optional[int] = self.get_first_msg(block=block, timeout=remaining_timeout) 603 | if block_id is not None: 604 | break 605 | 606 | if not block: 607 | raise Empty 608 | 609 | with self.block_locks[block_id]: 610 | data_block = self.data_blocks[block_id] 611 | src_pid: typing.Union[bytes, int] = self.get_meta(data_block, 'src_pid') 612 | msg_id: typing.Union[bytes, int] = self.get_meta(data_block, 'msg_id') 613 | total_chunks: typing.Union[bytes, int] = self.get_meta(data_block, 'total_chunks') 614 | next_chunk_block_id: typing.Union[bytes, int] = self.get_meta(data_block, 'next_chunk_block_id') 615 | if isinstance(src_pid, int) and isinstance (msg_id, bytes) and isinstance(total_chunks, int) and isinstance(next_chunk_block_id, int): 616 | return src_pid, msg_id, block_id, total_chunks, next_chunk_block_id 617 | else: 618 | raise ValueError("next_readable_msg: internal error extracting data block metadata.") 619 | 620 | # def debug_data_block(self): 621 | # for b in self.data_blocks: 622 | # print(bytes(b.buf[0:24])) 623 | 624 | def put(self, msg: typing.Any, block: bool=True, timeout: typing.Optional[float]=None): 625 | 626 | """ 627 | Put an object into a shared memory queue. 628 | 629 | Args: 630 | msg (obj): The object which is to be put into queue. 631 | block (bool, optional): If it is set to True (default), it will return after an item is put into queue. 632 | timeout (int, optional): A positive integer for the timeout duration in seconds, which is only effective when `block` is set to True. 633 | 634 | Raises: 635 | queue.Full: Raised if the call times out or the queue is full when `block` is False. 636 | ValueError: An internal error occured in accessing the message's metadata. 637 | ValueError: A request was made to send a message that, when serialized, exceeds the capacity of the queue. 638 | PicklingError: This exception is raised when the serializer is pickle and 639 | an error occured in serializing the message. 640 | UnpicklingError: This exception is raised when the serializer is pickle and 641 | an error occured in deserializing the message for an integrity check. 642 | 643 | Note: 644 | - Errors other then PicklingError might be raised if a serialized other then 645 | pickle is specified. 646 | """ 647 | if timeout is not None: 648 | if not block: 649 | raise ValueError("A timeout is allowed only when not blocking.") 650 | if timeout < 0: 651 | raise Full 652 | 653 | msg_id: bytes = self.generate_msg_id() 654 | src_pid: int = os.getpid() 655 | msg_body: bytes = self.serializer.dumps(msg) # type: ignore[union-attr] 656 | if self.integrity_check: 657 | total_msg_size: int = len(msg_body) 658 | msg2: typing.Any = self.serializer.loads(msg_body) # type: ignore[union-attr] 659 | if self.verbose: 660 | print("put: qid=%d src_pid=%d msg_id=%r: serialization integrity check is OK." % (self.qid, src_pid, msg_id), file=sys.stderr, flush=True) # *** 661 | 662 | total_chunks: int = math.ceil(len(msg_body) / self.chunk_size) 663 | if self.verbose: 664 | print("put: qid=%d src_pid=%d msg_id=%r: total_chunks=%d len(msg_body)=%d chunk_size=%d" % (self.qid, src_pid, msg_id, total_chunks, len(msg_body), self.chunk_size), file=sys.stderr, flush=True) # *** 665 | if self.watermark_check or self.verbose: 666 | if total_chunks > self.chunk_watermark: 667 | print("put: qid=%d src_pid=%d msg_id=%r: total_chunks=%d maxsize=%d new watermark" % (self.qid, src_pid, msg_id, total_chunks, self.maxsize), file=sys.stderr, flush=True) # *** 668 | self.chunk_watermark = total_chunks 669 | 670 | if self.deadlock_immanent_check and total_chunks > self.maxsize: 671 | raise ValueError("DEADLOCK IMMANENT: qid=%d src_pid=%d: total_chunks=%d > maxsize=%d" % (self.qid, src_pid, total_chunks, self.maxsize)) 672 | 673 | time_start: float = time.time() 674 | 675 | # We acquire the producer lock to avoid deadlock if multiple 676 | # producers need multiple chunks each. 677 | lock_acquired: bool = self.producer_lock.acquire(timeout=timeout) 678 | if not lock_acquired: 679 | # We must have timed out. 680 | if self.verbose: 681 | print("put: qid=%d src_pid=%d msg_id=%r: queue FULL" % (self.qid, src_pid, msg_id), file=sys.stderr, flush=True) # *** 682 | raise Full 683 | 684 | block_id: int 685 | block_id_list: typing.List[int] = [ ] 686 | try: 687 | # In case we will process more than one chunk and this is a 688 | # nonblocking or timed out request, start by reserving all the 689 | # blocks that we will need. 690 | i: int 691 | for i in range(total_chunks): 692 | try: 693 | remaining_timeout: typing.Optional[float] = timeout 694 | if remaining_timeout is not None: 695 | remaining_timeout -= (time.time() - time_start) 696 | if remaining_timeout <= 0: 697 | if self.verbose: 698 | print("put: qid=%d src_pid=%d msg_id=%r: queue FULL" % (self.qid, src_pid, msg_id), file=sys.stderr, flush=True) # *** 699 | raise Full 700 | 701 | block_id = self.next_writable_block_id(block, remaining_timeout, msg_id, src_pid) 702 | block_id_list.append(block_id) 703 | 704 | except Full: 705 | # We failed to find a free block and/or a timeout occured. 706 | # Release the reserved blocks. 707 | if self.verbose: 708 | print("put: qid=%d src_pid=%d msg_id=%r: releasing %d blocks" % (self.qid, src_pid, msg_id, len(block_id_list)), file=sys.stderr, flush=True) # *** 709 | for block_id in block_id_list: 710 | self.add_free_block(block_id) 711 | raise 712 | 713 | finally: 714 | # Now that we have acquired the full set of chunks, we can release 715 | # the producer lock. We don't want to hold it while we transfer 716 | # data into the blocks. 717 | if self.verbose: 718 | print("put: qid=%d src_pid=%d msg_id=%r: releasing producer lock" % (self.qid, src_pid, msg_id), file=sys.stderr, flush=True) # *** 719 | self.producer_lock.release() 720 | 721 | # Consume this message ID. 722 | self.consume_msg_id() 723 | 724 | if self.verbose: 725 | print("put: qid=%d src_pid=%d msg_id=%r: acquired %d blocks" % (self.qid, src_pid, msg_id, total_chunks), file=sys.stderr, flush=True) # *** 726 | 727 | # Now that we have a full set of blocks, build the 728 | # chunks: 729 | block_idx: int 730 | for block_idx, block_id in enumerate(block_id_list): 731 | chunk_id = block_idx + 1 732 | if self.verbose: 733 | print("put: qid=%d src_pid=%d msg_id=%r: chunk_id=%d of total_chunks=%d" % (self.qid, src_pid, msg_id, chunk_id, total_chunks), file=sys.stderr, flush=True) # *** 734 | 735 | data_block: SharedMemory = self.data_blocks[block_id] 736 | chunk_data: bytes = msg_body[block_idx * self.chunk_size: (block_idx + 1) * self.chunk_size] 737 | msg_size: int = len(chunk_data) 738 | if self.verbose: 739 | print("put: qid=%d src_pid=%d msg_id=%r: chunk_id=%d: block_id=%d msg_size=%d." % (self.qid, src_pid, msg_id, chunk_id, block_id, msg_size), file=sys.stderr, flush=True) # *** 740 | if self.integrity_check: 741 | checksum: int = zlib.adler32(chunk_data) 742 | if self.verbose: 743 | print("put: qid=%d src_pid=%d msg_id=%r: chunk_id=%d: checksum=%x total_msg_size=%d" % (self.qid, src_pid, msg_id, chunk_id, checksum, total_msg_size), file=sys.stderr, flush=True) # *** 744 | 745 | with self.block_locks[block_id]: 746 | self.set_meta(data_block, msg_id, 'msg_id') 747 | self.set_meta(data_block, msg_size, 'msg_size') 748 | self.set_meta(data_block, chunk_id, 'chunk_id') 749 | self.set_meta(data_block, total_chunks, 'total_chunks') 750 | if self.integrity_check: 751 | self.set_meta(data_block, total_msg_size, 'total_msg_size') 752 | self.set_meta(data_block, checksum, 'checksum') 753 | if chunk_id == total_chunks: 754 | # No more chunks, store a reserved value to simplify debugging. 755 | self.set_meta(data_block, self.__class__.RESERVED_BLOCK_ID, 'next_chunk_block_id') 756 | else: 757 | # Store the block ID of the next chunk. 758 | self.set_meta(data_block, block_id_list[block_idx + 1], 'next_chunk_block_id') 759 | self.set_data(data_block, chunk_data, msg_size) 760 | 761 | # Now that the entire message has built, queue it: 762 | self.add_msg(block_id_list[0]) 763 | if self.verbose: 764 | print("put: qid=%d src_pid=%d msg_id=%r: message sent" % (self.qid, src_pid, msg_id), file=sys.stderr, flush=True) # *** 765 | 766 | def get(self, block: bool=True, timeout: typing.Optional[float]=None)->typing.Any: 767 | """ 768 | Get the next available message from the queue. 769 | 770 | Args: 771 | block (bool, optional): If it is set to True (default), it will only return when an item is available. 772 | timeout (int, optional): A positive integer for the timeout duration in seconds, which is only effective when `block` is set to True. 773 | 774 | Returns: 775 | object: A message object retrieved from the queue. 776 | 777 | Raises: 778 | queue.Empty: This exception will be raised if it times out or queue is empty when `block` is False. 779 | ValueError: An internal error occured in accessing the message's metadata. 780 | UnpicklingError: This exception is raised when the serializer is pickle and 781 | an error occured in deserializing the message. 782 | 783 | Note: 784 | - Errors other then UnpicklingError might be raised if a serialized other then 785 | pickle is specified. 786 | """ 787 | time_start: float = time.time() 788 | 789 | # We will build a list of message chunks. We can't 790 | # release them until after we deserialize the data. 791 | block_id: int 792 | chunk_id: int 793 | msg_block_ids: typing.List[int] = [ ] 794 | data_block: SharedMemory 795 | 796 | try: 797 | remaining_timeout: typing.Optional[float] = timeout 798 | if remaining_timeout is not None: 799 | remaining_timeout -= (time.time() - time_start) 800 | if remaining_timeout <= 0: 801 | if self.verbose: 802 | print("put: qid=%d src_pid=%d msg_id=%r: queue EMPTY" % (self.qid, src_pid, msg_id), file=sys.stderr, flush=True) # *** 803 | raise Empty 804 | 805 | src_pid: int 806 | msg_id: bytes 807 | total_chunks: int 808 | next_chunk_block_id: int 809 | src_pid, msg_id, block_id, total_chunks, next_chunk_block_id = self.next_readable_msg(block, remaining_timeout) # This call might raise Empty. 810 | if self.verbose: 811 | print("get: qid=%d src_pid=%d msg_id=%r: total_chunks=%d next_chunk_block_id=%d." % (self.qid, src_pid, msg_id, total_chunks, next_chunk_block_id), file=sys.stderr, flush=True) # *** 812 | msg_block_ids.append(block_id) 813 | 814 | # Acquire the chunks for the rest of the message: 815 | i: int 816 | for i in range(1, total_chunks): 817 | chunk_id = i + 1 818 | if self.verbose: 819 | print("get: qid=%d src_pid=%d msg_id=%r: chunk_id=%d: block_id=%d." % (self.qid, src_pid, msg_id, chunk_id, next_chunk_block_id), file=sys.stderr, flush=True) # *** 820 | msg_block_ids.append(next_chunk_block_id) 821 | data_block = self.data_blocks[next_chunk_block_id] 822 | with self.block_locks[next_chunk_block_id]: 823 | maybe_next_chunk_block_id: typing.Union[bytes, int] = self.get_meta(data_block, 'next_chunk_block_id') 824 | if isinstance(maybe_next_chunk_block_id, int): 825 | next_chunk_block_id = maybe_next_chunk_block_id 826 | else: 827 | raise ValueError("get: internal error getting next_chunk_block_id") 828 | 829 | except Exception: 830 | # Release the data blocks (losing the message) if we get an 831 | # unexpected exception: 832 | if self.verbose: 833 | print("put: qid=%d: releasing data blocks due to Exception" % self.qid, file=sys.stderr, flush=True) # *** 834 | for block_id in msg_block_ids: 835 | self.add_free_block(block_id) 836 | msg_block_ids.clear() 837 | raise 838 | 839 | buf_msg_body: typing.List[bytes] = [] 840 | try: 841 | block_idx: int 842 | for block_idx, block_id in enumerate(msg_block_ids): 843 | chunk_id = block_idx + 1 844 | data_block = self.data_blocks[block_id] 845 | with self.block_locks[block_id]: 846 | maybe_msg_size: typing.Union[bytes, int] = self.get_meta(data_block, 'msg_size') 847 | if isinstance(maybe_msg_size, int): 848 | msg_size: int = maybe_msg_size 849 | else: 850 | raise ValueError("get: internal error getting msg_size") 851 | if self.integrity_check: 852 | if block_idx == 0: 853 | maybe_total_msg_size: typing.Union[bytes, int] = self.get_meta(data_block, 'total_msg_size') 854 | if isinstance(maybe_total_msg_size, int): 855 | total_msg_size: int = maybe_total_msg_size 856 | else: 857 | raise ValueError("set: internal errpor getting total_msg_size") 858 | maybe_checksum: typing.Union[bytes, int] = self.get_meta(data_block, 'checksum') 859 | if isinstance(maybe_checksum, int): 860 | checksum: int = maybe_checksum 861 | else: 862 | raise ValueError("get: internal error getting checksum") 863 | chunk_data: bytes = self.get_data(data_block, msg_size) # This may make a reference, not a deep copy. 864 | if self.verbose: 865 | print("get: qid=%d src_pid=%d msg_id=%r: chunk_id=%d: block_id=%d msg_size=%d total_chunks=%d." % (self.qid, src_pid, msg_id, chunk_id, block_id, msg_size, total_chunks), file=sys.stderr, flush=True) # *** 866 | if self.integrity_check: 867 | checksum2: int = zlib.adler32(chunk_data) 868 | if checksum == checksum2: 869 | if self.verbose: 870 | print("get: qid=%d src_pid=%d msg_id=%r: chunk_id=%d: checksum=%x is OK" % (self.qid, src_pid, msg_id, chunk_id, checksum), file=sys.stderr, flush=True) # *** 871 | else: 872 | raise ValueError("ShmQueue.get: qid=%d src_pid=%d msg_id=%r: chunk_id=%d: block_id=%d checksum=%x != checksum2=%x -- FAIL!" % (self.qid, src_pid, msg_id, chunk_id, block_id, checksum, checksum2)) # TODO: use a better exception 873 | 874 | buf_msg_body.append(chunk_data) # This may copy the reference. 875 | 876 | msg_body: bytes = b''.join(buf_msg_body) # Even this might copy the references. 877 | if self.integrity_check: 878 | if total_msg_size == len(msg_body): 879 | if self.verbose: 880 | print("get: qid=%d src_pid=%d msg_id=%r: total_msg_size=%d is OK" % (self.qid, src_pid, msg_id, total_msg_size), file=sys.stderr, flush=True) # *** 881 | else: 882 | raise ValueError("get: qid=%d src_pid=%d msg_id=%r: total_msg_size=%d != len(msg_body)=%d -- FAIL!" % (self.qid, src_pid, msg_id, total_msg_size, len(msg_body))) # TODO: use a beter exception. 883 | 884 | try: 885 | # Finally, we are guaranteed to copy the data. 886 | msg: typing.Any = self.serializer.loads(msg_body) # type: ignore[union-attr] 887 | 888 | # We could release the blocks here, but then we'd have to 889 | # release them in the except clause, too. 890 | 891 | return msg 892 | 893 | except pickle.UnpicklingError as e: 894 | print("get: Fail: qid=%d src_pid=%d msg_id=%r: msg_size=%d chunk_id=%d total_chunks=%d." % (self.qid, src_pid, msg_id, msg_size, chunk_id, total_chunks), file=sys.stderr, flush=True) # *** 895 | if self.integrity_check: 896 | print("get: Fail: qid=%d src_pid=%d msg_id=%r: total_msg_size=%d checksum=%x" % (self.qid, src_pid, msg_id, total_msg_size, checksum), file=sys.stderr, flush=True) # *** 897 | raise 898 | 899 | finally: 900 | # It is now safe to release the data blocks. This is a good place 901 | # to release them, because it covers error paths as well as the main return. 902 | if self.verbose: 903 | print("get: qid=%d src_pid=%d msg_id=%r: releasing %d blocks." % (self.qid, src_pid, msg_id, len(msg_block_ids)), file=sys.stderr, flush=True) # *** 904 | for block_id in msg_block_ids: 905 | self.add_free_block(block_id) 906 | msg_block_ids.clear() 907 | buf_msg_body.clear() 908 | 909 | def get_nowait(self)->typing.Any: 910 | """ 911 | Equivalent to `get(False)`. 912 | """ 913 | return self.get(False) 914 | 915 | def put_nowait(self, msg: typing.Any): 916 | """ 917 | Equivalent to `put(obj, False)`. 918 | """ 919 | return self.put(msg, False) 920 | 921 | def qsize(self)->int: 922 | """int: Return the number of ready messages.""" 923 | return self.get_msg_count() 924 | 925 | def empty(self)->bool: 926 | """bool: True when no messages are ready.""" 927 | return self.get_msg_count() == 0 928 | 929 | def full(self)->bool: 930 | """bool: True when no free blocks are available.""" 931 | return self.get_free_block_count() == 0 932 | 933 | def close(self): 934 | """ 935 | Indicate no more new data will be added and release the shared memory areas. 936 | """ 937 | block: SharedMemory 938 | for block in self.data_blocks: 939 | block.close() 940 | block.unlink() 941 | 942 | self.list_heads.close() 943 | self.list_heads.unlink() 944 | 945 | def __del__(self): 946 | pass 947 | --------------------------------------------------------------------------------