├── .coveragerc ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── question.md └── workflows │ ├── docs.yml │ ├── publish.yml │ └── test.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── .gitignore ├── Gemfile ├── _config.yml └── src │ ├── 404.md │ ├── assets │ └── img │ │ ├── diagram1.png │ │ ├── diagram2.png │ │ ├── favicon.ico │ │ ├── hint1.png │ │ ├── hint2.png │ │ ├── hint3.png │ │ └── pyper.png │ ├── docs │ ├── ApiReference │ │ ├── AsyncPipeline.md │ │ ├── Pipeline.md │ │ ├── index.md │ │ └── task.md │ ├── Examples │ │ ├── ChessDataAnalysis.md │ │ └── index.md │ ├── Resources │ │ ├── Contributing.md │ │ └── index.md │ └── UserGuide │ │ ├── AdvancedConcepts.md │ │ ├── BasicConcepts.md │ │ ├── ComposingPipelines.md │ │ ├── CreatingPipelines.md │ │ └── index.md │ └── index.md ├── examples └── ChessDataAnalysis │ ├── main.py │ └── requirements.txt ├── pyproject.toml ├── src └── pyper │ ├── __init__.py │ └── _core │ ├── async_helper │ ├── output.py │ ├── queue_io.py │ └── stage.py │ ├── decorators.py │ ├── pipeline.py │ ├── sync_helper │ ├── output.py │ ├── queue_io.py │ └── stage.py │ ├── task.py │ └── util │ ├── asynchronize.py │ ├── sentinel.py │ ├── task_group.py │ └── worker_pool.py ├── tests ├── Dockerfile ├── docker-compose.yaml ├── entrypoint.sh ├── requirements.txt ├── test_async.py ├── test_sync.py └── test_task.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | exclude_also = 3 | # pragma: no cover 4 | if TYPE_CHECKING: 5 | if t.TYPE_CHECKING: 6 | raise NotImplementedError 7 | 8 | [run] 9 | omit = **/pyper/_core/util/task_group.py 10 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Help us find and resolve bugs 4 | title: "[BUG] " 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Quick Check** 11 | - [ ] I've checked the [documentation](https://pyper-dev.github.io/pyper/) and verified that the behaviour is unintended 12 | - [ ] I've checked existing issues for similar bug reports 13 | 14 | **Description** 15 | Please describe what the bug is and when it occurs (mention package version and OS if relevant) 16 | 17 | **Code Sample** 18 | Please provide a minimal reproducible example of the bug 19 | 20 | **Expected behavior** 21 | Please briefly describe what you expected to happen 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE] " 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Quick Check** 11 | - [ ] I've checked the [documentation](https://pyper-dev.github.io/pyper/) and verified that this feature does not exist 12 | - [ ] I've checked existing issues for similar feature requests 13 | 14 | **Use Case** 15 | Please describe: 16 | * The problem you are trying to solve 17 | * Your feature idea and its intended usage 18 | * How this feature will solve your use case 19 | 20 | **Code Sample** 21 | If relevant, please provide and describe any code illustrating the need for your suggested feature 22 | 23 | **Alternatives** 24 | Please describe the other approaches you have tried and why they do not work or why they are not optimal 25 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Need some help? 4 | title: "[QUESTION] " 5 | labels: question 6 | assignees: RichardZhu2 7 | 8 | --- 9 | 10 | **Quick Check** 11 | - [ ] I've checked the [documentation](https://pyper-dev.github.io/pyper/) and have not found what I'm looking for 12 | - [ ] I've checked existing issues for similar questions 13 | 14 | **Question** 15 | Please ask away! 16 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Docs 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | paths: 7 | - "docs/**" 8 | 9 | workflow_dispatch: 10 | 11 | permissions: 12 | contents: read 13 | pages: write 14 | id-token: write 15 | 16 | concurrency: 17 | group: "pages" 18 | cancel-in-progress: false 19 | 20 | jobs: 21 | build: 22 | runs-on: ubuntu-latest 23 | defaults: 24 | run: 25 | working-directory: docs 26 | steps: 27 | - name: Checkout 28 | uses: actions/checkout@v4 29 | - name: Setup Ruby 30 | uses: ruby/setup-ruby@v1 31 | with: 32 | ruby-version: '3.3' 33 | bundler-cache: true 34 | cache-version: 0 35 | working-directory: '${{ github.workspace }}/docs' 36 | - name: Setup Pages 37 | id: pages 38 | uses: actions/configure-pages@v5 39 | - name: Build with Jekyll 40 | # Outputs to the './_site' directory by default 41 | run: bundle exec jekyll build --baseurl "${{ steps.pages.outputs.base_path }}" 42 | env: 43 | JEKYLL_ENV: production 44 | - name: Upload artifact 45 | uses: actions/upload-pages-artifact@v3 46 | with: 47 | path: "docs/_site/" 48 | 49 | deploy: 50 | environment: 51 | name: github-pages 52 | url: ${{ steps.deployment.outputs.page_url }} 53 | runs-on: ubuntu-latest 54 | needs: build 55 | steps: 56 | - name: Deploy to GitHub Pages 57 | id: deployment 58 | uses: actions/deploy-pages@v4 59 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package to PyPI 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | pypi-publish: 9 | name: Publish release to PyPI 10 | runs-on: ubuntu-latest 11 | environment: 12 | name: pypi 13 | url: https://pypi.org/p/python-pyper 14 | permissions: 15 | id-token: write 16 | env: 17 | FORCE_JAVASCRIPT_ACTIONS_TO_NODE20: true 18 | steps: 19 | - uses: actions/checkout@v4 20 | with: 21 | fetch-depth: 0 22 | - name: Set up Python 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: "3.11" 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install build setuptools wheel 30 | - name: Build package 31 | run: | 32 | python -m build 33 | - name: Publish package distributions to PyPI 34 | uses: pypa/gh-action-pypi-publish@release/v1 35 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths: 8 | - "src/pyper/**" 9 | - "tests/**" 10 | 11 | workflow_dispatch: 12 | 13 | 14 | jobs: 15 | test: 16 | runs-on: ubuntu-latest 17 | 18 | strategy: 19 | matrix: 20 | python-version: [ 21 | "3.8", 22 | "3.9", 23 | "3.10", 24 | "3.11", 25 | "3.12", 26 | "3.13" 27 | ] 28 | 29 | steps: 30 | - name: Checkout code 31 | uses: actions/checkout@v3 32 | 33 | - name: Set up Python 34 | uses: actions/setup-python@v4 35 | with: 36 | python-version: ${{ matrix.python-version }} 37 | 38 | - name: Install dependencies 39 | run: | 40 | python -m pip install --upgrade pip 41 | pip install tox 42 | 43 | - name: Run tests with Tox 44 | run: tox -e ${{ matrix.python-version }} 45 | 46 | coverage: 47 | environment: 48 | name: test 49 | env: 50 | COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }} 51 | needs: test 52 | runs-on: ubuntu-latest 53 | 54 | steps: 55 | - name: Checkout code 56 | uses: actions/checkout@v3 57 | 58 | - name: Set up Python 59 | uses: actions/setup-python@v4 60 | with: 61 | python-version: '3.12' 62 | 63 | - name: Install dependencies 64 | run: | 65 | python -m pip install --upgrade pip 66 | pip install tox coveralls 67 | 68 | - name: Run coverage with Tox 69 | run: tox -e coverage 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .vscode/ 3 | 4 | __pycache__/ 5 | .pytest_cache/ 6 | .venv/ 7 | src/*.egg-info/ 8 | 9 | .tox/ -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guide 2 | 3 | Please read the [Contributing Guide](https://pyper-dev.github.io/pyper/docs/Resources/Contributing) on the docs site -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Richard Zhu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Pyper 3 |

4 |

5 | Concurrent Python made simple 6 |

7 | 8 |

9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | Package version 17 | 18 | 19 | Supported Python versions 20 | 21 |

22 | 23 | --- 24 | 25 | Pyper is a flexible framework for concurrent and parallel data-processing, based on functional programming patterns. Used for 🔀 **ETL Systems**, ⚙️ **Data Microservices**, and 🌐 **Data Collection** 26 | 27 | See the [Documentation](https://pyper-dev.github.io/pyper/) 28 | 29 | **Key features:** 30 | 31 | * 💡**Intuitive API**: Easy to learn, easy to think about. Implements clean abstractions to seamlessly unify threaded, multiprocessed, and asynchronous work. 32 | * 🚀 **Functional Paradigm**: Python functions are the building blocks of data pipelines. Let's you write clean, reusable code naturally. 33 | * 🛡️ **Safety**: Hides the heavy lifting of underlying task execution and resource clean-up. No more worrying about race conditions, memory leaks, or thread-level error handling. 34 | * ⚡ **Efficiency**: Designed from the ground up for lazy execution, using queues, workers, and generators. 35 | * ✨ **Pure Python**: Lightweight, with zero sub-dependencies. 36 | 37 | ## Installation 38 | 39 | Install the latest version using `pip`: 40 | 41 | ```console 42 | $ pip install python-pyper 43 | ``` 44 | 45 | Note that `python-pyper` is the [pypi](https://pypi.org/project/python-pyper) registered package. 46 | 47 | ## Usage 48 | 49 | In Pyper, the `task` decorator is used to transform functions into composable pipelines. 50 | 51 | Let's simulate a pipeline that performs a series of transformations on some data. 52 | 53 | ```python 54 | import asyncio 55 | import time 56 | 57 | from pyper import task 58 | 59 | 60 | def get_data(limit: int): 61 | for i in range(limit): 62 | yield i 63 | 64 | 65 | async def step1(data: int): 66 | await asyncio.sleep(1) 67 | print("Finished async wait", data) 68 | return data 69 | 70 | 71 | def step2(data: int): 72 | time.sleep(1) 73 | print("Finished sync wait", data) 74 | return data 75 | 76 | 77 | def step3(data: int): 78 | for i in range(10_000_000): 79 | _ = i*i 80 | print("Finished heavy computation", data) 81 | return data 82 | 83 | 84 | async def main(): 85 | # Define a pipeline of tasks using `pyper.task` 86 | pipeline = task(get_data, branch=True) \ 87 | | task(step1, workers=20) \ 88 | | task(step2, workers=20) \ 89 | | task(step3, workers=20, multiprocess=True) 90 | 91 | # Call the pipeline 92 | total = 0 93 | async for output in pipeline(limit=20): 94 | total += output 95 | print("Total:", total) 96 | 97 | 98 | if __name__ == "__main__": 99 | asyncio.run(main()) 100 | ``` 101 | 102 | Pyper provides an elegant abstraction of the execution of each task, allowing you to focus on building out the **logical** functions of your program. In the `main` function: 103 | 104 | * `pipeline` defines a function; this takes the parameters of its first task (`get_data`) and yields each output from its last task (`step3`) 105 | * Tasks are piped together using the `|` operator (motivated by Unix's pipe operator) as a syntactic representation of passing inputs/outputs between tasks. 106 | 107 | In the pipeline, we are executing three different types of work: 108 | 109 | * `task(step1, workers=20)` spins up 20 `asyncio.Task`s to handle asynchronous IO-bound work 110 | 111 | * `task(step2, workers=20)` spins up 20 `threads` to handle synchronous IO-bound work 112 | 113 | * `task(step3, workers=20, multiprocess=True)` spins up 20 `processes` to handle synchronous CPU-bound work 114 | 115 | `task` acts as one intuitive API for unifying the execution of each different type of function. 116 | 117 | Each task has workers that submit outputs to the next task within the pipeline via queue-based data structures; this is the mechanism underpinning how concurrency and parallelism are achieved. See the [docs](https://pyper-dev.github.io/pyper/docs/UserGuide/BasicConcepts) for a breakdown of what a pipeline looks like under the hood. 118 | 119 | --- 120 | 121 | 122 | 123 |

124 |

See a non-async example

125 | 126 |
127 | 128 | Pyper pipelines are by default non-async, as long as their tasks are defined as synchronous functions. For example: 129 | 130 | ```python 131 | import time 132 | 133 | from pyper import task 134 | 135 | 136 | def get_data(limit: int): 137 | for i in range(limit): 138 | yield i 139 | 140 | def step1(data: int): 141 | time.sleep(1) 142 | print("Finished sync wait", data) 143 | return data 144 | 145 | def step2(data: int): 146 | for i in range(10_000_000): 147 | _ = i*i 148 | print("Finished heavy computation", data) 149 | return data 150 | 151 | 152 | def main(): 153 | pipeline = task(get_data, branch=True) \ 154 | | task(step1, workers=20) \ 155 | | task(step2, workers=20, multiprocess=True) 156 | total = 0 157 | for output in pipeline(limit=20): 158 | total += output 159 | print("Total:", total) 160 | 161 | 162 | if __name__ == "__main__": 163 | main() 164 | ``` 165 | 166 | A pipeline consisting of _at least one asynchronous function_ becomes an `AsyncPipeline`, which exposes the same usage API, provided `async` and `await` syntax in the obvious places. This makes it effortless to combine synchronously defined and asynchronously defined functions where need be. 167 | 168 |

169 | 170 | ## Examples 171 | 172 | To explore more of Pyper's features, see some further [examples](https://pyper-dev.github.io/pyper/docs/Examples) 173 | 174 | ## Dependencies 175 | 176 | Pyper is implemented in pure Python, with no sub-dependencies. It is built on top of the well-established built-in Python modules: 177 | * [threading](https://docs.python.org/3/library/threading.html) for thread-based concurrency 178 | * [multiprocessing](https://docs.python.org/3/library/multiprocessing.html) for parallelism 179 | * [asyncio](https://docs.python.org/3/library/asyncio.html) for async-based concurrency 180 | * [concurrent.futures](https://docs.python.org/3/library/concurrent.futures.html) for unifying threads, processes, and async code 181 | 182 | ## License 183 | 184 | This project is licensed under the terms of the MIT license. -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _site/ 2 | .sass-cache/ 3 | .jekyll-cache/ 4 | .jekyll-metadata 5 | 6 | .bundle/ 7 | vendor/ 8 | 9 | # GitHub pages does not need the lock file 10 | Gemfile.lock -------------------------------------------------------------------------------- /docs/Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem "jekyll", "~> 4.3.4" # installed by `gem jekyll` 4 | gem "just-the-docs", "0.10.0" # pinned to the current release 5 | 6 | # Performance-booster for watching directories on Windows 7 | gem "wdm", "~> 0.1", :platforms => [:mingw, :x64_mingw, :mswin] -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | title: Pyper Docs 2 | description: Concurrent Python made simple 3 | theme: just-the-docs 4 | 5 | url: https://pyper-dev.github.io/pyper 6 | source: src 7 | 8 | color_scheme: dark 9 | 10 | logo: "/assets/img/pyper.png" 11 | favicon_ico: "/assets/img/favicon.ico" 12 | search_enabled: true 13 | nav_enabled: true 14 | 15 | search: 16 | heading_level: 2 17 | previews: 3 18 | preview_words_before: 5 19 | preview_words_after: 10 20 | tokenizer_separator: /[\s/]+/ 21 | rel_url: true 22 | button: false 23 | focus_shortcut_key: 'k' 24 | 25 | # Aux links for the upper right navigation 26 | aux_links: 27 | "GitHub": 28 | - "//github.com/pyper-dev/pyper" 29 | aux_links_new_tab: true 30 | 31 | callouts: 32 | info: 33 | title: Info 34 | color: blue 35 | warning: 36 | title: Warning 37 | color: yellow -------------------------------------------------------------------------------- /docs/src/404.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | permalink: /404.html 4 | --- 5 | 6 | # 404 7 | 8 | This page wasn't found -------------------------------------------------------------------------------- /docs/src/assets/img/diagram1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyper-dev/pyper/c958bafab029ed966a2d0f2468621870a9407235/docs/src/assets/img/diagram1.png -------------------------------------------------------------------------------- /docs/src/assets/img/diagram2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyper-dev/pyper/c958bafab029ed966a2d0f2468621870a9407235/docs/src/assets/img/diagram2.png -------------------------------------------------------------------------------- /docs/src/assets/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyper-dev/pyper/c958bafab029ed966a2d0f2468621870a9407235/docs/src/assets/img/favicon.ico -------------------------------------------------------------------------------- /docs/src/assets/img/hint1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyper-dev/pyper/c958bafab029ed966a2d0f2468621870a9407235/docs/src/assets/img/hint1.png -------------------------------------------------------------------------------- /docs/src/assets/img/hint2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyper-dev/pyper/c958bafab029ed966a2d0f2468621870a9407235/docs/src/assets/img/hint2.png -------------------------------------------------------------------------------- /docs/src/assets/img/hint3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyper-dev/pyper/c958bafab029ed966a2d0f2468621870a9407235/docs/src/assets/img/hint3.png -------------------------------------------------------------------------------- /docs/src/assets/img/pyper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyper-dev/pyper/c958bafab029ed966a2d0f2468621870a9407235/docs/src/assets/img/pyper.png -------------------------------------------------------------------------------- /docs/src/docs/ApiReference/AsyncPipeline.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: AsyncPipeline 3 | parent: API Reference 4 | layout: default 5 | nav_order: 3 6 | permalink: /docs/ApiReference/AsyncPipeline 7 | --- 8 | 9 | # pyper.AsyncPipeline 10 | {: .no_toc } 11 | 12 | `AsyncPipeline` is a sublass of [Pipeline](Pipeline) and exposes the same API. 13 | 14 | [Example](../UserGuide/CreatingPipelines#asynchronous-code) -------------------------------------------------------------------------------- /docs/src/docs/ApiReference/Pipeline.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Pipeline 3 | parent: API Reference 4 | layout: default 5 | nav_order: 2 6 | permalink: /docs/ApiReference/Pipeline 7 | --- 8 | 9 | # pyper.Pipeline 10 | {: .no_toc } 11 | 12 | * TOC 13 | {:toc} 14 | 15 | ## Pipeline 16 | 17 | ```python 18 | def __new__(cls, tasks: List[Task]) -> Pipeline: 19 | ``` 20 | 21 | An object that represents a data flow consisting of a series of (at least one) tasks. 22 | 23 | {: .warning} 24 | It is not recommended to instantiate a `Pipeline` directly. Use the [task](task) class 25 | 26 | ## Pipeline.\__call__ 27 | 28 | ```python 29 | def __call__(self, *args, **kwargs) -> Generator[Any, None, None]: 30 | ``` 31 | 32 | A `Pipeline` is a callable object with the parameter specification of its first task which generates each output from its last task. 33 | 34 | [Example](../UserGuide/CreatingPipelines#pipeline-usage) 35 | 36 | ## Pipeline.pipe 37 | 38 | ```python 39 | def pipe(self, other: Pipeline) -> Pipeline: 40 | ``` 41 | 42 | Allows two `Pipeline` objects to be composed together, returning a new pipeline with a combined list of tasks. 43 | 44 | [Example](../UserGuide/ComposingPipelines#piping-and-the--operator) 45 | 46 | ## Pipeline.\__or__ 47 | 48 | ```python 49 | def __or__(self, other: Pipeline) -> Pipeline: 50 | ``` 51 | 52 | Allows the use of the operator `|` as syntactic sugar for `Pipeline.pipe`. 53 | 54 | ## Pipeline.consume 55 | 56 | ```python 57 | def consume(self, other: Callable) -> Callable: 58 | ``` 59 | 60 | Allows a consumer function to be attached to a `Pipeline`. 61 | 62 | [Example](../UserGuide/ComposingPipelines#consumer-functions-and-the--operator) 63 | 64 | 65 | ## Pipeline.\__gt__ 66 | 67 | ```python 68 | def __gt__(self, other: Callable) -> Callable: 69 | ``` 70 | 71 | Allows the use of the operator `>` as syntactic sugar for `Pipeline.consume`. 72 | -------------------------------------------------------------------------------- /docs/src/docs/ApiReference/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: API Reference 3 | nav_order: 4 4 | layout: page 5 | --- 6 | 7 | # API Reference -------------------------------------------------------------------------------- /docs/src/docs/ApiReference/task.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: task 3 | parent: API Reference 4 | layout: default 5 | nav_order: 1 6 | permalink: /docs/ApiReference/task 7 | --- 8 | 9 | # pyper.task 10 | {: .no_toc } 11 | 12 | * TOC 13 | {:toc} 14 | 15 | > For convenience, we will use the following terminology on this page: 16 | > * **Producer**: The _first_ task within a pipeline 17 | > * **Producer-consumer**: Any task after the first task within a pipeline 18 | 19 | ## task 20 | 21 | ```python 22 | def __new__( 23 | cls, 24 | func: Optional[Callable] = None, 25 | /, 26 | *, 27 | branch: bool = False, 28 | join: bool = False, 29 | workers: int = 1, 30 | throttle: int = 0, 31 | multiprocess: bool = False, 32 | bind: Optional[Tuple[Tuple[Any], Dict[str, Any]]] = None): 33 | ``` 34 | 35 | Used to initialize a [Pipeline](Pipeline) object, consisting of one 'task' (one functional operation). 36 | 37 | Pipelines created this way can be [composed](../UserGuide/ComposingPipelines) into new pipelines that contain multiple tasks. 38 | 39 | --- 40 | 41 | {: .text-green-200 .text-gamma} 42 | **Parameters** 43 | 44 | {: .text-beta} 45 | ### `func` 46 | 47 | * **type:** `Optional[Callable]` 48 | * **default:** `None` 49 | 50 | The function or callable object defining the logic of the task. This is a positional-only parameter. 51 | 52 | ```python 53 | from pyper import task 54 | 55 | def add_one(x: int): 56 | return x + 1 57 | 58 | pipeline = task(add_one) 59 | ``` 60 | 61 | {: .text-beta} 62 | ### `branch` 63 | 64 | * **type:** `bool` 65 | * **default:** `False` 66 | 67 | When `branch` is `False`, the output of the task is the value it returns. 68 | Setting `branch` to `True` allows a task to generate multiple outputs. This requires the task to return an `Iterable` (or `AsyncIterable`). 69 | 70 | ```python 71 | from pyper import task 72 | 73 | def create_data(x: int): 74 | return [x + 1, x + 2, x + 3] 75 | 76 | if __name__ == "__main__": 77 | pipeline1 = task(create_data) 78 | for output in pipeline1(0): 79 | print(output) 80 | #> [1, 2, 3] 81 | 82 | pipeline2 = task(create_data, branch=True) 83 | for output in pipeline2(0): 84 | print(output) 85 | #> 1 86 | #> 2 87 | #> 3 88 | ``` 89 | 90 | This can be applied to generator functions (or async generator functions) to submit outputs lazily: 91 | 92 | ```python 93 | from pyper import task 94 | 95 | def create_data(x: int): 96 | yield 1 97 | yield 2 98 | yield 3 99 | 100 | if __name__ == "__main__": 101 | pipeline = task(create_data, branch=True) 102 | for output in pipeline(0): 103 | print(output) 104 | #> 1 105 | #> 2 106 | #> 3 107 | ``` 108 | 109 | {: .text-beta} 110 | ### `join` 111 | 112 | * **type:** `bool` 113 | * **default:** `False` 114 | 115 | When `join` is `False`, a producer-consumer takes each individual output from the previous task as input. When `True`, a producer-consumer takes a stream of inputs from the previous task. 116 | 117 | ```python 118 | from typing import Iterable 119 | from pyper import task 120 | 121 | def create_data(x: int): 122 | return [x + 1, x + 2, x + 3] 123 | 124 | def running_total(data: Iterable[int]): 125 | total = 0 126 | for item in data: 127 | total += item 128 | yield total 129 | 130 | if __name__ == "__main__": 131 | pipeline = ( 132 | task(create_data, branch=True) 133 | | task(running_total, branch=True, join=True) 134 | ) 135 | for output in pipeline(0): 136 | print(output) 137 | #> 1 138 | #> 3 139 | #> 6 140 | ``` 141 | 142 | {: .warning} 143 | A producer _cannot_ have `join` set as `True` 144 | 145 | A task with `join=True` can also be run with multiple workers, which will pull from the previous task in a thread-safe/process-safe way. 146 | Note, however, that the order of outputs cannot be maintained consistently when a joined task is run with more than one worker. 147 | 148 | {: .text-beta} 149 | ### `workers` 150 | 151 | * **type:** `int` 152 | * **default:** `1` 153 | 154 | The parameter `workers` takes a `int` value which determines the number of workers executing the task concurrently or in parallel. 155 | 156 | ```python 157 | import time 158 | from pyper import task 159 | 160 | def slow_func(data: int): 161 | time.sleep(2) 162 | return data 163 | 164 | if __name__ == "__main__": 165 | pipeline = task(range, branch=True) | task(slow_func, workers=20) 166 | # Runs in ~2 seconds 167 | for output in pipeline(20): 168 | print(output) 169 | ``` 170 | 171 | {: .warning} 172 | A producer _cannot_ have `workers` set greater than `1` 173 | 174 | {: .text-beta} 175 | ### `throttle` 176 | 177 | * **type:** `int` 178 | * **default:** `0` 179 | 180 | The parameter `throttle` determines the maximum size of a task's output queue. The purpose of this parameter is to give finer control over memory in situations where: 181 | 182 | * A producer/producer-consumer generates data very quickly 183 | * A producer-consumer/consumer processes that data very slowly 184 | 185 | ```python 186 | import time 187 | from pyper import task 188 | 189 | def fast_producer(): 190 | for i in range(1_000_000): 191 | yield i 192 | 193 | def slow_consumer(data: int): 194 | time.sleep(10) 195 | return data 196 | 197 | pipeline = ( 198 | task(fast_producer, branch=True, throttle=5000) 199 | | task(slow_consumer) 200 | ) 201 | ``` 202 | 203 | In the example above, workers on `fast_producer` are paused after `5000` values have been generated, until workers for `slow_consumer` are ready to start processing again. 204 | If no throttle were specified, workers for `fast_producer` would quickly flood its output queue with up to `1_000_000` values, which all have to be allocated in memory. 205 | 206 | {: .text-beta} 207 | ### `multiprocess` 208 | 209 | * **type:** `bool` 210 | * **default:** `False` 211 | 212 | By default, synchronous tasks are run in `threading.Thread` workers and asynchronous tasks are run in `asyncio.Task` workers. 213 | The `multiprocess` parameter allows synchronous tasks be be run with `multiprocessing.Process` instead, benefitting heavily CPU-bound tasks. 214 | 215 | ```python 216 | from pyper import task 217 | 218 | def slow_func(data: int): 219 | for i in range(1, 10_000_000): 220 | i *= i 221 | return data 222 | 223 | if __name__ == "__main__": 224 | pipeline = ( 225 | task(range, branch=True) 226 | | task(slow_func, workers=20, multiprocess=True) 227 | ) 228 | for output in pipeline(20): 229 | print(output) 230 | ``` 231 | 232 | {: .warning} 233 | An asynchronous task cannot set `multiprocessing` as `True` 234 | 235 | See some [considerations](../UserGuide/AdvancedConcepts#cpu-bound-work) for when to set this parameter. 236 | 237 | Note, also, that normal Python multiprocessing restrictions apply: 238 | 239 | * Only [picklable](https://docs.python.org/3/library/pickle.html#module-pickle) functions can be multiprocessed, which excludes certain types of functions like lambdas and closures. 240 | * Arguments and return values of multiprocessed tasks must also be picklable, which excludes objects like file handles, connections, and (on Windows) generators. 241 | 242 | {: .text-beta} 243 | ### `bind` 244 | 245 | * **type:** `Optional[Tuple[Tuple[Any], Dict[str, Any]]]` 246 | * **default:** `None` 247 | 248 | The parameter `bind` allows additional `args` and `kwargs` to be bound to a task when creating a pipeline. 249 | 250 | ```python 251 | from pyper import task 252 | 253 | def apply_multiplier(data: int, multiplier: int): 254 | return data * multiplier 255 | 256 | if __name__ == "__main__": 257 | pipeline = ( 258 | task(range, branch=True) 259 | | task(apply_multiplier, bind=task.bind(multiplier=10)) 260 | ) 261 | for output in pipeline(1, 4): 262 | print(output) 263 | #> 10 264 | #> 20 265 | #> 30 266 | ``` 267 | 268 | Given that each producer-consumer expects to be given one input argument, the purpose of the `bind` parameter is to allow functions to be defined flexibly in terms of the inputs they wish to take, as well as allowing tasks to access external states, like contexts. 269 | 270 | ## task.bind 271 | 272 | ```python 273 | @staticmethod 274 | def bind(*args, **kwargs): 275 | ``` 276 | 277 | `task.bind` is the utility method that can be used to supply arguments to the `bind` parameter, which uses `functools.partial` under the hood. 278 | 279 | The method accepts normal valid `*args` and `**kwargs`. 280 | -------------------------------------------------------------------------------- /docs/src/docs/Examples/ChessDataAnalysis.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Chess Data Analysis 3 | parent: Examples 4 | layout: default 5 | nav_order: 1 6 | permalink: /docs/Examples/ChessDataAnalysis 7 | --- 8 | 9 | # Chess Data Analysis 10 | {: .no_toc } 11 | 12 | * TOC 13 | {:toc} 14 | 15 | ## Problem Statement 16 | 17 | Let's look at a very simple example of collecting some data and doing something with it. We will: 18 | 19 | * Build a pipeline to download a player's game data for the past few months from the [chess.com API](https://www.chess.com/news/view/published-data-api) 20 | * Use the `python-chess` package to parse the PGN game data 21 | * Use `pandas` to do some basic opening win-rate analysis 22 | 23 | ## Setup 24 | 25 | This is a standalone script. Python package requirements are specified in `requirements.txt` 26 | 27 | **See the [source code](https://github.com/pyper-dev/pyper/tree/main/examples/ChessDataAnalysis) for this example** _(always review code before running it on your own machine)_ 28 | 29 | ## Implementation 30 | 31 | To collect the data we need, we will use the chess.com API's monthly multigame PGN download endpoint, which has the url format: 32 | 33 | ``` 34 | https://api.chess.com/pub/player/player-name/games/YYYY/MM/pgn 35 | ``` 36 | 37 | Firstly, we define a helper function to generate these urls for the most recent months: 38 | 39 | ```python 40 | def generate_urls_by_month(player: str, num_months: int): 41 | """Define a series of pgn game resource urls for a player, for num_months recent months.""" 42 | today = datetime.date.today() 43 | for i in range(num_months): 44 | d = today - relativedelta(months=i) 45 | yield f"https://api.chess.com/pub/player/{player}/games/{d.year}/{d.month:02}/pgn" 46 | ``` 47 | 48 | We also need a function to fetch the raw data from each url. 49 | 50 | ```python 51 | def fetch_text_data(url: str, session: requests.Session): 52 | """Fetch text data from a url.""" 53 | r = session.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}) 54 | return r.text 55 | ``` 56 | 57 | Each PGN dataset consists of data for multiple games. We'll create a function called `read_game_data` to extract individual game details as dictionaries. 58 | 59 | ```python 60 | def _clean_opening_name(eco_url: str): 61 | """Get a rough opening name from the chess.com ECO url.""" 62 | name = eco_url.removeprefix("https://www.chess.com/openings/") 63 | return " ".join(name.split("-")[:2]) 64 | 65 | 66 | def read_game_data(pgn_text: str, player: str): 67 | """Read PGN data and generate game details (each PGN contains details for multiple games).""" 68 | pgn = io.StringIO(pgn_text) 69 | while (headers := chess.pgn.read_headers(pgn)) is not None: 70 | color = 'W' if headers["White"].lower() == player else 'B' 71 | 72 | if headers["Result"] == "1/2-1/2": 73 | score = 0.5 74 | elif (color == 'W' and headers["Result"] == "1-0") or (color == 'B' and headers["Result"] == "0-1"): 75 | score = 1 76 | else: 77 | score = 0 78 | 79 | yield { 80 | "color": color, 81 | "score": score, 82 | "opening": _clean_opening_name(headers["ECOUrl"]) 83 | } 84 | ``` 85 | 86 | Finally, we need some logic to handle the data analysis (which we're keeping very barebones). 87 | Let's dump the data into a pandas dataframe and print a table showing: 88 | 89 | * average score grouped by chess opening 90 | * where the player plays the white pieces 91 | * ordered by total games 92 | 93 | ```python 94 | def build_df(data: typing.Iterable[dict]) -> pd.DataFrame: 95 | df = pd.DataFrame(data) 96 | df = df[df["color"] == 'W'] 97 | df = df.groupby("opening").agg(total_games=("score", "count"), average_score=("score", "mean")) 98 | df = df.sort_values(by="total_games", ascending=False) 99 | return df 100 | ``` 101 | 102 | All that's left is to piece everything together. 103 | 104 | Note that the Pyper framework hasn't placed any particular restrictions on the way our 'business logic' is implemented. We can use Pyper to simply compose together these logical functions into a concurrent pipeline, with minimal code coupling. 105 | 106 | In the pipeline, we will: 107 | 108 | 1. Set `branch=True` for `generate_urls_by_month`, to allow this task to generate multiple outputs 109 | 2. Create 3 workers for `fetch_text_data`, so that we can wait on requests concurrently 110 | 3. Set `branch=True` for `read_game_data` also, as this generates multiple dictionaries 111 | 4. Let the `build_df` function consume all output generated by this pipeline 112 | 113 | ```python 114 | def main(): 115 | player = "hikaru" 116 | num_months = 6 # Keep this number low, or add sleeps for etiquette 117 | 118 | with requests.Session() as session: 119 | run = ( 120 | task(generate_urls_by_month, branch=True) 121 | | task( 122 | fetch_text_data, 123 | workers=3, 124 | bind=task.bind(session=session)) 125 | | task( 126 | read_game_data, 127 | branch=True, 128 | bind=task.bind(player=player)) 129 | > build_df 130 | ) 131 | df = run(player, num_months) 132 | print(df.head(10)) 133 | ``` 134 | 135 | With no more lines of code than it would have taken to define a series of sequential for-loops, we've defined a concurrently executable data flow! 136 | 137 | We can now run everything to see the result of our analysis: 138 | 139 | ``` 140 | opening total_games average_score 141 | 142 | Nimzowitsch Larsen 244 0.879098 143 | Closed Sicilian 205 0.924390 144 | Caro Kann 157 0.882166 145 | Bishops Opening 156 0.900641 146 | French Defense 140 0.846429 147 | Sicilian Defense 127 0.877953 148 | Reti Opening 97 0.819588 149 | Vienna Game 71 0.929577 150 | English Opening 61 0.868852 151 | Scandinavian Defense 51 0.862745 152 | ``` -------------------------------------------------------------------------------- /docs/src/docs/Examples/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Examples 3 | nav_order: 3 4 | layout: page 5 | --- 6 | 7 | # Examples -------------------------------------------------------------------------------- /docs/src/docs/Resources/Contributing.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Contributing 3 | parent: Resources 4 | layout: default 5 | nav_order: 1 6 | permalink: /docs/Resources/Contributing 7 | --- 8 | 9 | # Contributing 10 | {: .no_toc } 11 | 12 | * TOC 13 | {:toc} 14 | 15 | We welcome developers to contribute to this project! 16 | 17 | This guide will take you through our contribution guidelines and best practices. 18 | 19 | The Pyper repository is hosted on [GitHub](https://github.com/pyper-dev/pyper). 20 | We will assume familiarity with Git-based version control and making [pull requests](https://docs.github.com/get-started/exploring-projects-on-github/contributing-to-a-project). 21 | 22 | ## Typos 23 | 24 | All typo fixes are welcome. Feel free to simply open a pull request. 25 | 26 | ## Bugs and Features 27 | 28 | ### Issues 29 | 30 | We track all ongoing improvements through GitHub Issues. Start by opening a [new issue](https://github.com/pyper-dev/pyper/issues/new/choose) if one doesn't exist already. 31 | 32 | After you have created the issue, or you have found another unassigned issue you would like to work on, please: 33 | * Assign yourself to the issue if you are a collaborator 34 | * OR post a comment asking to be assigned to the issue 35 | 36 | ### Testing 37 | 38 | Assuming you have: 39 | 40 | * Been assigned to an issue 41 | * Forked and cloned a copy of the repo 42 | * Made your changes to the source code 43 | 44 | We'll want to make sure that tests are passing before pushing and merging these changes. 45 | If the changes you've made warrant writing additional tests, please consider doing so. 46 | 47 | To set up the testing environment, install the test dependencies (within a virtual environment): 48 | 49 | ```console 50 | $ pip install -r tests/requirements.txt 51 | ``` 52 | 53 | Test coverage is measured against Python 3.12 -- we are aiming for 100% code coverage at all times. 54 | Use `tox` to run all tests within a 3.12 environment and generate a coverage report: 55 | 56 | ```console 57 | $ tox -e 3.12 58 | ``` 59 | 60 | Please also make sure that tests pass successfully for all supported versions. 61 | You can do this without configuring additional Python virtual environments by using [Docker](https://docs.docker.com/): 62 | 63 | ```console 64 | $ cd tests 65 | $ docker-compose up --build --detach 66 | ``` 67 | 68 | You can verify that all tests have passed succesfully if each container exits with a status code of 0, or by inspecting the Docker logs. 69 | 70 | ### Documentation 71 | 72 | If relevant, please update the documentation appropriately. Documentation source files are found at `/docs/src`. These consist of markdown files, served with Jekyll on Github Pages, using the [just-the-docs](https://github.com/just-the-docs/just-the-docs) theme. 73 | 74 | If you would like to serve the documentation locally, you will need to: 75 | 76 | * Install [Ruby](https://www.ruby-lang.org/en/documentation/installation/) 77 | * Install [bundler](https://bundler.io/) (environment manager for Ruby) 78 | 79 | Install dependencies with: 80 | 81 | ```console 82 | $ cd docs 83 | $ bundle install 84 | ``` 85 | 86 | Then run: 87 | 88 | ```console 89 | $ bundle exec jekyll serve 90 | ``` 91 | 92 | This serves the documentation site locally at `http://localhost:4000`, where you can inspect your changes. 93 | 94 | 95 | -------------------------------------------------------------------------------- /docs/src/docs/Resources/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Resources 3 | nav_order: 5 4 | layout: page 5 | --- 6 | 7 | # Resources -------------------------------------------------------------------------------- /docs/src/docs/UserGuide/AdvancedConcepts.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Advanced Concepts 3 | parent: User Guide 4 | layout: default 5 | nav_order: 4 6 | permalink: /docs/UserGuide/AdvancedConcepts 7 | --- 8 | 9 | # Advanced Concepts 10 | {: .no_toc } 11 | 12 | * TOC 13 | {:toc} 14 | 15 | ## Threads vs Processes vs Async 16 | 17 | Whereas threading and asynchronous code are Python's way of achieving concurrency, multiprocessing is the answer for parallelism. 18 | 19 | Pyper supports all three modes of execution by coordinating different types of workers: 20 | 21 | * Synchronous tasks by default are handled by [threads](https://docs.python.org/3/library/threading.html) 22 | * Synchronous tasks set with `multiprocess=True` are handled by [processes](https://docs.python.org/3/library/multiprocessing.html) 23 | * Asynchronous tasks are handled by [asyncio Tasks](https://docs.python.org/3/library/asyncio-task.html) 24 | 25 | 26 | Concurrency and parallelism are powerful constructs that allow us to squeeze the best possible performance out of our code. 27 | To leverage these mechanisms optimally, however, we need to consider the type of work being done by each task; primarily, whether this work is [io-bound or cpu-bound](https://stackoverflow.com/questions/868568). 28 | 29 | 30 | ### IO-bound work 31 | 32 | An IO-bound task is one that can make progress off the CPU after releasing the [GIL](https://wiki.python.org/moin/GlobalInterpreterLock), by doing something that doesn't require computation. For example by: 33 | 34 | * Performing a sleep 35 | * Sending a network request 36 | * Reading from a database 37 | 38 | IO-bound tasks benefit from both concurrent and parallel execution. 39 | However, to avoid the overhead costs of creating processes, it is generally preferable to use either threading or async code. 40 | 41 | {: .info} 42 | Threads incur a higher overhead cost compared to async coroutines, but are suitable if the task prefers or requires a synchronous implementation 43 | 44 | Note that asynchronous functions need to `await` or `yield` something in order to benefit from concurrency. 45 | Any long-running call in an async task which does not yield execution will prevent other tasks from making progress: 46 | 47 | ```python 48 | # Okay 49 | def slow_func(): 50 | time.sleep(5) 51 | 52 | # Okay 53 | async def slow_func(): 54 | await asyncio.sleep(5) 55 | 56 | # Bad -- cannot benefit from concurrency 57 | async def slow_func(): 58 | time.sleep(5) 59 | ``` 60 | 61 | ### CPU-bound work 62 | 63 | A CPU-bound function is one that hogs the CPU intensely, without releasing the GIL. This includes all 'heavy-computation' type operations like: 64 | 65 | * Crunching numbers 66 | * Parsing text data 67 | * Sorting and searching 68 | 69 | {: .warning} 70 | Executing CPU-bound tasks concurrently does not improve performance, as CPU-bound tasks do not make progress while not holding the GIL 71 | 72 | The correct way to optimize the performance of CPU-bound tasks is through parallel execution, using multiprocessing. 73 | 74 | ```python 75 | def long_computation(data: int): 76 | for i in range(1, 1_000_000): 77 | data *= i 78 | return data 79 | 80 | # Okay 81 | pipeline = task(long_computation, workers=10, multiprocess=True) 82 | 83 | # Bad -- cannot benefit from concurrency 84 | pipeline = task(long_computation, workers=10) 85 | ``` 86 | 87 | Note, however, that processes incur a very high overhead cost (performance cost in creation and memory cost in inter-process communication). Specific cases should be benchmarked to fine-tune the task parameters for your program / your machine. 88 | 89 | ### Summary 90 | 91 | | | Threading | Multiprocessing | Async | 92 | |:----------------------|:----------|:----------------|:--------| 93 | | Overhead costs | Moderate | High | Low | 94 | | Synchronous execution | ✅ | ✅ | ❌ | 95 | | IO-bound work | ⬆️ | ⬆️ | ⬆️ | 96 | | CPU-bound work | ❌ | ⬆️ | ❌ | 97 | 98 | {: .text-green-200} 99 | **Key Considerations:** 100 | 101 | * If a task is doing expensive CPU-bound work, define it synchronously and set `multiprocess=True` 102 | * If a task is doing expensive IO-bound work, consider implementing it asynchronously, or use threads 103 | * Do _not_ put expensive, blocking work in an async task, as this clogs up the async event loop 104 | 105 | ## Functional Design 106 | 107 | ### Logical Separation 108 | 109 | Writing clean code is partly about defining functions with single, clear responsibilities. 110 | 111 | In Pyper, it is especially important to separate out different types of work into different tasks if we want to optimize their performance. For example, consider a task which performs an IO-bound network request along with a CPU-bound function to parse the data. 112 | 113 | ```python 114 | # Bad -- functions not separated 115 | def get_data(endpoint: str): 116 | # IO-bound work 117 | r = requests.get(endpoint) 118 | data = r.json() 119 | 120 | # CPU-bound work 121 | for item in data["results"]: 122 | yield process_data(item) 123 | 124 | pipeline = task(get_data, branch=True, workers=20) 125 | ``` 126 | 127 | Whilst it makes sense to handle the network request concurrently, the call to `process_data` within the same task requires holding onto the GIL and will harm concurrency. 128 | Instead, `process_data` should be implemented as a separate function: 129 | 130 | ```python 131 | def get_data(endpoint: str): 132 | # IO-bound work 133 | r = requests.get(endpoint) 134 | data = r.json() 135 | return data["results"] 136 | 137 | def process_data(data): 138 | # CPU-bound work 139 | return ... 140 | 141 | pipeline = ( 142 | task(get_data, branch=True, workers=20) 143 | | task(process_data, workers=10, multiprocess=True) 144 | ) 145 | ``` 146 | 147 | ### Resource Management 148 | 149 | It is often useful to share resources between different tasks, like http sessions or database connections. 150 | The correct pattern is generally to define functions which take these resources as arguments. 151 | 152 | ```python 153 | from aiohttp import ClientSession 154 | from pyper import task 155 | 156 | async def list_user_ids(session: ClientSession) -> list[int]: 157 | async with session.get("/users") as r: 158 | return await r.json() 159 | 160 | async def fetch_user_data(user_id: int, session: ClientSession) -> dict: 161 | async with session.get(f"/users/{user_id}") as r: 162 | return await r.json() 163 | ``` 164 | 165 | When defining a pipeline, these additional arguments are plugged into tasks using `task.bind`. For example: 166 | 167 | ```python 168 | async def main(): 169 | async with ClientSession("http://localhost:8000/api") as session: 170 | user_data_pipeline = ( 171 | task(list_user_ids, branch=True) 172 | | task(fetch_user_data, workers=10, bind=task.bind(session=session)) 173 | ) 174 | async for output in user_data_pipeline(session): 175 | print(output) 176 | ``` 177 | 178 | This is preferable to defining custom set-up and tear-down mechanisms, because it relies on Python's intrinsic mechanism for set-up and tear-down: using `with` syntax. 179 | However, this requires us to define and run the pipeline within the resource's context, which means it can't be used modularly in other data flows. 180 | 181 | If we want `user_data_pipeline` to be reusable, a simple solution is to create a factory function or factory class which uses the session resource internally. For example: 182 | 183 | ```python 184 | from aiohttp import ClientSession 185 | from pyper import task, AsyncPipeline 186 | 187 | def user_data_pipeline(session: ClientSession) -> AsyncPipeline: 188 | 189 | async def list_user_ids() -> list[int]: 190 | async with session.get("/users") as r: 191 | return await r.json() 192 | 193 | async def fetch_user_data(user_id: int) -> dict: 194 | async with session.get(f"/users/{user_id}") as r: 195 | return await r.json() 196 | 197 | return ( 198 | task(list_user_ids, branch=True) 199 | | task(fetch_user_data, workers=10) 200 | ) 201 | ``` 202 | 203 | Now `user_data_pipeline` constructs a self-contained data-flow, which can be reused without having to define its internal pipeline everytime. 204 | 205 | ```python 206 | async def main(): 207 | async with ClientSession("http://localhost:8000/api") as session: 208 | run = ( 209 | user_data_pipeline(session) 210 | | task(write_to_file, join=True) 211 | > copy_to_db 212 | ) 213 | await run() 214 | ``` 215 | 216 | ## Generators 217 | 218 | ### Usage 219 | 220 | Generators in Python are a mechanism for _lazy execution_, whereby results in an iterable are returned one by one (via underlying calls to `__next__`) instead of within a data structure, like a `list`, which requires all of its elements to be allocated in memory. 221 | 222 | Using generators is an indispensible approach for processing large volumes of data in a memory-friendly way. We can define generator functions by using the `yield` keyword within a normal `def` block: 223 | 224 | ```python 225 | import typing 226 | from pyper import task 227 | 228 | # Okay 229 | def generate_values_lazily() -> typing.Iterable[dict]: 230 | for i in range(10_000_000): 231 | yield {"data": i} 232 | 233 | # Bad -- this creates 10 million values in memory 234 | # Within a pipeline, subsequent tasks also cannot start executing until the entire list is created 235 | def create_values_in_list() -> typing.List[dict]: 236 | return [{"data": i} for i in range(10_000_000)] 237 | ``` 238 | 239 | {: .info} 240 | Generator `functions` return immediately. They return `generator` objects, which are iterable 241 | 242 | Using the `branch` task parameter in Pyper allows generators to generate multiple outputs, which get picked up by subsequent tasks as soon as the data is available. 243 | 244 | Using a generator function without `branch=True` is also possible; this just means the task submits `generator` objects as output, instead of each generated value. 245 | 246 | ```python 247 | from pyper import task 248 | 249 | def get_data(): 250 | yield 1 251 | yield 2 252 | yield 3 253 | 254 | if __name__ == "__main__": 255 | branched_pipeline = task(get_data, branch=True) 256 | for output in branched_pipeline(): 257 | print(output) 258 | #> 1 259 | #> 2 260 | #> 3 261 | 262 | non_branched_pipeline = task(get_data) 263 | for output in non_branched_pipeline(): 264 | print(output) 265 | #> 266 | ``` 267 | 268 | ### Limitations 269 | 270 | Implementing generator objects in a pipeline can also come with some caveats that are important to keep in mind. 271 | 272 | {: .text-green-200} 273 | **Synchronous Generators with Asynchronous Code** 274 | 275 | Synchronous generators in an `AsyncPipeline` do not benefit from threading or multiprocessing. 276 | 277 | This is because, in order to be scheduled in an async event loop, each synchronous task is run by a thread/process, and then wrapped in an `asyncio.Task`. 278 | 279 | Generator functions, which return _immediately_, do most of their work outside of the thread/process and this synchronous work will therefore not benefit from multiple workers in an async context. 280 | 281 | The alternatives are to: 282 | 283 | 1. Refactor your functions. If you find that one function is repeating a computation multiple times, it may be possible to [separate out responsibilities](#logical-separation) into separate functions 284 | 285 | 2. Use a synchronous generator anyway (if its performance is unlikely to be a bottleneck) 286 | 287 | 3. Use a normal synchronous function, and return an iterable data structure (if memory is unlikely to be a bottleneck) 288 | 289 | 4. Use an async generator (if an async implementation of the function is appropriate) 290 | 291 | {: .text-green-200} 292 | **Multiprocessing and Pickling** 293 | 294 | In Python, anything that goes into and comes out of a process must be picklable. 295 | 296 | On Windows, generator objects cannot be pickled, so cannot be passed as inputs and outputs when multiprocessing. 297 | 298 | Note that, for example, using `branch=True` to pass individual outputs from a generator into a multiprocessed task is still fine, because the task input would not be a `generator` object. 299 | -------------------------------------------------------------------------------- /docs/src/docs/UserGuide/BasicConcepts.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Basic Concepts 3 | parent: User Guide 4 | layout: page 5 | nav_order: 1 6 | permalink: /docs/UserGuide/BasicConcepts 7 | --- 8 | 9 | # Basic Concepts 10 | {: .no_toc } 11 | 12 | * TOC 13 | {:toc} 14 | 15 | ## Pipeline Design 16 | 17 | Pyper follows the [functional paradigm](https://docs.python.org/3/howto/functional.html), which by design maximizes the modularity and composability of data flows. This pattern takes effect in the usage API on two levels: 18 | 19 | * Python functions are the building blocks used to create `Pipeline` objects 20 | * `Pipeline` objects can themselves be thought of as functions 21 | 22 | For example, to create a simple pipeline, we can wrap a function in the `task` class: 23 | 24 | ```python 25 | from pyper import task 26 | 27 | def len_strings(x: str, y: str) -> int: 28 | return len(x) + len(y) 29 | 30 | pipeline = task(len_strings) 31 | ``` 32 | 33 | This defines `pipeline` as a pipeline consisting of a single task. It takes the parameters `(x: str, y: str)` and generates `int` outputs from an output queue: 34 | 35 | Diagram

70 | 71 | We can think of this pipeline as one function. 72 | 73 | The internal behaviour handles, intuitively, taking the outputs of each task and passing them as inputs to the next, where tasks communicate with each other via queue-based data structures. Running a task with multiple workers is the key mechanism underpinning how concurrency and parallelism are achieved. 74 | 75 | ## Next Steps 76 | 77 | In the next few sections, we'll go over some more details on pipeline usage. Skip ahead to see: 78 | 79 | * [More on creating pipelines](CreatingPipelines) 80 | * [More on composing pipelines](ComposingPipelines) 81 | * [Advanced concepts](AdvancedConcepts) -------------------------------------------------------------------------------- /docs/src/docs/UserGuide/ComposingPipelines.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Composing Pipelines 3 | parent: User Guide 4 | layout: page 5 | nav_order: 3 6 | permalink: /docs/UserGuide/ComposingPipelines 7 | --- 8 | 9 | # Composing Pipelines 10 | {: .no_toc } 11 | 12 | * TOC 13 | {:toc} 14 | 15 | ## Piping and the `|` Operator 16 | 17 | The `|` operator (inspired by UNIX syntax) is used to pipe one pipeline into another. This is syntactic sugar for the `Pipeline.pipe` method. 18 | 19 | ```python 20 | from pyper import task, Pipeline 21 | 22 | p1 = task(lambda x: x + 1) 23 | p2 = task(lambda x: 2 * x) 24 | p3 = task(lambda x: x - 1) 25 | 26 | new_pipeline = p1 | p2 | p3 27 | assert isinstance(new_pipeline, Pipeline) 28 | # OR 29 | new_pipeline = p1.pipe(p2).pipe(p3) 30 | assert isinstance(new_pipeline, Pipeline) 31 | ``` 32 | 33 | This represents defining a new function that: 34 | 35 | 1. takes the inputs of the first task 36 | 2. takes the outputs of each task and passes them as the inputs of the next task 37 | 3. finally, generates each output from the last task 38 | 39 | ```python 40 | if __name__ == "__main__": 41 | for output in new_pipeline(4): 42 | print(output) 43 | #> 9 44 | ``` 45 | 46 | ## Consumer Functions and the `>` Operator 47 | 48 | It is often useful to define resuable functions that process the results of a pipeline, which we'll call a 'consumer'. For example: 49 | 50 | ```python 51 | import json 52 | from typing import Dict, Iterable 53 | 54 | from pyper import task 55 | 56 | def step1(limit: int): 57 | for i in range(limit): 58 | yield {"data": i} 59 | 60 | def step2(data: Dict): 61 | return data | {"hello": "world"} 62 | 63 | class JsonFileWriter: 64 | def __init__(self, filepath): 65 | self.filepath = filepath 66 | 67 | def __call__(self, data: Iterable[Dict]): 68 | data_list = list(data) 69 | with open(self.filepath, 'w', encoding='utf-8') as f: 70 | json.dump(data_list, f, indent=4) 71 | 72 | if __name__ == "__main__": 73 | pipeline = task(step1, branch=True) | task(step2) # The pipeline 74 | writer = JsonFileWriter("data.json") # A consumer 75 | writer(pipeline(limit=10)) # Run 76 | ``` 77 | 78 | The `>` operator (again inspired by UNIX syntax) is used to pipe a `Pipeline` into a consumer function (any callable that takes an `Iterable` of inputs) returning simply a function that handles the 'run' operation. This is syntactic sugar for the `Pipeline.consume` method. 79 | ```python 80 | if __name__ == "__main__": 81 | run = ( 82 | task(step1, branch=True) 83 | | task(step2) 84 | > JsonFileWriter("data.json") 85 | ) 86 | run(limit=10) 87 | # OR 88 | run = ( 89 | task(step1, branch=True).pipe( 90 | task(step2)).consume( 91 | JsonFileWriter("data.json")) 92 | ) 93 | run(limit=10) 94 | ``` 95 | 96 | {: .info} 97 | Pyper comes with fantastic intellisense support which understands these operators and preserves parameter/return type hints from user-defined functions 98 | 99 |

100 | 101 | Type Hint

102 | 103 | Type Hint

104 | 105 | ## Nested Pipelines 106 | 107 | Just like functions, we can also call pipelines from other pipelines, which facilitates defining data flows of arbitrary complexity. 108 | 109 | For example, let's say we have a theoretical pipeline which takes `(source: str)` as input, downloads some files from a source, and generates `str` outputs representing filepaths. 110 | 111 | ```python 112 | download_files_from_source = ( 113 | task(list_files, branch=True) # Return a list of file info 114 | | task(download_file, workers=20) # Return a filepath 115 | | task(decrypt_file, workers=5, multiprocess=True) # Return a filepath 116 | ) 117 | ``` 118 | 119 | This is a function which generates multiple outputs per source. But we may wish to process _batches of filepaths_ downstream, after waiting for a single source to finish downloading. This means a piping approach, where we pass each _individual_ filepath along to subsequent tasks, won't work. 120 | 121 | Instead, we can define `download_files_from_source` as a task within an outer pipeline, which is as simple as wrapping it in `task` like we would with any other function. 122 | 123 | ```python 124 | download_and_merge_files = ( 125 | task(get_sources, branch=True) # Return a list of sources 126 | | task(download_files_from_source) # Return a batch of filepaths (as a generator) 127 | | task(sync_files, workers=5) # Do something with each batch 128 | ) 129 | ``` 130 | 131 | * `download_files_from_source` takes a source as input, and returns a generator of filepaths (note that we are _not_ setting `branch=True`; a batch of filepaths is being passed along per source) 132 | * `sync_files` takes each batch of filepaths as input, and works on them concurrently 133 | 134 | ## Asynchronous Code 135 | 136 | Recall that an `AsyncPipeline` is created from an asynchronous function: 137 | 138 | ```python 139 | from pyper import task, AsyncPipeline 140 | 141 | async def func(): 142 | return 1 143 | 144 | assert isinstance(task(func), AsyncPipeline) 145 | ``` 146 | 147 | When piping pipelines together, the following rule applies: 148 | 149 | * `Pipeline` + `Pipeline` = `Pipeline` 150 | * `Pipeline` + `AsyncPipeline` = `AsyncPipeline` 151 | * `AsyncPipeline` + `Pipeline` = `AsyncPipeline` 152 | * `AsyncPipeline` + `AsyncPipeline` = `AsyncPipeline` 153 | 154 | In other words: 155 | 156 | {: .info} 157 | A pipeline that contains _at least one_ asynchronous task becomes asynchronous 158 | 159 | This reflects a (sometimes awkward) trait of Python, which is that `async` and `await` syntax bleeds everywhere -- as soon as one function is defined asynchronously, you often find that many other parts program need to become asynchronous. Hence, the sync vs async decision is usually one made at the start of designing an application. 160 | 161 | The Pyper framework slightly assuages the developer experience by unifying synchronous and asynchronous execution under the hood. This allows the user to define functions in the way that makes the most sense, relying on Pyper to understand both synchronous and asynchronous tasks within an `AsyncPipeline`. 162 | 163 | Consumer functions will however need to adapt to asynchronous output. For example: 164 | 165 | ```python 166 | import asyncio 167 | import json 168 | from typing import AsyncIterable, Dict 169 | 170 | from pyper import task 171 | 172 | async def step1(limit: int): 173 | for i in range(limit): 174 | yield {"data": i} 175 | 176 | def step2(data: Dict): 177 | return data | {"hello": "world"} 178 | 179 | class AsyncJsonFileWriter: 180 | def __init__(self, filepath): 181 | self.filepath = filepath 182 | 183 | async def __call__(self, data: AsyncIterable[Dict]): 184 | data_list = [row async for row in data] 185 | with open(self.filepath, 'w', encoding='utf-8') as f: 186 | json.dump(data_list, f, indent=4) 187 | 188 | async def main(): 189 | run = ( 190 | task(step1, branch=True) 191 | | task(step2) 192 | > AsyncJsonFileWriter("data.json") 193 | ) 194 | await run(limit=10) 195 | 196 | if __name__ == "__main__": 197 | asyncio.run(main()) 198 | ``` -------------------------------------------------------------------------------- /docs/src/docs/UserGuide/CreatingPipelines.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Creating Pipelines 3 | parent: User Guide 4 | layout: default 5 | nav_order: 2 6 | permalink: /docs/UserGuide/CreatingPipelines 7 | --- 8 | 9 | # Creating Pipelines 10 | {: .no_toc } 11 | 12 | * TOC 13 | {:toc} 14 | 15 | ## The `task` Decorator 16 | 17 | Pyper's `task` decorator is the means by which we instantiate pipelines and control their behaviour: 18 | 19 | ```python 20 | from pyper import task, Pipeline 21 | 22 | def func(x: int): 23 | return x + 1 24 | 25 | pipeline = task(func) 26 | 27 | assert isinstance(pipeline, Pipeline) 28 | ``` 29 | 30 | This creates a `Pipeline` object consisting of one 'task' (one step of data transformation). 31 | 32 | In addition to functions, anything `callable` in Python can be wrapped in `task` in the same way: 33 | 34 | ```python 35 | from pyper import task 36 | 37 | class Doubler: 38 | def __call__(self, x: int): 39 | return 2 * x 40 | 41 | pipeline1 = task(Doubler()) 42 | pipeline2 = task(lambda x: x - 1) 43 | pipeline3 = task(range) 44 | ``` 45 | 46 | {: .info} 47 | The internal behaviour of a pipeline (e.g number of workers) is controlled by the different parameters for `task`. Refer to the [API Reference](../ApiReference/task) 48 | 49 | ## Pipeline Usage 50 | 51 | Recall that a `Pipeline` is itself essentially a function. Pipelines return a [Generator](https://wiki.python.org/moin/Generators) object (Python's mechanism for lazily iterating through data). 52 | 53 | ```python 54 | from pyper import task 55 | 56 | def func(x: int): 57 | return x + 1 58 | 59 | if __name__ == "__main__": 60 | pipeline = task(func) 61 | for output in pipeline(x=0): 62 | print(output) 63 | #> 1 64 | ``` 65 | 66 | {: .info} 67 | A Pipeline always takes the input of its first task, and yields each output from its last task 68 | 69 | A pipeline that generates _multiple_ outputs can be created using the `branch` parameter: 70 | 71 | ```python 72 | from pyper import task 73 | 74 | def func(x: int): 75 | yield x + 1 76 | yield x + 2 77 | yield x + 3 78 | 79 | if __name__ == "__main__": 80 | pipeline = task(func, branch=True) 81 | for output in pipeline(x=0): 82 | print(output) 83 | #> 1 84 | #> 2 85 | #> 3 86 | ``` 87 | 88 | ## Asynchronous Code 89 | 90 | Asynchronous functions/callables are used to create `AsyncPipeline` objects, which behave in an intuitively analogous way to `Pipeline`: 91 | 92 | ```python 93 | import asyncio 94 | from pyper import task 95 | 96 | async def func(x: int): 97 | return x + 1 98 | 99 | async def main(): 100 | pipeline = task(func) 101 | async for output in pipeline(x=0): 102 | print(output) 103 | #> 1 104 | 105 | if __name__ == "__main__": 106 | asyncio.run(main()) 107 | ``` 108 | 109 | Note that `AsyncPipeline` objects return an `AsyncGenerator` which is iterated over with `async for` syntax. -------------------------------------------------------------------------------- /docs/src/docs/UserGuide/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: User Guide 3 | nav_order: 2 4 | layout: page 5 | --- 6 | 7 | # User Guide -------------------------------------------------------------------------------- /docs/src/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Home 3 | description: "Concurrent Python made simple" 4 | layout: home 5 | nav_order: 1 6 | permalink: / 7 | --- 8 | 9 |

10 | Pyper 11 |

12 |

13 | Concurrent Python made simple 14 |

15 | 16 |

17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | Package version 25 | 26 | 27 | Supported Python versions 28 | 29 |

30 | 31 | --- 32 | 33 | ## Introduction 34 | 35 | Concurrency and parallelism are really hard to get right. 36 | 37 | The Python package space has great support for achieving 38 | 39 | * concurrency in web applications (Django, Flask, FastAPI, etc.) 40 | * parallelism for distributed computing (Ray, Dask, etc.) 41 | 42 | However, the solutions for _general-purpose_ data processing are less established. 43 | 44 | {: .text-green-200} 45 | **Pyper aims to offer a flexible framework for concurrent and parallel data-processing in Python** 46 | 47 | It is designed with the following goals in mind: 48 | 49 | * **Unified API**: Combine threads, processes and async code using one intuitive pattern 50 | * **Functional Paradigm**: Data pipelines compose together straightforwardly as functions 51 | * **Lazy Execution**: Built from the ground up to support generators, and provides mechanisms for fine-grained memory control 52 | * **Error Handling**: Data flows fail fast, even in long-running threads, and propagate their errors cleanly 53 | * **Complex Data Flows**: Data pipelines support branching/joining data flows, as well as sharing contexts/resources between tasks 54 | 55 | In addition, Pyper enables developers to write code in an extensible way that can be integrated naturally with other frameworks like those aforementioned. 56 | 57 | ## Installation 58 | 59 | Install the latest version using `pip`: 60 | 61 | ```console 62 | $ pip install python-pyper 63 | ``` 64 | 65 | ## Where Next? 66 | 67 | * Check out the 📖 **[User Guide](./docs/UserGuide/BasicConcepts)** to get started with Pyper 68 | 69 | * See some 🎯 **[Examples](./docs/Examples/)** of possible use cases 70 | -------------------------------------------------------------------------------- /examples/ChessDataAnalysis/main.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from dateutil.relativedelta import relativedelta 3 | import io 4 | import typing 5 | 6 | import chess.pgn 7 | import pandas as pd 8 | from pyper import task 9 | import requests 10 | 11 | 12 | def generate_urls_by_month(player: str, num_months: int): 13 | """Define a series of pgn game resource urls for a player, for num_months recent months.""" 14 | today = datetime.date.today() 15 | for i in range(num_months): 16 | d = today - relativedelta(months=i) 17 | yield f"https://api.chess.com/pub/player/{player}/games/{d.year}/{d.month:02}/pgn" 18 | 19 | 20 | def fetch_text_data(url: str, session: requests.Session): 21 | """Fetch text data from a url.""" 22 | r = session.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}) 23 | return r.text 24 | 25 | 26 | def _clean_opening_name(eco_url: str): 27 | """Get a rough opening name from the chess.com ECO url.""" 28 | name = eco_url.removeprefix("https://www.chess.com/openings/") 29 | return " ".join(name.split("-")[:2]) 30 | 31 | 32 | def read_game_data(pgn_text: str, player: str): 33 | """Read PGN data and generate game details (each PGN contains details for multiple games).""" 34 | pgn = io.StringIO(pgn_text) 35 | while (headers := chess.pgn.read_headers(pgn)) is not None: 36 | color = 'W' if headers["White"].lower() == player else 'B' 37 | 38 | if headers["Result"] == "1/2-1/2": 39 | score = 0.5 40 | elif (color == 'W' and headers["Result"] == "1-0") or (color == 'B' and headers["Result"] == "0-1"): 41 | score = 1 42 | else: 43 | score = 0 44 | 45 | yield { 46 | "color": color, 47 | "score": score, 48 | "opening": _clean_opening_name(headers["ECOUrl"]) 49 | } 50 | 51 | 52 | def build_df(data: typing.Iterable[dict]) -> pd.DataFrame: 53 | df = pd.DataFrame(data) 54 | df = df[df["color"] == 'W'] 55 | df = df.groupby("opening").agg(total_games=("score", "count"), average_score=("score", "mean")) 56 | df = df.sort_values(by="total_games", ascending=False) 57 | return df 58 | 59 | 60 | def main(): 61 | player = "hikaru" 62 | num_months = 6 # Keep this number low, or add sleeps for etiquette 63 | 64 | with requests.Session() as session: 65 | run = ( 66 | task(generate_urls_by_month, branch=True) 67 | | task( 68 | fetch_text_data, 69 | workers=3, 70 | bind=task.bind(session=session)) 71 | | task( 72 | read_game_data, 73 | branch=True, 74 | bind=task.bind(player=player)) 75 | > build_df 76 | ) 77 | df = run(player, num_months) 78 | print(df.head(10)) 79 | 80 | 81 | if __name__ == "__main__": 82 | main() 83 | -------------------------------------------------------------------------------- /examples/ChessDataAnalysis/requirements.txt: -------------------------------------------------------------------------------- 1 | chess>=1.0 2 | pandas>=2.0 3 | python-pyper 4 | requests>=2.0 -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "setuptools-scm", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.setuptools_scm] 6 | 7 | [project] 8 | name = "python-pyper" 9 | dynamic = ["version"] 10 | description = "Concurrent Python made simple" 11 | readme = "README.md" 12 | requires-python = ">=3.8" 13 | authors = [ 14 | { name = "Richard Zhu", email = "richard.zhu2@gmail.com" }, 15 | ] 16 | classifiers = [ 17 | "Intended Audience :: Developers", 18 | "Operating System :: OS Independent", 19 | "Programming Language :: Python :: 3", 20 | "Programming Language :: Python", 21 | "Topic :: Software Development :: Libraries", 22 | "Topic :: Software Development", 23 | "Framework :: AsyncIO", 24 | "License :: OSI Approved :: MIT License", 25 | "Programming Language :: Python :: 3 :: Only", 26 | "Programming Language :: Python :: 3.8", 27 | "Programming Language :: Python :: 3.9", 28 | "Programming Language :: Python :: 3.10", 29 | "Programming Language :: Python :: 3.11", 30 | "Programming Language :: Python :: 3.12", 31 | "Programming Language :: Python :: 3.13" 32 | ] 33 | dependencies = [ 34 | # Typing support dependency only for Python versions < 3.10 35 | "typing_extensions; python_version<'3.10'" 36 | ] 37 | 38 | [project.urls] 39 | Homepage = "https://pyper-dev.github.io/pyper/" 40 | Documentation = "https://pyper-dev.github.io/pyper/" 41 | Repository = "https://github.com/pyper-dev/pyper" 42 | Issues = "https://github.com/pyper-dev/pyper/issues" 43 | 44 | [tool.pytest.ini_options] 45 | asyncio_default_fixture_loop_scope = "function" -------------------------------------------------------------------------------- /src/pyper/__init__.py: -------------------------------------------------------------------------------- 1 | from ._core.decorators import task 2 | from ._core.pipeline import Pipeline, AsyncPipeline 3 | -------------------------------------------------------------------------------- /src/pyper/_core/async_helper/output.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor 5 | import sys 6 | from typing import TYPE_CHECKING 7 | 8 | from .stage import AsyncProducer, AsyncProducerConsumer 9 | from ..util.asynchronize import asynchronize 10 | from ..util.sentinel import StopSentinel 11 | 12 | if sys.version_info < (3, 11): # pragma: no cover 13 | from ..util.task_group import TaskGroup, ExceptionGroup 14 | else: 15 | from asyncio import TaskGroup 16 | 17 | if TYPE_CHECKING: 18 | from ..pipeline import AsyncPipeline 19 | 20 | 21 | class AsyncPipelineOutput: 22 | def __init__(self, pipeline: AsyncPipeline): 23 | self.pipeline = pipeline 24 | 25 | def _get_q_out(self, tg: TaskGroup, tp: ThreadPoolExecutor, pp: ProcessPoolExecutor, *args, **kwargs) -> asyncio.Queue: 26 | """Feed forward each stage to the next, returning the output queue of the final stage.""" 27 | q_out = None 28 | for task, next_task in zip(self.pipeline.tasks, self.pipeline.tasks[1:] + [None]): 29 | task = asynchronize(task, tp=tp, pp=pp) 30 | if q_out is None: 31 | stage = AsyncProducer(task=task, next_task=next_task) 32 | stage.start(tg, *args, **kwargs) 33 | else: 34 | stage = AsyncProducerConsumer(q_in=q_out, task=task, next_task=next_task) 35 | stage.start(tg) 36 | q_out = stage.q_out 37 | 38 | return q_out 39 | 40 | async def __call__(self, *args, **kwargs): 41 | """Iterate through the pipeline, taking the inputs to the first task, and yielding each output from the last task. 42 | 43 | Unify async, threaded, and multiprocessed work by: 44 | 1. using TaskGroup to execute asynchronous tasks 45 | 2. using ThreadPoolExecutor to execute threaded synchronous tasks 46 | 3. using ProcessPoolExecutor to execute multiprocessed synchronous tasks 47 | """ 48 | try: 49 | async with TaskGroup() as tg: 50 | with ThreadPoolExecutor() as tp, ProcessPoolExecutor() as pp: 51 | q_out = self._get_q_out(tg, tp, pp, *args, **kwargs) 52 | while (data := await q_out.get()) is not StopSentinel: 53 | yield data 54 | except ExceptionGroup as eg: 55 | raise eg.exceptions[0] from None 56 | -------------------------------------------------------------------------------- /src/pyper/_core/async_helper/queue_io.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | from collections.abc import AsyncIterable, Iterable 5 | from typing import TYPE_CHECKING 6 | 7 | from ..util.sentinel import StopSentinel 8 | 9 | if TYPE_CHECKING: 10 | import asyncio 11 | from ..task import Task 12 | 13 | 14 | def AsyncDequeueFactory(q_in: asyncio.Queue, task: Task): 15 | return _JoiningAsyncDequeue(q_in=q_in) if task.join \ 16 | else _SingleAsyncDequeue(q_in=q_in) 17 | 18 | 19 | class _AsyncDequeue: 20 | """Pulls data from an input queue.""" 21 | def __init__(self, q_in: asyncio.Queue): 22 | self.q_in = q_in 23 | 24 | async def _input_stream(self): 25 | while (data := await self.q_in.get()) is not StopSentinel: 26 | yield data 27 | 28 | def __call__(self): 29 | raise NotImplementedError 30 | 31 | 32 | class _SingleAsyncDequeue(_AsyncDequeue): 33 | async def __call__(self): 34 | async for data in self._input_stream(): 35 | yield data 36 | 37 | 38 | class _JoiningAsyncDequeue(_AsyncDequeue): 39 | async def __call__(self): 40 | yield self._input_stream() 41 | 42 | 43 | def AsyncEnqueueFactory(q_out: asyncio.Queue, task: Task): 44 | return _BranchingAsyncEnqueue(q_out=q_out, task=task) if task.branch \ 45 | else _SingleAsyncEnqueue(q_out=q_out, task=task) 46 | 47 | 48 | class _AsyncEnqueue: 49 | """Puts output from a task onto an output queue.""" 50 | def __init__(self, q_out: asyncio.Queue, task: Task): 51 | self.q_out = q_out 52 | self.task = task 53 | 54 | async def __call__(self, *args, **kwargs): 55 | raise NotImplementedError 56 | 57 | 58 | class _SingleAsyncEnqueue(_AsyncEnqueue): 59 | async def __call__(self, *args, **kwargs): 60 | await self.q_out.put(await self.task.func(*args, **kwargs)) 61 | 62 | 63 | class _BranchingAsyncEnqueue(_AsyncEnqueue): 64 | async def __call__(self, *args, **kwargs): 65 | result = self.task.func(*args, **kwargs) 66 | if isinstance(result, AsyncIterable): 67 | async for output in result: 68 | await self.q_out.put(output) 69 | await asyncio.sleep(0) 70 | elif isinstance(result := await result, Iterable): 71 | for output in result: 72 | await self.q_out.put(output) 73 | await asyncio.sleep(0) 74 | else: 75 | raise TypeError(f"got object of type {type(result)} from branching task {self.task.func} which could not be iterated over" 76 | " (the task should be a generator, or return an iterable)") 77 | -------------------------------------------------------------------------------- /src/pyper/_core/async_helper/stage.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | import sys 5 | from typing import TYPE_CHECKING 6 | 7 | from .queue_io import AsyncDequeueFactory, AsyncEnqueueFactory 8 | from ..util.sentinel import StopSentinel 9 | 10 | if sys.version_info < (3, 11): # pragma: no cover 11 | from ..util.task_group import TaskGroup 12 | else: 13 | from asyncio import TaskGroup 14 | 15 | if TYPE_CHECKING: 16 | from ..task import Task 17 | 18 | 19 | class AsyncProducer: 20 | def __init__(self, task: Task, next_task: Task): 21 | if task.workers > 1: 22 | raise RuntimeError(f"The first task in a pipeline ({task.func}) cannot have more than 1 worker") 23 | if task.join: 24 | raise RuntimeError(f"The first task in a pipeline ({task.func}) cannot join previous results") 25 | self.task = task 26 | self.q_out = asyncio.Queue(maxsize=task.throttle) 27 | 28 | self._n_consumers = 1 if next_task is None else next_task.workers 29 | self._enqueue = AsyncEnqueueFactory(self.q_out, self.task) 30 | 31 | async def _worker(self, *args, **kwargs): 32 | await self._enqueue(*args, **kwargs) 33 | 34 | for _ in range(self._n_consumers): 35 | await self.q_out.put(StopSentinel) 36 | 37 | def start(self, tg: TaskGroup, /, *args, **kwargs): 38 | tg.create_task(self._worker(*args, **kwargs)) 39 | 40 | 41 | class AsyncProducerConsumer: 42 | def __init__(self, q_in: asyncio.Queue, task: Task, next_task: Task): 43 | self.q_out = asyncio.Queue(maxsize=task.throttle) 44 | 45 | self._n_workers = task.workers 46 | self._n_consumers = 1 if next_task is None else next_task.workers 47 | self._dequeue = AsyncDequeueFactory(q_in, task) 48 | self._enqueue = AsyncEnqueueFactory(self.q_out, task) 49 | self._workers_done = 0 50 | 51 | async def _worker(self): 52 | async for output in self._dequeue(): 53 | await self._enqueue(output) 54 | 55 | self._workers_done += 1 56 | if self._workers_done == self._n_workers: 57 | for _ in range(self._n_consumers): 58 | await self.q_out.put(StopSentinel) 59 | 60 | def start(self, tg: TaskGroup, /): 61 | for _ in range(self._n_workers): 62 | tg.create_task(self._worker()) 63 | -------------------------------------------------------------------------------- /src/pyper/_core/decorators.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import functools 4 | import sys 5 | import typing as t 6 | 7 | from .pipeline import AsyncPipeline, Pipeline 8 | from .task import Task 9 | 10 | if sys.version_info < (3, 10): # pragma: no cover 11 | from typing_extensions import ParamSpec 12 | else: 13 | from typing import ParamSpec 14 | 15 | 16 | _P = ParamSpec('P') 17 | _R = t.TypeVar('R') 18 | _Default = t.TypeVar('T', bound=t.NoReturn) # Matches to no type hints 19 | ArgsKwargs: t.TypeAlias = t.Optional[t.Tuple[t.Tuple[t.Any], t.Dict[str, t.Any]]] 20 | 21 | 22 | class task: 23 | """Decorator class to initialize a `Pipeline` consisting of one task. 24 | 25 | Args: 26 | func (callable): A positional-only param defining the task function (can be omitted when using `@task`) 27 | branch (bool): Allows the task to submit multiple outputs 28 | join (bool): Allows the task to take all previous results as input, instead of single results 29 | workers (int): Defines the number of workers to run the task 30 | throttle (int): Limits the number of results the task is able to produce when all consumers are busy 31 | multiprocess (bool): Allows the task to be multiprocessed (cannot be `True` for async tasks) 32 | bind (tuple[tuple, dict]): Additional args and kwargs to bind to the task when defining a pipeline 33 | 34 | Returns: 35 | A `Pipeline` instance consisting of one task. 36 | 37 | Examples: 38 | ```python 39 | def spam(x: int): 40 | return x + 1 41 | 42 | p = task(spam) 43 | 44 | def ham(x: int): 45 | return [x, x + 1, x + 2] 46 | 47 | p = task(ham, branch=True, workers=10) 48 | 49 | async def eggs(x: int): 50 | yield x 51 | yield x + 1 52 | yield x + 2 53 | 54 | p = task(eggs, branch=True, throttle=1) 55 | ``` 56 | """ 57 | @t.overload 58 | def __new__( 59 | cls, 60 | func: t.Callable[_P, _Default], 61 | /, 62 | *, 63 | branch: bool = False, 64 | join: bool = False, 65 | workers: int = 1, 66 | throttle: int = 0, 67 | multiprocess: bool = False, 68 | bind: ArgsKwargs = None) -> Pipeline[_P, _Default]: ... 69 | 70 | @t.overload 71 | def __new__( 72 | cls, 73 | func: None = None, 74 | /, 75 | *, 76 | branch: t.Literal[True], 77 | join: bool = False, 78 | workers: int = 1, 79 | throttle: int = 0, 80 | multiprocess: bool = False, 81 | bind: ArgsKwargs = None) -> t.Type[_branched_partial_task]: ... 82 | 83 | @t.overload 84 | def __new__( 85 | cls, 86 | func: None = None, 87 | /, 88 | *, 89 | branch: bool = False, 90 | join: bool = False, 91 | workers: int = 1, 92 | throttle: int = 0, 93 | multiprocess: bool = False, 94 | bind: ArgsKwargs = None) -> t.Type[task]: ... 95 | 96 | @t.overload 97 | def __new__( 98 | cls, 99 | func: t.Callable[_P, t.Union[t.Awaitable[t.Iterable[_R]], t.AsyncGenerator[_R]]], 100 | /, 101 | *, 102 | branch: t.Literal[True], 103 | join: bool = False, 104 | workers: int = 1, 105 | throttle: int = 0, 106 | multiprocess: bool = False, 107 | bind: ArgsKwargs = None) -> AsyncPipeline[_P, _R]: ... 108 | 109 | @t.overload 110 | def __new__( 111 | cls, 112 | func: t.Callable[_P, t.Awaitable[_R]], 113 | /, 114 | *, 115 | branch: bool = False, 116 | join: bool = False, 117 | workers: int = 1, 118 | throttle: int = 0, 119 | multiprocess: bool = False, 120 | bind: ArgsKwargs = None) -> AsyncPipeline[_P, _R]: ... 121 | 122 | @t.overload 123 | def __new__( 124 | cls, 125 | func: t.Callable[_P, t.Iterable[_R]], 126 | /, 127 | *, 128 | branch: t.Literal[True], 129 | join: bool = False, 130 | workers: int = 1, 131 | throttle: int = 0, 132 | multiprocess: bool = False, 133 | bind: ArgsKwargs = None) -> Pipeline[_P, _R]: ... 134 | 135 | @t.overload 136 | def __new__( 137 | cls, 138 | func: t.Callable[_P, _R], 139 | /, 140 | *, 141 | branch: bool = False, 142 | join: bool = False, 143 | workers: int = 1, 144 | throttle: int = 0, 145 | multiprocess: bool = False, 146 | bind: ArgsKwargs = None) -> Pipeline[_P, _R]: ... 147 | 148 | def __new__( 149 | cls, 150 | func: t.Optional[t.Callable] = None, 151 | /, 152 | *, 153 | branch: bool = False, 154 | join: bool = False, 155 | workers: int = 1, 156 | throttle: int = 0, 157 | multiprocess: bool = False, 158 | bind: ArgsKwargs = None): 159 | # Classic decorator trick: @task() means func is None, @task without parentheses means func is passed. 160 | if func is None: 161 | return functools.partial(cls, branch=branch, join=join, workers=workers, throttle=throttle, multiprocess=multiprocess, bind=bind) 162 | return Pipeline([Task(func=func, branch=branch, join=join, workers=workers, throttle=throttle, multiprocess=multiprocess, bind=bind)]) 163 | 164 | @staticmethod 165 | def bind(*args, **kwargs) -> ArgsKwargs: 166 | """Bind additional `args` and `kwargs` to a task. 167 | 168 | Example: 169 | ```python 170 | def f(x: int, y: int): 171 | return x + y 172 | 173 | p = task(f, bind=task.bind(y=1)) 174 | p(x=1) 175 | ``` 176 | """ 177 | if not args and not kwargs: 178 | return None 179 | return args, kwargs 180 | 181 | 182 | class _branched_partial_task: 183 | @t.overload 184 | def __new__(cls, func: t.Callable[_P, _Default]) -> Pipeline[_P, _Default]: ... 185 | 186 | @t.overload 187 | def __new__( 188 | cls, 189 | func: t.Callable[_P, t.Union[t.Awaitable[t.Iterable[_R]], t.AsyncGenerator[_R]]]) -> AsyncPipeline[_P, _R]: ... 190 | 191 | @t.overload 192 | def __new__(cls, func: t.Callable[_P, t.Iterable[_R]]) -> Pipeline[_P, _R]: ... 193 | 194 | @t.overload 195 | def __new__(cls, func: t.Callable[_P, _R]) -> Pipeline[_P, t.Any]: ... 196 | 197 | def __new__(cls): 198 | raise NotImplementedError 199 | -------------------------------------------------------------------------------- /src/pyper/_core/pipeline.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import inspect 4 | import sys 5 | import typing as t 6 | 7 | from .async_helper.output import AsyncPipelineOutput 8 | from .sync_helper.output import PipelineOutput 9 | 10 | if sys.version_info < (3, 10): # pragma: no cover 11 | from typing_extensions import ParamSpec 12 | else: 13 | from typing import ParamSpec 14 | 15 | if t.TYPE_CHECKING: 16 | from .task import Task 17 | 18 | 19 | _P = ParamSpec('P') 20 | _R = t.TypeVar('R') 21 | _P_Other = ParamSpec("P_Other") 22 | _R_Other = t.TypeVar("R_Other") 23 | 24 | 25 | class Pipeline(t.Generic[_P, _R]): 26 | """A sequence of at least 1 Tasks. 27 | 28 | Two pipelines can be piped into another via: 29 | ```python 30 | new_pipeline = p1 | p2 31 | # OR 32 | new_pipeline = p1.pipe(p2) 33 | ``` 34 | """ 35 | 36 | def __new__(cls, tasks: t.List[Task]): 37 | if any(task.is_async for task in tasks): 38 | instance = object.__new__(AsyncPipeline) 39 | else: 40 | instance = object.__new__(cls) 41 | instance.__init__(tasks=tasks) 42 | return instance 43 | 44 | def __init__(self, tasks: t.List[Task]): 45 | self.tasks = tasks 46 | 47 | def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> t.Generator[_R]: 48 | """Return the pipeline output.""" 49 | output = PipelineOutput(self) 50 | return output(*args, **kwargs) 51 | 52 | @t.overload 53 | def pipe(self: AsyncPipeline[_P, _R], other: AsyncPipeline[_P_Other, _R_Other]) -> AsyncPipeline[_P, _R_Other]: ... 54 | 55 | @t.overload 56 | def pipe(self: AsyncPipeline[_P, _R], other: Pipeline[_P_Other, _R_Other]) -> AsyncPipeline[_P, _R_Other]: ... 57 | 58 | @t.overload 59 | def pipe(self, other: AsyncPipeline[_P_Other, _R_Other]) -> AsyncPipeline[_P, _R_Other]: ... 60 | 61 | @t.overload 62 | def pipe(self, other: Pipeline[_P_Other, _R_Other]) -> Pipeline[_P, _R_Other]: ... 63 | 64 | def pipe(self, other: Pipeline): 65 | """Connect two pipelines, returning a new Pipeline.""" 66 | if not isinstance(other, Pipeline): 67 | raise TypeError(f"{other} of type {type(other)} cannot be piped into a Pipeline") 68 | return Pipeline(self.tasks + other.tasks) 69 | 70 | @t.overload 71 | def __or__(self: AsyncPipeline[_P, _R], other: AsyncPipeline[_P_Other, _R_Other]) -> AsyncPipeline[_P, _R_Other]: ... 72 | 73 | @t.overload 74 | def __or__(self: AsyncPipeline[_P, _R], other: Pipeline[_P_Other, _R_Other]) -> AsyncPipeline[_P, _R_Other]: ... 75 | 76 | @t.overload 77 | def __or__(self, other: AsyncPipeline[_P_Other, _R_Other]) -> AsyncPipeline[_P, _R_Other]: ... 78 | 79 | @t.overload 80 | def __or__(self, other: Pipeline[_P_Other, _R_Other]) -> Pipeline[_P, _R_Other]: ... 81 | 82 | def __or__(self, other: Pipeline): 83 | """Connect two pipelines, returning a new Pipeline.""" 84 | return self.pipe(other) 85 | 86 | def consume(self, other: t.Callable[..., _R_Other]) -> t.Callable[_P, _R_Other]: 87 | """Connect the pipeline to a consumer function (a callable that takes the pipeline output as input).""" 88 | if callable(other): 89 | def consumer(*args: _P.args, **kwargs: _P.kwargs) -> _R_Other: 90 | return other(self(*args, **kwargs)) 91 | return consumer 92 | raise TypeError(f"{other} must be a callable that takes a generator") 93 | 94 | def __gt__(self, other: t.Callable[..., _R_Other]) -> t.Callable[_P, _R_Other]: 95 | """Connect the pipeline to a consumer function (a callable that takes the pipeline output as input).""" 96 | return self.consume(other) 97 | 98 | def __repr__(self): 99 | return f"<{self.__class__.__name__} {[task.func for task in self.tasks]}>" 100 | 101 | 102 | class AsyncPipeline(Pipeline[_P, _R]): 103 | def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> t.AsyncGenerator[_R]: 104 | """Return the pipeline output.""" 105 | output = AsyncPipelineOutput(self) 106 | return output(*args, **kwargs) 107 | 108 | def consume(self, other: t.Callable[..., _R_Other]) -> t.Callable[_P, _R_Other]: 109 | """Connect the pipeline to a consumer function (a callable that takes the pipeline output as input).""" 110 | if callable(other) and \ 111 | (inspect.iscoroutinefunction(other) or inspect.iscoroutinefunction(other.__call__)): 112 | async def consumer(*args: _P.args, **kwargs: _P.kwargs) -> _R_Other: 113 | return await other(self(*args, **kwargs)) 114 | return consumer 115 | raise TypeError(f"{other} must be an async callable that takes an async generator") 116 | -------------------------------------------------------------------------------- /src/pyper/_core/sync_helper/output.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, Union 4 | 5 | from .stage import Producer, ProducerConsumer 6 | from ..util.sentinel import StopSentinel 7 | from ..util.worker_pool import ProcessPool, ThreadPool 8 | 9 | if TYPE_CHECKING: 10 | import multiprocessing as mp 11 | import queue 12 | from ..pipeline import Pipeline 13 | 14 | 15 | class PipelineOutput: 16 | def __init__(self, pipeline: Pipeline): 17 | self.pipeline = pipeline 18 | 19 | def _get_q_out(self, tp: ThreadPool, pp: ProcessPool, *args, **kwargs) -> Union[mp.Queue, queue.Queue]: 20 | """Feed forward each stage to the next, returning the output queue of the final stage.""" 21 | q_out = None 22 | for task, next_task in zip(self.pipeline.tasks, self.pipeline.tasks[1:] + [None]): 23 | pool = pp if task.multiprocess else tp 24 | if q_out is None: 25 | stage = Producer(task=task, next_task=next_task, manager=pp.manager, shutdown_event=pool.shutdown_event) 26 | stage.start(pool, *args, **kwargs) 27 | else: 28 | stage = ProducerConsumer(q_in=q_out, task=task, next_task=next_task, manager=pp.manager, shutdown_event=pool.shutdown_event) 29 | stage.start(pool) 30 | q_out = stage.q_out 31 | 32 | return q_out 33 | 34 | def __call__(self, *args, **kwargs): 35 | """Iterate through the pipeline, taking the inputs to the first task, and yielding each output from the last task.""" 36 | with ThreadPool() as tp, ProcessPool() as pp: 37 | q_out = self._get_q_out(tp, pp, *args, **kwargs) 38 | try: 39 | while (data := q_out.get()) is not StopSentinel: 40 | yield data 41 | except (KeyboardInterrupt, SystemExit): # pragma: no cover 42 | tp.shutdown_event.set() 43 | pp.shutdown_event.set() 44 | raise 45 | -------------------------------------------------------------------------------- /src/pyper/_core/sync_helper/queue_io.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections.abc import Iterable 4 | from typing import TYPE_CHECKING, Union 5 | 6 | from ..util.sentinel import StopSentinel 7 | 8 | if TYPE_CHECKING: 9 | import multiprocessing as mp 10 | import queue 11 | from ..task import Task 12 | 13 | 14 | def DequeueFactory(q_in: Union[mp.Queue, queue.Queue], task: Task): 15 | return _JoiningDequeue(q_in=q_in) if task.join \ 16 | else _SingleDequeue(q_in=q_in) 17 | 18 | 19 | class _Dequeue: 20 | """Pulls data from an input queue.""" 21 | def __init__(self, q_in: Union[mp.Queue, queue.Queue]): 22 | self.q_in = q_in 23 | 24 | def _input_stream(self): 25 | while (data := self.q_in.get()) is not StopSentinel: 26 | yield data 27 | 28 | def __call__(self): 29 | raise NotImplementedError 30 | 31 | 32 | class _SingleDequeue(_Dequeue): 33 | def __call__(self): 34 | for data in self._input_stream(): 35 | yield data 36 | 37 | 38 | class _JoiningDequeue(_Dequeue): 39 | def __call__(self): 40 | yield self._input_stream() 41 | 42 | 43 | def EnqueueFactory(q_out: Union[mp.Queue, queue.Queue], task: Task): 44 | return _BranchingEnqueue(q_out=q_out, task=task) if task.branch \ 45 | else _SingleEnqueue(q_out=q_out, task=task) 46 | 47 | 48 | class _Enqueue: 49 | """Puts output from a task onto an output queue.""" 50 | def __init__(self, q_out: Union[mp.Queue, queue.Queue], task: Task): 51 | self.q_out = q_out 52 | self.task = task 53 | 54 | def __call__(self, *args, **kwargs): 55 | raise NotImplementedError 56 | 57 | 58 | class _SingleEnqueue(_Enqueue): 59 | def __call__(self, *args, **kwargs): 60 | self.q_out.put(self.task.func(*args, **kwargs)) 61 | 62 | 63 | class _BranchingEnqueue(_Enqueue): 64 | def __call__(self, *args, **kwargs): 65 | if isinstance(result := self.task.func(*args, **kwargs), Iterable): 66 | for output in result: 67 | self.q_out.put(output) 68 | else: 69 | raise TypeError( 70 | f"got object of type {type(result)} from branching task {self.task.func} which could not be iterated over." 71 | " (the task should be a generator, or return an iterable)") 72 | -------------------------------------------------------------------------------- /src/pyper/_core/sync_helper/stage.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import queue 4 | import threading 5 | from types import SimpleNamespace 6 | from typing import TYPE_CHECKING, Union 7 | 8 | from .queue_io import DequeueFactory, EnqueueFactory 9 | from ..util.sentinel import StopSentinel 10 | 11 | if TYPE_CHECKING: 12 | import multiprocessing as mp 13 | from multiprocessing.managers import SyncManager 14 | import multiprocessing.synchronize as mpsync 15 | from ..util.worker_pool import WorkerPool 16 | from ..task import Task 17 | 18 | 19 | class Producer: 20 | def __init__( 21 | self, 22 | task: Task, 23 | next_task: Task, 24 | manager: SyncManager, 25 | shutdown_event: Union[mpsync.Event, threading.Event]): 26 | if task.workers > 1: 27 | raise RuntimeError(f"The first task in a pipeline ({task.func}) cannot have more than 1 worker") 28 | if task.join: 29 | raise RuntimeError(f"The first task in a pipeline ({task.func}) cannot join previous results") 30 | self.q_out = manager.Queue(maxsize=task.throttle) \ 31 | if task.multiprocess or (next_task is not None and next_task.multiprocess) \ 32 | else queue.Queue(maxsize=task.throttle) 33 | 34 | self._shutdown_event = shutdown_event 35 | self._n_workers = task.workers 36 | self._n_consumers = 1 if next_task is None else next_task.workers 37 | self._enqueue = EnqueueFactory(self.q_out, task) 38 | 39 | def _worker(self, *args, **kwargs): 40 | try: 41 | self._enqueue(*args, **kwargs) 42 | except Exception: 43 | self._shutdown_event.set() 44 | raise 45 | finally: 46 | for _ in range(self._n_consumers): 47 | self.q_out.put(StopSentinel) 48 | 49 | def start(self, pool: WorkerPool, /, *args, **kwargs): 50 | pool.submit(self._worker, *args, **kwargs) 51 | 52 | 53 | class ProducerConsumer: 54 | def __init__( 55 | self, 56 | q_in: Union[mp.Queue, queue.Queue], 57 | task: Task, 58 | next_task: Task, 59 | manager: SyncManager, 60 | shutdown_event: Union[mpsync.Event, threading.Event]): 61 | # The output queue is shared between this task and the next. We optimize here by using queue.Queue wherever possible 62 | # and only using a multiprocess Queue when the current task or the next task are multiprocessed 63 | self.q_out = manager.Queue(maxsize=task.throttle) \ 64 | if task.multiprocess or (next_task is not None and next_task.multiprocess) \ 65 | else queue.Queue(maxsize=task.throttle) 66 | 67 | self._shutdown_event = shutdown_event 68 | self._n_workers = task.workers 69 | self._n_consumers = 1 if next_task is None else next_task.workers 70 | self._dequeue = DequeueFactory(q_in, task) 71 | self._enqueue = EnqueueFactory(self.q_out, task) 72 | self._workers_done = manager.Value('i', 0) if task.multiprocess else SimpleNamespace(value=0) 73 | self._workers_done_lock = manager.Lock() if task.multiprocess else threading.Lock() 74 | 75 | def _worker(self): 76 | try: 77 | for output in self._dequeue(): 78 | if not self._shutdown_event.is_set(): 79 | self._enqueue(output) 80 | except Exception: 81 | self._shutdown_event.set() 82 | raise 83 | finally: 84 | with self._workers_done_lock: 85 | self._workers_done.value += 1 86 | if self._workers_done.value == self._n_workers: 87 | for _ in range(self._n_consumers): 88 | self.q_out.put(StopSentinel) 89 | 90 | def start(self, pool: WorkerPool, /): 91 | for _ in range(self._n_workers): 92 | pool.submit(self._worker) 93 | -------------------------------------------------------------------------------- /src/pyper/_core/task.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import functools 4 | import inspect 5 | import pickle 6 | from typing import Callable, Dict, Optional, Tuple 7 | 8 | 9 | class Task: 10 | """The representation of a function within a Pipeline.""" 11 | 12 | __slots__ = ( 13 | "func", 14 | "branch", 15 | "join", 16 | "workers", 17 | "throttle", 18 | "multiprocess", 19 | "is_async", 20 | "is_gen" 21 | ) 22 | 23 | def __init__( 24 | self, 25 | func: Callable, 26 | branch: bool = False, 27 | join: bool = False, 28 | workers: int = 1, 29 | throttle: int = 0, 30 | multiprocess: bool = False, 31 | bind: Optional[Tuple[Tuple, Dict]] = None): 32 | if not isinstance(workers, int): 33 | raise TypeError("workers must be an integer") 34 | if workers < 1: 35 | raise ValueError("workers cannot be less than 1") 36 | if not isinstance(throttle, int): 37 | raise TypeError("throttle must be an integer") 38 | if throttle < 0: 39 | raise ValueError("throttle cannot be less than 0") 40 | if not callable(func): 41 | raise TypeError("A task function must be a callable object") 42 | 43 | self.is_gen = inspect.isgeneratorfunction(func) \ 44 | or inspect.isasyncgenfunction(func) \ 45 | or inspect.isgeneratorfunction(func.__call__) \ 46 | or inspect.isasyncgenfunction(func.__call__) 47 | self.is_async = inspect.iscoroutinefunction(func) \ 48 | or inspect.isasyncgenfunction(func) \ 49 | or inspect.iscoroutinefunction(func.__call__) \ 50 | or inspect.isasyncgenfunction(func.__call__) 51 | 52 | if multiprocess: 53 | # Asynchronous functions cannot be multiprocessed 54 | if self.is_async: 55 | raise ValueError("multiprocess cannot be True for an async task") 56 | 57 | # The function must be picklable 58 | try: 59 | pickle.dumps(func) 60 | except (pickle.PicklingError, AttributeError): 61 | raise RuntimeError(f"{func} cannot be pickled and so cannot be multiprocessed" 62 | f" -- ensure that the function is globally accessible and that its definition has not changed") from None 63 | 64 | self.func = func if bind is None else functools.partial(func, *bind[0], **bind[1]) 65 | self.branch = branch 66 | self.join = join 67 | self.workers = workers 68 | self.throttle = throttle 69 | self.multiprocess = multiprocess 70 | -------------------------------------------------------------------------------- /src/pyper/_core/util/asynchronize.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor 5 | import functools 6 | 7 | from ..task import Task 8 | 9 | 10 | def asynchronize(task: Task, tp: ThreadPoolExecutor, pp: ProcessPoolExecutor) -> Task: 11 | """Unify async and sync tasks as awaitable futures. 12 | 1. If the task is async already, return it. 13 | 2. Multiprocessed synchronous functions are wrapped in a call to `run_in_executor` using `ProcessPoolExecutor`. 14 | 3. Threaded synchronous functions are wrapped in a call to `run_in_executor` using `ThreadPoolExecutor`. 15 | """ 16 | if task.is_async: 17 | return task 18 | 19 | if task.is_gen: 20 | # Small optimization to convert sync generators to async functions 21 | # This saves from having to use a thread/process just to get the generator object 22 | @functools.wraps(task.func) 23 | async def wrapper(*args, **kwargs): 24 | return task.func(*args, **kwargs) 25 | else: 26 | executor = pp if task.multiprocess else tp 27 | @functools.wraps(task.func) 28 | async def wrapper(*args, **kwargs): 29 | loop = asyncio.get_running_loop() 30 | f = functools.partial(task.func, *args, **kwargs) 31 | return await loop.run_in_executor(executor=executor, func=f) 32 | return Task( 33 | func=wrapper, 34 | branch=task.branch, 35 | join=task.join, 36 | workers=task.workers, 37 | throttle=task.throttle 38 | ) 39 | 40 | 41 | # backwards compatibility 42 | ascynchronize = asynchronize 43 | -------------------------------------------------------------------------------- /src/pyper/_core/util/sentinel.py: -------------------------------------------------------------------------------- 1 | class StopSentinel: 2 | """Dummy value informing a consumer to stop pulling from its input queue.""" 3 | -------------------------------------------------------------------------------- /src/pyper/_core/util/task_group.py: -------------------------------------------------------------------------------- 1 | """Fallback implementation of TaskGroup and ExceptionGroup for < 3.11""" 2 | from __future__ import annotations 3 | 4 | import asyncio 5 | from typing import List 6 | 7 | 8 | class TaskGroup: 9 | """Asynchronous context manager for managing groups of tasks. 10 | 11 | Example use: 12 | 13 | async with asyncio.TaskGroup() as group: 14 | task1 = group.create_task(some_coroutine(...)) 15 | task2 = group.create_task(other_coroutine(...)) 16 | print("Both tasks have completed now.") 17 | 18 | All tasks are awaited when the context manager exits. 19 | 20 | Any exceptions other than `asyncio.CancelledError` raised within 21 | a task will cancel all remaining tasks and wait for them to exit. 22 | The exceptions are then combined and raised as an `ExceptionGroup`. 23 | """ 24 | def __init__(self): 25 | self._entered = False 26 | self._exiting = False 27 | self._aborting = False 28 | self._loop = None 29 | self._parent_task = None 30 | self._parent_cancel_requested = False 31 | self._tasks = set() 32 | self._errors = [] 33 | self._base_error = None 34 | self._on_completed_fut = None 35 | 36 | def __repr__(self): 37 | info = [''] 38 | if self._tasks: 39 | info.append(f"tasks={len(self._tasks)}") 40 | if self._errors: 41 | info.append(f"errors={len(self._errors)}") 42 | if self._aborting: 43 | info.append("cancelling") 44 | elif self._entered: 45 | info.append("entered") 46 | 47 | info_str = ' '.join(info) 48 | return f"" 49 | 50 | async def __aenter__(self): 51 | if self._entered: 52 | raise RuntimeError( 53 | f"TaskGroup {self!r} has already been entered") 54 | if self._loop is None: 55 | self._loop = asyncio.get_running_loop() 56 | self._parent_task = asyncio.tasks.current_task(self._loop) 57 | if self._parent_task is None: 58 | raise RuntimeError(f"TaskGroup {self!r} cannot determine the parent task") 59 | self._entered = True 60 | 61 | return self 62 | 63 | async def __aexit__(self, et, exc, tb): 64 | self._exiting = True 65 | 66 | if (exc is not None and 67 | self._is_base_error(exc) and 68 | self._base_error is None): 69 | self._base_error = exc 70 | 71 | propagate_cancellation_error = \ 72 | exc if et is asyncio.CancelledError else None 73 | # if self._parent_cancel_requested: 74 | # if self._parent_task.uncancel() == 0: 75 | # propagate_cancellation_error = None 76 | 77 | if et is not None: 78 | if not self._aborting: 79 | # Our parent task is being cancelled: 80 | # 81 | # async with TaskGroup() as g: 82 | # g.create_task(...) 83 | # await ... # <- CancelledError 84 | # 85 | # or there's an exception in "async with": 86 | # 87 | # async with TaskGroup() as g: 88 | # g.create_task(...) 89 | # 1 / 0 90 | # 91 | self._abort() 92 | 93 | # We use while-loop here because "self._on_completed_fut" 94 | # can be cancelled multiple times if our parent task 95 | # is being cancelled repeatedly (or even once, when 96 | # our own cancellation is already in progress) 97 | while self._tasks: 98 | if self._on_completed_fut is None: 99 | self._on_completed_fut = self._loop.create_future() 100 | 101 | try: 102 | await self._on_completed_fut 103 | except asyncio.CancelledError as ex: 104 | if not self._aborting: 105 | # Our parent task is being cancelled: 106 | # 107 | # async def wrapper(): 108 | # async with TaskGroup() as g: 109 | # g.create_task(foo) 110 | # 111 | # "wrapper" is being cancelled while "foo" is 112 | # still running. 113 | propagate_cancellation_error = ex 114 | self._abort() 115 | 116 | self._on_completed_fut = None 117 | 118 | assert not self._tasks 119 | 120 | if self._base_error is not None: 121 | raise self._base_error 122 | 123 | # Propagate CancelledError if there is one, except if there 124 | # are other errors -- those have priority. 125 | if propagate_cancellation_error and not self._errors: 126 | raise propagate_cancellation_error 127 | 128 | if et is not None and et is not asyncio.CancelledError: 129 | self._errors.append(exc) 130 | 131 | if self._errors: 132 | # Exceptions are heavy objects that can have object 133 | # cycles (bad for GC); let's not keep a reference to 134 | # a bunch of them. 135 | try: 136 | me = ExceptionGroup("unhandled errors in a TaskGroup", self._errors) 137 | raise me from None 138 | finally: 139 | self._errors = None 140 | 141 | def create_task(self, coro, *, name=None, context=None): 142 | """Create a new task in this group and return it. 143 | 144 | Similar to `asyncio.create_task`. 145 | """ 146 | if not self._entered: 147 | raise RuntimeError(f"TaskGroup {self!r} has not been entered") 148 | if self._exiting and not self._tasks: 149 | raise RuntimeError(f"TaskGroup {self!r} is finished") 150 | if self._aborting: 151 | raise RuntimeError(f"TaskGroup {self!r} is shutting down") 152 | if context is None: 153 | task = self._loop.create_task(coro) 154 | else: 155 | task = self._loop.create_task(coro, context=context) 156 | asyncio.tasks._set_task_name(task, name) 157 | if task.done(): 158 | self._on_task_done(task) 159 | else: 160 | self._tasks.add(task) 161 | task.add_done_callback(self._on_task_done) 162 | return task 163 | 164 | def _is_base_error(self, exc: BaseException) -> bool: 165 | assert isinstance(exc, BaseException) 166 | return isinstance(exc, (SystemExit, KeyboardInterrupt)) 167 | 168 | def _abort(self): 169 | self._aborting = True 170 | 171 | for t in self._tasks: 172 | if not t.done(): 173 | t.cancel() 174 | 175 | def _on_task_done(self, task): 176 | self._tasks.discard(task) 177 | 178 | if self._on_completed_fut is not None and not self._tasks: 179 | if not self._on_completed_fut.done(): 180 | self._on_completed_fut.set_result(True) 181 | 182 | if task.cancelled(): 183 | return 184 | 185 | exc = task.exception() 186 | if exc is None: 187 | return 188 | 189 | self._errors.append(exc) 190 | if self._is_base_error(exc) and self._base_error is None: 191 | self._base_error = exc 192 | 193 | if self._parent_task.done(): 194 | # Not sure if this case is possible, but we want to handle 195 | # it anyways. 196 | self._loop.call_exception_handler({ 197 | "message": f"Task {task!r} has errored out but its parent " 198 | f"task {self._parent_task} is already complete", 199 | "exception": exc, 200 | "task": task, 201 | }) 202 | return 203 | 204 | if not self._aborting and not self._parent_cancel_requested: 205 | # If parent task *is not* being cancelled, it means that we want 206 | # to manually cancel it to abort whatever is being run right now 207 | # in the TaskGroup. But we want to mark parent task as 208 | # "not cancelled" later in __aexit__. Example situation that 209 | # we need to handle: 210 | # 211 | # async def foo(): 212 | # try: 213 | # async with TaskGroup() as g: 214 | # g.create_task(crash_soon()) 215 | # await something # <- this needs to be canceled 216 | # # by the TaskGroup, e.g. 217 | # # foo() needs to be cancelled 218 | # except Exception: 219 | # # Ignore any exceptions raised in the TaskGroup 220 | # pass 221 | # await something_else # this line has to be called 222 | # # after TaskGroup is finished. 223 | self._abort() 224 | self._parent_cancel_requested = True 225 | self._parent_task.cancel() 226 | 227 | 228 | class ExceptionGroup(Exception): 229 | def __init__(self, message: str, exceptions: List[Exception]): 230 | self.message = message 231 | self.exceptions = exceptions 232 | -------------------------------------------------------------------------------- /src/pyper/_core/util/worker_pool.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import concurrent.futures as cf 4 | import multiprocessing as mp 5 | import multiprocessing.synchronize as mpsync 6 | import threading 7 | from typing import List, Union 8 | 9 | 10 | class WorkerPool: 11 | """A unified wrapper to the ThreadPoolExecutor and ProcessPoolExecutor classes. 12 | 13 | 1. Spins up thread/process workers and maintains a reference to each 14 | 2. Ensures safe tear-down of all workers and propagates errors 15 | """ 16 | shutdown_event: Union[mpsync.Event, threading.Event] 17 | _executor: Union[cf.ProcessPoolExecutor, cf.ThreadPoolExecutor] 18 | _futures: List[cf.Future] 19 | 20 | def __enter__(self): 21 | self._executor.__enter__() 22 | return self 23 | 24 | def __exit__(self, et, ev, tb): 25 | self._executor.__exit__(et, ev, tb) 26 | for future in self._futures: 27 | # Resolve the future and raise any errors inside 28 | future.result() 29 | 30 | def submit(self, func, /, *args, **kwargs): 31 | future = self._executor.submit(func, *args, **kwargs) 32 | self._futures.append(future) 33 | return future 34 | 35 | 36 | class ThreadPool(WorkerPool): 37 | def __init__(self): 38 | self.shutdown_event = threading.Event() 39 | 40 | self._executor = cf.ThreadPoolExecutor() 41 | self._futures = [] 42 | 43 | 44 | class ProcessPool(WorkerPool): 45 | def __init__(self): 46 | self.manager = mp.Manager() 47 | self.shutdown_event = self.manager.Event() 48 | 49 | self._executor = cf.ProcessPoolExecutor() 50 | self._futures = [] 51 | -------------------------------------------------------------------------------- /tests/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PYTHON_VERSION=3.12 2 | FROM python:${PYTHON_VERSION} 3 | 4 | WORKDIR /src 5 | RUN pip install tox>=4.23 6 | 7 | COPY src/pyper src/pyper 8 | COPY tests tests 9 | COPY pyproject.toml . 10 | COPY tox.ini . 11 | COPY .coveragerc . 12 | COPY .git .git 13 | 14 | RUN chmod +x tests/entrypoint.sh 15 | 16 | ENTRYPOINT ["/bin/bash", "tests/entrypoint.sh"] -------------------------------------------------------------------------------- /tests/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | 3 | services: 4 | python3.13: 5 | container_name: test3.13 6 | build: 7 | context: .. 8 | dockerfile: tests/Dockerfile 9 | args: 10 | PYTHON_VERSION: "3.13" 11 | 12 | python3.12: 13 | container_name: test3.12 14 | build: 15 | context: .. 16 | dockerfile: tests/Dockerfile 17 | args: 18 | PYTHON_VERSION: "3.12" 19 | 20 | python3.11: 21 | container_name: test3.11 22 | build: 23 | context: .. 24 | dockerfile: tests/Dockerfile 25 | args: 26 | PYTHON_VERSION: "3.11" 27 | 28 | python3.10: 29 | container_name: test3.10 30 | build: 31 | context: .. 32 | dockerfile: tests/Dockerfile 33 | args: 34 | PYTHON_VERSION: "3.10" 35 | 36 | python3.9: 37 | container_name: test3.9 38 | build: 39 | context: .. 40 | dockerfile: tests/Dockerfile 41 | args: 42 | PYTHON_VERSION: "3.9" 43 | 44 | python3.8: 45 | container_name: test3.8 46 | build: 47 | context: .. 48 | dockerfile: tests/Dockerfile 49 | args: 50 | PYTHON_VERSION: "3.8" 51 | -------------------------------------------------------------------------------- /tests/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | exec tox -e "${PYTHON_VERSION}" 4 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest>=8.3.3 2 | pytest_asyncio>=0.24 3 | tox>=4.23 -------------------------------------------------------------------------------- /tests/test_async.py: -------------------------------------------------------------------------------- 1 | from pyper import task 2 | import pytest 3 | 4 | class TestError(Exception): ... 5 | 6 | def f1(data): 7 | return data 8 | 9 | def f2(data): 10 | yield data 11 | 12 | def f3(data): 13 | raise TestError 14 | 15 | def f4(data): 16 | return [data] 17 | 18 | async def af1(data): 19 | return data 20 | 21 | async def af2(data): 22 | yield data 23 | 24 | async def af3(data): 25 | raise TestError 26 | 27 | async def af4(data): 28 | async for row in data: 29 | yield row 30 | 31 | async def consumer(data): 32 | total = 0 33 | async for i in data: 34 | total += i 35 | return total 36 | 37 | @pytest.mark.asyncio 38 | async def test_aiterable_branched_pipeline(): 39 | p = task(af1) | task(f2, branch=True) 40 | assert await p(1).__anext__() == 1 41 | 42 | @pytest.mark.asyncio 43 | async def test_iterable_branched_pipeline(): 44 | p = task(af1) | task(f4, branch=True) 45 | assert await p(1).__anext__() == 1 46 | 47 | @pytest.mark.asyncio 48 | async def test_joined_pipeline(): 49 | p = task(af1) | task(af2, branch=True) | task(af4, branch=True, join=True) 50 | assert await p(1).__anext__() == 1 51 | 52 | @pytest.mark.asyncio 53 | async def test_consumer(): 54 | p = task(af1) | task(af2, branch=True) > consumer 55 | assert await p(1) == 1 56 | 57 | @pytest.mark.asyncio 58 | async def test_invalid_first_stage_workers(): 59 | try: 60 | p = task(af1, workers=2) | task(af2, branch=True) > consumer 61 | await p(1) 62 | except Exception as e: 63 | assert isinstance(e, RuntimeError) 64 | else: 65 | raise AssertionError 66 | 67 | @pytest.mark.asyncio 68 | async def test_invalid_first_stage_join(): 69 | try: 70 | p = task(af1, join=True) | task(af2, branch=True) > consumer 71 | await p(1) 72 | except Exception as e: 73 | assert isinstance(e, RuntimeError) 74 | else: 75 | raise AssertionError 76 | 77 | @pytest.mark.asyncio 78 | async def test_invalid_branch_result(): 79 | try: 80 | p = task(af1, branch=True) > consumer 81 | await p(1) 82 | except Exception as e: 83 | assert isinstance(e, TypeError) 84 | else: 85 | raise AssertionError 86 | 87 | async def _try_catch_error(pipeline): 88 | try: 89 | p = task(af1) | pipeline > consumer 90 | await p(1) 91 | except Exception as e: 92 | return isinstance(e, TestError) 93 | else: 94 | return False 95 | 96 | @pytest.mark.asyncio 97 | async def test_async_error_handling(): 98 | p = task(af3) 99 | assert await _try_catch_error(p) 100 | 101 | @pytest.mark.asyncio 102 | async def test_threaded_error_handling(): 103 | p = task(f3, workers=2) 104 | assert await _try_catch_error(p) 105 | 106 | @pytest.mark.asyncio 107 | async def test_multiprocessed_error_handling(): 108 | p = task(f3, workers=2, multiprocess=True) 109 | assert await _try_catch_error(p) 110 | 111 | @pytest.mark.asyncio 112 | async def test_unified_pipeline(): 113 | p = task(af1) | task(f1) | task(f2, branch=True, multiprocess=True) > consumer 114 | assert await p(1) == 1 115 | -------------------------------------------------------------------------------- /tests/test_sync.py: -------------------------------------------------------------------------------- 1 | import time 2 | from pyper import task 3 | 4 | class TestError(Exception): ... 5 | 6 | def f1(data): 7 | return data 8 | 9 | def f2(data): 10 | yield data 11 | 12 | def f3(data): 13 | for row in data: 14 | yield row 15 | 16 | def f4(a1, a2, a3, data, k1, k2): 17 | return data 18 | 19 | def f5(data): 20 | # Make queue monitor timeout on main thread 21 | time.sleep(0.2) 22 | raise TestError 23 | 24 | def consumer(data): 25 | total = 0 26 | for i in data: 27 | total += i 28 | return total 29 | 30 | def test_branched_pipeline(): 31 | p = task(f1) | task(f2, branch=True) 32 | assert p(1).__next__() == 1 33 | 34 | def test_joined_pipeline(): 35 | p = task(f1) | task(f2, branch=True) | task(f3, branch=True, join=True) 36 | assert p(1).__next__() == 1 37 | 38 | def test_bind(): 39 | p = task(f1) | task(f4, bind=task.bind(1, 1, 1, k1=1, k2=2)) 40 | assert p(1).__next__() == 1 41 | 42 | def test_redundant_bind_ok(): 43 | p = task(f1) | task(f2, branch=True, bind=task.bind()) 44 | assert p(1).__next__() == 1 45 | 46 | def test_consumer(): 47 | p = task(f1) | task(f2, branch=True) > consumer 48 | assert p(1) == 1 49 | 50 | def test_invalid_first_stage_workers(): 51 | try: 52 | p = task(f1, workers=2) | task(f2) > consumer 53 | p(1) 54 | except Exception as e: 55 | assert isinstance(e, RuntimeError) 56 | else: 57 | raise AssertionError 58 | 59 | def test_invalid_first_stage_join(): 60 | try: 61 | p = task(f1, join=True) | task(f2, branch=True) > consumer 62 | p(1) 63 | except Exception as e: 64 | assert isinstance(e, RuntimeError) 65 | else: 66 | raise AssertionError 67 | 68 | def test_invalid_branch_result(): 69 | try: 70 | p = task(f1, branch=True) > consumer 71 | p(1) 72 | except Exception as e: 73 | assert isinstance(e, TypeError) 74 | else: 75 | raise AssertionError 76 | 77 | def test_threaded_error_handling(): 78 | try: 79 | p = task(f1) | task(f5, workers=2) > consumer 80 | p(1) 81 | except Exception as e: 82 | assert isinstance(e, TestError) 83 | else: 84 | raise AssertionError 85 | 86 | def test_multiprocessed_error_handling(): 87 | try: 88 | p = task(f1) | task(f5, workers=2, multiprocess=True) > consumer 89 | p(1) 90 | except Exception as e: 91 | assert isinstance(e, TestError) 92 | else: 93 | raise AssertionError 94 | -------------------------------------------------------------------------------- /tests/test_task.py: -------------------------------------------------------------------------------- 1 | from pyper import task, AsyncPipeline, Pipeline 2 | 3 | 4 | def func(x): 5 | return x 6 | 7 | def gen(x): 8 | yield x 9 | 10 | async def afunc(x): 11 | return x 12 | 13 | async def agen(x): 14 | yield x 15 | 16 | class Func: 17 | def __call__(self, x): 18 | return x 19 | 20 | class Gen: 21 | def __call__(self, x): 22 | yield x 23 | 24 | class AFunc: 25 | async def __call__(self, x): 26 | return x 27 | 28 | class AGen: 29 | async def __call__(self, x): 30 | yield x 31 | 32 | def test_as_decorator(): 33 | p = task(func) 34 | assert isinstance(p, Pipeline) 35 | 36 | def test_as_decorator_with_params(): 37 | p = task(branch=True, workers=2, throttle=2)(func) 38 | assert isinstance(p, Pipeline) 39 | 40 | def test_as_wrapper_with_params(): 41 | p = task(func, join=True, workers=2, throttle=2) 42 | assert isinstance(p, Pipeline) 43 | 44 | def _try_invalid_workers_value(value, exc_type): 45 | try: 46 | task(func, workers=value) 47 | except Exception as e: 48 | return isinstance(e, exc_type) 49 | return False 50 | 51 | def test_raise_for_invalid_workers(): 52 | assert _try_invalid_workers_value(0, ValueError) 53 | assert _try_invalid_workers_value(-1, ValueError) 54 | assert _try_invalid_workers_value("1",TypeError) 55 | assert _try_invalid_workers_value(1.5, TypeError) 56 | 57 | def _try_invalid_throttle(value, exc_type): 58 | try: 59 | task(func, throttle=value) 60 | except Exception as e: 61 | return isinstance(e, exc_type) 62 | return False 63 | 64 | def test_raise_for_invalid_throttle(): 65 | assert _try_invalid_throttle(-1, ValueError) 66 | assert _try_invalid_throttle("1",TypeError) 67 | assert _try_invalid_throttle(1.5, TypeError) 68 | 69 | def test_raise_for_invalid_func(): 70 | try: 71 | task(1) 72 | except Exception as e: 73 | assert isinstance(e, TypeError) 74 | else: 75 | raise AssertionError 76 | 77 | def test_raise_for_async_multiprocess(): 78 | try: 79 | task(afunc, multiprocess=True) 80 | except Exception as e: 81 | assert isinstance(e, ValueError) 82 | else: 83 | raise AssertionError 84 | 85 | def test_raise_for_lambda_multiprocess(): 86 | try: 87 | task(lambda x: x, multiprocess=True) 88 | except Exception as e: 89 | assert isinstance(e, RuntimeError) 90 | else: 91 | raise AssertionError 92 | 93 | def test_raise_for_non_global_multiprocess(): 94 | try: 95 | @task(multiprocess=True) 96 | def f(x): 97 | return x 98 | except Exception as e: 99 | assert isinstance(e, RuntimeError) 100 | else: 101 | raise AssertionError 102 | 103 | def test_async_task(): 104 | p = task(afunc) 105 | assert isinstance(p, AsyncPipeline) 106 | 107 | def test_piped_async_task(): 108 | p = task(afunc) | task(func) 109 | assert isinstance(p, AsyncPipeline) 110 | 111 | def test_invalid_pipe(): 112 | try: 113 | task(func) | 1 114 | except Exception as e: 115 | assert isinstance(e, TypeError) 116 | else: 117 | raise AssertionError 118 | 119 | def test_invalid_async_pipe(): 120 | try: 121 | task(afunc) | 1 122 | except Exception as e: 123 | assert isinstance(e, TypeError) 124 | else: 125 | raise AssertionError 126 | 127 | def test_invalid_consumer(): 128 | try: 129 | task(func) > 1 130 | except Exception as e: 131 | assert isinstance(e, TypeError) 132 | else: 133 | raise AssertionError 134 | 135 | def test_invalid_async_consumer(): 136 | try: 137 | task(afunc) > func 138 | except Exception as e: 139 | assert isinstance(e, TypeError) 140 | else: 141 | raise AssertionError 142 | 143 | def test_gen_inspect(): 144 | is_gen = lambda f: task(f).tasks[0].is_gen 145 | assert is_gen(gen) 146 | assert is_gen(agen) 147 | assert is_gen(Gen()) 148 | assert is_gen(AGen()) 149 | assert not is_gen(func) 150 | assert not is_gen(afunc) 151 | assert not is_gen(Func()) 152 | assert not is_gen(AFunc()) 153 | assert not is_gen(lambda x: x) 154 | 155 | def test_async_inspect(): 156 | is_async = lambda f: task(f).tasks[0].is_async 157 | assert is_async(afunc) 158 | assert is_async(agen) 159 | assert is_async(AFunc()) 160 | assert is_async(AGen()) 161 | assert not is_async(func) 162 | assert not is_async(Func()) 163 | assert not is_async(gen) 164 | assert not is_async(Gen()) 165 | assert not is_async(lambda x: x) 166 | 167 | def test_repr(): 168 | p = task(func) 169 | assert "Pipeline" in repr(p) 170 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py{3.8, 3.9, 3.10, 3.11, 3.12, 3.13}, coverage 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | pytest-asyncio 8 | pytest-cov 9 | coverage 10 | commands = 11 | coverage erase 12 | pytest --cov={envsitepackagesdir}/pyper tests 13 | coverage report -m 14 | setenv = 15 | COVERAGE_FILE = .tox/.coverage 16 | 17 | [testenv:coverage] 18 | deps = 19 | pytest 20 | pytest-asyncio 21 | pytest-cov 22 | coverage 23 | coveralls 24 | commands = 25 | coverage erase 26 | pytest --cov={envsitepackagesdir}/pyper tests 27 | coverage report -m 28 | coverage xml 29 | coveralls 30 | setenv = 31 | COVERALLS_REPO_TOKEN = {env:COVERALLS_REPO_TOKEN} 32 | 33 | [coverage] 34 | rcfile = .coveragerc 35 | --------------------------------------------------------------------------------