├── .codesandbox └── Dockerfile ├── .coveragerc ├── .github └── workflows │ ├── build.yml │ └── docs.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── docs ├── CHANGELOG.md ├── basics.md ├── caching.md ├── channel-collapse_files.png ├── channel-expand_dir.png ├── channels.md ├── cli.md ├── cloud.md ├── configurations.md ├── defining-proc.md ├── error.md ├── examples.md ├── input-output.md ├── layers.png ├── pipen-cloud1.png ├── pipen-cloud2.png ├── plugin.md ├── proc-group.md ├── requirements.txt ├── running.md ├── scheduler.md ├── script.md ├── style.css └── templating.md ├── examples ├── caching.py ├── cloudwdir.py ├── example.py ├── gbatch.py ├── input_data_callback.py ├── mako-templating.py ├── multijobs.py ├── plugin-example.py ├── python-script.py └── retry.py ├── mkdocs.yml ├── pipen.png ├── pipen ├── __init__.py ├── __main__.py ├── _job_caching.py ├── channel.py ├── cli │ ├── __init__.py │ ├── _hooks.py │ ├── _main.py │ ├── help.py │ ├── plugins.py │ ├── profile.py │ └── version.py ├── defaults.py ├── exceptions.py ├── job.py ├── pipen.py ├── pluginmgr.py ├── proc.py ├── procgroup.py ├── progressbar.py ├── py.typed ├── scheduler.py ├── template.py ├── utils.py └── version.py ├── poetry.lock ├── pyproject.toml ├── setup.py ├── tests ├── __init__.py ├── conftest.py ├── helpers.py ├── test_channel.py ├── test_cli.py ├── test_job.py ├── test_pipen.py ├── test_plugin.py ├── test_proc.py ├── test_procgroup.py ├── test_scheduler.py ├── test_template.py ├── test_utils.py └── test_xqute_pars.py └── tox.ini /.codesandbox/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10.12 2 | 3 | RUN apt-get update && apt-get install -y fish && \ 4 | pip install -U pip && \ 5 | pip install poetry && \ 6 | poetry config virtualenvs.create false && \ 7 | chsh -s /usr/bin/fish -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | # need plugins to be installed to test 4 | pipen/cli/plugins.py 5 | tests/* 6 | setup.py 7 | 8 | [report] 9 | exclude_lines = 10 | if TYPE_CHECKING: 11 | pragma: no cover 12 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build and Deploy 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | 7 | build: 8 | runs-on: ubuntu-24.04 9 | if: "! contains(github.event.head_commit.message, 'wip') && ! startsWith(github.ref, 'refs/tags')" 10 | strategy: 11 | matrix: 12 | # python-version: [3.8, 3.9, "3.10"] 13 | python-version: [3.9, "3.10", "3.11", "3.12"] 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Setup Python # Set Python version 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | python -m pip install poetry 25 | poetry config virtualenvs.create false 26 | poetry install -v --with dev 27 | - name: Run flake8 28 | run: flake8 pipen 29 | - uses: 'google-github-actions/auth@v2' 30 | with: 31 | credentials_json: ${{ secrets.GCP_SA_KEY }} 32 | - name: Test with pytest 33 | run: pytest tests/ --junitxml=junit/test-results-${{ matrix.python-version }}.xml 34 | - name: Upload pytest test results 35 | uses: actions/upload-artifact@v4 36 | with: 37 | name: pytest-results-${{ matrix.python-version }} 38 | path: junit/test-results-${{ matrix.python-version }}.xml 39 | # Use always() to always run this step to publish test results when there are test failures 40 | if: ${{ always() }} 41 | - name: Run codacy-coverage-reporter 42 | uses: codacy/codacy-coverage-reporter-action@master 43 | if: matrix.python-version == 3.10 44 | with: 45 | project-token: ${{ secrets.CODACY_PROJECT_TOKEN }} 46 | coverage-reports: .coverage.xml 47 | 48 | deploy: 49 | # needs: build 50 | runs-on: ubuntu-24.04 51 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') 52 | strategy: 53 | matrix: 54 | python-version: ["3.10"] 55 | steps: 56 | - uses: actions/checkout@v4 57 | - name: Setup Python # Set Python version 58 | uses: actions/setup-python@v5 59 | - name: Install dependencies 60 | run: | 61 | python -m pip install --upgrade pip 62 | python -m pip install poetry 63 | - name: Publish to PyPI 64 | run: poetry publish --build -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASSWORD }} 65 | if: success() 66 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Build Docs 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | docs: 7 | runs-on: ubuntu-24.04 8 | # if: github.ref == 'refs/heads/master' 9 | strategy: 10 | matrix: 11 | python-version: ["3.10"] 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Setup Python # Set Python version 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: ${{ matrix.python-version }} 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | python -m pip install poetry 22 | poetry config virtualenvs.create false 23 | poetry install -v 24 | - name: Update docs 25 | run: | 26 | python -m pip install mkdocs 27 | python -m pip install -r docs/requirements.txt 28 | cd docs 29 | cp ../README.md index.md 30 | cp ../pipen.png pipen.png 31 | cd .. 32 | mkdocs gh-deploy --clean --force 33 | if: success() 34 | 35 | # fix-index: 36 | # needs: docs 37 | # runs-on: ubuntu-latest 38 | # strategy: 39 | # matrix: 40 | # python-version: ["3.10"] 41 | # steps: 42 | # - uses: actions/checkout@v3 43 | # with: 44 | # ref: gh-pages 45 | # - name: Fix index.html 46 | # run: | 47 | # echo ':: head of index.html - before ::' 48 | # head index.html 49 | # sed -i '1,5{/^$/d}' index.html 50 | # echo ':: head of index.html - after ::' 51 | # head index.html 52 | # if: success() 53 | # - name: Commit changes 54 | # run: | 55 | # git config --local user.email "action@github.com" 56 | # git config --local user.name "GitHub Action" 57 | # git commit -m "Add changes" -a 58 | # if: success() 59 | # - name: Push changes 60 | # uses: ad-m/github-push-action@master 61 | # with: 62 | # github_token: ${{ secrets.GITHUB_TOKEN }} 63 | # branch: gh-pages 64 | # if: success() 65 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | .coverage.xml 46 | cov.xml 47 | *,cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # IPython Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # dotenv 81 | .env 82 | 83 | # virtualenv 84 | venv/ 85 | ENV/ 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | 93 | workdir/ 94 | node_modules/ 95 | _book/ 96 | .vscode 97 | export/ 98 | *.svg 99 | *.dot 100 | *.queue.txt 101 | site/ 102 | 103 | # poetry 104 | # poetry.lock 105 | 106 | # backup files 107 | *.bak 108 | 109 | .history/ 110 | .xqute/ 111 | .pipen/ 112 | t-*.ipynb 113 | *-output/ 114 | *_results/ 115 | t.py 116 | 117 | nohup.out 118 | test.py 119 | test.ipynb 120 | gac_key.json 121 | examples/.pipen.toml 122 | docs/api/ 123 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | fail_fast: false 4 | exclude: '^README.rst$|^tests/|^setup.py$|^examples/|^docs/' 5 | repos: 6 | - repo: https://github.com/pre-commit/pre-commit-hooks 7 | rev: 5df1a4bf6f04a1ed3a643167b38d502575e29aef 8 | hooks: 9 | - id: trailing-whitespace 10 | - id: end-of-file-fixer 11 | - id: check-yaml 12 | - id: check-added-large-files 13 | - repo: local 14 | hooks: 15 | - id: versionchecker 16 | name: Check version agreement in pyproject and __version__ 17 | entry: bash -c 18 | language: system 19 | args: 20 | - get_ver() { echo $(egrep "^__version|^version" $1 | cut -d= -f2 | sed 's/\"\| //g'); }; 21 | v1=`get_ver pyproject.toml`; 22 | v2=`get_ver pipen/version.py`; 23 | if [[ $v1 == $v2 ]]; then exit 0; else exit 1; fi 24 | pass_filenames: false 25 | files: ^pyproject\.toml|pipen/version\.py$ 26 | - id: mypy 27 | name: Run mypy type check 28 | entry: mypy 29 | language: system 30 | args: ["-p", "pipen"] 31 | pass_filenames: false 32 | always_run: true 33 | files: ^/pipen/.+$ 34 | - id: pytest 35 | name: Run pytest 36 | entry: pytest 37 | language: system 38 | args: [tests/] 39 | pass_filenames: false 40 | files: ^tests/.+$|^pipen/.+$ 41 | - id: flake8 42 | name: Run flake8 43 | entry: flake8 44 | language: system 45 | args: [pipen] 46 | pass_filenames: false 47 | files: ^pipen/.+$ 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | **A pipeline framework for python** 5 | 6 |
7 | 8 | ______________________________________________________________________ 9 | 10 | [![Pypi][6]][7] [![Github][8]][9] ![Building][10] [![Docs and API][11]][1] [![Codacy][12]][13] [![Codacy coverage][14]][13] [![Deps][5]][23] 11 | 12 | [Documentation][1] | [ChangeLog][2] | [Examples][3] | [API][4] 13 | 14 | ## Features 15 | 16 | - Easy to use 17 | - Nearly zero-configuration 18 | - Nice logging 19 | - Highly extendable 20 | - Cloud support naively 21 | 22 | ## Installation 23 | 24 | ```bash 25 | pip install -U pipen 26 | ``` 27 | 28 | ## Quickstart 29 | 30 | `example.py` 31 | 32 | ```python 33 | from pipen import Proc, Pipen, run 34 | 35 | class P1(Proc): 36 | """Sort input file""" 37 | input = "infile" 38 | input_data = ["/tmp/data.txt"] 39 | output = "outfile:file:intermediate.txt" 40 | script = "cat {{in.infile}} | sort > {{out.outfile}}" 41 | 42 | class P2(Proc): 43 | """Paste line number""" 44 | requires = P1 45 | input = "infile:file" 46 | output = "outfile:file:result.txt" 47 | script = "paste <(seq 1 3) {{in.infile}} > {{out.outfile}}" 48 | 49 | # class MyPipeline(Pipen): 50 | # starts = P1 51 | 52 | if __name__ == "__main__": 53 | # MyPipeline().run() 54 | run("MyPipeline", starts=P1) 55 | ``` 56 | 57 | ```shell 58 | > echo -e "3\n2\n1" > /tmp/data.txt 59 | > python example.py 60 | ``` 61 | 62 | ```log 63 | 04-17 16:19:35 I core _____________________________________ __ 64 | 04-17 16:19:35 I core ___ __ \___ _/__ __ \__ ____/__ | / / 65 | 04-17 16:19:35 I core __ /_/ /__ / __ /_/ /_ __/ __ |/ / 66 | 04-17 16:19:35 I core _ ____/__/ / _ ____/_ /___ _ /| / 67 | 04-17 16:19:35 I core /_/ /___/ /_/ /_____/ /_/ |_/ 68 | 04-17 16:19:35 I core 69 | 04-17 16:19:35 I core version: 0.17.3 70 | 04-17 16:19:35 I core 71 | 04-17 16:19:35 I core ╔═══════════════════════════ MYPIPELINE ════════════════════════════╗ 72 | 04-17 16:19:35 I core ║ My pipeline ║ 73 | 04-17 16:19:35 I core ╚═══════════════════════════════════════════════════════════════════╝ 74 | 04-17 16:19:35 I core plugins : verbose v0.14.1 75 | 04-17 16:19:35 I core # procs : 2 76 | 04-17 16:19:35 I core profile : default 77 | 04-17 16:19:35 I core outdir : 78 | /home/pwwang/github/pipen/examples/MyPipeline-output 79 | 04-17 16:19:35 I core cache : True 80 | 04-17 16:19:35 I core dirsig : 1 81 | 04-17 16:19:35 I core error_strategy : ignore 82 | 04-17 16:19:35 I core forks : 1 83 | 04-17 16:19:35 I core lang : bash 84 | 04-17 16:19:35 I core loglevel : info 85 | 04-17 16:19:35 I core num_retries : 3 86 | 04-17 16:19:35 I core scheduler : local 87 | 04-17 16:19:35 I core submission_batch: 8 88 | 04-17 16:19:35 I core template : liquid 89 | 04-17 16:19:35 I core workdir : 90 | /home/pwwang/github/pipen/examples/.pipen/MyPipeline 91 | 04-17 16:19:35 I core plugin_opts : 92 | 04-17 16:19:35 I core template_opts : filters={'realpath': >> ['P2'] 105 | 04-17 16:19:36 I verbose P1: in.infile: /tmp/data.txt 106 | 04-17 16:19:36 I verbose P1: out.outfile: 107 | /home/pwwang/github/pipen/examples/.pipen/MyPipeline/P1/0/output/intermediate 108 | .txt 109 | 04-17 16:19:38 I verbose P1: Time elapsed: 00:00:02.051s 110 | 04-17 16:19:38 I core 111 | 04-17 16:19:38 I core ╭═══════════════════════════════ P2 ════════════════════════════════╮ 112 | 04-17 16:19:38 I core ║ Paste line number ║ 113 | 04-17 16:19:38 I core ╰═══════════════════════════════════════════════════════════════════╯ 114 | 04-17 16:19:38 I core P2: Workdir: 115 | '/home/pwwang/github/pipen/examples/.pipen/MyPipeline/P2' 116 | 04-17 16:19:38 I core P2: <<< ['P1'] 117 | 04-17 16:19:38 I core P2: >>> [END] 118 | 04-17 16:19:38 I verbose P2: in.infile: 119 | /home/pwwang/github/pipen/examples/.pipen/MyPipeline/P1/0/output/intermediate 120 | .txt 121 | 04-17 16:19:38 I verbose P2: out.outfile: 122 | /home/pwwang/github/pipen/examples/MyPipeline-output/P2/result.txt 123 | 04-17 16:19:41 I verbose P2: Time elapsed: 00:00:02.051s 124 | 04-17 16:19:41 I core 125 | 126 | 127 | MYPIPELINE: 100%|██████████████████████████████| 2/2 [00:06<00:00, 0.35 procs/s] 128 | ``` 129 | 130 | ```shell 131 | > cat ./MyPipeline-output/P2/result.txt 132 | 1 1 133 | 2 2 134 | 3 3 135 | ``` 136 | 137 | ## Examples 138 | 139 | See more examples at `examples/` and a more realcase example at: 140 | 141 | 142 | 143 | ## Plugin gallery 144 | 145 | Plugins make `pipen` even better. 146 | 147 | - [`pipen-annotate`][26]: Use docstring to annotate pipen processes 148 | - [`pipen-args`][19]: Command line argument parser for pipen 149 | - [`pipen-board`][27]: Visualize configuration and running of pipen pipelines on the web 150 | - [`pipen-diagram`][18]: Draw pipeline diagrams for pipen 151 | - [`pipen-dry`][20]: Dry runner for pipen pipelines 152 | - [`pipen-filters`][17]: Add a set of useful filters for pipen templates. 153 | - [`pipen-lock`][25]: Process lock for pipen to prevent multiple runs at the same time. 154 | - [`pipen-log2file`][28]: Save running logs to file for pipen 155 | - [`pipen-poplog`][30]: Populate logs from jobs to running log of the pipeline 156 | - [`pipen-report`][16]: Generate report for pipen 157 | - [`pipen-runinfo`][29]: Save running information to file for pipen 158 | - [`pipen-verbose`][15]: Add verbosal information in logs for pipen. 159 | - [`pipen-gcs`][32]: A plugin for pipen to handle files in Google Cloud Storage. 160 | - [`pipen-cli-init`][21]: A pipen CLI plugin to create a pipen project (pipeline) 161 | - [`pipen-cli-ref`][31]: Make reference documentation for processes 162 | - [`pipen-cli-require`][24]: A pipen cli plugin check the requirements of a pipeline 163 | - [`pipen-cli-run`][22]: A pipen cli plugin to run a process or a pipeline 164 | 165 | [1]: https://pwwang.github.io/pipen 166 | [2]: https://pwwang.github.io/pipen/CHANGELOG 167 | [3]: https://pwwang.github.io/pipen/examples 168 | [4]: https://pwwang.github.io/pipen/api/pipen 169 | [5]: https://img.shields.io/librariesio/release/pypi/pipen?style=flat-square 170 | [6]: https://img.shields.io/pypi/v/pipen?style=flat-square 171 | [7]: https://pypi.org/project/pipen/ 172 | [8]: https://img.shields.io/github/v/tag/pwwang/pipen?style=flat-square 173 | [9]: https://github.com/pwwang/pipen 174 | [10]: https://img.shields.io/github/actions/workflow/status/pwwang/pipen/build.yml?style=flat-square 175 | [11]: https://img.shields.io/github/actions/workflow/status/pwwang/pipen/docs.yml?label=docs&style=flat-square 176 | [12]: https://img.shields.io/codacy/grade/cf1c6c97e5c4480386a05b42dec10c6e?style=flat-square 177 | [13]: https://app.codacy.com/gh/pwwang/pipen 178 | [14]: https://img.shields.io/codacy/coverage/cf1c6c97e5c4480386a05b42dec10c6e?style=flat-square 179 | [15]: https://github.com/pwwang/pipen-verbose 180 | [16]: https://github.com/pwwang/pipen-report 181 | [17]: https://github.com/pwwang/pipen-filters 182 | [18]: https://github.com/pwwang/pipen-diagram 183 | [19]: https://github.com/pwwang/pipen-args 184 | [20]: https://github.com/pwwang/pipen-dry 185 | [21]: https://github.com/pwwang/pipen-cli-init 186 | [22]: https://github.com/pwwang/pipen-cli-run 187 | [23]: https://libraries.io/github/pwwang/pipen#repository_dependencies 188 | [24]: https://github.com/pwwang/pipen-cli-require 189 | [25]: https://github.com/pwwang/pipen-lock 190 | [26]: https://github.com/pwwang/pipen-annotate 191 | [27]: https://github.com/pwwang/pipen-board 192 | [28]: https://github.com/pwwang/pipen-log2file 193 | [29]: https://github.com/pwwang/pipen-runinfo 194 | [30]: https://github.com/pwwang/pipen-poplog 195 | [31]: https://github.com/pwwang/pipen-cli-ref 196 | [32]: https://github.com/pwwang/pipen-gcs 197 | -------------------------------------------------------------------------------- /docs/basics.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Layers of a pipeline 4 | 5 | ![Layers](./layers.png) 6 | 7 | The pipeline consists of channels and processes. A process may have many jobs. Each job uses the corresponding elements from the input channel of the process (a row of the input channel/dataframe), and generates values for output channel. 8 | Actually, what you need to do is just specify the first input channel, and then tell `pipen` the dependencies of the processes. The later processes will use the output channel of the processes they depend on. Of course, you can also modify the output channel to match the input of the next processes, using functions. 9 | 10 | ## Folder structure 11 | ``` 12 | ./ 13 | |- pipeline.py 14 | `- / 15 | `- / 16 | |- proc.name 17 | `- / 18 | |- input/ 19 | |- output/ 20 | |- job.signature.toml 21 | |- job.script 22 | |- job.rc 23 | |- job.stdout 24 | |- job.stderr 25 | |- job.status 26 | `- job.wrapped. 27 | ``` 28 | 29 | | Path | Content | Memo | 30 | |------|---------|------| 31 | |``|Where the pipeline directories of all processes of current pipeline are located.|Can be set by `workdir`| 32 | |``|The slugified name of the pipeline.|| 33 | |`/`|The job directory|Starts with `0`| 34 | |`/output/`|Where you can find all the output files|If this is an end process, it should be a link to the output directory of this process of the pipeline| 35 | |`/job.signature.toml`|The signature file of the job, used to check if job is cached|| 36 | |`/job.script`|The rendered script file|| 37 | |`/job.rc`|To file containing the return code|| 38 | |`/job.stdout`|The STDOUT of the script|| 39 | |`/job.stderr`|The STDERR of the script|| 40 | |`/job.statis`|The status of the job|| 41 | |`/job.wrapped.`|The wrapper for the scheduler to wrap the script|| 42 | 43 | -------------------------------------------------------------------------------- /docs/caching.md: -------------------------------------------------------------------------------- 1 | 2 | ## Job caching 3 | 4 | If `cache` set to `False` (detected in the sequence of configuration files, `Pipen` constructor, and process definition), the job is running anyway regardless of previous runs. 5 | 6 | If a previous run of a job fails, the job will be running anyway. 7 | 8 | If a job is done successfully, a signature file will be generated for the job. When we try to run the job again, the signature will be used to check if we can skip running the job again but to use the results generated by previous run. 9 | 10 | We can also do a force-cache for a job by setting `cache` to `"force"`. This make sure of the results of previous successful run regardless of input or script changes. This is useful for the cases that, for example, you make some changes to input/script, but you don't want them to take effect immediately, especially when the job takes long time to run. 11 | 12 | ## Job signature 13 | 14 | The signature of a job consists of input types and data, output types and data, and lastest time (`lastest_time`) any files/directories from the script, input or output files are generated/modified. So these siutations will make job-cache checking fail (job will start over): 15 | 16 | 1. Any changes in `input` or `output` types 17 | 2. Any changes in `input` or `output` data 18 | 3. Any changes to `script` 19 | 4. Any touches to input files (since they will make the last modified time > `lastest_time`) 20 | 5. Any touches to input directories 21 | - Use `dirsig` as the depth to check the files under the directories 22 | - Otherwise if it is `0`, only the directories themselves are checked. Note that modify a file inside a directory may not change the last modified time of the directory itself. 23 | 6. Any deletions to the output files/directories 24 | Note that only the files/directories specified by `output` are checked. Files or subdirectories in the output directories will NOT be checked. 25 | -------------------------------------------------------------------------------- /docs/channel-collapse_files.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/docs/channel-collapse_files.png -------------------------------------------------------------------------------- /docs/channel-expand_dir.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/docs/channel-expand_dir.png -------------------------------------------------------------------------------- /docs/channels.md: -------------------------------------------------------------------------------- 1 | 2 | Channels are used to pass data from one process to another. It is actually a `pandas.DataFrame` object, where each column corresponds to an input key and each row corresponds to a job. 3 | 4 | 5 | The values for different variables in different jobs wil be: 6 | 7 | | Job Index | v1 | v2 | v3 | 8 | |-----------|----|-----|----| 9 | | 0 | a1 | b1 | c1 | 10 | | 1 | a2 | b2 | c2 | 11 | | ... |... | ... |... | 12 | 13 | With a process definition: 14 | 15 | ```python 16 | class MyProcess(Proc): 17 | input = "v1, v2, v3" 18 | input_data = df # The above data frame 19 | ``` 20 | 21 | Then: 22 | 23 | |Job index|Template|Rendered to| 24 | |-|-|-| 25 | |0|`{{in.v1}}`|`a1`| 26 | |0|`{{in.v2}}`|`b1`| 27 | |0|`{{in.v3}}`|`c1`| 28 | |1|`{{in.v1}}`|`a2`| 29 | |1|`{{in.v2}}`|`b2`| 30 | |1|`{{in.v3}}`|`c2`| 31 | |...|...|...| 32 | 33 | The column names don't have to match the exact input keys. If `pipen` finds any of the input keys present in the data, just use them. However, if any input keys cannot find in the data frame, we will use the first couple of columns. 34 | 35 | For example: 36 | ```python 37 | class MyProcess2(Proc): 38 | input = "v4, v3" 39 | input_data = df # The above data frame 40 | ``` 41 | 42 | The for job#0, `{{in.v4}}` will be rendered as `a1` (using column `v1` in the data), and `{{in.v3}}` as `c1` (using column `v3`). 43 | 44 | 45 | ## Creating channels 46 | 47 | Since channels are just data frames, so whatever creates a pandas data frame, can be used to create a channel. Besides, a couple of class methods are avaible to create channels: 48 | 49 | - `Channel.create(...)` 50 | 51 | This takes a list of values to create a channel. If a data frame is passed, will return that data frame. 52 | 53 | If each element in the list is a tuple, the list is used to create a data frame directly, just like: 54 | 55 | ```python 56 | from pandas import DataFrame 57 | ch = Channel.create([(1,2), (3,4)]) 58 | # ch = DataFrame([(1,2), (3,4)]) 59 | # 0 1 60 | # 61 | # 0 1 2 62 | # 1 3 4 63 | ``` 64 | 65 | If each element is not a tuple (even it is a list), it is converted to tuple: 66 | ```python 67 | ch = Channel.create([1, 2]) 68 | # equvalent to: 69 | # ch = Channel.create([(1, ), (2, )]) 70 | ``` 71 | 72 | The `input_data` is passed to this class method to create the input channel. 73 | 74 | - `Channel.from_glob(...)` 75 | 76 | This takes a glob pattern to match the files to create a single-column channel. 77 | 78 | You can also filter the types of files by `ftype`: 79 | - `any`: to match any files (default) 80 | - `link`: to mach any links 81 | - `dir`: to match any directories 82 | - `file`: to match any files 83 | 84 | You may also sort the files using `sortby`: 85 | - `name`: sort the files by their basename (default) 86 | - `mtime`: sort the files by their last modified time 87 | - `size`: sort by file size 88 | 89 | When `reverse` is True, the above sortings are reversed. 90 | 91 | - `Channel.from_pairs(...)` 92 | 93 | Like `Channel.from_glob()` but create a double-column channel. 94 | 95 | - `Channel.from_csv(...)` 96 | 97 | Uses `pandas.read_csv()` to create a channel 98 | 99 | - `Channel.from_excel(...)` 100 | 101 | Uses `pandas.read_excel()` to create a channel 102 | 103 | - `Channel.from_table(...)` 104 | 105 | Uses `pandas.read_table()` to create a channel 106 | 107 | 108 | ## Builtin verbs/functions to transform channels 109 | 110 | `pipen` uses [`pipda`][1] to create some verbs/functions to transform channels, so that you can use them with piping syntax: 111 | 112 | ```python 113 | channel >> verb(...) 114 | ``` 115 | 116 | ### Expanding a channel by directory: `expand_dir()` 117 | 118 | Sometimes we prepare files in one process (for example, split a big file into small ones in a directory), then handle these files by different jobs in another process, so that they can be processed simultaneously. 119 | 120 | ![channel.expand_dir](./channel-expand_dir.png) 121 | 122 | For example: 123 | ```python 124 | 125 | class P1(Proc): 126 | # the original file: a.txt 127 | input = "infile:file" 128 | input_data = ["a.txt"] 129 | output = "outdir:dir:outdir" 130 | script = "# the script to split a.txt to 1.txt, 2.txt, 3.txt ... to {{out.outdir}}" 131 | 132 | class P2(Proc): 133 | requires = P1 134 | # expand channel [("outdir/a/",)] to channel: 135 | # [("outdir/a/1.txt",), ("outdir/a/2.txt",), ("outdir/a/3.txt",), ...] 136 | input = "infile:file" 137 | input_data = lambda ch: ch >> expand_dir(pattern="*.txt") 138 | # outfile: 1.result, 2.result, ... 139 | output = "outfile:file:{{in.infile.split('/')[-1].split('.')[0]}}.result" 140 | script = """ 141 | # work on {{in.infile}} (1.txt, 2.txt, 3.txt, ...) 142 | # to result file {{out.outfile}} (1.result, 2.result, 3.result, ...) 143 | """ 144 | 145 | # Run 3 jobs in a batch simultaneously 146 | Pipen(forks=3).run(P1) 147 | ``` 148 | 149 | If the channel is a multi-column channel, you can also specify `col` to expand only on that column, values of other columns will be copied to the expanded rows/jobs. 150 | 151 | You can also filter and sort the expanded files using arguments `ftype`, `sortby` and `reverse`, just like when we use `Channel.from_glob(...)` 152 | 153 | !!! caution 154 | 155 | - `expand_dir(...)` only works for single-row channels, which will be expanded to `N` (number of files included). If original channel has more than 1 row, only first row will be used, and other rows will be ignored. 156 | - Only the value of the column to be expanded will be changed, values of other columns remain the same. 157 | 158 | ### Collapsing a channel by files in a common ancestor directory: `collapse_files(...)` 159 | 160 | It's basically the reverse process of `expand_dir()`. It applies when you deal with different files and in next process you need them all involved (i.e. combine the results): 161 | 162 | ![channel.collapse_files](./channel-collapse_files.png) 163 | 164 | For example: 165 | ```python 166 | 167 | class P1(Proc): 168 | input = "infile:file" 169 | input_data = ["/a/b/1.txt", "/a/b/2.txt", "/a/b/3.txt"] 170 | output = "outfile:file:{{in.infile.split('/')[-1].split('.')[0] | append: '.txt2'}}" 171 | script = """ 172 | # the script to deal with each input file: 173 | # {{in.infile}} -> {{out.outfile}} 174 | """ 175 | 176 | class P2(Proc): 177 | requires = P1 178 | # collapse channel [("/1.txt2",), ("/2.txt2",), ("/3.txt2",)] 179 | # to channel: [("/", )] 180 | input = "indir:file" 181 | input_data = lambda ch: ch >> collapse_files() 182 | output = "outfile:file:{{in.indir.split('/')[-1]}}.result" 183 | script = """ 184 | # combine 1.txt2, 2.txt2, 3.txt3 in {{in.indir}} to {{out.outfile}} 185 | """ 186 | 187 | Pipen().run(P1) 188 | ``` 189 | 190 | Similarly, if we have multiple columns, you may specify the column by index or name to collapse by: 191 | `ch >> collapse_files(col=...)` 192 | 193 | !!! caution 194 | 195 | * `os.path.dirname(os.path.commonprefix(...))` is used to detect the common ancestor directory, so the files could be `['/a/1/1.file', '/a/2/1.file']`. In this case `/a/` will be returned. 196 | * values at other columns should be the same. They will NOT be checked! The values at the first row will be used. 197 | 198 | [1]: https://github.com/pwwang/pipda 199 | -------------------------------------------------------------------------------- /docs/cli.md: -------------------------------------------------------------------------------- 1 | `pipen` has a CLI tool that you can run from command line. 2 | 3 | To run it: 4 | 5 | ```shell 6 | ❯ pipen --help 7 | Usage: pipen [-h] {version,profile,plugins,help} ... 8 | 9 | CLI Tool for pipen v0.4.2 10 | 11 | Optional Arguments: 12 | -h, --help show help message and exit 13 | 14 | Subcommands: 15 | version Print versions of pipen and its dependencies 16 | profile List available profiles. 17 | plugins List installed plugins 18 | help Print help for commands 19 | ``` 20 | 21 | ## Writing a plugin to extend the cli 22 | 23 | ### CLI plugin abstract class 24 | 25 | A CLI plugin has to be a subclass of `pipen.cli.CLIPlugin`. 26 | 27 | A CLI plugin has to define a `name` property, which also is the sub-command of the plugin. 28 | 29 | There are a couple of methods of `pipen.cli.CLIPlugin` to extend for a plugin: 30 | 31 | - `__init__(self, parser, subparser)`: initialize the plugin 32 | It takes the main parser and the subparser of the sub-command as arguments. You can add arguments to the parser or subparser here. 33 | Check [argx][1] for more information about how to define arguments. 34 | 35 | - `parse_args(self)`: parse the arguments 36 | It takes no arguments. It should parse the arguments and return the parsed arguments (Namespace), which are used to execute the command. 37 | By default, `self.parser.parse_args()` is called to parse the arguments. 38 | 39 | - `exec_command(self, args)`: execute the command 40 | It takes the parsed arguments as argument. It should execute the command as you wish. 41 | 42 | ### loading CLI plugins 43 | 44 | Like pipen [plugins][2], [templates][3], and [schedulers][4], there are two ways to load the CLI plugins: 45 | 46 | 1. Use the plugin directly: 47 | 48 | ```python 49 | from pipen.cli import cli_plugin 50 | 51 | cli_plugin.register() 52 | ``` 53 | 54 | 2. Use the entry points with group name `pipen_cli` 55 | 56 | 57 | ## The `profile` subcommand 58 | 59 | It is used to list the configurations/profiles in current directory. Run `pipen profile` or `pipen help profile` to get more information. 60 | 61 | ## The `plugins` subcommand 62 | 63 | This subcommand is used to list the plugins for `pipen` itself, templates, scheduler and cli. Run `pipen plugins` or `pipen help plugins` to get more information. 64 | 65 | ## The `version` subcommand 66 | 67 | This command prints the versions of `pipen` and its dependencies. 68 | 69 | ## CLI plugin gallery 70 | 71 | - [`pipen-cli-init`][5]: A pipen CLI plugin to create a pipen project (pipeline) 72 | - [`pipen-cli-ref`][6]: Make reference documentation for processes 73 | - [`pipen-cli-require`][7]: A pipen cli plugin check the requirements of a pipeline 74 | - [`pipen-cli-run`][8]: A pipen cli plugin to run a process or a pipeline 75 | 76 | [1]: https://github.com/pwwang/argx 77 | [2]: ../plugin 78 | [3]: ../templating 79 | [4]: ../scheduler 80 | [5]: https://github.com/pwwang/pipen-cli-init 81 | [6]: https://github.com/pwwang/pipen-cli-ref 82 | [7]: https://github.com/pwwang/pipen-cli-require 83 | [8]: https://github.com/pwwang/pipen-cli-run 84 | -------------------------------------------------------------------------------- /docs/cloud.md: -------------------------------------------------------------------------------- 1 | Since `v0.16.0`, `pipen` supports the cloud naively. There are two ways by means of cloud support: 2 | 3 | - Run the pipeline locally (or schedulers like `sge`, `slurm`, etc.) and save the files to the cloud. 4 | - Run the pipeline on the cloud. 5 | 6 | ## Run the pipeline locally and save the files to the cloud 7 | 8 | To run the pipeline locally and save the files to the cloud, you need to install `pipen` with cloud support: 9 | 10 | ```bash 11 | pip install xqute[cloudsh] 12 | # To support a specific cloud service provider 13 | pip install cloudpathlib[s3] 14 | pip install cloudpathlib[gs] 15 | pip install cloudpathlib[azure] 16 | ``` 17 | 18 | The you can directly assign a cloud path as a pipeline working directory: 19 | 20 | ```python 21 | from pipen import Pipen, Proc, run 22 | 23 | 24 | class P1(Proc): 25 | """Sort input file""" 26 | input = "in:var" 27 | input_data = ["Hello World"] 28 | output = "outfile:file:out.txt" 29 | # Note that out.outfile is on the cloud but the script is executed locally 30 | # we can use cloudsh to save the output to the cloud 31 | script = "echo {{in.in}} | cloudsh sink {{out.outfile}}" 32 | 33 | 34 | class MyPipeline(Pipen): 35 | starts = P1 36 | workdir = "gs://mybucket/mypipeline/workdir" 37 | output = "gs://mybucket/mypipeline/output" 38 | 39 | 40 | if __name__ == "__main__": 41 | MyPipeline().run() 42 | ``` 43 | 44 | Like the following figure, the pipeline is run locally but the meta information is grabbed from and saved to the cloud (workdir). 45 | No local files are generated. 46 | 47 | For the output files, if a process is a non-export process, the output files are saved to the workdir. 48 | If a process is an export process, the output files are saved to the output directory (export dir). 49 | 50 | ![pipen-cloud1](./pipen-cloud1.png) 51 | 52 | ## Run the pipeline on the cloud 53 | 54 | Currently, `pipen` only supports running the pipeline on the cloud with google batch jobs. 55 | 56 | To run the pipeline on the cloud, you need to install `pipen` with cloud support: 57 | 58 | ```bash 59 | pip install xqute[gs] 60 | ``` 61 | 62 | It is used to communicate with google cloud storage files. No `cloudsh` is needed, since operating the cloud files will be happening on the cloud (with the cloud paths mounted to the VM). You also need to have [google cloud sdk][1] installed and configured, which is used to communicate with google batch jobs (submit jobs, get job status, etc.). 63 | 64 | ```python 65 | from pipen import Pipen, Proc, run 66 | 67 | 68 | class P1(Proc): 69 | """Sort input file""" 70 | input = "in:var" 71 | input_data = ["Hello World"] 72 | output = "outfile:file:out.txt" 73 | # Note that out.outfile is on the cloud but the script is executed locally 74 | # we can use cloudsh to save the output to the cloud 75 | script = "echo {{in.in}} | cloudsh sink {{out.outfile}}" 76 | 77 | 78 | class MyPipeline(Pipen): 79 | starts = P1 80 | workdir = "gs://mybucket/mypipeline/workdir" 81 | output = "gs://mybucket/mypipeline/output" 82 | scheduler = "gbatch" 83 | 84 | 85 | if __name__ == "__main__": 86 | MyPipeline().run() 87 | ``` 88 | 89 | The only difference is that we need to set `scheduler` to `gbatch` (google batch jobs). 90 | 91 | As shown in the following figure, the pipeline is run on the cloud platform, and the workdir and export dir will be mounted to the VM. So the process script can directly access the cloud files, no `cloudsh` or `gcloud` tools are needed. 92 | 93 | ![pipen-cloud2](./pipen-cloud2.png) 94 | 95 | [1]: https://cloud.google.com/sdk?hl=en 96 | -------------------------------------------------------------------------------- /docs/configurations.md: -------------------------------------------------------------------------------- 1 | 2 | ## Configuration items 3 | 4 | There are two levels of configuration items in `pipen`: pipeline level and process level. 5 | 6 | There are only 3 configuration items at pipeline level: 7 | 8 | - `loglevel`: The logging level for the logger (Default: `"info"`) 9 | - `workdir`: Where the metadata and intermediate files are saved for the pipeline (Default: `./.pipen`) 10 | - `plugins`: The plugins to be enabled or disabled for the pipeline 11 | 12 | These items cannot be set or changed at process level. 13 | 14 | Following items are at process level. They can be set changed at process level so that they can be process-specific. You may also see some of the configuration items introduced [here][1] 15 | 16 | - `cache`: Should we detect whether the jobs are cached? See also [here][2] 17 | - `dirsig`: When checking the signature for caching, whether should we walk through the content of the directory? This is sometimes time-consuming if the directory is big. 18 | - `error_strategy`: How to deal with the errors: retry, ignore or halt. See also [here][3] 19 | - `num_retries`: How many times to retry to jobs once error occurs. 20 | - `template`: efine the template engine to use. See also [here][4] 21 | - `template_opts`: Options to initialize the template engine (will inherit from pipeline level) 22 | - `forks`: How many jobs to run simultaneously? 23 | - `lang`: The language for the script to run. See also [here][5] 24 | - `plugin_opts`: Options for process-level plugins, will inherit from pipeline level 25 | - `scheduler`: The scheduler to run the jobs 26 | - `scheduler_opts`: The options for the scheduler, will inherit from pipeline level 27 | - `submission_batch`: How many jobs to be submited simultaneously 28 | 29 | ## Configuration priorities 30 | 31 | There are different places to set values for the configuration items (priorities from low to high): 32 | 33 | - The configuration files (priorities from low to high): 34 | 35 | - `~/.pipen.toml` 36 | - `./.pipen.toml` 37 | - `PIPEN.osenv` 38 | 39 | See [here][6] for how the configuration files are loaded. 40 | `pipen` uses `TOML` as configuration language, see [here][7] for more information about `toml` format. 41 | 42 | - The arguments of `Pipen` constructor 43 | - The process definition 44 | 45 | !!! note 46 | 47 | The configurations from configuration files are with profiles. If the same profile name appears in multiple configuration files, the items will be inherited from the lower-priority files. 48 | 49 | !!! note 50 | 51 | Special note for `lang`. 52 | 53 | If it is not set at process level, and there are shebang in the script, whatever you specified at pipeline level (including in the configuration files), it will be ignored and the interpreter in the shebang will be used. 54 | 55 | See also [script][5] 56 | 57 | !!! tip 58 | 59 | If you have nothing set at `Pipen` constructor or process definition for a configuration item, the `PIPEN.osenv` is useful to use a different value than the one set in other configuration files. For example, to disable cache for all processes: 60 | 61 | ``` 62 | PIPEN_DEFAULT_cache=0 python ./pipeline.py ... 63 | ``` 64 | 65 | ## Profiles 66 | 67 | You can have different profiles in configuration files: 68 | 69 | `~/.pipen.toml` 70 | ```toml 71 | [default] 72 | scheduler = "local" 73 | 74 | [sge] 75 | scheduler = "sge" 76 | 77 | [sge.schduler_opts] 78 | sge_q = "1-day" 79 | ``` 80 | 81 | 82 | To use the `sge` profile: 83 | 84 | ```python 85 | Pipen().run(P1, profile="sge") 86 | ``` 87 | 88 | You can also have a configuration in current directory: 89 | 90 | `./.pipen.toml` 91 | ```toml 92 | [sge.scheduler_opts] 93 | sge_q = "7-days" 94 | ``` 95 | 96 | Then the queue to run the jobs will be `7-days`. Note that we didn't specify the `scheduler` in `./.pipen.toml`, which is inherited from `~/.pipen.toml`. 97 | 98 | [1]: ../defining-proc 99 | [2]: ../caching 100 | [3]: ../error 101 | [4]: ../templating 102 | [5]: ../script 103 | [6]: https://github.com/pwwang/python-simpleconf#loading-configurations 104 | [7]: https://github.com/toml-lang/toml 105 | -------------------------------------------------------------------------------- /docs/defining-proc.md: -------------------------------------------------------------------------------- 1 | A pipeline consists of many processes, which could own multiple jobs that run in parallel. 2 | 3 | ## Defining/Creating processes 4 | 5 | `pipen` has two (preferred) ways to define processes: 6 | 7 | ### Subclassing `pipen.Proc` 8 | 9 | ```python 10 | from pipen import Proc 11 | 12 | class MyProcess(Proc): 13 | ... # process configurations 14 | ``` 15 | 16 | The configurations are specified as class variables of the class. 17 | 18 | 19 | 20 | ### Using class method `Proc.from_proc()` 21 | 22 | If you want to reuse a defined process, you can either subclass it: 23 | 24 | ```python 25 | class MyOtherProcess(MyProcess): 26 | ... # configurations inherited from MyProcess 27 | ``` 28 | 29 | Or use `Proc.from_proc()`: 30 | 31 | ```python 32 | # You can also pass the configurations you want to override 33 | MyOtherProcess = Proc.from_proc(MyProcess, ...) 34 | ``` 35 | 36 | Note that `Proc.from_proc()` cannot override all configurations/class variables, because we assume that there are some shared configurations if you want to "copy" from another process. 37 | 38 | These shared configurations are: 39 | 40 | 1. Template engine and its options (`template` and `template_opts`) 41 | 2. Script template (`script`) 42 | 3. Input keys (`input`) 43 | 4. Language/Interpreter of the script (`lang`) 44 | 5. Output keys (`output`) 45 | 46 | 47 | All other configurations can be passed to `Proc.from_proc()` to override the old ones. 48 | 49 | For all configurations/class variables for a process, see next section. 50 | 51 | You don't need to specify the new name of the new process, the variable name on the left-handle side will be used if `name` argument is not provided to `Proc.from_proc()`. For example: 52 | 53 | ```python 54 | NewProc = Proc.from_proc(OldProc) 55 | # NewProc.name == "NewProc" 56 | ``` 57 | 58 | But you are able to assign a different name to a new process if you want. For example: 59 | 60 | ```python 61 | NewProc = Proc.from_proc(OldProc, name="NewProc2") 62 | # NewProc.name = "NewProc2" 63 | ``` 64 | 65 | ### How about instantiation of `Proc` directly? 66 | 67 | You are not allowed to do that. `Proc` is an abstract class, which is designed to be subclassed. 68 | 69 | ### How about instantiation of a `Proc` subclass? 70 | 71 | Nope, in `pipen`, a process is a `Proc` subclass itself. The instances of the subcleasses are used internally, and they are singletons. In most cases, you don't need to use the instances, unless you want to access the computed properties of the instances, including: 72 | 73 | - `pipeline`: The pipeline, which is a `Pipen` object 74 | - `pbar`: The progress bar for the process, indicating the job status of this process 75 | - `jobs`: The jobs of this process 76 | - `xqute`: The `Xqute` object to manage the job running. 77 | - `template`: The template engine (a `pipen.template.Template` object) 78 | - `template_opts`: The template options (overwritten from config by the `template_opts` class variable) 79 | - `input`: The sanitized input keys and types 80 | - `output`: The compiled output template, ready for the jobs to render with their own data 81 | - `scheduler`: The scheduler object (inferred from the name or sheduler object from the `scheduler` class variable) 82 | - `script`: The compiled script template, ready for the jobs to render with their own data 83 | 84 | ### How about copy/deep-copy of a `Proc` subclass? 85 | 86 | Nope. Copy or deep-copy of a `Proc` subclass won't trigger `__init_subclass__()`, where consolidate the process name from the class name if not specified and connect the required processes with the current one. Copy or deep-copy keeps all properties, but disconnect the relationships between current process and the dependency processes, even with a separate assignment, such as `MyProcess.requires = ...`. 87 | 88 | ## process configurations and `Proc` class variables 89 | 90 | The configurations of a process are specified as class variables of subclasses of `Proc`. 91 | 92 | |Name|Meaning|Can be overwritten by `Proc.from_proc()`| 93 | |-|-|-| 94 | |`name`|The name of the process. Will use the class name by default.|Yes| 95 | |`desc`|The description of the process. Will use the summary from the docstring by default.|Yes| 96 | |`envs`|The env variables that are job-independent, useful for common options across jobs.|Yes, and old ones will be inherited| 97 | |`cache`|Should we detect whether the jobs are cached?|Yes| 98 | |`dirsig`|When checking the signature for caching, the depth we should walk through the content of the directory? This is sometimes time-consuming if the directory and the depth are big.|Yes| 99 | |`export`|When True, the results will be exported to `` Defaults to None, meaning only end processes will export. You can set it to True/False to enable or disable exporting for processes|Yes| 100 | |`error_strategy`|How to deal with the errors: retry, ignore, halt|Yes| 101 | |`num_retries`|How many times to retry to jobs once error occurs|Yes| 102 | |`template`|Define the template engine to use.|No| 103 | |`template_opts`|Options to initialize the template engine.|No| 104 | |`forks`|How many jobs to run simultaneously?|Yes| 105 | |`input`|The keys and types for the input channel|No| 106 | |`input_data`|The input data (will be computed for dependent processes)|Yes| 107 | |`lang`|The language for the script to run.|No| 108 | |`order`|The execution order for the same dependency-level processes|Yes| 109 | |`output`|The output keys for the output channel|No| 110 | |`plugin_opts`|Options for process-level plugins|Yes| 111 | |`requires`|The dependency processes|Yes| 112 | |`scheduler`|The scheduler to run the jobs|Yes| 113 | |`scheduler_opts`|The options for the scheduler|Yes| 114 | |`script`|The script template for the process|No| 115 | |`submission_batch`|How many jobs to be submited simultaneously|Yes| 116 | -------------------------------------------------------------------------------- /docs/error.md: -------------------------------------------------------------------------------- 1 | You can tell `pipen` how to handle when a job fails to run. 2 | 3 | You can specify one of the following to `error_strategy` 4 | 5 | - `halt`: Any failure will just halt the whole pipeline 6 | - `ignore`: Ignore the error and keep running (assuming the job runs successfully anyway) 7 | - `retry`: Retry to job running 8 | - After `num_retries` times of retrying, if the job is still failing, then halt the pipeline. 9 | 10 | `pipen` uses `xqute` to handle the errors. See also [here][1]. 11 | 12 | [1]: https://pwwang.github.io/xqute/api/xqute.defaults/#xqute.defaults.JobErrorStrategy 13 | -------------------------------------------------------------------------------- /docs/input-output.md: -------------------------------------------------------------------------------- 1 | 2 | ## Specify input of a process 3 | 4 | The input of a process is specified with `input`, the keys of the input data, and `input_data`, the real input data 5 | 6 | !!! tip 7 | 8 | Why separate the keys and data? 9 | 10 | Because the keys and data are not always combined, for example, we need the keys to infer the `output` and `script` (using them in the template), but the data may be deferred to obtain from the output of dependency processes. 11 | 12 | 13 | The complete form of an input key (`input`) is `:`. The `` could be `var`, `file`, `dir`, `files` and `dirs`. **A type of `var` can be omitted.** So `ph1, ph2` is the same as `ph1:var, ph2:var` 14 | 15 | If a process is requiring other processes, the specified `input_data` will be ignored, and will use the output data of their required processes: 16 | 17 | ```python 18 | class P1(Proc): 19 | input = "v1" 20 | output = "o1:{{in.v1}}" # pass by v1 as output variable 21 | input_data = ["a"] 22 | 23 | class P2(Proc): 24 | input = "v2" 25 | output = "o2:{{in.v2}}" 26 | input_data = ["b"] 27 | 28 | class P3(Proc): 29 | requires = [P1, P2] 30 | input = "i1, i2" 31 | output = "o3:{{in.i1}}_{{in.i2}}" # will be "a_b" 32 | # input_data = [] # ignored with a warning 33 | 34 | Pipen().run(P1, P2) 35 | ``` 36 | 37 | !!! Tip 38 | 39 | The direct `input_data` is ignore, but you can use a callback to modify the input channel. 40 | For example: 41 | 42 | ```python 43 | class P4(Proc): 44 | requires = [P1, P2] 45 | input = "i1, i2" 46 | input_data = lambda ch: ch.applymap(str.upper) 47 | output = "o3:{{in.i1}}_{{in.i2}}" # will be "A_B" 48 | ``` 49 | 50 | !!! Note 51 | 52 | When the input data does have enough columns, `None` will be used with warnings. And when the input data has more columns than the input keys, the extra columns are dropped and ignored, also with warnings 53 | 54 | ## Specify output of a process 55 | 56 | Different from input, instead of channels, you have to tell `pipen` how to compute the output channel. The output can be a `list` or `str`. If it's `str`, a comma (`,`) is used to separate different keys: 57 | 58 | To use templating in `output`, see [`templating`][1]. 59 | 60 | ```python 61 | class P1(Proc): 62 | input = "invar, infile" 63 | input_data = [(1, "/a/b/c.txt")] 64 | output = ( 65 | "outvar:{{in.invar}}2, " 66 | "outfile:file:{{in.infile.split('/')[-1]}}2, " 67 | "outdir:dir:{{in.infile.split('/')[-1].split('.')[0]}}-dir" 68 | ) 69 | 70 | # The type 'var' is omitted in the first element. 71 | # The output channel will be: 72 | # 73 | # outvar outfile outdir 74 | # 75 | # 0 "12" "/c.text2" "/c-dir" 76 | ``` 77 | 78 | ## Types of input and output 79 | 80 | ### Input 81 | 82 | |Type|Meaning| 83 | |----|-------| 84 | |`var`|Use the values directly| 85 | |`file`|Treat the data as a file path| 86 | |`dir`|Treat the data as a directory path| 87 | |`files`|Treat the data as a list of file paths| 88 | |`dirs`|Treat the data as a list of directory paths| 89 | 90 | For `file`/`files`, when checking whether a job is cached, their last modified time will be checked. 91 | 92 | For `dir`/`dirs`, if `dirsig > 0`, then the files inside the directories will be checked. Otherwise, the directories themselves are checked for last modified time. 93 | 94 | 95 | ### Output 96 | 97 | |Type|Meaning|Memo| 98 | |----|-------|----| 99 | |`var`|Use the values directly|| 100 | |`dir`|Use the data as a directory path|The directory will be created directly| 101 | |`file`|Use the data as a file path|| 102 | 103 | [1]: ../templating 104 | -------------------------------------------------------------------------------- /docs/layers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/docs/layers.png -------------------------------------------------------------------------------- /docs/pipen-cloud1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/docs/pipen-cloud1.png -------------------------------------------------------------------------------- /docs/pipen-cloud2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/docs/pipen-cloud2.png -------------------------------------------------------------------------------- /docs/plugin.md: -------------------------------------------------------------------------------- 1 | `pipen` uses [`simplug`][1] for plugin support. There are very enriched hooks available for you to write your own plugins to extend `pipen`. 2 | 3 | ## Runtime plugins 4 | 5 | ### Plugin hooks 6 | 7 | To implement a function in your plugin, just simply: 8 | 9 | ```python 10 | from pipen import plugin 11 | 12 | @plugin.impl 13 | [async ]def hook(...): 14 | ... 15 | ``` 16 | 17 | Note that you have to use keyword-arguments and they have to match the hook signature. 18 | 19 | See [`simplug`][1] for more details. 20 | 21 | #### Pipeline-level hooks 22 | 23 | - `on_setup(config)` (sync): 24 | 25 | Setup for the plugin, mainly used for initalization and set the default values for the plugin configuration items. 26 | 27 | This is only called once even when you have multiple pipelines (`Pipen` objects) in a python session. 28 | 29 | - `on_init(pipen)` (async) 30 | 31 | Called when pipeline is initialized. Note that here only default configurations are loaded (from defaults.CONFIG and config files). The configurations from `Pipen` constructor and the processes are not loaded yet. It's useful for plugins to change the default configurations. 32 | 33 | - `on_start(pipen)` (async) 34 | 35 | Right before the pipeline starts to run. The process relationships are inferred here. 36 | You can access the start processes by `pipen.starts` and all processes by `pipen.procs` in the sequence of the execution order. 37 | 38 | - `on_complete(pipen, succeeded)` (async) 39 | 40 | After all processes finish. `succeeded` indicates whether all processes/jobs finish successfully. 41 | 42 | #### Process-level hooks 43 | 44 | - `on_proc_create(proc)` (sync) 45 | 46 | Called before proc get instantiated. 47 | Enables plugins to modify the default attributes of processes 48 | 49 | - `on_proc_input_computed(proc)` (sync) 50 | 51 | Called after process input data is computed. 52 | 53 | - `on_proc_script_computed(proc)` (sync) 54 | 55 | Called after process script is computed. 56 | 57 | The script is computed as a string that is about to compiled into a 58 | template. You can modify the script here. 59 | 60 | - `on_proc_init(proc)` (async) 61 | 62 | When process object is initialized. 63 | Allows plugins to modify the process attributes after initialization, but 64 | before the jobs are initialized. 65 | 66 | - `on_proc_start(proc)` (async) 67 | 68 | When process object initialization completes, including the `xqute` and job initialization. The `output_data` is also accessible here. The process is ready to run. 69 | 70 | - `on_proc_shutdown(proc, sig)` (sync) 71 | 72 | When the process is shut down (i.e. by ``). You can access the signal that shuts the process down by `sig`. Only first plugin (based on the priority) that implements this hook will get called. 73 | 74 | - `on_proc_done(proc, succeeded)` (async) 75 | 76 | When a process is done. 77 | 78 | #### Job-level hooks 79 | 80 | - `on_job_init(job)` (async) 81 | 82 | When a job is initialized 83 | 84 | - `on_job_queued(job)` (async) 85 | 86 | When a job is queued in xqute. Note it might not be queued yet in the scheduler system. 87 | 88 | - `on_job_submitting(job)` (async) 89 | 90 | When a job is submitting. 91 | 92 | The first plugin (based on priority) have this hook return `False` will cancel the submission 93 | 94 | - `on_job_submitted(job)` (async) 95 | 96 | When a job is submitted in the scheduler system. 97 | 98 | - `on_job_started(job)` (async) 99 | 100 | When a job starts to run in then scheduler system. 101 | 102 | - `on_job_polling(job)` (async) 103 | 104 | When status of a job is being polled. 105 | 106 | - `on_job_killing(job)` (async) 107 | 108 | When a job is being killed. 109 | 110 | The first plugin (based on priority) have this hook return `False` will cancel the killing 111 | 112 | - `on_job_killed(job)` (async) 113 | 114 | When a job is killed 115 | 116 | - `on_job_succeeded(job)` (async) 117 | 118 | When a job completes successfully 119 | 120 | - `on_job_cached(job)` (async) 121 | 122 | When a job is cached 123 | 124 | - `on_job_failed(job)` (async) 125 | 126 | When a job is done but failed (i.e. return_code == 1). 127 | 128 | - `on_jobcmd_init(job) -> str` (sync) 129 | 130 | When the job command wrapper script is initialized before the prescript is run 131 | 132 | This should return a piece of bash code to be inserted in the wrapped job 133 | script (template), which is a python template string, with the following 134 | variables available: `status` and `job`. `status` is the class `JobStatus` from 135 | `xqute.defaults.py` and `job` is the `Job` instance. 136 | 137 | For multiple plugins, the code will be inserted in the order of the plugin priority. 138 | 139 | The code will replace the `#![jobcmd_init]` placeholder in the wrapped job script. 140 | See also 141 | 142 | - `on_jobcmd_prep(job) -> str` (sync) 143 | 144 | When the job command right about to be run 145 | 146 | This should return a piece of bash code to be inserted in the wrapped job 147 | script (template), which is a python template string, with the following 148 | variables available: `status` and `job`. `status` is the class `JobStatus` from 149 | `xqute.defaults.py` and `job` is the `Job` instance. 150 | 151 | The bash variable `$cmd` is accessible in the context. It is also possible to 152 | modify the `cmd` variable. Just remember to assign the modified value to `cmd`. 153 | 154 | For multiple plugins, the code will be inserted in the order of the plugin priority. 155 | Keep in mind that the `$cmd` may be modified by other plugins. 156 | 157 | The code will replace the `#![jobcmd_prep]` placeholder in the wrapped job script. 158 | See also 159 | 160 | - `on_jobcmd_end(job) -> str` (sync): 161 | 162 | When the job command finishes and after the postscript is run 163 | 164 | This should return a piece of bash code to be inserted in the wrapped job 165 | script (template), which is a python template string, with the following 166 | variables available: `status` and `job`. `status` is the class `JobStatus` from 167 | `xqute.defaults.py` and `job` is the `Job` instance. 168 | 169 | The bash variable `$rc` is accessible in the context, which is the return code 170 | of the job command. 171 | 172 | For multiple plugins, the code will be inserted in the order of the plugin priority. 173 | 174 | The code will replace the `#![jobcmd_end]` placeholder in the wrapped job script. 175 | See also 176 | 177 | ### Loading plugins 178 | 179 | You can specify the plugins to be loaded by specifying the names or the plugin itself in `plugins` configuration. With names, the plugins will be loaded from [entry points][2]. 180 | 181 | You can also disable some plugins if they are set in the lower-priority configurations. For example, you want to disable `pipen_verbose` (enabled in a configuration file) for a pipeline: 182 | 183 | ```python 184 | Pipen(..., plugins=["-pipen_verbose"]) 185 | ``` 186 | 187 | !!! note 188 | 189 | You can use `+` as prefix to enable a disabled plugin, or `-` as prefix to disable an enabled plugin. If no prefix is used, only the specified plugins will be enabled and all other plugins will be disabled. You should either use `+` or `-` for all plugins or none of them. If a plugin is not given as a string, it will be treated as `+plugin`. 190 | 191 | ### Writing a plugin 192 | 193 | You can write your own plugin by implementing some of the above hooks. You can import the plugin directly and add it to `Pipen(..., plugins=[...]). For example: 194 | 195 | ```python 196 | from pipen import plugin, Pipen 197 | 198 | class PipenPlugin: 199 | 200 | @plugin.impl 201 | [async ]def hook(...): 202 | ... 203 | 204 | Pipen(..., plugins=[PipenPlugin]) 205 | 206 | 207 | You can also use the entry point to register your plugin using the group name `pipen` 208 | 209 | For `setup.py`, you will need: 210 | ```python 211 | setup( 212 | # ... 213 | entry_points={"pipen": ["pipen_verbose = pipen_verbose"]}, 214 | # ... 215 | ) 216 | ``` 217 | 218 | For `pyproject.toml`: 219 | 220 | ```toml 221 | [tool.poetry.plugins.pipen] 222 | pipen_verbose = "pipen_verbose" 223 | ``` 224 | 225 | Then the plugin `pipen_verbose` can be loaded by `plugins=["+pipen_verbose"]` or disabled by `plugins=["-pipen_verbose"]` 226 | 227 | #### Logging to the console from a plugin 228 | 229 | Of course you can do arbitrary logging from a plugin. However, to keep the consistency with main logger of `pipen`, The best practice is: 230 | 231 | ```python 232 | from pipen.utils import get_logger 233 | 234 | logger = get_logger("verbose", "info") 235 | 236 | # do some logging inside the hooks 237 | ``` 238 | 239 | The above code will produce some logging on the console like this: 240 | 241 | ```shell 242 | 11-04 12:00:19 I main ╭═══════════════════════════ Process ═══════════════════════════╮ 243 | 11-04 12:00:19 I main ║ Undescribed. ║ 244 | 11-04 12:00:19 I main ╰═══════════════════════════════════════════════════════════════╯ 245 | 11-04 12:00:19 I main Process: Workdir: '.pipen/process' 246 | 11-04 12:00:19 I verbose Process: size: 10 247 | 11-04 12:00:19 I verbose Process: [0/9] in.a: 0 248 | 11-04 12:00:19 I verbose Process: [0/9] out.b: pipeline-0-output/Process/0/a.txt 249 | ``` 250 | 251 | ## CLI plugins 252 | 253 | See [CLI][11] for more details. 254 | 255 | ## Plugin gallery 256 | 257 | - [`pipen-verbose`][3]: Add verbosal information in logs for pipen. 258 | - [`pipen-report`][4]: Generate report for pipen 259 | - [`pipen-filters`][8]: Add a set of useful filters for pipen templates. 260 | - [`pipen-diagram`][5]: Draw pipeline diagrams for pipen 261 | - [`pipen-args`][6]: Command line argument parser for pipen 262 | - [`pipen-dry`][7]: Dry runner for pipen pipelines 263 | - [`pipen-annotate`][12]: Use docstring to annotate pipen processes 264 | - [`pipen-board`][13]: Visualize configuration and running of pipen pipelines on the web 265 | - [`pipen-lock`][14]: Process lock for pipen to prevent multiple runs at the same time. 266 | - [`pipen-log2file`][15]: Save running logs to file for pipen 267 | - [`pipen-poplog`][16]: Populate logs from jobs to running log of the pipeline 268 | - [`pipen-runinfo`][17]: Save running information to file for pipen 269 | - [`pipen-gcs`][9]: A plugin for pipen to handle files in Google Cloud Storage. 270 | 271 | [1]: https://github.com/pwwang/simplug 272 | [2]: https://packaging.python.org/specifications/entry-points/ 273 | [3]: https://github.com/pwwang/pipen-verbose 274 | [4]: https://github.com/pwwang/pipen-report 275 | [5]: https://github.com/pwwang/pipen-diagram 276 | [6]: https://github.com/pwwang/pipen-args 277 | [7]: https://github.com/pwwang/pipen-dry 278 | [8]: https://github.com/pwwang/pipen-filters 279 | [9]: https://github.com/pwwang/pipen-gcs 280 | [11]: ../cli 281 | [12]: https://github.com/pwwang/pipen-annotate 282 | [13]: https://github.com/pwwang/pipen-board 283 | [14]: https://github.com/pwwang/pipen-lock 284 | [15]: https://github.com/pwwang/pipen-log2file 285 | [16]: https://github.com/pwwang/pipen-poplog 286 | [17]: https://github.com/pwwang/pipen-runinfo 287 | -------------------------------------------------------------------------------- /docs/proc-group.md: -------------------------------------------------------------------------------- 1 | A process group is a collection of processes that are related to each other. It is a convenient way to manage a set of processes. 2 | 3 | With `pipen`, not only a process can be reused, but also a group of processes can be reused. We just need to define the relationship between the processes in the group, and then we can reuse the group in other pipelines, or even run it directly as a pipeline. 4 | 5 | ## Define a process group 6 | 7 | To define a process group, we need to define a class that inherits from `pipen.procgroup.ProcGroup`. The class name will be the name of the group, unless we specify a `name` attribute. 8 | 9 | ```python 10 | from pipen.procgroup import ProcGroup 11 | 12 | class MyGroup(ProcGroup): 13 | ... 14 | ``` 15 | 16 | Note that the subclasses of `ProcGroup` are singleton classes. If you need to define multiple groups, you can define a base class and then inherit from it. 17 | 18 | ## Add processes to a group 19 | 20 | There are two ways to add processes to a group, using `pg.add_proc` or `ProcGroup.add_proc`, where `pg` is a process group instance. The first method is used after the group is instantiated and it decorates a process class directly. The second method is used before the group is instantiated and it decorates a property of `ProcGroup` that returns a process. 21 | 22 | 1. Using the `pg.add_proc()` decorator. 23 | 24 | ```python 25 | from pipen import Proc, ProcGroup 26 | 27 | class MyGroup(ProcGroup): 28 | ... 29 | 30 | pg = MyGroup() 31 | 32 | @pg.add_proc 33 | class MyProc(Proc): 34 | ... 35 | ``` 36 | 37 | 2. Using the `ProcGroup.add_proc()` decorator to decorate a property of the group class. 38 | 39 | ```python 40 | from pipen import Proc, ProcGroup 41 | 42 | class MyGroup(ProcGroup): 43 | 44 | @ProcGroup.add_proc 45 | def my_proc(self): 46 | class MyProc(Proc): 47 | ... 48 | return MyProc 49 | ``` 50 | 51 | This method adds a process at runtime, so it is useful when we want to add processes to a group dynamically. 52 | 53 | ## Access processes in a group 54 | 55 | We can access the processes in a group using the `pg.` attribute, where `pg` is a process group instance. Note that when we use the `ProcGroup.add_proc` method to add processes, the process name is the name of the property. 56 | 57 | However, you can always use `pg.procs.` to access the process, where the `` is the real name of the process. 58 | 59 | ```python 60 | from pipen import Proc, ProcGroup 61 | 62 | class MyGroup(ProcGroup): 63 | 64 | @ProcGroup.add_proc 65 | def my_proc(self): 66 | class MyProc(Proc): 67 | ... 68 | return MyProc 69 | 70 | pg = MyGroup() 71 | assert pg.my_proc.name == 'MyProc' 72 | assert pg.procs.MyProc.name == 'MyProc' 73 | ``` 74 | 75 | We can use `pg.starts` to get the start processes of the group, which are the processes that have no required processes. So when you add processes to a group, remember to specify `.requires` for each process, unless they are start processes. 76 | 77 | ## Run a process group as a pipeline 78 | 79 | To run a process group as a pipeline, we can convert it to a pipeline using the `as_pipen()` method. The method takes the same arguments as the `Pipen` constructor. 80 | 81 | ```python 82 | from pipen import Proc, ProcGroup 83 | 84 | class MyGroup(ProcGroup): 85 | ... 86 | 87 | pg = MyGroup() 88 | 89 | @pg.add_proc 90 | class MyProc(Proc): 91 | ... 92 | 93 | pg.as_pipen().set_data(...).run() 94 | ``` 95 | 96 | ## Integrate a process group into a pipeline 97 | 98 | ```python 99 | from pipen import Proc, ProcGroup 100 | 101 | class MyGroup(ProcGroup): 102 | 103 | @ProcGroup.add_proc 104 | def my_proc(self): 105 | class MyProc(Proc): 106 | ... 107 | return MyProc 108 | 109 | @ProcGroup.add_proc 110 | def my_proc2(self): 111 | class MyProc2(Proc): 112 | requires = self.my_proc 113 | ... 114 | 115 | return MyProc2 116 | 117 | pg = MyGroup() 118 | 119 | class PrepareData(Proc): 120 | ... 121 | 122 | class PostGroup(Proc): 123 | requires = pg.my_proc2 124 | 125 | pg.my_proc.requires = PrepareData 126 | 127 | pipen = Pipen().set_starts(PrepareData).set_data(...).run() 128 | ``` 129 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs 2 | jinja2 3 | mkdocs-material 4 | pymdown-extensions 5 | mkapi-fix 6 | -------------------------------------------------------------------------------- /docs/running.md: -------------------------------------------------------------------------------- 1 | 2 | ## Creating a `Pipen` object 3 | 4 | The arguments for the constrctor are: 5 | 6 | - `name`: The name of the pipeline 7 | - `desc`: The description of the pipeline 8 | - `outdir`: The output directory of the pipeline. If not provided, defaults to `_results`. 9 | - `**kwargs`: Other configurations 10 | 11 | ## Specification of the start processes 12 | 13 | Once the requirements of the processes are specified, we are able to build the entire process dependency network. To start runing a pipeline, we just need to specify the start processes to start: 14 | 15 | ```python 16 | class P1(Proc): 17 | ... 18 | 19 | class P2(Proc): 20 | ... 21 | 22 | class P3(Proc): 23 | requires = [P1, P2] 24 | ... 25 | 26 | Pipen().set_starts(P1, P2) 27 | ``` 28 | 29 | You can specify the start processes individually, like we did above, or send a list of processes: 30 | 31 | ```python 32 | Pipen().set_starts([P1, P2]) 33 | ``` 34 | 35 | ## Setting input data for start processes 36 | 37 | Other than set the input data when defining a process, you can also specify the input data for start processes: 38 | 39 | ```python 40 | Pipen().set_starts(P1, P2).set_data(, ) 41 | ``` 42 | 43 | This is useful when you want to reuse the processes. 44 | 45 | The order of data in `.set_data()` has to be the same as the order of processes to be set in `.set_starts()`. When the `input_data` of a start process has already been set, an error will be raised. To use that `input_data`, use `None` in `.set_data()`. For example: 46 | 47 | ```python 48 | class P1(Proc): 49 | ... 50 | 51 | class P2(Proc): 52 | input_data = [1] 53 | 54 | Pipen().set_starts(P1, P2).set_data(, None) 55 | ``` 56 | 57 | ## Running with a different profile 58 | 59 | `Pipen.run()` accepts an argument `profile`, which allows you to use different profile from configuration files to run the pipeline: 60 | 61 | ```python 62 | Pipen().run("sge") 63 | ``` 64 | 65 | See [configurations][1] for more details. 66 | 67 | ## Shortcut for running a pipeline 68 | 69 | ```python 70 | import pipen 71 | 72 | class P1(pipen.Proc): 73 | ... 74 | 75 | class P2(pipen.Proc): 76 | ... 77 | 78 | class P3(pipen.Proc): 79 | requires = [P1, P2] 80 | ... 81 | 82 | pipen.run("MyPipeline", starts=[P1, P2], data=[, ]) 83 | ``` 84 | 85 | ```python 86 | >>> help(pipen.run) 87 | 88 | run( 89 | name: 'str', 90 | starts: 'Type[Proc] | List[Type[Proc]]', 91 | data: 'Iterable' = None, 92 | *, 93 | desc: 'str' = None, 94 | outdir: 'str | PathLike' = None, 95 | profile: 'str' = 'default', 96 | **kwargs, 97 | ) -> 'bool' 98 | Shortcut to run a pipeline 99 | 100 | Args: 101 | name: The name of the pipeline 102 | starts: The start processes 103 | data: The input data for the start processes 104 | desc: The description of the pipeline 105 | outdir: The output directory of the results 106 | profile: The profile to use 107 | **kwargs: Other options pass to Pipen to create the pipeline 108 | 109 | Returns: 110 | True if the pipeline ends successfully else False 111 | ``` 112 | 113 | [1]: ../configurations 114 | -------------------------------------------------------------------------------- /docs/scheduler.md: -------------------------------------------------------------------------------- 1 | 2 | `pipen` can send jobs to different scheduler system to run. To specify the scheduler, use `scheduler` and `scheduler_opts` configurations. 3 | 4 | ## Default supported schedulers 5 | 6 | `pipen` uses [`xqute`][1] for scheduler backend support. By default, the `local` and `sge` schedulers are supported by `xqute`. They are also the supported schedulers supported by `pipen`. 7 | 8 | ### `local` 9 | 10 | This is the default scheduler used by `pipen`. The jobs will be run on the local machine. 11 | 12 | No scheduler-specific options are available. 13 | 14 | ### `sge` 15 | 16 | Send the jobs to run on `sge` scheduler. 17 | 18 | The `scheduler_opts` will be the ones supported by `qsub`. 19 | 20 | ### `slurm` 21 | 22 | Send the jobs to run on `slurm` scheduler. 23 | 24 | The `scheduler_opts` will be the ones supported by `sbatch`. 25 | 26 | ### `ssh` 27 | 28 | Send the jobs to run on a remote machine via `ssh`. 29 | 30 | The `scheduler_opts` will be the ones supported by `ssh`. 31 | 32 | See also [xqute][1]. 33 | 34 | ### `gbatch` 35 | 36 | Send the jobs to run using Google Batch Jobs. 37 | 38 | The `scheduler_opts` will be used to construct the job configuration (json) file. 39 | 40 | By default, `taskGroups[0].taskSpec.runnables[0].script.text` is set to run the job script, and `taskGroups[0].taskSpec.volumes[0]` and `taskGroups[0].taskSpec.volumes[1]` will be set to mount the workdir and output directory to the VM. 41 | The `scheduler_opts` will be used to set the other fields in the job configuration file. 42 | 43 | `gbatch` scheduler also supports a `fast_mount` option to speed up the mounting a cloud directory to the VM. For example, `scheduler_opts={"fast_mount": "gs://bucket/path:/mnt/dir"}` will mount `gs://bucket/path` to `/mnt/dir` on the VM. 44 | 45 | ## Writing your own scheduler plugin 46 | 47 | To write a scheduler plugin, you need to subclass both `xqute.schedulers.scheduler.Scheduler` and `pipen.scheduler.SchedulerPostInit`. 48 | 49 | For examples of a scheduler plugin, see [local_scheduler][2], [sge_scheduler][3], [slurm_scheduler][4], [ssh_scheduler][5], and [gbatch_scheduler][6], and also `pipen.scheduler`. 50 | 51 | 52 | A scheduler class can be passed to `scheduler` configuration directly to be used as a scheduler. But you can also register it with entry points: 53 | 54 | For `setup.py`, you will need: 55 | ```python 56 | setup( 57 | # ... 58 | entry_points={"pipen_sched": ["mysched = pipen_mysched"]}, 59 | # ... 60 | ) 61 | ``` 62 | 63 | For `pyproject.toml`: 64 | ```toml 65 | [tool.poetry.plugins.pipen_sched] 66 | mysched = "pipen_mysched" 67 | ``` 68 | 69 | Then you can switch the scheduler to `mysched` by `scheduler="mysched"` 70 | 71 | 72 | [1]: https://github.com/pwwang/xqute 73 | [2]: https://github.com/pwwang/xqute/blob/master/xqute/schedulers/local_scheduler.py 74 | [3]: https://github.com/pwwang/xqute/blob/master/xqute/schedulers/sge_scheduler.py 75 | [4]: https://github.com/pwwang/xqute/blob/master/xqute/schedulers/slurm_scheduler.py 76 | [5]: https://github.com/pwwang/xqute/blob/master/xqute/schedulers/ssh_scheduler/ 77 | [4]: https://github.com/pwwang/xqute/blob/master/xqute/schedulers/gbatch_scheduler.py 78 | -------------------------------------------------------------------------------- /docs/script.md: -------------------------------------------------------------------------------- 1 | 2 | For templating in `script`, see [`templating`][2] 3 | 4 | ## Choosing your language 5 | 6 | You can specify the path of interpreter to `lang`. If the interpreter is in `$PATH`, you can directly give the basename of the interpreter (i.e. `python` instead of `/path/to/python`). 7 | 8 | For example, if you have your own perl installed at `/home/user/bin/perl`, then you need to tell `pipen` where it is: `lang = "/home/user/bin/perl"`. If `/home/user/bin` is in your `$PATH`, you can simply do: `lang = "perl"` 9 | 10 | You can also use [shebang][1] to specify the interperter: 11 | ```perl 12 | #!/home/usr/bin/perl 13 | # You perl code goes here 14 | ``` 15 | 16 | If you have shebang in your script, the `lang` specified in the configuration files and `Pipen` constructor will be ignored (but the one specified in process definition is not). 17 | 18 | ## Use script from a file 19 | 20 | You can also put the script into a file, and use it with a `file://` prefix: `script = "file:///a/b/c.pl"` 21 | 22 | !!! note 23 | 24 | You may also use a script file with a relative path, which is relative to where process is defined. For example: a process with `script = "file://./scripts/script.py"` is defined in `/a/b/pipeline.py`, then the script file refers to `/a/b/scripts/script.py` 25 | 26 | !!! hint 27 | 28 | Indents are important in python, when you write your scripts, you don't have to worry about the indents in your first empty lines. For example, you don't have to do this: 29 | 30 | ```python 31 | class P1(Proc): 32 | lang = "python" 33 | script = """ 34 | import os 35 | import re 36 | def somefunc (): 37 | pass 38 | """ 39 | ``` 40 | 41 | You can do this: 42 | 43 | ```python 44 | class P1(Proc): 45 | lang = "python" 46 | script = """ 47 | import os 48 | import re 49 | def somefunc (): 50 | pass 51 | """ 52 | ``` 53 | 54 | Only the first non-empty line is used to detect the indent for the whole script. 55 | 56 | ## Debugging your script 57 | 58 | If you need to debug your script, you just need to find the real running script, which is at: `///job.script`. The template is rendered already in the file. You can debug it using the tool according to the language you used for the script. 59 | 60 | ## Caching your results 61 | 62 | Job results get automatically cached previous run is successful and input/output data are not changed, see [caching][3]. 63 | 64 | However, there are cases when you want to cache some results even when the job fails. For example, there is a very time-consuming chunk of code in your script that you don't want to run that part each time if it finishes once. In that case, you can save the intermediate results in a directory under ``, where the directory is not specified in `output`. This keeps that directory untouched each time when the running data get purged if previous run fails. 65 | 66 | [1]: https://en.wikipedia.org/wiki/Shebang_(Unix) 67 | [2]: ../templating 68 | [3]: ../caching 69 | -------------------------------------------------------------------------------- /docs/style.css: -------------------------------------------------------------------------------- 1 | 2 | 3 | .md-typeset .admonition, .md-typeset details { 4 | font-size: .7rem !important; 5 | } 6 | 7 | .md-typeset table:not([class]) td { 8 | padding: .55em 1.25em !important; 9 | } 10 | 11 | .md-typeset table:not([class]) th { 12 | padding: .75em 1.25em !important; 13 | } 14 | 15 | .mkapi-docstring{ 16 | line-height: 1; 17 | } 18 | .mkapi-node { 19 | background-color: #f0f6fa; 20 | border-top: 3px solid #559bc9; 21 | } 22 | .mkapi-node .mkapi-object-container { 23 | background-color: #b4d4e9; 24 | padding: .12em .4em; 25 | } 26 | .mkapi-node .mkapi-object-container .mkapi-object.code { 27 | background: none; 28 | border: none; 29 | } 30 | .mkapi-node .mkapi-object-container .mkapi-object.code * { 31 | font-size: .65rem !important; 32 | } 33 | .mkapi-node pre { 34 | line-height: 1.5; 35 | } 36 | .md-typeset pre>code { 37 | overflow: visible; 38 | line-height: 1.2; 39 | } 40 | .mkapi-docstring .md-typeset pre>code { 41 | font-size: 0.1rem !important; 42 | } 43 | .mkapi-section-name.bases { 44 | margin-top: .2em; 45 | } 46 | .mkapi-section-body.bases { 47 | padding-bottom: .7em; 48 | line-height: 1.3; 49 | } 50 | .mkapi-section.bases { 51 | margin-bottom: .8em; 52 | } 53 | .mkapi-node * { 54 | font-size: .7rem; 55 | } 56 | .mkapi-node a.mkapi-src-link { 57 | word-break: keep-all; 58 | } 59 | .mkapi-docstring { 60 | padding: .4em .15em !important; 61 | } 62 | .mkapi-section-name-body { 63 | font-size: .72rem !important; 64 | } 65 | .mkapi-node ul.mkapi-items li { 66 | line-height: 1.4 !important; 67 | } 68 | .mkapi-node ul.mkapi-items li * { 69 | font-size: .65rem !important; 70 | } 71 | .mkapi-node code.mkapi-object-signature { 72 | padding-right: 2px; 73 | } 74 | .mkapi-node .mkapi-code * { 75 | font-size: .65rem; 76 | } 77 | .mkapi-node a.mkapi-docs-link { 78 | font-size: .6rem; 79 | } 80 | .mkapi-node h1.mkapi-object.mkapi-object-code { 81 | margin: .2em .3em; 82 | } 83 | .mkapi-node h1.mkapi-object.mkapi-object-code .mkapi-object-kind.mkapi-object-kind-code { 84 | font-style: normal; 85 | margin-right: 16px; 86 | } 87 | .mkapi-node .mkapi-item-name { 88 | font-size: .7rem !important; 89 | color: #555; 90 | padding-right: 4px; 91 | } 92 | .md-typeset { 93 | font-size: .75rem !important; 94 | line-height: 1.5 !important; 95 | } 96 | .mkapi-object-kind.package.top { 97 | font-size: .8rem !important; 98 | color: #111; 99 | 100 | } 101 | .mkapi-object.package.top > h2 { 102 | font-size: .8rem !important; 103 | } 104 | 105 | .mkapi-object-body.package.top * { 106 | font-size: .75rem !important; 107 | } 108 | .mkapi-object-kind.module.top { 109 | font-size: .75rem !important; 110 | color: #222; 111 | } 112 | 113 | .mkapi-object-body.module.top * { 114 | font-size: .75rem !important; 115 | } 116 | -------------------------------------------------------------------------------- /docs/templating.md: -------------------------------------------------------------------------------- 1 | Templates are used in `output` and `script` in process definition. 2 | 3 | ## Template engines 4 | 5 | By default, `pipen` uses [`liquid`][1] template engine to render the `output` and `script`. You can also switch the template engine to [`jinja2`][2] by specifying: 6 | 7 | ```toml 8 | template = "jinja2" 9 | ``` 10 | 11 | in one of the configuration files, or in the `Pipen` constructor: 12 | 13 | ```python 14 | pipeline = Pipen(..., template="jinja2", ...) 15 | ``` 16 | 17 | or in the process definition 18 | 19 | ```python 20 | class MyProcess(Proc): 21 | ... 22 | template = "jinja2" # overwrite the global template engine 23 | ``` 24 | 25 | Besides specifying the name of a template engine, you can also specify a subclass `pipen.template.Template` as a template engine. This enables us to use our own template engine. You just have to wrap then use a subclass of `pipen.template.Template`. For example, if you want to use [`mako`][3]: 26 | 27 | ```python 28 | from mako.template import Template as MakoTemplate 29 | from pipen.template import Template 30 | 31 | class TemplateMako(Template): 32 | 33 | def __init__(self, source, **kwargs): 34 | super().__init__(source) 35 | self.engine = MakoTemplate(source, **kwargs) 36 | 37 | def _render(self, data): 38 | return self.engine.render(**data) 39 | 40 | # Use it for a process 41 | from pipen import Proc 42 | 43 | class MyProcess(Proc): 44 | template = TemplateMako 45 | ... # other configurations 46 | 47 | ``` 48 | 49 | The `template_opts` configuration is used to pass to `TemplateMako` constructor. The values is passed by to the `MakoTemplate` constructor. 50 | 51 | You can also register the template as a plugin of pipen: 52 | 53 | In `pyproject.toml`: 54 | 55 | ```toml 56 | [tool.poetry.plugins.pipen_tpl] 57 | mako = "pipen_mako:pipen_mako" 58 | ``` 59 | 60 | Or in `setup.py`: 61 | 62 | ```python 63 | setup( 64 | ..., 65 | entry_points={"pipen_tpl": ["pipen_mako:pipen_mako"]}, 66 | ) 67 | ``` 68 | 69 | Then in `pipen_mako.py` of your package: 70 | 71 | ```python 72 | def pipen_mako(): 73 | # TemplateMako is defined as the above 74 | return TemplateMako 75 | ``` 76 | 77 | ## Rendering data 78 | 79 | There are some data shared to render both `output` and `script`. However, there are some different. One of the obvious reasons is that, the `script` template can use the `output` data to render. 80 | 81 | ### `output` 82 | 83 | The data to render the `output`: 84 | 85 | |Name|Description| 86 | |-|-| 87 | |`job.index`|The index of the job, 0-based| 88 | |`job.metadir`1|The directory where job metadata is saved, typically `////`| 89 | |`job.outdir`1|*The output directory of the job: `////output`| 90 | |`job.stdout_file`1|The file that saves the stdout of the job| 91 | |`job.stderr_file`1|The file that saves the stderr of the job| 92 | |`in`|The input data of the job. You can use `in.`1 to access the data for each input key| 93 | |`proc`|The process object, used to access their properties, such as `proc.workdir`| 94 | |`envs`|The `envs` of the process| 95 | 96 | `*`: If the process is an end process, it will be a symbolic link to `//`. When the process has only a single job, the `` is also omitted. 97 | 98 | ### `script` 99 | 100 | All the data used to render `output` can also be used to render `script`. Addtionally, the rendered `output` can also be used to render `script`. For example: 101 | 102 | ```python 103 | class MyProcess(Proc): 104 | input = "in" 105 | output = "outfile:file:{{in.in}}.txt" 106 | script = "echo {{in.in}} > {{out.outfile}}" 107 | ... # other configurations 108 | 109 | ``` 110 | 111 | With input data ["a"], the script is rendered as `echo a > /a.txt` 112 | 113 | > 1 The paths are [`MountedPath`][4] objects, which represent paths of jobs and it is useful when a job is running in a remote system (a VM, a container, etc.), where we need to mount the paths into the remote system. It has an attribute `spec` to get the specified path. When there is no mountings, it is the same as the path itself. 114 | 115 | [1]: https://github.com/pwwang/liquidpy 116 | [2]: https://github.com/pallets/jinja 117 | [3]: https://www.makotemplates.org/ 118 | [4]: https://pwwang.github.io/xqute/api/xqute.path/ 119 | -------------------------------------------------------------------------------- /examples/caching.py: -------------------------------------------------------------------------------- 1 | """An example showing how caching works""" 2 | 3 | from pathlib import Path 4 | from pipen import Pipen, Proc 5 | 6 | 7 | class AProcess(Proc): 8 | """A normal process""" 9 | input = "infile:file" 10 | output = "outfile:file:{{in.infile.name}}" 11 | script = "cat {{in.infile}} > {{out.outfile}}" 12 | 13 | 14 | class MyPipeline(Pipen): 15 | starts = AProcess 16 | # Enable debugging information so you will see why jobs are not cached 17 | loglevel = "debug" 18 | 19 | 20 | if __name__ == "__main__": 21 | 22 | infile = "/tmp/pipen_example_caching.txt" 23 | if not Path(infile).exists(): 24 | Path(infile).write_text("123") 25 | 26 | MyPipeline().set_data([infile]).run() 27 | 28 | 29 | # Run this script the repeatedly, you will see the jobs are cached 30 | 31 | # To "de-cache" the jobs, either 32 | # 1. touch the input file 33 | # 2. change any part of input, output, script 34 | # 3. run: 35 | # PIPEN_default_cache=0 python caching.py 36 | # 4. Pass cache=False or set it to AProcess and run again 37 | -------------------------------------------------------------------------------- /examples/cloudwdir.py: -------------------------------------------------------------------------------- 1 | """An example using cloud workdir/outdiur""" 2 | 3 | from dotenv import load_dotenv 4 | from pipen import Proc, Pipen 5 | 6 | load_dotenv() 7 | BUCKET = "gs://handy-buffer-287000.appspot.com" 8 | 9 | 10 | class MyProcess(Proc): 11 | """A process""" 12 | 13 | input = "a" 14 | input_data = [1] 15 | output = "outfile:file:{{in.a}}.txt" 16 | script = "cloudsh touch {{out.outfile}}" 17 | 18 | 19 | class MyProcess2(Proc): 20 | """Another process""" 21 | requires = MyProcess 22 | input = "infile:file" 23 | output = "outfile:file:{{in.infile.stem}}2.txt" 24 | script = "cloudsh cat {{in.infile}} | cloudsh sink {{out.outfile}}" 25 | 26 | 27 | class MyCloudDirPipeline(Pipen): 28 | starts = MyProcess 29 | workdir = f"{BUCKET}/pipen-test/clouddir-pipeline/workdir" 30 | outdir = f"{BUCKET}/pipen-test/clouddir-pipeline/outdir" 31 | 32 | 33 | if __name__ == "__main__": 34 | MyCloudDirPipeline().run() 35 | -------------------------------------------------------------------------------- /examples/example.py: -------------------------------------------------------------------------------- 1 | from pipen import Proc, Pipen, run 2 | 3 | class P1(Proc): 4 | """Sort input file""" 5 | input = "infile" 6 | input_data = ["/tmp/data.txt"] 7 | output = "outfile:file:intermediate.txt" 8 | script = "cat {{in.infile}} | sort > {{out.outfile}}" 9 | 10 | class P2(Proc): 11 | """Paste line number""" 12 | requires = P1 13 | input = "infile:file" 14 | output = "outfile:file:result.txt" 15 | script = "paste <(seq 1 3) {{in.infile}} > {{out.outfile}}" 16 | 17 | # class MyPipeline(Pipen): 18 | # starts = P1 19 | 20 | if __name__ == "__main__": 21 | # MyPipeline().run() 22 | # Before running the pipeline, make sure to create the input file 23 | # $ echo -e "3\n2\n1" > /tmp/data.txt 24 | run("MyPipeline", starts=P1, desc="My pipeline") 25 | -------------------------------------------------------------------------------- /examples/gbatch.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dotenv import load_dotenv 4 | from pipen import Proc, Pipen 5 | 6 | load_dotenv() 7 | BUCKET = f"gs://{os.environ['GBATCH_EXAMPLE_BUCKET']}" 8 | 9 | 10 | class MyProcess(Proc): 11 | 12 | input = "a" 13 | input_data = [1] 14 | output = "outfile:file:{{in.a}}.txt" 15 | script = "echo {{in.a}} > {{out.outfile}}" 16 | 17 | 18 | # Works even when metadir/outdir mounted 19 | class MyProcess2(Proc): 20 | requires = MyProcess 21 | input = "infile:file" 22 | output = "outfile:file:{{in.infile.stem}}2.txt" 23 | script = "echo 123 > {{out.outfile}}" 24 | export = True 25 | 26 | 27 | # Works even when metadir/outdir mounted 28 | class MyProcess3(Proc): 29 | requires = MyProcess2 30 | input = "infile:file" 31 | output = "outfile:file:{{in.infile.stem}}3.txt" 32 | script = "echo 456 > {{out.outfile}}" 33 | 34 | 35 | class MyGBatchPipeline(Pipen): 36 | starts = MyProcess 37 | workdir = f"{BUCKET}/pipen-test/workdir" 38 | outdir = f"{BUCKET}/pipen-test/outdir" 39 | loglevel = "DEBUG" 40 | 41 | 42 | if __name__ == "__main__": 43 | MyGBatchPipeline().run(profile="gbatch") 44 | -------------------------------------------------------------------------------- /examples/input_data_callback.py: -------------------------------------------------------------------------------- 1 | """An example showing using callback to modify the channel 2 | 3 | It's a more complete example from README.md 4 | """ 5 | from pathlib import Path 6 | import random 7 | from pipen import Proc, Pipen 8 | from pipen.channel import Channel 9 | 10 | 11 | def wc(path): 12 | """Count lines in the file""" 13 | i = 0 14 | with Path(path).open() as f: 15 | for line in f: 16 | i += 1 17 | return i 18 | 19 | 20 | class P1(Proc): 21 | """Sort input file""" 22 | 23 | input = "infile:file" 24 | output = "outfile:file:intermediate.txt" 25 | script = "cat {{in.infile}} | sort > {{out.outfile}}" 26 | 27 | 28 | class P2(Proc): 29 | """Paste line number""" 30 | 31 | requires = P1 32 | input = "infile:file, nlines" 33 | # use the callback to add number of lines for each file 34 | input_data = lambda ch: ch.assign(nlines=ch.outfile.apply(wc)) 35 | output = "outfile:file:result.txt" 36 | script = "paste <(seq 1 {{in.nlines}}) {{in.infile}} > {{out.outfile}}" 37 | 38 | 39 | def prepare_input_data(): 40 | """Prepare input data""" 41 | tmpdir = "/tmp/pipen_example_input_data_callback/" 42 | Path(tmpdir).mkdir(exist_ok=True) 43 | 44 | for i in range(10): 45 | seq = list(range(i + 2)) 46 | random.shuffle(seq) 47 | seq = (f"{i}_{x}" for x in seq) 48 | 49 | Path(tmpdir).joinpath(f"{i}.txt").write_text("\n".join(seq)) 50 | 51 | return Channel.from_glob(f"{tmpdir}/*.txt") 52 | 53 | 54 | class MyPipeline(Pipen): 55 | starts = [P1] 56 | data = [prepare_input_data()] 57 | forks = 3 58 | 59 | 60 | if __name__ == "__main__": 61 | MyPipeline().run() 62 | -------------------------------------------------------------------------------- /examples/mako-templating.py: -------------------------------------------------------------------------------- 1 | """An example showing how to use mako template engine""" 2 | 3 | from mako.template import Template as Mako 4 | from pipen.template import Template 5 | from pipen import Proc, Pipen 6 | 7 | 8 | class TemplateMako(Template): 9 | 10 | name = "mako" 11 | 12 | def __init__(self, source, **kwargs): 13 | super().__init__(source) 14 | self.engine = Mako(source, **kwargs) 15 | 16 | def _render(self, data): 17 | return self.engine.render(**data) 18 | 19 | 20 | class MakoProcess(Proc): 21 | """A process using mako templating""" 22 | template = TemplateMako 23 | input = "a" 24 | input_data = [1] 25 | output = "outfile:file:${in_['a']}.txt" 26 | script = "touch ${out['outfile']}" 27 | 28 | 29 | class MyPipeline(Pipen): 30 | starts = MakoProcess 31 | plugins = ["-filters"] 32 | 33 | 34 | if __name__ == "__main__": 35 | MyPipeline().run() 36 | -------------------------------------------------------------------------------- /examples/multijobs.py: -------------------------------------------------------------------------------- 1 | """An example for a process to have multiple jobs and run jobs in parallel""" 2 | from pipen import Proc, Pipen 3 | 4 | 5 | class MultiJobProc(Proc): 6 | """A process with multiple jobs""" 7 | input = "i" 8 | input_data = range(10) 9 | forks = 3 10 | # Don't cache, we need to see the jobs to run every time 11 | cache = False 12 | output = "outfile:file:{{in.i}}.txt" 13 | # Let the job takes long the see the parallelization from the progressbar 14 | script = "sleep 1; echo {{in.i}} > {{out.outfile}}" 15 | 16 | 17 | if __name__ == "__main__": 18 | Pipen().set_starts(MultiJobProc).run() 19 | -------------------------------------------------------------------------------- /examples/plugin-example.py: -------------------------------------------------------------------------------- 1 | """An example showing how to create a plugin""" 2 | 3 | from pipen import Proc, Pipen, plugin 4 | from pipen.utils import get_logger 5 | 6 | logger = get_logger("notify", "info") 7 | 8 | 9 | class NotifyPlugin: 10 | version = "0.0.0" 11 | 12 | @plugin.impl 13 | def on_setup(config): 14 | logger.info("Calling on_setup") 15 | 16 | @plugin.impl 17 | async def on_start(pipen): 18 | logger.info("Calling on_start") 19 | 20 | @plugin.impl 21 | async def on_complete(pipen, succeeded): 22 | logger.info("Calling on_complete, succeeded = %s", succeeded) 23 | 24 | @plugin.impl 25 | async def on_proc_start(proc): 26 | logger.info("Calling on_proc_start") 27 | 28 | @plugin.impl 29 | async def on_proc_done(proc, succeeded): 30 | logger.info("Calling on_proc_done, succeeded = %s", succeeded) 31 | 32 | @plugin.impl 33 | async def on_job_polling(job): 34 | logger.info("Calling on_job_polling") 35 | 36 | 37 | class AProcess(Proc): 38 | input = "a" 39 | script = 'sleep 2' 40 | 41 | 42 | if __name__ == "__main__": 43 | Pipen(plugins=[NotifyPlugin], cache=False).set_starts(AProcess).run() 44 | -------------------------------------------------------------------------------- /examples/python-script.py: -------------------------------------------------------------------------------- 1 | """An example using python as interpreter for the script""" 2 | 3 | from pipen import Pipen, Proc 4 | 5 | 6 | class PythonScriptProc(Proc): 7 | """A process using python interpreter for script""" 8 | input = "a" 9 | input_data = [1] 10 | output = "outfile:file:{{in.a}}.txt" 11 | lang = "python" 12 | script = """ 13 | from pathlib import Path 14 | Path("{{out.outfile}}").write_text("{{in.a}}") 15 | """ 16 | 17 | 18 | if __name__ == "__main__": 19 | Pipen().set_starts(PythonScriptProc).run() 20 | -------------------------------------------------------------------------------- /examples/retry.py: -------------------------------------------------------------------------------- 1 | """An example to retry the jobs when error happends""" 2 | import time 3 | from pipen import Pipen, Proc 4 | 5 | 6 | class RetryProc(Proc): 7 | """Retry the jobs when fail""" 8 | input = "starttime" 9 | input_data = [int(time.time())] 10 | error_strategy = "retry" 11 | # Make sure the job succeeds finally 12 | num_retries = 10 13 | script = """ 14 | timefile="{{job.outdir}}/time.txt" 15 | now=$(date +"%s") 16 | expect={{in.starttime + 10}} 17 | if [[ $now -gt $expect ]]; then 18 | echo $now $expect 0 >> "$timefile" 19 | exit 0 20 | else 21 | echo $now $expect 1 >> "$timefile" 22 | exit 1 23 | fi 24 | """ 25 | 26 | 27 | if __name__ == "__main__": 28 | # Show debug information so we see the retrying message 29 | Pipen(loglevel="debug").set_starts(RetryProc).run() 30 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: pipen 2 | repo_url: https://github.com/pwwang/pipen 3 | repo_name: pwwang/pipen 4 | theme: 5 | name: 'material' 6 | markdown_extensions: 7 | - markdown.extensions.admonition 8 | - pymdownx.superfences: 9 | preserve_tabs: true 10 | - toc: 11 | baselevel: 2 12 | plugins: 13 | - search # necessary for search to work 14 | - mkapi 15 | extra_css: 16 | - style.css 17 | nav: 18 | - 'Introduction': 'index.md' 19 | - 'Basics': 'basics.md' 20 | - 'Defining a process': 'defining-proc.md' 21 | - 'Defining and running a pipeline': 'running.md' 22 | - 'Templating': 'templating.md' 23 | - 'Channels': 'channels.md' 24 | - 'Input and output': 'input-output.md' 25 | - 'Script': 'script.md' 26 | - 'Caching': 'caching.md' 27 | - 'Cloud support': 'cloud.md' 28 | - 'Error handling': 'error.md' 29 | - 'Configurations': 'configurations.md' 30 | - 'Plugins': 'plugin.md' 31 | - 'Scheduler': 'scheduler.md' 32 | - 'Process group': 'proc-group.md' 33 | - 'Command line iterface': 'cli.md' 34 | - 'Examples': 'examples.md' 35 | - 'Change log': 'CHANGELOG.md' 36 | - 'API': mkapi/api/pipen 37 | -------------------------------------------------------------------------------- /pipen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/pipen.png -------------------------------------------------------------------------------- /pipen/__init__.py: -------------------------------------------------------------------------------- 1 | """A pipeline framework for python""" 2 | from .pipen import Pipen, run 3 | from .proc import Proc 4 | from .procgroup import ProcGroup 5 | 6 | # Use from pipen.channel import Channel instead of 7 | # from pipen import Channel 8 | # This slows down import 9 | # from .channel import Channel 10 | from .pluginmgr import plugin 11 | from .version import __version__ 12 | -------------------------------------------------------------------------------- /pipen/__main__.py: -------------------------------------------------------------------------------- 1 | from .cli._main import main 2 | 3 | 4 | if __name__ == '__main__': 5 | main() 6 | -------------------------------------------------------------------------------- /pipen/_job_caching.py: -------------------------------------------------------------------------------- 1 | """Provide JobCaching class that implements caching for jobs""" 2 | 3 | from __future__ import annotations 4 | from typing import TYPE_CHECKING 5 | 6 | from diot import Diot 7 | from simpleconf import Config 8 | 9 | from .defaults import ProcInputType, ProcOutputType 10 | from .utils import get_mtime, path_is_symlink 11 | 12 | if TYPE_CHECKING: 13 | from xqute.path import SpecPath 14 | 15 | 16 | class JobCaching: 17 | """Provide caching functionality of jobs""" 18 | 19 | @property 20 | def signature_file(self) -> SpecPath: 21 | """Get the path to the signature file 22 | 23 | Returns: 24 | The path to the signature file 25 | """ 26 | return self.metadir / "job.signature.toml" 27 | 28 | async def cache(self) -> None: 29 | """write signature to signature file""" 30 | dirsig = ( 31 | self.proc.pipeline.config.dirsig 32 | if self.proc.dirsig is None 33 | else self.proc.dirsig 34 | ) 35 | # Check if mtimes of input is greater than those of output 36 | try: 37 | max_mtime = get_mtime(self.script_file, 0) 38 | except Exception: # pragma: no cover 39 | max_mtime = 0 40 | 41 | # Make self.input serializable 42 | input_data = {} 43 | for inkey, intype in self.proc.input.type.items(): 44 | if intype == ProcInputType.VAR: 45 | input_data[inkey] = self.input[inkey] 46 | continue 47 | 48 | if intype in (ProcInputType.FILE, ProcInputType.DIR): 49 | if self.input[inkey] is None: 50 | input_data[inkey] = None 51 | else: 52 | input_data[inkey] = str(self.input[inkey].spec) 53 | max_mtime = max( 54 | max_mtime, 55 | get_mtime(self.input[inkey].spec, dirsig), 56 | ) 57 | 58 | if intype in (ProcInputType.FILES, ProcInputType.DIRS): 59 | if self.input[inkey] is None: # pragma: no cover 60 | input_data[inkey] = None 61 | else: 62 | input_data[inkey] = [] 63 | for file in self.input[inkey]: 64 | input_data[inkey].append(str(file.spec)) 65 | max_mtime = max(max_mtime, get_mtime(file.spec, dirsig)) 66 | 67 | # Make self.output serializable 68 | output_data = {} 69 | for outkey, outval in self._output_types.items(): 70 | if outval in (ProcOutputType.FILE, ProcInputType.DIR): 71 | output_data[outkey] = str(self.output[outkey].spec) 72 | max_mtime = max(max_mtime, get_mtime(self.output[outkey].spec, dirsig)) 73 | else: 74 | output_data[outkey] = self.output[outkey] 75 | 76 | signature = { 77 | "input": { 78 | "type": self.proc.input.type, 79 | "data": input_data, 80 | }, 81 | "output": {"type": self._output_types, "data": output_data}, 82 | "ctime": float("inf") if max_mtime == 0 else max_mtime, 83 | } 84 | with self.signature_file.open("w") as f: 85 | f.write(Diot(signature).to_toml()) 86 | 87 | async def _clear_output(self) -> None: 88 | """Clear output if not cached""" 89 | self.log("debug", "Clearing previous output files.") 90 | for outkey, outval in self._output_types.items(): 91 | if outval not in (ProcOutputType.FILE, ProcOutputType.DIR): 92 | continue 93 | 94 | path = self.output[outkey].spec 95 | if not path.exists() and path_is_symlink(path): # dead link 96 | path.unlink() 97 | elif path.exists(): 98 | if not path.is_dir(): 99 | path.unlink() 100 | else: 101 | path.rmtree(ignore_errors=True) 102 | path.mkdir() 103 | 104 | async def _check_cached(self) -> bool: 105 | """Check if the job is cached based on signature 106 | 107 | Returns: 108 | True if the job is cached otherwise False 109 | """ 110 | with self.signature_file.open("r") as sf: 111 | signature = Config.load(sf, loader="toml") 112 | 113 | dirsig = ( 114 | self.proc.pipeline.config.dirsig 115 | if self.proc.dirsig is None 116 | else self.proc.dirsig 117 | ) 118 | 119 | try: 120 | # check if inputs/outputs are still the same 121 | if ( 122 | signature.input.type != self.proc.input.type 123 | or signature.output.type != self._output_types 124 | ): 125 | self.log("debug", "Not cached (input or output types are different)") 126 | return False 127 | 128 | # check if any script file is newer 129 | script_mtime = get_mtime(self.script_file, 0) 130 | if script_mtime > signature.ctime + 1e-3: 131 | self.log( 132 | "debug", 133 | "Not cached (script file is newer: %s > %s)", 134 | script_mtime, 135 | signature.ctime, 136 | ) 137 | return False 138 | 139 | # Check if input is different 140 | for inkey, intype in self.proc.input.type.items(): 141 | sig_indata = signature.input.data.get(inkey) 142 | 143 | if intype == ProcInputType.VAR: 144 | if sig_indata != self.input[inkey]: 145 | self.log( 146 | "debug", 147 | "Not cached (input %s:%s is different)", 148 | inkey, 149 | intype, 150 | ) 151 | return False 152 | 153 | elif int(self.input[inkey] is None) + int(sig_indata is None) == 1: 154 | # one is None, the other is not 155 | self.log( 156 | "debug", 157 | "Not cached (input %s:%s is different; " 158 | "it is <%s> in signature, but <%s> in data)", 159 | inkey, 160 | intype, 161 | type(sig_indata).__name__, 162 | type(self.input[inkey]).__name__, 163 | ) 164 | return False 165 | 166 | elif self.input[inkey] is None and sig_indata is None: 167 | continue 168 | 169 | elif intype in (ProcInputType.FILE, ProcInputType.DIR): 170 | if sig_indata != str(self.input[inkey].spec): 171 | self.log( 172 | "debug", 173 | "Not cached (input %s:%s is different)", 174 | inkey, 175 | intype, 176 | ) 177 | return False 178 | 179 | if ( 180 | get_mtime(self.input[inkey].spec, dirsig) 181 | > signature.ctime + 1e-3 182 | ): 183 | self.log( 184 | "debug", 185 | "Not cached (Input file is newer: %s)", 186 | inkey, 187 | ) 188 | return False 189 | 190 | # FILES/DIRS 191 | 192 | # self.input[inkey] can't be None with intype files/dirs 193 | # elif sig_indata is None: # both None 194 | # continue 195 | 196 | elif not isinstance(sig_indata, list): # pragma: no cover 197 | self.log( 198 | "debug", 199 | "Not cached (input %s:%s is different, " 200 | "%s detected in signature)", 201 | inkey, 202 | intype, 203 | type(sig_indata).__name__, 204 | ) 205 | return False 206 | 207 | else: # both list 208 | if len(sig_indata) != len(self.input[inkey]): # pragma: no cover 209 | self.log( 210 | "debug", 211 | "Not cached (input %s:%s length is different)", 212 | inkey, 213 | intype, 214 | ) 215 | return False 216 | 217 | for i, file in enumerate(self.input[inkey]): 218 | if sig_indata[i] != str(file.spec): # pragma: no cover 219 | self.log( 220 | "debug", 221 | "Not cached (input %s:%s at index %s is different)", 222 | inkey, 223 | intype, 224 | i, 225 | ) 226 | return False 227 | 228 | if get_mtime(file.spec, dirsig) > signature.ctime + 1e-3: 229 | self.log( 230 | "debug", 231 | "Not cached (input %s:%s at index %s is newer)", 232 | inkey, 233 | intype, 234 | i, 235 | ) 236 | return False 237 | 238 | # Check if output is different 239 | for outkey, outtype in self._output_types.items(): 240 | sig_outdata = signature.output.data.get(outkey) 241 | if outtype == ProcOutputType.VAR: 242 | if sig_outdata != self.output[outkey]: # pragma: no cover 243 | self.log( 244 | "debug", 245 | "Not cached (output %s:%s is different)", 246 | outkey, 247 | outtype, 248 | ) 249 | return False 250 | 251 | else: # FILE/DIR 252 | if sig_outdata != str(self.output[outkey].spec): # pragma: no cover 253 | self.log( 254 | "debug", 255 | "Not cached (output %s:%s is different)", 256 | outkey, 257 | outtype, 258 | ) 259 | return False 260 | 261 | if not self.output[outkey].spec.exists(): 262 | self.log( 263 | "debug", 264 | "Not cached (output %s:%s was removed)", 265 | outkey, 266 | outtype, 267 | ) 268 | return False 269 | 270 | except Exception as exc: # pragma: no cover 271 | # meaning signature is incomplete 272 | # or any file is deleted 273 | self.log("debug", "Not cached (%s)", exc) 274 | return False 275 | 276 | return True 277 | 278 | @property 279 | async def cached(self) -> bool: 280 | """Check if a job is cached 281 | 282 | Returns: 283 | True if the job is cached otherwise False 284 | """ 285 | out = True 286 | proc_cache = ( 287 | self.proc.pipeline.config.cache 288 | if self.proc.cache is None 289 | else self.proc.cache 290 | ) 291 | if not proc_cache: 292 | self.log( 293 | "debug", 294 | "Not cached (proc.cache is False)", 295 | ) 296 | out = False 297 | elif self.rc != 0: 298 | self.log( 299 | "debug", 300 | "Not cached (job.rc != 0)", 301 | ) 302 | out = False 303 | elif proc_cache == "force": 304 | try: 305 | await self.cache() 306 | except Exception: # pragma: no cover 307 | # FileNotFoundError, google.api_core.exceptions.NotFound, etc 308 | out = False 309 | else: 310 | out = True 311 | elif not self.signature_file.is_file(): 312 | self.log( 313 | "debug", 314 | "Not cached (signature file not found)", 315 | ) 316 | out = False 317 | else: 318 | out = await self._check_cached() 319 | 320 | if not out: 321 | await self._clear_output() 322 | 323 | return out 324 | -------------------------------------------------------------------------------- /pipen/channel.py: -------------------------------------------------------------------------------- 1 | """Provide some function for creating and modifying channels (dataframes)""" 2 | 3 | from __future__ import annotations 4 | 5 | from glob import glob 6 | from os import path 7 | from itertools import chain 8 | from pathlib import Path 9 | from typing import Any, List 10 | 11 | import pandas 12 | from yunpath import AnyPath, CloudPath 13 | from pandas import DataFrame 14 | from pipda import register_verb 15 | 16 | from .utils import path_is_symlink 17 | 18 | 19 | # ---------------------------------------------------------------- 20 | # Creators 21 | class Channel(DataFrame): 22 | """A DataFrame wrapper with creators""" 23 | 24 | @classmethod 25 | def create(cls, value: DataFrame | List[Any]) -> DataFrame: 26 | """Create a channel from a list. 27 | 28 | The second dimension is identified by tuple. if all elements are tuple, 29 | then a channel is created directly. Otherwise, elements are converted 30 | to tuples first and channels are created then. 31 | 32 | Examples: 33 | >>> Channel.create([1, 2, 3]) # 3 rows, 1 column 34 | >>> Channel.create([(1,2,3)]) # 1 row, 3 columns 35 | 36 | Args: 37 | value: The value to create a channel 38 | 39 | Returns: 40 | A channel (dataframe) 41 | """ 42 | if isinstance(value, DataFrame): 43 | return value 44 | if all(isinstance(elem, tuple) for elem in value): 45 | return cls(value) 46 | return cls((val,) for val in value) 47 | 48 | @classmethod 49 | def from_glob( 50 | cls, 51 | pattern: str, 52 | ftype: str = "any", 53 | sortby: str = "name", 54 | reverse: bool = False, 55 | ) -> DataFrame: 56 | """Create a channel with a glob pattern 57 | 58 | Args: 59 | ftype: The file type, one of any, link, dir and file 60 | sortby: How the files should be sorted. One of name, mtime and size 61 | reverse: Whether sort them in a reversed way. 62 | 63 | Returns: 64 | The channel 65 | """ 66 | 67 | def sort_key(file: Path | CloudPath) -> Any: 68 | if sortby == "mtime": 69 | return file.stat().st_mtime 70 | if sortby == "size": 71 | return file.stat().st_size 72 | 73 | return str(file) # sort by name 74 | 75 | def file_filter(file: Path | CloudPath) -> bool: 76 | if ftype == "link": 77 | return path_is_symlink(file) 78 | if ftype == "dir": 79 | return file.is_dir() 80 | if ftype == "file": 81 | return file.is_file() 82 | return True 83 | 84 | pattern: Path | CloudPath = AnyPath(pattern) 85 | if isinstance(pattern, CloudPath): 86 | parts = pattern.parts 87 | bucket = CloudPath("".join(parts[:2])) # gs://bucket 88 | # CloudPath.glob() does not support a/b/*.txt 89 | # we have to do it part by part 90 | parts = parts[2:] 91 | files = [bucket] 92 | for i, part in enumerate(parts): 93 | tmp = chain(*[base.glob(part) for base in files]) 94 | tmp = list(tmp) 95 | files = [ 96 | base for base in tmp 97 | if (i < len(parts) - 1 and base.is_dir()) 98 | or (i == len(parts) - 1 and file_filter(base)) 99 | ] 100 | else: # local path 101 | files = ( 102 | Path(file) for file in glob(str(pattern)) if file_filter(Path(file)) 103 | ) 104 | 105 | return cls.create( 106 | [ 107 | str(file) 108 | for file in sorted( 109 | files, 110 | key=sort_key if sortby in ("name", "mtime", "size") else None, 111 | reverse=reverse, 112 | ) # type: ignore 113 | ] 114 | ) 115 | 116 | @classmethod 117 | def from_pairs( 118 | cls, 119 | pattern: str, 120 | ftype: str = "any", 121 | sortby: str = "name", 122 | reverse: bool = False, 123 | ) -> DataFrame: 124 | """Create a width=2 channel with a glob pattern 125 | 126 | Args: 127 | ftype: The file type, one of any, link, dir and file 128 | sortby: How the files should be sorted. One of name, mtime and size 129 | reverse: Whether sort them in a reversed way. 130 | 131 | Returns: 132 | The channel 133 | """ 134 | mates = cls.from_glob(pattern, ftype, sortby, reverse) 135 | return pandas.concat( 136 | ( 137 | mates.iloc[::2].reset_index(drop=True), 138 | mates.iloc[1::2].reset_index(drop=True), 139 | ), 140 | axis=1, 141 | ) 142 | 143 | @classmethod 144 | def from_csv(cls, *args, **kwargs): 145 | """Create a channel from a csv file 146 | 147 | Uses pandas.read_csv() to create a channel 148 | 149 | Args: 150 | *args: and 151 | **kwargs: Arguments passing to pandas.read_csv() 152 | """ 153 | return pandas.read_csv(*args, **kwargs) 154 | 155 | @classmethod 156 | def from_excel(cls, *args, **kwargs): 157 | """Create a channel from an excel file. 158 | 159 | Uses pandas.read_excel() to create a channel 160 | 161 | Args: 162 | *args: and 163 | **kwargs: Arguments passing to pandas.read_excel() 164 | """ 165 | return pandas.read_excel(*args, **kwargs) 166 | 167 | @classmethod 168 | def from_table(cls, *args, **kwargs): 169 | """Create a channel from a table file. 170 | 171 | Uses pandas.read_table() to create a channel 172 | 173 | Args: 174 | *args: and 175 | **kwargs: Arguments passing to pandas.read_table() 176 | """ 177 | return pandas.read_table(*args, **kwargs) 178 | 179 | 180 | # ---------------------------------------------------------------- 181 | # Verbs 182 | @register_verb(DataFrame) 183 | def expand_dir( 184 | data: DataFrame, 185 | col: str | int = 0, 186 | pattern: str = "*", 187 | ftype: str = "any", 188 | sortby: str = "name", 189 | reverse: bool = False, 190 | ) -> DataFrame: 191 | """Expand a Channel according to the files in , 192 | other cols will keep the same. 193 | 194 | This is only applicable to a 1-row channel. 195 | 196 | Examples: 197 | >>> ch = channel.create([('./', 1)]) 198 | >>> ch >> expand() 199 | >>> [['./a', 1], ['./b', 1], ['./c', 1]] 200 | 201 | Args: 202 | col: the index or name of the column used to expand 203 | pattern: use a pattern to filter the files/dirs, default: `*` 204 | ftype: the type of the files/dirs to include 205 | - 'dir', 'file', 'link' or 'any' (default) 206 | sortby: how the list is sorted 207 | - 'name' (default), 'mtime', 'size' 208 | reverse: reverse sort. 209 | 210 | Returns: 211 | The expanded channel 212 | """ 213 | assert data.shape[0] == 1, "Can only expand a single row DataFrame." 214 | col_loc = col if isinstance(col, int) else data.columns.get_loc(col) 215 | full_pattern = f"{data.iloc[0, col_loc]}/{pattern}" 216 | expanded = Channel.from_glob( 217 | full_pattern, 218 | ftype, 219 | sortby, 220 | reverse, 221 | ).iloc[:, 0] 222 | ret = pandas.concat([data] * expanded.size, axis=0, ignore_index=True) 223 | ret.iloc[:, col_loc] = expanded.values 224 | return ret.reset_index(drop=True) 225 | 226 | 227 | @register_verb(DataFrame) 228 | def collapse_files(data: DataFrame, col: str | int = 0) -> DataFrame: 229 | """Collapse a Channel according to the files in , 230 | other cols will use the values in row 0. 231 | 232 | Note that other values in other rows will be discarded. 233 | 234 | Examples: 235 | >>> ch = channel.create([['./a', 1], ['./b', 1], ['./c', 1]]) 236 | >>> ch >> collapse() 237 | >>> [['.', 1]] 238 | 239 | Args: 240 | data: The original channel 241 | col: the index or name of the column used to collapse on 242 | 243 | Returns: 244 | The collapsed channel 245 | """ 246 | assert data.shape[0] > 0, "Cannot collapse on an empty DataFrame." 247 | col_loc = col if isinstance(col, int) else data.columns.get_loc(col) 248 | paths = list(data.iloc[:, col_loc]) 249 | compx = path.dirname(path.commonprefix(paths)) 250 | ret = data.iloc[[0], :].copy() 251 | ret.iloc[0, col_loc] = compx 252 | return ret 253 | -------------------------------------------------------------------------------- /pipen/cli/__init__.py: -------------------------------------------------------------------------------- 1 | """Provide CLI for pipen""" 2 | 3 | from ._hooks import CLIPlugin 4 | from ._main import main 5 | -------------------------------------------------------------------------------- /pipen/cli/_hooks.py: -------------------------------------------------------------------------------- 1 | """Provide Cli class""" 2 | from __future__ import annotations 3 | 4 | from abc import ABC, abstractmethod 5 | from typing import TYPE_CHECKING 6 | 7 | from simplug import Simplug 8 | 9 | from ..defaults import CLI_ENTRY_GROUP 10 | 11 | if TYPE_CHECKING: 12 | from argx import ArgumentParser 13 | from argparse import Namespace 14 | 15 | cli_plugin = Simplug(CLI_ENTRY_GROUP) 16 | 17 | 18 | class CLIPlugin(ABC): 19 | """The abc for cli plugin""" 20 | 21 | def __init__( 22 | self, 23 | parser: ArgumentParser, 24 | subparser: ArgumentParser, 25 | ) -> None: 26 | self.parser = parser 27 | self.subparser = subparser 28 | 29 | @property 30 | @abstractmethod 31 | def name(self) -> str: 32 | """The name/command of this plugin""" 33 | 34 | def parse_args(self) -> Namespace: 35 | """Define arguments for the command""" 36 | return self.parser.parse_args() 37 | 38 | @abstractmethod 39 | def exec_command(self, args: Namespace) -> None: 40 | """Execute the command""" 41 | -------------------------------------------------------------------------------- /pipen/cli/_main.py: -------------------------------------------------------------------------------- 1 | """CLI main entrance""" 2 | import re 3 | import importlib 4 | from pathlib import Path 5 | 6 | from argx import ArgumentParser 7 | 8 | from ._hooks import cli_plugin 9 | from ..version import __version__ 10 | 11 | parser = ArgumentParser( 12 | prog="pipen", 13 | description=f"CLI Tool for pipen v{__version__}", 14 | ) 15 | 16 | 17 | def load_builtin_clis() -> None: 18 | """Load builtin cli plugins in this directory""" 19 | for clifile in Path(__file__).parent.glob("*.py"): 20 | if clifile.stem.startswith("_"): 21 | continue 22 | cli = importlib.import_module(f".{clifile.stem}", __package__) 23 | plg = getattr(cli, cli.__all__[0]) 24 | cli_plugin.register(plg) 25 | 26 | 27 | def main() -> None: 28 | """Main function of pipen CLI""" 29 | cli_plugin.load_entrypoints() 30 | # builtin plugins have the highest priority 31 | # so they are loaded later to override the entrypoints 32 | load_builtin_clis() 33 | 34 | plugin_names = sorted( 35 | cli_plugin.get_enabled_plugin_names(), 36 | key=lambda cmd: 999 if cmd == "help" else 0, 37 | ) 38 | plugins = {} 39 | for name in plugin_names: 40 | plg = cli_plugin.get_plugin(name, raw=True) 41 | 42 | docstr = plg.__doc__ 43 | if docstr is not None: 44 | docstr = docstr.strip() 45 | 46 | subparser = parser.add_command( 47 | plg.name, 48 | help=( 49 | None 50 | if docstr is None 51 | else re.sub(r"\s+", " ", docstr.splitlines()[0]) 52 | ), 53 | description=docstr, 54 | ) 55 | plugins[plg.name] = plg(parser, subparser) 56 | 57 | known_parsed, _ = parser.parse_known_args() 58 | parsed = plugins[known_parsed.COMMAND].parse_args() 59 | plugins[known_parsed.COMMAND].exec_command(parsed) 60 | -------------------------------------------------------------------------------- /pipen/cli/help.py: -------------------------------------------------------------------------------- 1 | """Print help for commands""" 2 | from __future__ import annotations 3 | from typing import TYPE_CHECKING 4 | 5 | from ._hooks import CLIPlugin 6 | 7 | if TYPE_CHECKING: 8 | from argx import ArgumentParser 9 | from argparse import Namespace 10 | 11 | __all__ = ("CLIHelpPlugin",) 12 | 13 | 14 | class CLIHelpPlugin(CLIPlugin): 15 | """Print help for commands""" 16 | 17 | name = "help" 18 | 19 | def __init__(self, parser: ArgumentParser, subparser: ArgumentParser): 20 | super().__init__(parser, subparser) 21 | subparser.add_argument( 22 | "cmd", 23 | nargs="?", 24 | choices=[ 25 | n 26 | for n in parser._subparsers._group_actions[0].choices 27 | if n != "help" 28 | ], 29 | help="The command to show help for", 30 | ) 31 | 32 | def exec_command(self, args: Namespace) -> None: 33 | """Run the command""" 34 | 35 | if not args.cmd: 36 | self.parser.parse_args(["--help"]) 37 | else: 38 | self.parser.parse_args([args.cmd, "--help"]) 39 | -------------------------------------------------------------------------------- /pipen/cli/plugins.py: -------------------------------------------------------------------------------- 1 | """List plugins""" 2 | from __future__ import annotations 3 | from typing import TYPE_CHECKING, Any, Iterable, List, Tuple 4 | 5 | from rich import print 6 | 7 | from ._hooks import CLIPlugin 8 | from ..defaults import ( 9 | CLI_ENTRY_GROUP, 10 | SCHEDULER_ENTRY_GROUP, 11 | TEMPLATE_ENTRY_GROUP, 12 | ) 13 | from ..utils import load_entrypoints 14 | 15 | if TYPE_CHECKING: 16 | from argx import ArgumentParser 17 | from argparse import Namespace 18 | 19 | 20 | COMMAND = "plugins" 21 | GROUPS = [ 22 | "pipen", 23 | SCHEDULER_ENTRY_GROUP, 24 | TEMPLATE_ENTRY_GROUP, 25 | CLI_ENTRY_GROUP, 26 | ] 27 | GROUP_NAMES = { 28 | "pipen": "Pipen", 29 | SCHEDULER_ENTRY_GROUP: "Scheduler", 30 | TEMPLATE_ENTRY_GROUP: "Template", 31 | CLI_ENTRY_GROUP: "CLI", 32 | } 33 | 34 | __all__ = ("CliPluginsPlugin",) 35 | 36 | 37 | def _get_plugins_by_group(group: str) -> Iterable[Tuple[str, Any]]: 38 | """Get plugins from entry points by group name 39 | 40 | Args: 41 | group: The name of the group 42 | 43 | Returns: 44 | A list of tuples with the plugin name and the plugin itself 45 | """ 46 | for name, obj in load_entrypoints(group): 47 | yield name, obj 48 | 49 | 50 | def _list_group_plugins( 51 | group: str, 52 | plugins: List[Tuple[str, Any]], 53 | ) -> None: 54 | """List plugins in a single group 55 | 56 | Args: 57 | group: The group of the plugins 58 | plugins: A list of tuples with name and plugin 59 | """ 60 | print("") 61 | print(f"[bold][u]{GROUP_NAMES[group]} plugins:[/u][/bold]") 62 | namelen = max(len(name) for name, _ in plugins) if plugins else 0 63 | for name, plugin in plugins: 64 | try: 65 | ver = plugin.version 66 | except AttributeError: 67 | try: 68 | ver = plugin.__version__ 69 | except AttributeError: 70 | ver = "unknown" 71 | print(f"- {name.ljust(namelen)}: (version: {ver})") 72 | 73 | 74 | def _list_plugins(plugins: List[Tuple[str, str, Any]]) -> None: 75 | """List plugins 76 | 77 | Args: 78 | plugins: A list of tuples with group, name and plugin 79 | """ 80 | pipen_plugins = [ 81 | (name, plugin) for group, name, plugin in plugins if group == "pipen" 82 | ] 83 | sched_plugins = [ 84 | (name, plugin) 85 | for group, name, plugin in plugins 86 | if group == SCHEDULER_ENTRY_GROUP 87 | ] 88 | tpl_plugins = [ 89 | (name, plugin) 90 | for group, name, plugin in plugins 91 | if group == TEMPLATE_ENTRY_GROUP 92 | ] 93 | cli_plugins = [ 94 | (name, plugin) 95 | for group, name, plugin in plugins 96 | if group == CLI_ENTRY_GROUP 97 | ] 98 | _list_group_plugins("pipen", pipen_plugins) 99 | _list_group_plugins(SCHEDULER_ENTRY_GROUP, sched_plugins) 100 | _list_group_plugins(TEMPLATE_ENTRY_GROUP, tpl_plugins) 101 | _list_group_plugins(CLI_ENTRY_GROUP, cli_plugins) 102 | 103 | 104 | class CliPluginsPlugin(CLIPlugin): 105 | """List installed plugins""" 106 | 107 | name = "plugins" 108 | 109 | def __init__( 110 | self, 111 | parser: ArgumentParser, 112 | subparser: ArgumentParser, 113 | ) -> None: 114 | super().__init__(parser, subparser) 115 | subparser.add_argument( 116 | "-g", 117 | "--group", 118 | choices=GROUPS + ["all"], 119 | default="all", 120 | help="The name of the entry point group. Show all if not provided", 121 | ) 122 | 123 | def exec_command(self, args: Namespace) -> None: 124 | """Execute the command""" 125 | from ..version import __version__ 126 | print("Pipen version:", __version__) 127 | 128 | plugins: List[Tuple[str, str, Any]] = [] 129 | 130 | if args.group and args.group != "all": 131 | for name, plugin in _get_plugins_by_group(args.group): 132 | plugins.append((args.group, name, plugin)) 133 | 134 | else: # args.name 135 | for group in GROUPS: 136 | for name, plugin in _get_plugins_by_group(group): 137 | plugins.append((group, name, plugin)) 138 | 139 | _list_plugins(plugins) 140 | -------------------------------------------------------------------------------- /pipen/cli/profile.py: -------------------------------------------------------------------------------- 1 | """List available profiles.""" 2 | from __future__ import annotations 3 | from typing import TYPE_CHECKING 4 | 5 | from rich import print 6 | from rich.panel import Panel 7 | from rich.syntax import Syntax 8 | from simpleconf import ProfileConfig 9 | 10 | from ._hooks import CLIPlugin 11 | from ..defaults import CONFIG, CONFIG_FILES 12 | 13 | if TYPE_CHECKING: 14 | from argx import ArgumentParser 15 | from argparse import Namespace 16 | 17 | __all__ = ("CLIProfilePlugin",) 18 | 19 | 20 | class CLIProfilePlugin(CLIPlugin): 21 | """List available profiles.""" 22 | 23 | name = "profile" 24 | 25 | def __init__( 26 | self, 27 | parser: ArgumentParser, 28 | subparser: ArgumentParser, 29 | ) -> None: 30 | super().__init__(parser, subparser) 31 | subparser.add_argument( 32 | "-n", 33 | "--name", 34 | default="", 35 | help="The name of the profile to show. Show all if not provided.", 36 | ) 37 | subparser.add_argument( 38 | "-l", 39 | "--list", 40 | action="store_true", 41 | default=False, 42 | help="List the names of all available profiles (-n won't work).", 43 | ) 44 | 45 | def exec_command(self, args: Namespace) -> None: 46 | """Run the command""" 47 | 48 | config = ProfileConfig.load( 49 | {"default": CONFIG}, 50 | *CONFIG_FILES, 51 | ignore_nonexist=True, 52 | ) 53 | 54 | if args.list: 55 | print("\n".join(ProfileConfig.profiles(config))) 56 | return 57 | 58 | print("Configurations loaded from:") 59 | print("- pipen.defaults.CONFIG (python dictionary)") 60 | for conffile in reversed(CONFIG_FILES): 61 | print(f"- {conffile}") 62 | print("") 63 | 64 | print("Note:") 65 | print( 66 | "- The same profile from different configuration files " 67 | "are inherited." 68 | ) 69 | print( 70 | "- These configurations can still be overriden by " 71 | "Pipen constructor and process definition." 72 | ) 73 | print("") 74 | 75 | if not args.name: 76 | for profile in ProfileConfig.profiles(config): 77 | with ProfileConfig.with_profile(config, profile): 78 | conf = ProfileConfig.detach(config) 79 | print( 80 | Panel( 81 | Syntax(conf.to_toml(), "toml"), 82 | title=f"Profile: {profile}", 83 | title_align="left", 84 | ) 85 | ) 86 | 87 | else: 88 | if not ProfileConfig.has_profile(config, args.name): 89 | raise ValueError(f"No such profile: {args.name}") 90 | 91 | ProfileConfig.use_profile(config, args.name) 92 | conf = ProfileConfig.detach(config) 93 | print( 94 | Panel( 95 | Syntax(conf.to_toml(), "toml"), 96 | title=f"Profile: {args.name}", 97 | title_align="left", 98 | ) 99 | ) 100 | -------------------------------------------------------------------------------- /pipen/cli/version.py: -------------------------------------------------------------------------------- 1 | """Print help for commands""" 2 | from __future__ import annotations 3 | from typing import TYPE_CHECKING 4 | 5 | from rich import print 6 | 7 | from ._hooks import CLIPlugin 8 | 9 | if TYPE_CHECKING: 10 | from argparse import Namespace 11 | 12 | __all__ = ("CLIVersionPlugin",) 13 | 14 | 15 | class CLIVersionPlugin(CLIPlugin): 16 | """Print versions of pipen and its dependencies""" 17 | 18 | name = "version" 19 | 20 | def exec_command(self, args: Namespace) -> None: 21 | """Run the command""" 22 | import sys 23 | from importlib.metadata import version 24 | from .. import __version__ 25 | 26 | versions = {"python": sys.version, "pipen": __version__} 27 | 28 | for pkg in ( 29 | "liquidpy", 30 | "pandas", 31 | "enlighten", 32 | "argx", 33 | "xqute", 34 | "python-simpleconf", 35 | "pipda", 36 | "varname", 37 | ): 38 | versions[pkg] = version(pkg) 39 | 40 | keylen = max(map(len, versions)) 41 | for key in versions: 42 | ver = versions[key] 43 | verlines = ver.splitlines() 44 | print(f"{key.ljust(keylen)}: {verlines.pop(0)}") 45 | for verline in verlines: # pragma: no cover 46 | print(f"{' ' * keylen} {verline}") 47 | -------------------------------------------------------------------------------- /pipen/defaults.py: -------------------------------------------------------------------------------- 1 | """Provide some default values/objects""" 2 | from pathlib import Path 3 | from typing import ClassVar 4 | 5 | from diot import Diot 6 | from xqute import JobErrorStrategy 7 | from xqute.utils import logger as xqute_logger 8 | 9 | # Remove the rich handler 10 | _xqute_handlers = xqute_logger.handlers 11 | if _xqute_handlers: 12 | # The very first handler is the rich handler 13 | xqute_logger.removeHandler(_xqute_handlers[0]) 14 | 15 | LOGGER_NAME = "core" 16 | CONFIG_FILES = ( 17 | Path("~/.pipen.toml").expanduser(), 18 | "./.pipen.toml", 19 | "PIPEN.osenv", 20 | ) 21 | CONFIG = Diot( 22 | # pipeline level: The logging level 23 | loglevel="info", 24 | # process level: The cache option, True/False/export 25 | cache=True, 26 | # process level: Whether expand directory to check signature 27 | dirsig=1, 28 | # process level: 29 | # How to deal with the errors 30 | # retry, ignore, halt 31 | # halt to halt the whole pipeline, no submitting new jobs 32 | # terminate to just terminate the job itself 33 | error_strategy=JobErrorStrategy.IGNORE, 34 | # process level: 35 | # How many times to retry to jobs once error occurs 36 | num_retries=3, 37 | # process level: 38 | # The directory to export the output files 39 | forks=1, 40 | # process level: Default shell/language 41 | lang="bash", 42 | # process level: 43 | # How many jobs to be submitted in a batch 44 | submission_batch=8, 45 | # pipeline level: 46 | # The working directory for the pipeline 47 | workdir="./.pipen", 48 | # process level: template engine 49 | template="liquid", 50 | # process level: template options 51 | template_opts={}, 52 | # process level: scheduler 53 | scheduler="local", 54 | # process level: scheduler options 55 | scheduler_opts={}, 56 | # pipeline level: plugins 57 | plugins=None, 58 | # pipeline level: plugin opts 59 | plugin_opts={}, 60 | ) 61 | 62 | # Just the total width of the terminal 63 | # when logging with a rich.Panel() 64 | CONSOLE_WIDTH_WITH_PANEL = 100 65 | # The width of the terminal when the width cannot be detected, 66 | # we are probably logging into a file 67 | CONSOLE_DEFAULT_WIDTH = 2048 68 | # [05/16/22 11:46:40] I 69 | # v0.3.4: 70 | # 05-16 11:11:11 I 71 | # The markup code is included 72 | # Don't modify this unless the logger formatter is changed 73 | CONSOLE_WIDTH_SHIFT = 25 74 | # For pipen scheduler plugins 75 | SCHEDULER_ENTRY_GROUP = "pipen_sched" 76 | # For pipen template plugins 77 | TEMPLATE_ENTRY_GROUP = "pipen_tpl" 78 | # For pipen template cli plugins 79 | CLI_ENTRY_GROUP = "pipen_cli" 80 | 81 | 82 | class ProcInputType: 83 | """Types for process inputs""" 84 | 85 | VAR: ClassVar[str] = "var" 86 | FILE: ClassVar[str] = "file" 87 | DIR: ClassVar[str] = "dir" 88 | FILES: ClassVar[str] = "files" 89 | DIRS: ClassVar[str] = "dirs" 90 | 91 | 92 | class ProcOutputType: 93 | """Types for process outputs""" 94 | 95 | VAR: ClassVar[str] = "var" 96 | DIR: ClassVar[str] = "dir" 97 | FILE: ClassVar[str] = "file" 98 | -------------------------------------------------------------------------------- /pipen/exceptions.py: -------------------------------------------------------------------------------- 1 | """Provide exception classes""" 2 | 3 | 4 | class PipenException(Exception): 5 | """Base exception class for pipen""" 6 | 7 | 8 | class PipenSetDataError(PipenException, ValueError): 9 | """When trying to set input data to processes with input_data already set 10 | using Pipen.set_data().""" 11 | 12 | 13 | class ProcInputTypeError(PipenException, TypeError): 14 | """When an unsupported input type is provided""" 15 | 16 | 17 | class ProcInputKeyError(PipenException, KeyError): 18 | """When an unsupported input key is provided""" 19 | 20 | 21 | class ProcInputValueError(PipenException, ValueError): 22 | """When an unsupported input value is provided""" 23 | 24 | 25 | class ProcScriptFileNotFound(PipenException, FileNotFoundError): 26 | """When script file specified as 'file://' cannot be found""" 27 | 28 | 29 | class ProcOutputNameError(PipenException, NameError): 30 | """When no name or malformatted output is provided""" 31 | 32 | 33 | class ProcOutputTypeError(PipenException, TypeError): 34 | """When an unsupported output type is provided""" 35 | 36 | 37 | class ProcOutputValueError(PipenException, ValueError): 38 | """When a malformatted output value is provided""" 39 | 40 | 41 | class ProcDependencyError(PipenException): 42 | """When there is something wrong the process dependencies""" 43 | 44 | 45 | class NoSuchSchedulerError(PipenException): 46 | """When specified scheduler cannot be found""" 47 | 48 | 49 | class WrongSchedulerTypeError(PipenException, TypeError): 50 | """When specified scheduler is not a subclass of Scheduler""" 51 | 52 | 53 | class NoSuchTemplateEngineError(PipenException): 54 | """When specified template engine cannot be found""" 55 | 56 | 57 | class WrongTemplateEnginTypeError(PipenException, TypeError): 58 | """When specified tempalte engine is not a subclass of Scheduler""" 59 | 60 | 61 | class TemplateRenderingError(PipenException): 62 | """Failed to render a template""" 63 | 64 | 65 | class ConfigurationError(PipenException): 66 | """When something wrong set as configuration""" 67 | 68 | 69 | class PipenOrProcNameError(PipenException): 70 | """ "When more than one processes are sharing the same workdir""" 71 | -------------------------------------------------------------------------------- /pipen/procgroup.py: -------------------------------------------------------------------------------- 1 | """Process group that contains a set of processes. 2 | 3 | It can be easily used to create a pipeline that runs independently or 4 | integrated into a larger pipeline. 5 | 6 | Runs directly: 7 | >>> proc_group = ProcGroup() 8 | >>> proc_group.as_pipen().set_data().run() 9 | 10 | Integrated into a larger pipeline 11 | >>> proc_group = ProcGroup() 12 | >>> # proc could be a process within the larger pipeline 13 | >>> proc.requires = prog_group. 14 | 15 | To add a process to the proc group, use the `add_proc` method: 16 | >>> class MyProcGroup(ProcGroup): 17 | >>> ... 18 | >>> 19 | >>> proc_group = MyProcGroup(...) 20 | >>> @proc_group.add_proc 21 | >>> class MyProc(Proc): 22 | >>> ... 23 | 24 | Or add a process at runtime: 25 | >>> class MyProcGroup(ProcGroup): 26 | >>> ... 27 | >>> 28 | >>> @ProcGroup.add_proc 29 | >>> def my_proc(self): 30 | >>> class MyProc(Proc): 31 | >>> # You may use self.options here 32 | >>> ... 33 | >>> return MyProc 34 | >>> proc_group = MyProcGroup(...) 35 | """ 36 | from __future__ import annotations 37 | 38 | from os import PathLike 39 | from functools import wraps, cached_property 40 | from typing import Any, Callable, Mapping, Type, List 41 | from abc import ABC, ABCMeta 42 | from diot import Diot 43 | 44 | from .pipen import Pipen 45 | from .proc import Proc 46 | 47 | 48 | class ProcGropuMeta(ABCMeta): 49 | """Meta class for ProcGroup""" 50 | 51 | _INST = None 52 | 53 | def __call__(cls, *args, **kwds): 54 | """Make sure Proc subclasses are singletons 55 | 56 | Args: 57 | *args: and 58 | **kwds: Arguments for the constructor 59 | 60 | Returns: 61 | The Proc instance 62 | """ 63 | if cls._INST is None: 64 | cls._INST = super().__call__(*args, **kwds) 65 | 66 | return cls._INST 67 | 68 | 69 | class ProcGroup(ABC, metaclass=ProcGropuMeta): 70 | """A group of processes that can be run independently or 71 | integrated into a larger pipeline. 72 | """ 73 | 74 | name: str | None = None 75 | __meta__: Mapping[str, Any] = {} 76 | DEFAULTS = Diot() 77 | PRESERVED = { 78 | "opts", 79 | "name", 80 | "add_proc", 81 | "as_pipen", 82 | "procs", 83 | "starts", 84 | "DEFAULTS", 85 | "PRESERVED", 86 | "_INST", 87 | } 88 | 89 | def __init_subclass__(cls) -> None: 90 | # Clear the meta 91 | cls.__meta__ = {} 92 | 93 | def __init__(self, **opts) -> None: 94 | self.opts = Diot(self.__class__.DEFAULTS or {}) | (opts or {}) 95 | self.name = self.__class__.name or self.__class__.__name__ 96 | self.starts: List[Type[Proc]] = [] 97 | self.procs = Diot() 98 | 99 | self._load_runtime_procs() 100 | 101 | def _load_runtime_procs(self): 102 | """Load all processes that are added at runtime""" 103 | # Load all processes if they are decorated by ProcGroup.add_proc 104 | for name, attr in self.__class__.__dict__.items(): 105 | if isinstance(attr, cached_property): 106 | getattr(self, name) 107 | elif isinstance(attr, type) and issubclass(attr, Proc): 108 | self.add_proc(attr) 109 | 110 | def add_proc( 111 | self_or_method: ProcGroup | Callable[[ProcGroup], Type[Proc]], 112 | proc: Type[Proc] | None = None, 113 | ) -> Type[Proc] | cached_property: 114 | """Add a process to the proc group 115 | 116 | It works either as a decorator to the process directly or as a 117 | decorator to a method that returns the process. 118 | 119 | Args: 120 | self_or_method: The proc group instance or a method that 121 | returns the process 122 | proc: The process class if `self_or_method` is the proc group 123 | 124 | Returns: 125 | The process class if `self_or_method` is the proc group, or 126 | a cached property that returns the process class 127 | """ 128 | if isinstance(self_or_method, ProcGroup): 129 | # Called as self.add_proc or pg.add_proc 130 | if proc is None: 131 | return self_or_method.add_proc # type: ignore 132 | 133 | if proc.name in self_or_method.__class__.PRESERVED: 134 | raise ValueError( 135 | f"Process name `{proc.name}` is reserved for ProcGroup" 136 | ) 137 | 138 | setattr(self_or_method, proc.name, proc) 139 | proc.__meta__["procgroup"] = self_or_method # type: ignore 140 | if not proc.requires and proc not in self_or_method.starts: 141 | self_or_method.starts.append(proc) 142 | self_or_method.procs[proc.name] = proc 143 | return proc 144 | 145 | @wraps(self_or_method) 146 | def wrapper(self): 147 | proc = self_or_method(self) 148 | 149 | if proc is None: 150 | return None 151 | 152 | if (not isinstance(proc, type) or not issubclass(proc, Proc)): 153 | raise ValueError(f"`{proc}` is not a Proc subclass") 154 | 155 | proc.__meta__["procgroup"] = self 156 | if not proc.requires and proc not in self.starts: 157 | self.starts.append(proc) 158 | self.procs[proc.name] = proc 159 | return proc 160 | 161 | return cached_property(wrapper) 162 | 163 | def as_pipen( 164 | self, 165 | name: str | None = None, 166 | desc: str | None = None, 167 | outdir: str | PathLike | None = None, 168 | **kwargs, 169 | ) -> Pipen: 170 | """Convert the pipeline to a Pipen instance 171 | 172 | Args: 173 | name: The name of the pipeline 174 | desc: The description of the pipeline 175 | outdir: The output directory of the pipeline 176 | **kwargs: The keyword arguments to pass to Pipen 177 | 178 | Returns: 179 | The Pipen instance 180 | """ 181 | name = name or self.__class__.__name__ 182 | if self.__doc__: 183 | desc = desc or self.__doc__.lstrip().splitlines()[0] 184 | 185 | pipe = Pipen(name=name, desc=desc, outdir=outdir, **kwargs) 186 | pipe.set_start(self.starts) 187 | return pipe 188 | -------------------------------------------------------------------------------- /pipen/progressbar.py: -------------------------------------------------------------------------------- 1 | """Provide the PipelinePBar and ProcPBar classes""" 2 | from __future__ import annotations 3 | 4 | from typing import TYPE_CHECKING 5 | 6 | from .utils import truncate_text 7 | 8 | if TYPE_CHECKING: # pragma: no cover 9 | import enlighten 10 | 11 | # [12/02/20 12:44:06] I core 12 | # pipeline: 100%| 13 | # | desc_len | 14 | PBAR_DESC_LEN = 24 15 | 16 | 17 | class ProcPBar: 18 | """The progress bar for processes""" 19 | 20 | def __init__( 21 | self, manager: enlighten.Manager, proc_size: int, proc_name: str 22 | ) -> None: 23 | self.submitted_counter = manager.counter( 24 | total=proc_size, 25 | color="cyan", 26 | desc=proc_name, 27 | unit="jobs", 28 | leave=False, 29 | ) 30 | self.running_counter = self.submitted_counter.add_subcounter("yellow") 31 | self.success_counter = self.submitted_counter.add_subcounter("green") 32 | self.failure_counter = self.submitted_counter.add_subcounter("red") 33 | 34 | def update_job_submitted(self): 35 | """Update the progress bar when a job is submitted""" 36 | self.submitted_counter.update() 37 | 38 | def update_job_retrying(self): 39 | """Update the progress bar when a job is retrying""" 40 | # self.running_counter.count -= 1 41 | self.failure_counter.update(-1) 42 | 43 | def update_job_running(self): 44 | """Update the progress bar when a job is running""" 45 | try: 46 | self.running_counter.update_from(self.submitted_counter) 47 | except ValueError: # pragma: no cover 48 | pass 49 | 50 | def update_job_succeeded(self): 51 | """Update the progress bar when a job is succeeded""" 52 | try: 53 | self.success_counter.update_from(self.running_counter) 54 | except ValueError: # pragma: no cover 55 | try: 56 | self.success_counter.update_from(self.submitted_counter) 57 | except ValueError: # pragma: no cover 58 | pass 59 | except: # noqa: E722 # pragma: no cover 60 | pass 61 | 62 | def update_job_failed(self): 63 | """Update the progress bar when a job is failed""" 64 | try: 65 | self.failure_counter.update_from(self.running_counter) 66 | except ValueError: # pragma: no cover 67 | try: 68 | self.failure_counter.update_from(self.submitted_counter) 69 | except ValueError: # pragma: no cover 70 | pass 71 | except: # noqa: E722 # pragma: no cover 72 | pass 73 | 74 | def done(self): 75 | """The process is done""" 76 | try: 77 | self.submitted_counter.close() 78 | except: # noqa: E722 # pragma: no cover 79 | pass 80 | 81 | 82 | class PipelinePBar: 83 | """Progress bar for the pipeline""" 84 | 85 | def __init__(self, n_procs: int, ppln_name: str) -> None: 86 | """Initialize progress bar for pipeline""" 87 | import enlighten 88 | 89 | desc_len = PBAR_DESC_LEN 90 | ppln_name = truncate_text(ppln_name, desc_len) 91 | self.manager = enlighten.get_manager() 92 | self.running_counter = self.manager.counter( 93 | total=n_procs, 94 | color="yellow", 95 | desc=f"{ppln_name:>{desc_len}}:", 96 | unit="procs", 97 | ) 98 | self.success_counter = self.running_counter.add_subcounter("green") 99 | self.failure_counter = self.running_counter.add_subcounter("red") 100 | self.desc_len = desc_len 101 | 102 | def proc_bar(self, proc_size: int, proc_name: str) -> ProcPBar: 103 | """Get the progress bar for a process 104 | 105 | Args: 106 | proc_size: The size of the process 107 | proc_name: The name of the process 108 | 109 | Returns: 110 | The progress bar for the given process 111 | """ 112 | proc_name = truncate_text(proc_name, self.desc_len) 113 | proc_name = f"{proc_name:>{self.desc_len}}:" 114 | return ProcPBar(self.manager, proc_size, proc_name) 115 | 116 | def update_proc_running(self): 117 | """Update the progress bar when a process is running""" 118 | self.running_counter.update() 119 | 120 | def update_proc_done(self): 121 | """Update the progress bar when a process is done""" 122 | self.success_counter.update_from(self.running_counter) 123 | 124 | def update_proc_error(self): 125 | """Update the progress bar when a process is errored""" 126 | self.failure_counter.update_from(self.running_counter) 127 | 128 | def done(self) -> None: 129 | """When the pipeline is done""" 130 | try: 131 | self.running_counter.close() 132 | self.manager.stop() 133 | except: # noqa: E722 # pragma: no cover 134 | pass 135 | -------------------------------------------------------------------------------- /pipen/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/pipen/py.typed -------------------------------------------------------------------------------- /pipen/scheduler.py: -------------------------------------------------------------------------------- 1 | """Provide builting schedulers""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import TYPE_CHECKING, Sequence, Type 6 | 7 | from diot import Diot 8 | 9 | # Use cloudpathlib.GSPath instead of yunpath.GSPath, 10 | # the latter is a subclass of the former. 11 | # (_GSPath is cloudpathlib.GSPath) 12 | from yunpath.patch import _GSPath 13 | from xqute import Scheduler 14 | from xqute.schedulers.local_scheduler import LocalScheduler as XquteLocalScheduler 15 | from xqute.schedulers.sge_scheduler import SgeScheduler as XquteSgeScheduler 16 | from xqute.schedulers.slurm_scheduler import SlurmScheduler as XquteSlurmScheduler 17 | from xqute.schedulers.ssh_scheduler import SshScheduler as XquteSshScheduler 18 | from xqute.schedulers.gbatch_scheduler import GbatchScheduler as XquteGbatchScheduler 19 | from xqute.path import SpecPath 20 | 21 | from .defaults import SCHEDULER_ENTRY_GROUP 22 | from .exceptions import NoSuchSchedulerError, WrongSchedulerTypeError 23 | from .job import Job 24 | from .utils import is_subclass, load_entrypoints 25 | 26 | if TYPE_CHECKING: 27 | from .proc import Proc 28 | 29 | 30 | class SchedulerPostInit: 31 | """Provides post init function for all schedulers""" 32 | 33 | job_class = Job 34 | 35 | MOUNTED_METADIR: str 36 | MOUNTED_OUTDIR: str 37 | 38 | def post_init(self, proc: Proc) -> None: ... # noqa: E704 39 | 40 | 41 | class LocalScheduler(SchedulerPostInit, XquteLocalScheduler): 42 | """Local scheduler""" 43 | 44 | 45 | class SgeScheduler(SchedulerPostInit, XquteSgeScheduler): 46 | """SGE scheduler""" 47 | 48 | 49 | class SlurmScheduler(SchedulerPostInit, XquteSlurmScheduler): 50 | """Slurm scheduler""" 51 | 52 | 53 | class SshScheduler(SchedulerPostInit, XquteSshScheduler): 54 | """SSH scheduler""" 55 | 56 | 57 | class GbatchScheduler(SchedulerPostInit, XquteGbatchScheduler): 58 | """Google Cloud Batch scheduler""" 59 | 60 | MOUNTED_METADIR: str = "/mnt/pipen-pipeline/workdir" 61 | MOUNTED_OUTDIR: str = "/mnt/pipen-pipeline/outdir" 62 | 63 | # fast mount is used to add a volume taskGroups[0].taskSpec.volumes 64 | # to mount additional cloud directory to the VM 65 | # For example: fast_mount="gs://bucket/path:/mnt/path" 66 | # will add a volume: { 67 | # "gcs": {"remotePath": "bucket/path"}, 68 | # "mountPath": "/mnt/path" 69 | # } 70 | def __init__( 71 | self, 72 | *args, 73 | project, 74 | location, 75 | fast_mount: str | Sequence[str] = None, 76 | **kwargs, 77 | ): 78 | super().__init__(*args, project=project, location=location, **kwargs) 79 | if not fast_mount: 80 | return 81 | 82 | if isinstance(fast_mount, str): 83 | fast_mount = [fast_mount] 84 | 85 | for fm in fast_mount: 86 | if fm.count(":") != 2: 87 | raise ValueError( 88 | "'fast_mount' for gbatch scheduler should be in the format of " 89 | "'gs://bucket/path:/mnt/path'" 90 | ) 91 | 92 | if not fm.startswith("gs://"): 93 | raise ValueError( 94 | "'fast_mount' for gbatch scheduler should be " 95 | "a Google Cloud Storage path (begins with 'gs://')" 96 | ) 97 | 98 | remote_path, mount_path = fm[5:].split(":", 1) 99 | self.config.taskGroups[0].taskSpec.volumes.append( 100 | Diot( 101 | { 102 | "gcs": {"remotePath": remote_path}, 103 | "mountPath": mount_path, 104 | } 105 | ) 106 | ) 107 | 108 | def post_init(self, proc: Proc): 109 | super().post_init(proc) 110 | 111 | # Check if pipeline outdir is a GSPath 112 | if not isinstance(proc.pipeline.outdir, _GSPath): 113 | raise ValueError( 114 | "'gbatch' scheduler requires google cloud storage 'outdir'." 115 | ) 116 | 117 | mounted_workdir = f"{self.MOUNTED_METADIR}/{proc.name}" 118 | self.workdir: SpecPath = SpecPath(self.workdir, mounted=mounted_workdir) 119 | 120 | # update the mounted metadir 121 | self.config.taskGroups[0].taskSpec.volumes[0].mountPath = mounted_workdir 122 | 123 | # update the config to map the outdir to vm 124 | self.config.taskGroups[0].taskSpec.volumes.append( 125 | Diot( 126 | { 127 | "gcs": {"remotePath": proc.pipeline.outdir._no_prefix}, 128 | "mountPath": self.MOUNTED_OUTDIR, 129 | } 130 | ) 131 | ) 132 | 133 | 134 | def get_scheduler(scheduler: str | Type[Scheduler]) -> Type[Scheduler]: 135 | """Get the scheduler by name of the scheduler class itself 136 | 137 | Args: 138 | scheduler: The scheduler class or name 139 | 140 | Returns: 141 | The scheduler class 142 | """ 143 | if is_subclass(scheduler, Scheduler): 144 | return scheduler # type: ignore 145 | 146 | if scheduler == "local": 147 | return LocalScheduler 148 | 149 | if scheduler == "sge": 150 | return SgeScheduler 151 | 152 | if scheduler == "slurm": 153 | return SlurmScheduler 154 | 155 | if scheduler == "ssh": 156 | return SshScheduler 157 | 158 | if scheduler == "gbatch": 159 | return GbatchScheduler 160 | 161 | for n, obj in load_entrypoints(SCHEDULER_ENTRY_GROUP): # pragma: no cover 162 | if n == scheduler: 163 | if not is_subclass(obj, Scheduler): 164 | raise WrongSchedulerTypeError( 165 | "Scheduler should be a subclass of " "pipen.scheduler.Scheduler." 166 | ) 167 | return obj 168 | 169 | raise NoSuchSchedulerError(str(scheduler)) 170 | -------------------------------------------------------------------------------- /pipen/template.py: -------------------------------------------------------------------------------- 1 | """Template adaptor for pipen""" 2 | from __future__ import annotations 3 | 4 | from abc import ABC, abstractmethod 5 | from typing import Any, Mapping, Type 6 | 7 | from liquid import Liquid 8 | 9 | from .defaults import TEMPLATE_ENTRY_GROUP 10 | from .exceptions import NoSuchTemplateEngineError, WrongTemplateEnginTypeError 11 | from .utils import is_subclass, load_entrypoints 12 | 13 | __all__ = [ 14 | "Template", 15 | "TemplateLiquid", 16 | "TemplateJinja2", 17 | "get_template_engine", 18 | ] 19 | 20 | 21 | class Template(ABC): 22 | """Base class wrapper to wrap template for pipen""" 23 | 24 | def __init__( 25 | self, 26 | source: Any, 27 | **kwargs: Any, 28 | ): 29 | """Template construct""" 30 | self.engine: Any = None 31 | 32 | def render(self, data: Mapping[str, Any] = None) -> str: 33 | """ 34 | Render the template 35 | @parmas: 36 | data (dict): The data used to render 37 | """ 38 | return self._render(data or {}) 39 | 40 | @abstractmethod 41 | def _render(self, data: Mapping[str, Any]) -> str: 42 | """Implement rendering""" 43 | 44 | 45 | class TemplateLiquid(Template): 46 | """Liquidpy template wrapper.""" 47 | 48 | name = "liquid" 49 | 50 | def __init__( 51 | self, 52 | source: Any, 53 | **kwargs: Any, 54 | ): 55 | """Initiate the engine with source and envs 56 | 57 | Args: 58 | source: The souce text 59 | envs: The env data 60 | **kwargs: Other arguments for Liquid 61 | """ 62 | super().__init__(source) 63 | self.engine = Liquid( 64 | source, 65 | from_file=False, 66 | mode="wild", 67 | **kwargs, 68 | ) 69 | 70 | def _render(self, data: Mapping[str, Any]) -> str: 71 | """Render the template 72 | 73 | Args: 74 | data: The data used for rendering 75 | 76 | Returns 77 | The rendered string 78 | """ 79 | return self.engine.render(data) 80 | 81 | 82 | class TemplateJinja2(Template): 83 | """Jinja2 template wrapper""" 84 | 85 | name = "jinja2" 86 | 87 | def __init__( 88 | self, 89 | source: Any, 90 | **kwargs: Any, 91 | ): 92 | """Initiate the engine with source and envs 93 | 94 | Args: 95 | source: The souce text 96 | envs: The env data 97 | **kwargs: Other arguments for jinja2.Template 98 | """ 99 | import jinja2 100 | 101 | super().__init__(source) 102 | filters = kwargs.pop("filters", {}) 103 | envs = kwargs.pop("globals", {}) 104 | filters = kwargs.pop("filters", {}) 105 | self.engine = jinja2.Template(source, **kwargs) 106 | self.engine.globals.update(envs) 107 | self.engine.environment.filters.update(filters) 108 | 109 | def _render(self, data: Mapping[str, Any]) -> str: 110 | """Render the template 111 | 112 | Args: 113 | data: The data used for rendering 114 | 115 | Retuens: 116 | The rendered string 117 | """ 118 | return self.engine.render(data) 119 | 120 | 121 | def get_template_engine(template: str | Type[Template]) -> Type[Template]: 122 | """Get the template engine by name or the template engine itself 123 | 124 | Args: 125 | template: The name of the template engine or the template engine itself 126 | 127 | Returns: 128 | The template engine 129 | """ 130 | if is_subclass(template, Template): 131 | return template # type: ignore 132 | 133 | if template == "liquid": 134 | return TemplateLiquid 135 | 136 | if template == "jinja2": 137 | return TemplateJinja2 138 | 139 | for name, obj in load_entrypoints( 140 | TEMPLATE_ENTRY_GROUP 141 | ): # pragma: no cover 142 | if name == template: 143 | if not is_subclass(obj, Template): 144 | raise WrongTemplateEnginTypeError( 145 | "Template engine should be a subclass of " 146 | "pipen.templates.Template." 147 | ) 148 | return obj 149 | 150 | raise NoSuchTemplateEngineError(str(template)) 151 | -------------------------------------------------------------------------------- /pipen/version.py: -------------------------------------------------------------------------------- 1 | """Provide version of pipen""" 2 | 3 | __version__ = "0.17.4" 4 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ "poetry>=1.1.0" ] 3 | build-backend = "poetry.masonry.api" 4 | 5 | [tool.poetry] 6 | name = "pipen" 7 | version = "0.17.4" 8 | description = "A pipeline framework for python" 9 | authors = [ "pwwang ",] 10 | license = "MIT" 11 | readme = "README.md" 12 | homepage = "https://github.com/pwwang/pipen" 13 | repository = "https://github.com/pwwang/pipen" 14 | 15 | [tool.poetry.build] 16 | generate-setup-file = true 17 | 18 | [tool.poetry.dependencies] 19 | python = "^3.9" 20 | liquidpy = "^0.8" 21 | pandas = "^2.0" 22 | enlighten = "^1" 23 | argx = "^0.3" 24 | xqute = "^0.9" 25 | ## included in xqute 26 | # rich = "^13" 27 | # diot = "^0.1" 28 | # simplug = "^0.0" 29 | python-simpleconf = {version = "^0.7", extras = ["toml"]} 30 | pipda = "^0.13" 31 | varname = "^0.14" 32 | 33 | [tool.poetry.group.dev.dependencies] 34 | openpyxl = "^3" 35 | pytest = "^8" 36 | pytest-asyncio = "^0.25" 37 | pytest-cov = "^6" 38 | pytest-xdist = "^3" 39 | pytest-forked = "^1.6" 40 | # This also installs scipy and wcwidth 41 | datar = { version = "^0.15", extras = ["pandas"] } 42 | flake8 = "^7" 43 | python-dotenv = "^1" 44 | cloudsh = {version = "^0.1", extras = ["gs"]} 45 | 46 | # dependencies for pipelines in ./examples 47 | ipykernel = "^6.29.5" 48 | [tool.poetry.group.example.dependencies] 49 | mako = "^1.3" 50 | python-dotenv = "^1" 51 | cloudsh = {version = "^0.1", extras = ["gs"]} 52 | 53 | 54 | [tool.poetry.group.docs.dependencies] 55 | mkdocs = "^1.6.1" 56 | jinja2 = "^3.1.5" 57 | mkdocs-material = "^9.6.5" 58 | pymdown-extensions = "^10.14.3" 59 | mkapi-fix = "^0.1.1" 60 | 61 | [tool.poetry.scripts] 62 | pipen = "pipen.cli:main" 63 | 64 | [tool.pytest.ini_options] 65 | addopts = "-vv -n auto --dist loadgroup -p no:benchmark -W error::UserWarning --cov-config=.coveragerc --cov=pipen --cov-report xml:.coverage.xml --cov-report term-missing" 66 | console_output_style = "progress" 67 | junit_family = "xunit1" 68 | asyncio_default_fixture_loop_scope = "function" 69 | filterwarnings = [ 70 | "ignore:.+may lead to deadlocks in the child:DeprecationWarning", 71 | ] 72 | 73 | [tool.mypy] 74 | ignore_missing_imports = true 75 | allow_redefinition = true 76 | disable_error_code = ["attr-defined", "no-redef"] 77 | show_error_codes = true 78 | strict_optional = false 79 | 80 | [tool.black] 81 | line-length = 88 82 | target-version = ['py39', 'py310', 'py311', 'py312'] 83 | include = '\.pyi?$' 84 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This will not be included in the distribution. 4 | The distribution is managed by poetry 5 | This file is kept only for 6 | 1. Github to index the dependents 7 | 2. pip install -e . 8 | 9 | Do NOT use this to install this package, unless you handled the dependencies 10 | by your self: 11 | 12 | pip install git+https://... 13 | """ 14 | from setuptools import setup 15 | 16 | setup(name="pipen") 17 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | from pipen import Pipen 2 | 3 | # disable all plugins 4 | Pipen.SETUP = True 5 | 6 | # Don't delete the following, which is used for relative path script testing 7 | # AbCdEf 8 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import datar 2 | from dotenv import load_dotenv 3 | 4 | datar.options(backends=["numpy", "pandas"]) 5 | load_dotenv() 6 | -------------------------------------------------------------------------------- /tests/helpers.py: -------------------------------------------------------------------------------- 1 | import sys 2 | # import signal 3 | from tempfile import gettempdir 4 | from pathlib import Path 5 | # from shutil import rmtree 6 | # from multiprocessing import Process 7 | 8 | import pytest 9 | from pipen import Proc, Pipen, plugin 10 | from pipen.utils import is_loading_pipeline 11 | 12 | BUCKET = "gs://handy-buffer-287000.appspot.com" 13 | 14 | 15 | class SimpleProc(Proc): 16 | """A very simple process for testing""" 17 | 18 | input = ["input"] 19 | script = "sleep 1.5" # let on_job_polling run 20 | 21 | 22 | class NormalProc(Proc): 23 | """A normal proc""" 24 | 25 | input = "input:var" 26 | output = ["output:{{in.input}}"] 27 | script = "echo {{in.input}}" 28 | 29 | 30 | class In2Out1Proc(Proc): 31 | """Process with 2 input vars and 1 output var""" 32 | 33 | input = "in1:var, in2:var" 34 | output = "out:var:{{in.in1}}_{{in.in2}}" 35 | script = "echo {{in.in1}} {{in.in2}}" 36 | 37 | 38 | class RelPathScriptProc(Proc): 39 | """Process uses relative path script""" 40 | 41 | input = "in" 42 | output = "out:var:{{in.in}}" 43 | # use this file itself 44 | script = "file://__init__.py" 45 | 46 | 47 | class ScriptNotExistsProc(Proc): 48 | """Process uses relative path script""" 49 | 50 | input = "in" 51 | output = "out:var:{{in.in}}" 52 | # use this file itself 53 | script = "file:///no/such/file" 54 | 55 | 56 | class ErrorProc(Proc): 57 | """Errant process""" 58 | 59 | input = ["input"] 60 | script = "exit 1" 61 | 62 | 63 | class ScriptRenderErrorProc(Proc): 64 | """When script is failed to render""" 65 | 66 | input = "a" 67 | output = "b:var:1" 68 | script = "{{c(d)}}" 69 | 70 | 71 | class SleepingProc(Proc): 72 | """Process to sleep for a certain time""" 73 | 74 | input = "time" 75 | script = "sleep {{in.time}}" 76 | 77 | 78 | class RetryProc(ErrorProc): 79 | input = "starttime" 80 | error_strategy = "retry" 81 | num_retries = 10 82 | lang = sys.executable # python 83 | script = """ 84 | import sys, time 85 | sys.exit(1 if time.time() < {{in.starttime}} + 3 else 0) 86 | """ 87 | 88 | 89 | class OutputRenderErrorProc(Proc): 90 | """When output is failed to render""" 91 | 92 | input = "a" 93 | output = "b:var:{{c(d)}}" 94 | 95 | 96 | class OutputNoNameErrorProc(Proc): 97 | """When no name/type given in output""" 98 | 99 | input = "a" 100 | output = "b" 101 | 102 | 103 | class OutputWrongTypeProc(Proc): 104 | """When no name/type given in output""" 105 | 106 | input = "a" 107 | output = "b:c:d" 108 | 109 | 110 | class OutputAbsPathProc(Proc): 111 | """When no name/type given in output""" 112 | 113 | input = "a" 114 | output = "b:file:/a/b" 115 | 116 | 117 | class NoInputProc(Proc): 118 | """Process without input""" 119 | 120 | 121 | class InputTypeUnsupportedProc(Proc): 122 | """Input type not supported""" 123 | 124 | input = "input:unsupported:1" 125 | 126 | 127 | class FileInputProc(Proc): 128 | """Process with file input""" 129 | 130 | input = "in:file" 131 | output = "out:file:{{in.in.name}}" 132 | script = "cat {{in.in}} > {{out.out}}" 133 | 134 | 135 | class FileInputProcToDiff(Proc): 136 | """Process with file input to different output""" 137 | 138 | input = "in:file" 139 | output = "out:var:output" 140 | # script does not rely on input or output 141 | script = "echo output" 142 | 143 | 144 | class OutputNotGeneratedProc(Proc): 145 | """Process with output file not generated intentionally""" 146 | 147 | input = "in" 148 | output = "out:file:{{in.in}}" 149 | script = "echo {{in.in}}" 150 | 151 | 152 | class FileInputsProc(Proc): 153 | """Process with files input""" 154 | 155 | input = "in:files" 156 | output = "out:file:{{in.in[0].name}}" 157 | script = "echo {{in.in[0].name}} > {{out.out}}" 158 | 159 | 160 | class MixedInputProc(Proc): 161 | """Process with mixed types of input""" 162 | 163 | input = "invar:var, infile:file" 164 | output = "outfile:file:{{in.invar}}" 165 | script = "echo {{in.invar}} > {{out.outfile}}" 166 | 167 | 168 | class DirOutputProc(Proc): 169 | """Process with directory output""" 170 | 171 | input = "in" 172 | output = "outfile:dir:outdir" 173 | script = "echo {{in.in}} > {{out.outfile}}/outfile; " 174 | 175 | 176 | class SimplePlugin: 177 | @plugin.impl 178 | async def on_init(pipen): 179 | if getattr(pipen.__class__, "loading", False): 180 | assert is_loading_pipeline("--help") 181 | print("SimplePlugin") 182 | 183 | @plugin.impl 184 | async def on_job_polling(job): 185 | print("SimplePlugin on_job_polling") 186 | 187 | 188 | @pytest.fixture 189 | def pipen(tmp_path): 190 | """Get a simple Pipen object each time""" 191 | index = Pipen.PIPELINE_COUNT + 1 192 | pipen_simple = Pipen( 193 | name=f"simple_pipeline_{index}", 194 | desc="No description", 195 | loglevel="debug", 196 | cache=True, 197 | workdir=tmp_path / ".pipen", 198 | outdir=tmp_path / f"pipen_simple_{index}", 199 | ) 200 | 201 | return pipen_simple 202 | 203 | 204 | @pytest.fixture 205 | def pipen_with_plugin(tmp_path): 206 | """Get a simple Pipen object each time""" 207 | index = Pipen.PIPELINE_COUNT + 1 208 | pipen_simple = Pipen( 209 | name=f"simple_pipeline_{index}", 210 | desc="No description", 211 | loglevel="debug", 212 | cache=True, 213 | plugins=[SimplePlugin()], 214 | workdir=tmp_path / ".pipen", 215 | outdir=tmp_path / f"pipen_simple_{index}", 216 | ) 217 | 218 | return pipen_simple 219 | 220 | 221 | class PipenIsLoading(Pipen): 222 | name = "PipenIsLoading" 223 | loading = True 224 | plugins = [SimplePlugin()] 225 | starts = SimpleProc 226 | 227 | 228 | @pytest.fixture 229 | def infile(tmp_path): 230 | out = tmp_path / "infile" 231 | out.write_text("in") 232 | return out 233 | 234 | 235 | @pytest.fixture 236 | def infile1(tmp_path): 237 | out = tmp_path / "infile1" 238 | out.write_text("in1") 239 | return out 240 | 241 | 242 | @pytest.fixture 243 | def infile2(tmp_path): 244 | out = tmp_path / "infile2" 245 | out.write_text("in2") 246 | return out 247 | 248 | 249 | def create_dead_link(path): 250 | target = Path(gettempdir()) / "__NoSuchFile__" 251 | target.write_text("") 252 | link = Path(path) 253 | if link.exists() or link.is_symlink(): 254 | link.unlink() 255 | link.symlink_to(target) 256 | target.unlink() 257 | 258 | 259 | # for load_pipeline tests 260 | pipeline = Pipen( 261 | name=f"simple_pipeline_{Pipen.PIPELINE_COUNT + 1}", 262 | desc="No description", 263 | loglevel="debug", 264 | cache=True, 265 | workdir=gettempdir() + "/.pipen", 266 | outdir=gettempdir() + f"/pipen_simple_{Pipen.PIPELINE_COUNT}", 267 | ).set_starts(SimpleProc) 268 | -------------------------------------------------------------------------------- /tests/test_channel.py: -------------------------------------------------------------------------------- 1 | import os 2 | from io import StringIO 3 | from pathlib import Path 4 | from math import ceil 5 | 6 | import pytest # noqa 7 | from pipen.channel import Channel, expand_dir, collapse_files 8 | from datar.tibble import tibble 9 | 10 | from pandas import DataFrame 11 | 12 | from .helpers import BUCKET 13 | 14 | 15 | def test_create(): 16 | assert isinstance(Channel.create(DataFrame([[1]])), DataFrame) 17 | 18 | 19 | def test_from_glob(): 20 | glob = Path(__file__).parent / "test_*.py" 21 | glob_files = list(Path(__file__).parent.glob("test_*.py")) 22 | ch = Channel.from_glob(glob) 23 | assert ch.shape == (len(glob_files), 1) 24 | 25 | 26 | def test_from_glob_sortby_mtime(tmp_path): 27 | file1 = tmp_path / "file1.txt" 28 | file2 = tmp_path / "file2.txt" 29 | file3 = tmp_path / "file3.txt" 30 | file1.touch() 31 | file2.touch() 32 | file3.touch() 33 | os.utime(file1, (1000, 1000)) 34 | os.utime(file2, (3000, 3000)) 35 | os.utime(file3, (2000, 2000)) 36 | ch = Channel.from_glob(tmp_path / "*.txt", sortby="mtime") 37 | assert ch.iloc[0, 0] == str(file1) 38 | assert ch.iloc[1, 0] == str(file3) 39 | assert ch.iloc[2, 0] == str(file2) 40 | 41 | 42 | def test_from_glob_sortby_size(tmp_path): 43 | file1 = tmp_path / "file1.txt" 44 | file2 = tmp_path / "file2.txt" 45 | file3 = tmp_path / "file3.txt" 46 | file1.write_text("1") 47 | file2.write_text("222") 48 | file3.write_text("33") 49 | ch = Channel.from_glob(tmp_path / "*.txt", sortby="size") 50 | assert ch.iloc[0, 0] == str(file1) 51 | assert ch.iloc[1, 0] == str(file3) 52 | assert ch.iloc[2, 0] == str(file2) 53 | 54 | 55 | def test_from_glob_filter_link(tmp_path): 56 | file1 = tmp_path / "file1.txt" 57 | file2 = tmp_path / "file2.txt" 58 | file3 = tmp_path / "file3.txt" 59 | file1.touch() 60 | file2.symlink_to(file1) 61 | file3.symlink_to(file1) 62 | ch = Channel.from_glob(tmp_path / "*.txt", ftype="link") 63 | assert ch.shape == (2, 1) 64 | assert ch.iloc[0, 0] == str(file2) 65 | assert ch.iloc[1, 0] == str(file3) 66 | 67 | 68 | def test_from_glob_filter_dir_file(tmp_path): 69 | file1 = tmp_path / "file1.txt" 70 | file2 = tmp_path / "file2.txt" 71 | file3 = tmp_path / "file3.txt" 72 | file1.mkdir() 73 | file2.touch() 74 | file3.mkdir() 75 | 76 | ch = Channel.from_glob(tmp_path / "*.txt", ftype="dir") 77 | assert ch.shape == (2, 1) 78 | assert ch.iloc[0, 0] == str(file1) 79 | assert ch.iloc[1, 0] == str(file3) 80 | 81 | ch = Channel.from_glob(tmp_path / "*.txt", ftype="file") 82 | assert ch.shape == (1, 1) 83 | assert ch.iloc[0, 0] == str(file2) 84 | 85 | 86 | def test_from_glob_cloudpath(): 87 | ch = Channel.from_glob(f"{BUCKET}/pipen-test/channel/test*.txt") 88 | assert ch.shape == (3, 1) 89 | assert ch.iloc[0, 0] == f"{BUCKET}/pipen-test/channel/test1.txt" 90 | assert ch.iloc[1, 0] == f"{BUCKET}/pipen-test/channel/test2.txt" 91 | assert ch.iloc[2, 0] == f"{BUCKET}/pipen-test/channel/test3.txt" 92 | 93 | 94 | def test_from_pairs(): 95 | glob = Path(__file__).parent / "test_*.py" 96 | glob_files = list(Path(__file__).parent.glob("test_*.py")) 97 | ch = Channel.from_pairs(glob) 98 | assert ch.shape == (ceil(len(glob_files) / 2.0), 2) 99 | 100 | 101 | def test_expand_dir_collapse_files(): 102 | ch0 = Channel.create([(Path(__file__).parent.as_posix(), 1)]) 103 | ch1 = ch0 >> expand_dir(pattern="test_*.py") 104 | glob_files = list(Path(__file__).parent.glob("test_*.py")) 105 | assert ch1.shape == (len(glob_files), 2) 106 | 107 | ch2 = ch1 >> collapse_files() 108 | assert ch2.equals(ch0) 109 | 110 | 111 | def test_from_csv(tmp_path): 112 | df = tibble(a=[1, 2], b=[3, 4]) 113 | df.to_csv(tmp_path / "input.csv", index=False) 114 | ch = Channel.from_csv(tmp_path / "input.csv") 115 | assert ch.equals(df) 116 | 117 | 118 | def test_from_excel(tmp_path): 119 | df = tibble(a=[1, 2], b=[3, 4]) 120 | df.to_excel(tmp_path / "input.xls", index=False) 121 | ch = Channel.from_excel(tmp_path / "input.xls") 122 | assert ch.equals(df) 123 | 124 | 125 | def test_from_table(): 126 | df = StringIO("a b\n1 3\n2 4\n") 127 | ch = Channel.from_table(df, sep=" ") 128 | exp = tibble(a=[1, 2], b=[3, 4]) 129 | assert ch.equals(exp) 130 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pytest # noqa: F401 3 | 4 | from subprocess import check_output, CalledProcessError, STDOUT 5 | 6 | 7 | def cmdoutput(cmd): 8 | try: 9 | return check_output( 10 | [sys.executable, "-m"] + cmd, 11 | stderr=STDOUT, 12 | encoding="utf-8", 13 | ) 14 | except CalledProcessError as err: 15 | return err.output 16 | 17 | 18 | @pytest.mark.forked 19 | def test_main(): 20 | out = cmdoutput(["pipen", "--help"]) 21 | assert "CLI Tool for pipen" in out 22 | 23 | 24 | @pytest.mark.forked 25 | def test_nosuch_command(): 26 | out = cmdoutput(["pipen", "x"]) 27 | assert "invalid choice" in out 28 | 29 | 30 | @pytest.mark.forked 31 | def test_help(): 32 | out = cmdoutput(["pipen", "help", "x"]) 33 | assert "invalid choice" in out 34 | out = cmdoutput(["pipen", "help", "profile"]) 35 | assert "The name of the profile to show" in out 36 | out = cmdoutput(["pipen", "help"]) 37 | assert "CLI Tool for pipen" in out 38 | 39 | 40 | @pytest.mark.forked 41 | def test_profile_all(): 42 | out = cmdoutput(["pipen", "profile"]) 43 | assert "Note:" in out 44 | out = cmdoutput(["pipen", "profile", "--list"]) 45 | assert "default" in out 46 | 47 | 48 | @pytest.mark.forked 49 | def test_profile_default(): 50 | out = cmdoutput(["pipen", "profile", "--name", "default"]) 51 | assert "Profile: default" in out 52 | 53 | 54 | @pytest.mark.forked 55 | def test_profile_nosuch(): 56 | out = cmdoutput(["pipen", "profile", "-n", "nosuch"]) 57 | assert "Profile: nosuch" not in out 58 | 59 | 60 | @pytest.mark.forked 61 | def test_version(): 62 | out = cmdoutput(["pipen", "version"]) 63 | assert "pipen" in out 64 | assert "python" in out 65 | assert "liquidpy" in out 66 | -------------------------------------------------------------------------------- /tests/test_pipen.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from uuid import uuid4 3 | from yunpath import AnyPath 4 | from pipen import Proc, Pipen, run 5 | from pipen.exceptions import ( 6 | ProcDependencyError, 7 | PipenSetDataError, 8 | ) 9 | from pipen.proc import PipenOrProcNameError 10 | 11 | from .helpers import ( # noqa: F401 12 | ErrorProc, 13 | NormalProc, 14 | SimpleProc, 15 | RelPathScriptProc, 16 | pipen, 17 | SimplePlugin, 18 | pipen_with_plugin, 19 | BUCKET, 20 | ) 21 | 22 | 23 | @pytest.fixture 24 | def uid(): 25 | return uuid4() 26 | 27 | 28 | @pytest.mark.forked 29 | def test_init(pipen): 30 | assert isinstance(pipen, Pipen) 31 | 32 | 33 | @pytest.mark.forked 34 | def test_name(): 35 | p = Pipen() 36 | assert p.name == "p" 37 | [p] = [Pipen()] 38 | assert p.name.startswith("Pipen-") 39 | 40 | 41 | @pytest.mark.forked 42 | def test_run(pipen): 43 | ret = pipen.set_starts(SimpleProc).run() 44 | assert ret 45 | 46 | ret = pipen.set_starts([ErrorProc]).run() 47 | assert not ret 48 | 49 | 50 | @pytest.mark.forked 51 | def test_no_start_procs(pipen): 52 | with pytest.raises(ProcDependencyError): 53 | pipen.run() 54 | 55 | 56 | @pytest.mark.forked 57 | def test_cyclic_dependency(pipen): 58 | """ 59 | proc1(start) --> proc2 --> proc3(start) 60 | <-- 61 | """ 62 | proc1 = Proc.from_proc(NormalProc, input_data=[1]) 63 | proc2 = Proc.from_proc(NormalProc) 64 | proc3 = Proc.from_proc(NormalProc, requires=proc2) 65 | proc2.requires = [proc1, proc3] 66 | 67 | with pytest.raises(ProcDependencyError, match="Cyclic dependency"): 68 | pipen.set_starts(proc1, proc3).run() 69 | 70 | 71 | @pytest.mark.forked 72 | def test_wrong_type_starts(pipen): 73 | with pytest.raises(ProcDependencyError, match="is not a subclass of"): 74 | pipen.set_starts(1) 75 | 76 | with pytest.raises(ProcDependencyError, match="is not a subclass of"): 77 | pipen.set_starts(lambda: 1) 78 | 79 | 80 | @pytest.mark.forked 81 | def test_not_cyclic_for_subclass_of_proc_in_pipeline(pipen): 82 | proc1 = Proc.from_proc(NormalProc, input_data=[1]) 83 | proc2 = Proc.from_proc(NormalProc, requires=proc1) 84 | 85 | class proc3(proc1): 86 | requires = proc2 87 | 88 | pipen.set_starts(proc1).run() 89 | assert pipen.procs == [proc1, proc2, proc3] 90 | 91 | 92 | @pytest.mark.forked 93 | def test_no_next_procs(pipen): 94 | """ 95 | proc1 --> proc2 --> proc3 96 | <-- 97 | """ 98 | proc1 = Proc.from_proc(NormalProc, input_data=[1]) 99 | proc2 = Proc.from_proc(NormalProc) 100 | proc3 = Proc.from_proc(NormalProc, requires=proc2) 101 | proc2.requires = [proc1, proc3] 102 | # trigger requires/nexts computation 103 | proc2.__init_subclass__() 104 | 105 | with pytest.raises( 106 | ProcDependencyError, 107 | match="No available next processes", 108 | ): 109 | pipen.set_starts(proc1).run() 110 | 111 | 112 | @pytest.mark.forked 113 | def test_plugins_are_pipeline_dependent(pipen, pipen_with_plugin, caplog): 114 | simproc = Proc.from_proc(SimpleProc) 115 | pipen_with_plugin.set_starts(simproc).run() 116 | assert "simpleplugin" in caplog.text 117 | 118 | caplog.clear() 119 | pipen.set_starts(simproc).run() # No simple plugin enabled 120 | assert "simpleplugin" not in caplog.text 121 | 122 | 123 | @pytest.mark.forked 124 | def test_set_starts_error(pipen): 125 | with pytest.raises(ProcDependencyError): 126 | pipen.set_starts(SimpleProc, SimpleProc) 127 | 128 | 129 | @pytest.mark.forked 130 | def test_set_data(pipen): 131 | simproc = Proc.from_proc(SimpleProc, input_data=[1]) 132 | pipen.set_starts(simproc).set_data(None) 133 | assert simproc.input_data == [1] 134 | 135 | with pytest.raises(PipenSetDataError): 136 | pipen.set_data([2]) 137 | 138 | 139 | @pytest.mark.forked 140 | def test_proc_order(pipen): 141 | proc1 = Proc.from_proc(NormalProc, input_data=[1]) 142 | proc2 = Proc.from_proc(NormalProc, requires=proc1) 143 | proc3 = Proc.from_proc(NormalProc, requires=proc1, order=-1) 144 | 145 | pipen.set_starts(proc1).run() 146 | assert pipen.procs == [proc1, proc3, proc2] 147 | 148 | 149 | @pytest.mark.forked 150 | def test_proc_inherited(pipen): 151 | proc1 = Proc.from_proc(RelPathScriptProc) 152 | proc2 = Proc.from_proc(proc1) 153 | pipen.set_starts(proc2).set_data([1]).run() 154 | assert proc2.__doc__ == RelPathScriptProc.__doc__ 155 | 156 | 157 | @pytest.mark.forked 158 | def test_subclass_pipen(tmp_path, caplog): 159 | class Proc1(Proc): 160 | input = "a" 161 | output = "b:var:{{in.a}}" 162 | 163 | class Proc2(Proc): 164 | requires = Proc1 165 | input = "b" 166 | output = "c:file:{{in.b}}" 167 | script = "touch {{out.c}}" 168 | 169 | class MyPipen(Pipen): 170 | name = "MyAwesomePipeline" 171 | starts = Proc1 172 | data = ([1],) 173 | outdir = tmp_path / "outdir" 174 | workdir = tmp_path 175 | loglevel = "DEBUG" 176 | plugin_opts = {"x": 1} 177 | scheduler_opts = {"n": 1} 178 | template_opts = {"a": 1} 179 | 180 | MyPipen(plugin_opts={"y": 2}).run() 181 | 182 | assert (tmp_path / "outdir" / "Proc2" / "1").is_file() 183 | assert "MYAWESOMEPIPELINE" in caplog.text 184 | assert "x=1" in caplog.text 185 | assert "y=2" in caplog.text 186 | 187 | class MyPipe2(Pipen): 188 | ... 189 | 190 | assert MyPipe2().name == "MyPipe2" 191 | 192 | 193 | @pytest.mark.forked 194 | def test_invalid_name(): 195 | class MyPipe3(Pipen): 196 | name = "a+" 197 | 198 | with pytest.raises(PipenOrProcNameError, match="Invalid pipeline name"): 199 | MyPipe3().run() 200 | 201 | 202 | @pytest.mark.forked 203 | def test_duplicate_proc_name(): 204 | class MyProc1(Proc): 205 | ... 206 | 207 | class MyProc2(Proc): 208 | requires = MyProc1 209 | name = "MyProc1" 210 | 211 | class MyPipe4(Pipen): 212 | starts = MyProc1 213 | 214 | with pytest.raises(PipenOrProcNameError, match="already used by another"): 215 | MyPipe4().run() 216 | 217 | 218 | @pytest.mark.forked 219 | def test_run2(): 220 | class RProc1(Proc): 221 | input = "a" 222 | output = "b:var:{{in.a}}" 223 | 224 | class RProc2(Proc): 225 | requires = RProc1 226 | input = "b" 227 | output = "c:file:{{in.b}}" 228 | script = "touch {{out.c}}" 229 | 230 | assert run("MyPipe", RProc1) 231 | 232 | 233 | @pytest.mark.forked 234 | def test_cloud_workdir_outdir(uid): 235 | class RProc1(Proc): 236 | input = "a" 237 | input_data = [1] 238 | output = "b:file:{{in.a}}.txt" 239 | script = "cloudsh touch {{out.b}}" 240 | 241 | class RProc2(Proc): 242 | requires = RProc1 243 | input = "b:file" 244 | output = "c:file:{{in.b.stem}}2.txt" 245 | script = "echo 123 | cloudsh sink {{out.c}}" 246 | 247 | # make sure multiple tests can run in parallel 248 | # e.g. for python3.9, python3.10, etc. 249 | cloud_dir = AnyPath(f"{BUCKET}/pipen-test/test-pipeline/{uid}") 250 | 251 | assert run( 252 | "MyCloudPipe", 253 | RProc1, 254 | workdir=f"{cloud_dir}/workdir", 255 | outdir=f"{cloud_dir}/outdir", 256 | ) 257 | 258 | cloud_dir.rmtree() 259 | -------------------------------------------------------------------------------- /tests/test_plugin.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pipen import plugin, Pipen, Proc 4 | 5 | from .helpers import OutputNotGeneratedProc, SimpleProc, pipen # noqa: F401 6 | 7 | 8 | class Plugin: 9 | def __init__(self, name): 10 | self.name = name 11 | 12 | @plugin.impl 13 | async def on_complete(self, pipen, succeeded): 14 | print(f"<<<{self.name}>>>") 15 | 16 | 17 | @pytest.mark.forked 18 | def test_job_succeeded(pipen, caplog): 19 | out = pipen.set_starts(OutputNotGeneratedProc).run() 20 | assert not out 21 | 22 | 23 | @pytest.mark.forked 24 | def test_plugin_context_only(tmp_path, capsys): 25 | plugin1 = Plugin("plugin1") 26 | plugin2 = Plugin("plugin2") 27 | plugin3 = Plugin("plugin3") 28 | plugin4 = Plugin("plugin4") 29 | 30 | plugin.register(plugin1, plugin2, plugin3, plugin4) 31 | plugin.get_plugin("plugin4").disable() 32 | 33 | pipeline = Pipen( 34 | name="pipeline_plugin_context_only", 35 | desc="No description", 36 | loglevel="debug", 37 | cache=True, 38 | workdir=tmp_path / ".pipen", 39 | outdir=tmp_path / "pipeline_plugin_context_only", 40 | plugins=["plugin1", "plugin2"], 41 | ) 42 | pipeline.set_starts(SimpleProc).run() 43 | out = capsys.readouterr().out 44 | assert "<<>>" in out 45 | assert "<<>>" in out 46 | assert "<<>>" not in out 47 | assert "<<>>" not in out 48 | 49 | 50 | @pytest.mark.forked 51 | def test_plugin_context_mixed(tmp_path, capsys): 52 | plugin1 = Plugin("plugin1") 53 | plugin2 = Plugin("plugin2") 54 | plugin3 = Plugin("plugin3") 55 | plugin4 = Plugin("plugin4") 56 | 57 | plugin.register(plugin1, plugin2, plugin3, plugin4) 58 | plugin.get_plugin("plugin3").disable() 59 | plugin.get_plugin("plugin4").disable() 60 | 61 | pipeline = Pipen( 62 | name="pipeline_plugin_context_mixed", 63 | desc="No description", 64 | loglevel="debug", 65 | cache=True, 66 | workdir=tmp_path / ".pipen", 67 | outdir=tmp_path / "pipeline_plugin_context_mixed", 68 | plugins=["+plugin3", plugin.get_plugin("plugin4"), "-plugin2"], 69 | ) 70 | pipeline.set_starts(SimpleProc).run() 71 | out = capsys.readouterr().out 72 | assert "<<>>" in out 73 | assert "<<>>" not in out 74 | assert "<<>>" in out 75 | assert "<<>>" in out 76 | 77 | 78 | @pytest.mark.forked 79 | def test_jobcmd_hooks(pipen): 80 | 81 | @plugin.register 82 | class MyJobCmdPlugin: 83 | @plugin.impl 84 | def on_jobcmd_init(job): 85 | return "# on_jobcmd_init from myjobcmdplugin" 86 | 87 | @plugin.impl 88 | def on_jobcmd_prep(job): 89 | return "# on_jobcmd_prep from myjobcmdplugin" 90 | 91 | @plugin.impl 92 | def on_jobcmd_end(job): 93 | return "# on_jobcmd_end from myjobcmdplugin" 94 | 95 | class MyProc(Proc): 96 | input = "in:var" 97 | input_data = [1] 98 | output = "out:var:{{in.in}}" 99 | script = "echo {{proc.name}}" 100 | 101 | pipen.set_starts(MyProc).run() 102 | assert pipen.run() 103 | 104 | wrapper_script = pipen.workdir / "MyProc" / "0" / "job.wrapped.local" 105 | assert wrapper_script.exists() 106 | content = wrapper_script.read_text() 107 | assert "# on_jobcmd_init from myjobcmdplugin" in content 108 | assert "# on_jobcmd_prep from myjobcmdplugin" in content 109 | assert "# on_jobcmd_end from myjobcmdplugin" in content 110 | -------------------------------------------------------------------------------- /tests/test_proc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | import pandas 5 | from pipen import Proc 6 | from pipen.exceptions import ( 7 | ProcInputKeyError, 8 | ProcInputTypeError, 9 | ProcScriptFileNotFound, 10 | PipenOrProcNameError, 11 | ) 12 | from datar.dplyr import mutate 13 | from .helpers import ( # noqa: F401 14 | In2Out1Proc, 15 | NoInputProc, 16 | NormalProc, 17 | FileInputProc, 18 | RelPathScriptProc, 19 | ScriptNotExistsProc, 20 | SimpleProc, 21 | InputTypeUnsupportedProc, 22 | pipen, 23 | ) 24 | 25 | 26 | @pytest.mark.forked 27 | def test_more_nexts(pipen): 28 | proc1 = Proc.from_proc(NormalProc) 29 | Proc.from_proc(NormalProc, "proc2", requires=proc1) 30 | Proc.from_proc(NormalProc, "proc3", requires=proc1) 31 | ret = pipen.set_starts(proc1).run() 32 | assert ret 33 | 34 | 35 | @pytest.mark.forked 36 | def test_proc_no_input(pipen): 37 | with pytest.raises(ProcInputKeyError): 38 | pipen.set_starts(NoInputProc).run() 39 | 40 | 41 | @pytest.mark.forked 42 | def test_unsupported_input_type(pipen): 43 | with pytest.raises(ProcInputTypeError): 44 | pipen.set_starts(InputTypeUnsupportedProc).run() 45 | 46 | 47 | @pytest.mark.forked 48 | def test_proc_with_input_data(pipen): 49 | proc = Proc.from_proc(NormalProc, input_data=[1]) 50 | pipen.set_starts(proc).run() 51 | assert proc.output_data.equals(pandas.DataFrame({"output": ["1"]})) 52 | 53 | 54 | @pytest.mark.forked 55 | def test_proc_with_symlink_input(pipen, tmp_path): 56 | infile_orig = tmp_path / "a.txt" 57 | infile_orig.write_text("1") 58 | infile_symlink = tmp_path / "b.txt" 59 | infile_symlink.symlink_to(infile_orig) 60 | proc = Proc.from_proc(FileInputProc, input_data=[infile_symlink]) 61 | pipen.set_starts(proc).run() 62 | outfile = proc.output_data["out"].iloc[0] 63 | assert outfile.name == "b.txt" 64 | 65 | 66 | @pytest.mark.forked 67 | def test_proc_with_input_callable(pipen): 68 | proc = Proc.from_proc(NormalProc, input_data=[1]) 69 | proc2 = Proc.from_proc( 70 | NormalProc, requires=proc, input_data=lambda ch: ch >> mutate(output=2) 71 | ) 72 | pipen.set_starts(proc).run() 73 | assert proc2.output_data.equals(pandas.DataFrame({"output": ["2"]})) 74 | 75 | 76 | @pytest.mark.forked 77 | def test_ignore_input_data_of_start_proc(caplog, pipen): 78 | proc = Proc.from_proc(NormalProc, input_data=[1]) 79 | proc2 = Proc.from_proc(NormalProc, requires=proc, input_data=[2]) 80 | pipen.set_starts(proc).run() 81 | assert "Ignoring input data" in caplog.text 82 | assert proc2.output_data.equals(pandas.DataFrame({"output": ["1"]})) 83 | 84 | 85 | @pytest.mark.forked 86 | def test_proc_wasted_input_columns(caplog, pipen): 87 | proc1 = Proc.from_proc(NormalProc, input_data=[1]) 88 | proc2 = Proc.from_proc(NormalProc, input_data=[1]) 89 | proc3 = Proc.from_proc(NormalProc, requires=[proc1, proc2]) # noqa: F841 90 | pipen.set_starts(proc1, proc2).run() 91 | assert "Wasted 1 column" in caplog.text 92 | 93 | 94 | @pytest.mark.forked 95 | def test_proc_not_enough_input_columns(caplog, pipen): 96 | proc1 = Proc.from_proc(NormalProc, input_data=[1]) 97 | proc2 = Proc.from_proc(In2Out1Proc, requires=proc1) 98 | pipen.set_starts(proc1).run() 99 | assert "No data column for input: ['in2'], using None" in caplog.text 100 | assert proc2.output_data.equals(pandas.DataFrame({"out": ["1_None"]})) 101 | 102 | 103 | @pytest.mark.forked 104 | def test_proc_relative_path_script(pipen): 105 | pipen.set_starts(RelPathScriptProc).run() 106 | script = RelPathScriptProc().script.render() 107 | assert "AbCdEf" in script 108 | 109 | 110 | @pytest.mark.forked 111 | def test_script_file_exists(pipen): 112 | with pytest.raises(ProcScriptFileNotFound): 113 | pipen.set_starts(ScriptNotExistsProc).run() 114 | 115 | 116 | @pytest.mark.forked 117 | def test_cached_run(caplog, pipen): 118 | NormalProc.nexts = [] 119 | # force uncache NormalProc 120 | # shutil.rmtree(pipen.config.workdir) 121 | ret = pipen.set_start(NormalProc).run() 122 | assert ret 123 | 124 | # trigger caching 125 | ret = pipen.set_start(NormalProc).run() 126 | assert ret 127 | 128 | assert caplog.text.count("Cached jobs:") == 1 129 | 130 | 131 | def test_proc_repr(): 132 | assert repr(SimpleProc) == "" 133 | 134 | 135 | def test_from_proc_no_name(): 136 | procs = [None] 137 | with pytest.raises(PipenOrProcNameError): 138 | procs[0] = Proc.from_proc(SimpleProc) 139 | 140 | 141 | def test_from_proc(): 142 | proc = Proc.from_proc( 143 | SimpleProc, 144 | name="new_proc", 145 | desc="new desc", 146 | envs={"a": 1}, 147 | cache=True, 148 | forks=2, 149 | plugin_opts={"p": 1}, 150 | scheduler="sge", 151 | scheduler_opts={"s": 1}, 152 | error_strategy="retry", 153 | num_retries=10, 154 | submission_batch=3, 155 | ) 156 | assert proc.name == "new_proc" 157 | assert proc.desc == "new desc" 158 | assert proc.envs == {"a": 1} 159 | assert proc.cache 160 | assert proc.forks == 2 161 | assert proc.plugin_opts == {"p": 1} 162 | assert proc.scheduler == "sge" 163 | assert proc.scheduler_opts == {"s": 1} 164 | assert proc.error_strategy == "retry" 165 | assert proc.num_retries == 10 166 | assert proc.submission_batch == 3 167 | 168 | 169 | def test_proc_is_singleton(pipen): 170 | pipen.workdir = ".pipen/" 171 | os.makedirs(pipen.workdir, exist_ok=True) 172 | p1 = SimpleProc(pipen) 173 | p2 = SimpleProc(pipen) 174 | assert p1 is p2 175 | 176 | 177 | def test_invalid_name(): 178 | with pytest.raises(PipenOrProcNameError): 179 | Proc.from_proc(SimpleProc, name="a b") 180 | 181 | 182 | def test_inherit_proc_envs(): 183 | class Proc1_1(Proc): 184 | envs_depth = 1 185 | envs = {"a": {"b": 1, "c": 2}} 186 | 187 | class Proc2(Proc1_1): 188 | envs = {"a": {"b": 2}} 189 | 190 | class Proc1_2(Proc): 191 | envs_depth = 2 192 | envs = {"a": {"b": 1, "c": 2}} 193 | 194 | class Proc3(Proc1_2): 195 | envs_depth = 2 196 | envs = {"a": {"b": 3}} 197 | 198 | class Proc1_3(Proc): 199 | envs_depth = 3 200 | envs = {"a": {"b": 1, "c": 2}} 201 | 202 | class Proc4(Proc1_3): 203 | envs = {"a": {"b": 4}} 204 | 205 | Proc5 = Proc.from_proc(Proc1_3, envs={"a": {"b": 5}}) 206 | 207 | assert Proc5.envs == {"a": {"b": 5, "c": 2}} 208 | assert Proc4.envs == {"a": {"b": 4, "c": 2}} 209 | assert Proc3.envs == {"a": {"b": 3, "c": 2}} 210 | assert Proc2.envs == {"a": {"b": 2}} 211 | assert Proc1_1.envs == {"a": {"b": 1, "c": 2}} 212 | assert Proc1_2.envs == {"a": {"b": 1, "c": 2}} 213 | assert Proc1_3.envs == {"a": {"b": 1, "c": 2}} 214 | -------------------------------------------------------------------------------- /tests/test_procgroup.py: -------------------------------------------------------------------------------- 1 | import pytest # noqa: F401 2 | 3 | from pipen import Proc, Pipen 4 | from pipen.procgroup import ProcGroup 5 | 6 | 7 | def test_singleton(): 8 | class PG(ProcGroup): 9 | ... 10 | 11 | assert PG() is PG() 12 | 13 | 14 | def test_option_overrides_defaults(): 15 | class PG(ProcGroup): 16 | DEFAULTS = {"a": 1} 17 | 18 | pg = PG(a=2) 19 | assert pg.opts.a == 2 20 | 21 | 22 | def test_add_proc(): 23 | 24 | class PG(ProcGroup): 25 | ... 26 | 27 | pg = PG() 28 | 29 | @pg.add_proc() 30 | class P1(Proc): 31 | pass 32 | 33 | assert pg.P1 is P1 34 | assert len(pg.procs) == 1 35 | assert pg.procs.P1 is P1 36 | assert pg.starts == [P1] 37 | 38 | 39 | def test_define_proc(): 40 | 41 | class P1(Proc): pass # noqa: E701 42 | class P2(Proc): pass # noqa: E701 43 | class P3(Proc): pass # noqa: E701 44 | 45 | class PG(ProcGroup): 46 | 47 | @ProcGroup.add_proc 48 | def p1(self): 49 | return P1 50 | 51 | @ProcGroup.add_proc 52 | def p2(self): 53 | P2.requires = self.p1 54 | return P2 55 | 56 | @ProcGroup.add_proc 57 | def p3(self): 58 | P3.requires = self.p2 59 | return P3 60 | 61 | pg = PG() 62 | 63 | assert pg.starts == [P1] 64 | 65 | assert pg.p1 is P1 66 | assert pg.p2 is P2 67 | assert pg.p3 is P3 68 | assert pg.procs == {"P1": P1, "P2": P2, "P3": P3} 69 | 70 | 71 | def test_define_proc_wrong_return(): 72 | class PG(ProcGroup): 73 | @ProcGroup.add_proc 74 | def p1(self): 75 | return None 76 | 77 | @ProcGroup.add_proc 78 | def p2(self): 79 | return 1 80 | 81 | with pytest.raises(ValueError): 82 | PG() 83 | 84 | 85 | def test_as_pipen(): 86 | class PG(ProcGroup): 87 | """A pipeline group""" 88 | 89 | pg = PG() 90 | 91 | @pg.add_proc 92 | class P1(Proc): 93 | ... 94 | 95 | p = pg.as_pipen() 96 | assert isinstance(p, Pipen) 97 | assert p.desc == "A pipeline group" 98 | 99 | p = pg.as_pipen(desc="Test desc") 100 | assert p.desc == "Test desc" 101 | 102 | 103 | def test_procgroup_cleared_when_subclassed(): 104 | class PG(ProcGroup): 105 | ... 106 | 107 | pg = PG() 108 | 109 | @pg.add_proc 110 | class P1(Proc): 111 | ... 112 | 113 | assert P1.__meta__["procgroup"] is pg 114 | 115 | class P2(P1): 116 | ... 117 | 118 | assert P2.__meta__["procgroup"] is None 119 | 120 | 121 | def test_name(): 122 | class PG(ProcGroup): 123 | ... 124 | 125 | pg = PG() 126 | assert pg.name == "PG" 127 | 128 | class PG2(ProcGroup): 129 | name = "PG10" 130 | 131 | pg2 = PG2() 132 | assert pg2.name == "PG10" 133 | 134 | 135 | def test_invliad_proc_name(): 136 | class PG(ProcGroup): 137 | ... 138 | 139 | pg = PG() 140 | 141 | with pytest.raises(ValueError, match="Process name `opts` is reserved"): 142 | @pg.add_proc 143 | class opts(Proc): 144 | ... 145 | 146 | 147 | def test_add_proc_directly(): 148 | class P1(Proc): 149 | ... 150 | 151 | class PG(ProcGroup): 152 | p1 = P1 153 | 154 | pg = PG() 155 | 156 | assert pg.p1 is P1 157 | assert pg.procs == {"P1": P1} 158 | assert pg.starts == [P1] 159 | -------------------------------------------------------------------------------- /tests/test_scheduler.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from unittest.mock import MagicMock 3 | 4 | from yunpath import AnyPath 5 | from pipen.scheduler import ( 6 | get_scheduler, 7 | LocalScheduler, 8 | SgeScheduler, 9 | SshScheduler, 10 | SlurmScheduler, 11 | GbatchScheduler, 12 | NoSuchSchedulerError, 13 | ) 14 | 15 | 16 | def test_get_scheduler(): 17 | 18 | local = get_scheduler("local") 19 | assert local is LocalScheduler 20 | 21 | local = get_scheduler(local) 22 | assert local is LocalScheduler 23 | 24 | sge = get_scheduler("sge") 25 | assert sge is SgeScheduler 26 | 27 | sge = get_scheduler(sge) 28 | assert sge is SgeScheduler 29 | 30 | slurm = get_scheduler("slurm") 31 | assert slurm is SlurmScheduler 32 | 33 | slurm = get_scheduler(slurm) 34 | assert slurm is SlurmScheduler 35 | 36 | ssh = get_scheduler("ssh") 37 | assert ssh is SshScheduler 38 | 39 | ssh = get_scheduler(ssh) 40 | assert ssh is SshScheduler 41 | 42 | gbatch = get_scheduler("gbatch") 43 | assert gbatch is GbatchScheduler 44 | 45 | gbatch = get_scheduler(gbatch) 46 | assert gbatch is GbatchScheduler 47 | 48 | with pytest.raises(NoSuchSchedulerError): 49 | get_scheduler("nosuchscheduler") 50 | 51 | 52 | def test_gbatch_scheduler_init(): 53 | gbatch_sched = get_scheduler("gbatch") 54 | 55 | with pytest.raises( 56 | ValueError, match="'fast_mount' for gbatch scheduler should be in the format" 57 | ): 58 | gbatch_sched( 59 | project="test_project", 60 | location="test_location", 61 | workdir="gs://test-bucket/workdir", 62 | fast_mount="test", 63 | ) 64 | 65 | with pytest.raises( 66 | ValueError, 67 | match="'fast_mount' for gbatch scheduler should be a Google Cloud Storage", 68 | ): 69 | gbatch_sched( 70 | project="test_project", 71 | location="test_location", 72 | workdir="gs://test-bucket/workdir", 73 | fast_mount="file:///tmp/test:/mnt/test", 74 | ) 75 | 76 | gbatch = gbatch_sched( 77 | project="test_project", 78 | location="test_location", 79 | workdir="gs://test-bucket/workdir", 80 | fast_mount="gs://test-bucket/path:/mnt/path", 81 | ) 82 | assert gbatch.project == "test_project" 83 | assert gbatch.location == "test_location" 84 | assert gbatch.config.taskGroups[0].taskSpec.volumes[-1].mountPath == "/mnt/path" 85 | assert ( 86 | gbatch.config.taskGroups[0].taskSpec.volumes[-1].gcs.remotePath 87 | == "test-bucket/path" 88 | ) 89 | 90 | 91 | def test_gbatch_scheduler_post_init_non_gs_outdir(): 92 | gbatch = get_scheduler("gbatch")( 93 | project="test_project", 94 | location="test_location", 95 | workdir="gs://test-bucket/workdir", 96 | ) 97 | pipeline = MagicMock(outdir="/local/outdir") 98 | proc = MagicMock(pipeline=pipeline) 99 | proc.name = "test_proc" 100 | with pytest.raises(ValueError): 101 | gbatch.post_init(proc) 102 | 103 | 104 | def test_gbatch_scheduler_post_init(): 105 | gbatch = get_scheduler("gbatch")( 106 | project="test_project", 107 | location="test_location", 108 | workdir="gs://test-bucket/workdir", 109 | ) 110 | pipeline_outdir = AnyPath("gs://test-bucket/outdir") 111 | pipeline = MagicMock(outdir=pipeline_outdir) 112 | proc = MagicMock(pipeline=pipeline) 113 | proc.name = "test_proc" 114 | gbatch.post_init(proc) 115 | 116 | assert str(gbatch.workdir) == "gs://test-bucket/workdir" 117 | assert ( 118 | str(gbatch.workdir.mounted) == f"{GbatchScheduler.MOUNTED_METADIR}/{proc.name}" 119 | ) 120 | assert ( 121 | gbatch.config.taskGroups[0].taskSpec.volumes[-1].mountPath 122 | == f"{GbatchScheduler.MOUNTED_OUTDIR}" 123 | ) 124 | assert ( 125 | gbatch.config.taskGroups[0].taskSpec.volumes[-1].gcs.remotePath 126 | == "test-bucket/outdir" 127 | ) 128 | assert ( 129 | gbatch.config.taskGroups[0].taskSpec.volumes[-2].mountPath 130 | == f"{GbatchScheduler.MOUNTED_METADIR}/{proc.name}" 131 | ) 132 | assert ( 133 | gbatch.config.taskGroups[0].taskSpec.volumes[-2].gcs.remotePath 134 | == "test-bucket/workdir" 135 | ) 136 | -------------------------------------------------------------------------------- /tests/test_template.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pipen.template import get_template_engine 4 | from pipen.template import NoSuchTemplateEngineError 5 | 6 | 7 | def test_update_envs(): 8 | jinja = get_template_engine("jinja2") 9 | jinja2 = get_template_engine(jinja) 10 | assert jinja is jinja2 11 | jinja_tpl = jinja("abc") 12 | assert jinja_tpl.render() == "abc" 13 | 14 | with pytest.raises(NoSuchTemplateEngineError): 15 | get_template_engine("nosuchtemplate") 16 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | import pipen 5 | from yunpath import CloudPath 6 | from pipen.utils import ( 7 | brief_list, 8 | desc_from_docstring, 9 | get_logger, 10 | get_mtime, 11 | get_shebang, 12 | ignore_firstline_dedent, 13 | strsplit, 14 | truncate_text, 15 | update_dict, 16 | mark, 17 | get_marked, 18 | _get_obj_from_spec, 19 | load_pipeline, 20 | path_is_symlink, 21 | path_symlink_to, 22 | ) 23 | from pipen.proc import Proc 24 | from pipen.procgroup import ProcGroup 25 | from pipen.exceptions import ConfigurationError 26 | 27 | from .helpers import BUCKET 28 | 29 | HERE = Path(__file__).parent.resolve() 30 | 31 | 32 | @pytest.mark.forked 33 | def test_get_logger(caplog): 34 | logger = get_logger("test", "info") 35 | logger.debug("debug message") 36 | assert "debug message" not in caplog.text 37 | 38 | 39 | @pytest.mark.forked 40 | def test_brief_list(): 41 | assert brief_list([1]) == "1" 42 | assert brief_list([1, 2, 3]) == "1-3" 43 | 44 | 45 | @pytest.mark.forked 46 | def test_get_mtime_dir(): 47 | package_dir = Path(pipen.__file__).parent 48 | mtime = get_mtime(package_dir, 2) 49 | assert mtime > 0 50 | 51 | 52 | @pytest.mark.forked 53 | def test_get_mtime_symlink_dir(tmp_path): 54 | dir = tmp_path / "dir" 55 | dir.mkdir() 56 | file = dir / "file" 57 | file.touch() 58 | link = tmp_path / "link" 59 | link.symlink_to(dir) 60 | mtime = get_mtime(link, 2) 61 | assert mtime > 0 62 | 63 | 64 | @pytest.mark.forked 65 | def test_get_mtime_cloud_file(): 66 | file = CloudPath(f"{BUCKET}/pipen-test/channel/test1.txt") 67 | mtime = get_mtime(file) 68 | assert mtime > 0 69 | 70 | 71 | @pytest.mark.forked 72 | def test_get_mtime_symlink_to_cloud_dir(tmp_path): 73 | link = tmp_path / "link" 74 | path_symlink_to(link, CloudPath(f"{BUCKET}/pipen-test/channel")) 75 | lmtime = get_mtime(link, 0) 76 | mtime = get_mtime(link) 77 | assert mtime < lmtime 78 | 79 | 80 | @pytest.mark.forked 81 | def test_desc_from_docstring(): 82 | class Base: 83 | ... 84 | 85 | class Obj1(Base): 86 | """ 87 | 88 | abc 89 | def 90 | 91 | """ 92 | 93 | desc = desc_from_docstring(Obj1, Base) 94 | assert desc == "abc def" 95 | 96 | 97 | @pytest.mark.forked 98 | def test_update_dict(): 99 | assert update_dict(None, None) is None 100 | assert update_dict({}, None) == {} 101 | assert update_dict(None, {}) == {} 102 | assert update_dict({"a": 1}, {"b": 2}) == {"a": 1, "b": 2} 103 | assert update_dict({"a": 1}, {"a": 2}) == {"a": 2} 104 | assert update_dict({"a": {"b": 1}}, {"a": {"c": 2}}) == { 105 | "a": {"b": 1, "c": 2} 106 | } 107 | assert update_dict({"a": {"b": 1}}, {"a": {"c": 2}}, depth=1) == { 108 | "a": {"c": 2} 109 | } 110 | assert update_dict( 111 | {"a": {"b1": {"c": 1, "d": 2}, "b2": {"c": 1, "d": 2}}}, 112 | {"a": {"b1": {"c": 2}}}, 113 | ) == {"a": {"b1": {"c": 2, "d": 2}, "b2": {"c": 1, "d": 2}}} 114 | 115 | assert update_dict( 116 | {"a": {"b1": {"c": 1, "d": 2}, "b2": {"c": 1, "d": 2}}}, 117 | {"a": {"b1": {"c": 2}}}, 118 | depth=2, 119 | ) == {"a": {"b1": {"c": 2}, "b2": {"c": 1, "d": 2}}} 120 | 121 | assert update_dict( 122 | {"a": {"b1": {"c": 1, "d": 2}, "b2": {"c": 1, "d": 2}}}, 123 | {"a": {"b1": {"c": 2}}}, 124 | depth=1, 125 | ) == {"a": {"b1": {"c": 2}}} 126 | 127 | 128 | @pytest.mark.forked 129 | def test_strsplit(): 130 | assert strsplit("a ,b ", ",", trim=None) == ["a ", "b "] 131 | assert strsplit("a , b", ",", trim="left") == ["a ", "b"] 132 | assert strsplit("a , b", ",", trim="right") == ["a", " b"] 133 | 134 | 135 | @pytest.mark.forked 136 | def test_get_shebang(): 137 | assert get_shebang("") is None 138 | assert get_shebang("#!bash") == "bash" 139 | assert get_shebang("#!bash \n") == "bash" 140 | 141 | 142 | @pytest.mark.forked 143 | def test_ignore_firstline_dedent(): 144 | text = """ 145 | 146 | a 147 | """ 148 | assert ignore_firstline_dedent(text) == "a\n" 149 | 150 | 151 | @pytest.mark.forked 152 | def test_truncate_text(): 153 | assert truncate_text("abcd", 2) == "a…" 154 | 155 | 156 | @pytest.mark.forked 157 | def test_mark(): 158 | @mark(a=1) 159 | class P1(pipen.Proc): 160 | ... 161 | 162 | assert get_marked(P1, "a") == 1 163 | 164 | class P2(P1): 165 | ... 166 | 167 | assert get_marked(P2, "a", None) is None 168 | 169 | P3 = pipen.Proc.from_proc(P1) 170 | assert get_marked(P3, "a") is None 171 | 172 | class X: 173 | ... 174 | 175 | assert get_marked(X, "a", None) is None 176 | 177 | @mark(a=1) 178 | class Y: 179 | ... 180 | 181 | assert get_marked(Y, "a") == 1 182 | 183 | class Z(Y): 184 | ... 185 | 186 | # Marks inherited, as Y/Z are not Proc nor ProcGroup 187 | assert get_marked(Z, "a", None) == 1 188 | 189 | 190 | @pytest.mark.forked 191 | def test_get_obj_from_spec(): 192 | with pytest.raises(ValueError): 193 | _get_obj_from_spec("a.b.c") 194 | 195 | obj = _get_obj_from_spec(f"{HERE}/helpers.py:SimpleProc") 196 | assert obj.name == "SimpleProc" 197 | 198 | obj = _get_obj_from_spec("pipen:Pipen") 199 | assert obj is pipen.Pipen 200 | 201 | 202 | @pytest.mark.forked 203 | @pytest.mark.asyncio 204 | async def test_load_pipeline(tmp_path): 205 | with pytest.raises(TypeError): 206 | await load_pipeline(f"{HERE}/helpers.py:create_dead_link") 207 | with pytest.raises(TypeError): 208 | await load_pipeline(ConfigurationError) 209 | 210 | # Proc 211 | pipeline = await load_pipeline(f"{HERE}/helpers.py:SimpleProc") 212 | assert pipeline.name == "SimpleProcPipeline" 213 | 214 | # ProcGroup 215 | class PG(ProcGroup): 216 | ... 217 | 218 | pg = PG() 219 | 220 | @pg.add_proc() 221 | class P1(Proc): 222 | pass 223 | 224 | pipeline = await load_pipeline(PG) 225 | assert pipeline.name == "PG" 226 | 227 | pipeline = await load_pipeline(f"{HERE}/helpers.py:PipenIsLoading") 228 | assert pipeline.name == "PipenIsLoading" 229 | assert pipeline.starts[0].name == "SimpleProc" 230 | assert len(pipeline.procs) == 1 231 | 232 | 233 | @pytest.mark.forked 234 | @pytest.mark.asyncio 235 | async def test_load_pipeline_pipen_object(tmp_path): 236 | p = await load_pipeline(f"{HERE}/helpers.py:pipeline", a=1) 237 | assert p._kwargs["a"] == 1 238 | 239 | 240 | @pytest.mark.forked 241 | # To avoid: Another plugin named simpleplugin has already been registered. 242 | @pytest.mark.asyncio 243 | async def test_is_load_pipeline_with_help(tmp_path): 244 | pipeline = await load_pipeline( 245 | f"{HERE}/helpers.py:PipenIsLoading", 246 | "_", # not @pipen 247 | ["--help"], 248 | ) 249 | assert pipeline.name == "PipenIsLoading" 250 | assert pipeline.starts[0].name == "SimpleProc" 251 | assert len(pipeline.procs) == 1 252 | 253 | 254 | def test_path_is_symlink(tmp_path): 255 | link = tmp_path / "link" 256 | path_symlink_to(link, tmp_path / "target") 257 | assert path_is_symlink(link) 258 | 259 | fake_symlink = tmp_path / "fake_symlink" 260 | path_symlink_to(fake_symlink, CloudPath(f"{BUCKET}/target")) 261 | assert path_is_symlink(fake_symlink) 262 | 263 | nonexist_file = tmp_path / "nonexist" 264 | assert not path_is_symlink(nonexist_file) 265 | 266 | dir = tmp_path / "dir" 267 | dir.mkdir() 268 | assert not path_is_symlink(dir) 269 | -------------------------------------------------------------------------------- /tests/test_xqute_pars.py: -------------------------------------------------------------------------------- 1 | """Test parameters for xqute""" 2 | from pipen.proc import Proc 3 | import pytest 4 | 5 | import time 6 | from .helpers import RetryProc, pipen # noqa: F401 7 | 8 | 9 | @pytest.mark.forked 10 | def test_retry(caplog, pipen): # noqa: F811 11 | proc = Proc.from_proc(RetryProc) 12 | rc = pipen.set_starts(proc).set_data([time.time()]).run() 13 | assert "Retrying" in caplog.text 14 | assert rc 15 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, W503 3 | max-line-length = 88 4 | per-file-ignores = 5 | # imported but unused 6 | __init__.py: F401 7 | pipen/utils.py: F401 8 | tests/*: F811 9 | --------------------------------------------------------------------------------