├── .codesandbox
└── Dockerfile
├── .coveragerc
├── .github
└── workflows
│ ├── build.yml
│ └── docs.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── docs
├── CHANGELOG.md
├── basics.md
├── caching.md
├── channel-collapse_files.png
├── channel-expand_dir.png
├── channels.md
├── cli.md
├── cloud.md
├── configurations.md
├── defining-proc.md
├── error.md
├── examples.md
├── input-output.md
├── layers.png
├── pipen-cloud1.png
├── pipen-cloud2.png
├── plugin.md
├── proc-group.md
├── requirements.txt
├── running.md
├── scheduler.md
├── script.md
├── style.css
└── templating.md
├── examples
├── caching.py
├── cloudwdir.py
├── example.py
├── gbatch.py
├── input_data_callback.py
├── mako-templating.py
├── multijobs.py
├── plugin-example.py
├── python-script.py
└── retry.py
├── mkdocs.yml
├── pipen.png
├── pipen
├── __init__.py
├── __main__.py
├── _job_caching.py
├── channel.py
├── cli
│ ├── __init__.py
│ ├── _hooks.py
│ ├── _main.py
│ ├── help.py
│ ├── plugins.py
│ ├── profile.py
│ └── version.py
├── defaults.py
├── exceptions.py
├── job.py
├── pipen.py
├── pluginmgr.py
├── proc.py
├── procgroup.py
├── progressbar.py
├── py.typed
├── scheduler.py
├── template.py
├── utils.py
└── version.py
├── poetry.lock
├── pyproject.toml
├── setup.py
├── tests
├── __init__.py
├── conftest.py
├── helpers.py
├── test_channel.py
├── test_cli.py
├── test_job.py
├── test_pipen.py
├── test_plugin.py
├── test_proc.py
├── test_procgroup.py
├── test_scheduler.py
├── test_template.py
├── test_utils.py
└── test_xqute_pars.py
└── tox.ini
/.codesandbox/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10.12
2 |
3 | RUN apt-get update && apt-get install -y fish && \
4 | pip install -U pip && \
5 | pip install poetry && \
6 | poetry config virtualenvs.create false && \
7 | chsh -s /usr/bin/fish
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 | # need plugins to be installed to test
4 | pipen/cli/plugins.py
5 | tests/*
6 | setup.py
7 |
8 | [report]
9 | exclude_lines =
10 | if TYPE_CHECKING:
11 | pragma: no cover
12 |
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build and Deploy
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 |
7 | build:
8 | runs-on: ubuntu-24.04
9 | if: "! contains(github.event.head_commit.message, 'wip') && ! startsWith(github.ref, 'refs/tags')"
10 | strategy:
11 | matrix:
12 | # python-version: [3.8, 3.9, "3.10"]
13 | python-version: [3.9, "3.10", "3.11", "3.12"]
14 |
15 | steps:
16 | - uses: actions/checkout@v4
17 | - name: Setup Python # Set Python version
18 | uses: actions/setup-python@v5
19 | with:
20 | python-version: ${{ matrix.python-version }}
21 | - name: Install dependencies
22 | run: |
23 | python -m pip install --upgrade pip
24 | python -m pip install poetry
25 | poetry config virtualenvs.create false
26 | poetry install -v --with dev
27 | - name: Run flake8
28 | run: flake8 pipen
29 | - uses: 'google-github-actions/auth@v2'
30 | with:
31 | credentials_json: ${{ secrets.GCP_SA_KEY }}
32 | - name: Test with pytest
33 | run: pytest tests/ --junitxml=junit/test-results-${{ matrix.python-version }}.xml
34 | - name: Upload pytest test results
35 | uses: actions/upload-artifact@v4
36 | with:
37 | name: pytest-results-${{ matrix.python-version }}
38 | path: junit/test-results-${{ matrix.python-version }}.xml
39 | # Use always() to always run this step to publish test results when there are test failures
40 | if: ${{ always() }}
41 | - name: Run codacy-coverage-reporter
42 | uses: codacy/codacy-coverage-reporter-action@master
43 | if: matrix.python-version == 3.10
44 | with:
45 | project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
46 | coverage-reports: .coverage.xml
47 |
48 | deploy:
49 | # needs: build
50 | runs-on: ubuntu-24.04
51 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
52 | strategy:
53 | matrix:
54 | python-version: ["3.10"]
55 | steps:
56 | - uses: actions/checkout@v4
57 | - name: Setup Python # Set Python version
58 | uses: actions/setup-python@v5
59 | - name: Install dependencies
60 | run: |
61 | python -m pip install --upgrade pip
62 | python -m pip install poetry
63 | - name: Publish to PyPI
64 | run: poetry publish --build -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASSWORD }}
65 | if: success()
66 |
--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
1 | name: Build Docs
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | docs:
7 | runs-on: ubuntu-24.04
8 | # if: github.ref == 'refs/heads/master'
9 | strategy:
10 | matrix:
11 | python-version: ["3.10"]
12 | steps:
13 | - uses: actions/checkout@v4
14 | - name: Setup Python # Set Python version
15 | uses: actions/setup-python@v5
16 | with:
17 | python-version: ${{ matrix.python-version }}
18 | - name: Install dependencies
19 | run: |
20 | python -m pip install --upgrade pip
21 | python -m pip install poetry
22 | poetry config virtualenvs.create false
23 | poetry install -v
24 | - name: Update docs
25 | run: |
26 | python -m pip install mkdocs
27 | python -m pip install -r docs/requirements.txt
28 | cd docs
29 | cp ../README.md index.md
30 | cp ../pipen.png pipen.png
31 | cd ..
32 | mkdocs gh-deploy --clean --force
33 | if: success()
34 |
35 | # fix-index:
36 | # needs: docs
37 | # runs-on: ubuntu-latest
38 | # strategy:
39 | # matrix:
40 | # python-version: ["3.10"]
41 | # steps:
42 | # - uses: actions/checkout@v3
43 | # with:
44 | # ref: gh-pages
45 | # - name: Fix index.html
46 | # run: |
47 | # echo ':: head of index.html - before ::'
48 | # head index.html
49 | # sed -i '1,5{/^$/d}' index.html
50 | # echo ':: head of index.html - after ::'
51 | # head index.html
52 | # if: success()
53 | # - name: Commit changes
54 | # run: |
55 | # git config --local user.email "action@github.com"
56 | # git config --local user.name "GitHub Action"
57 | # git commit -m "Add changes" -a
58 | # if: success()
59 | # - name: Push changes
60 | # uses: ad-m/github-push-action@master
61 | # with:
62 | # github_token: ${{ secrets.GITHUB_TOKEN }}
63 | # branch: gh-pages
64 | # if: success()
65 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | .coverage.xml
46 | cov.xml
47 | *,cover
48 | .hypothesis/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 |
58 | # Flask stuff:
59 | instance/
60 | .webassets-cache
61 |
62 | # Scrapy stuff:
63 | .scrapy
64 |
65 | # Sphinx documentation
66 | docs/_build/
67 |
68 | # PyBuilder
69 | target/
70 |
71 | # IPython Notebook
72 | .ipynb_checkpoints
73 |
74 | # pyenv
75 | .python-version
76 |
77 | # celery beat schedule file
78 | celerybeat-schedule
79 |
80 | # dotenv
81 | .env
82 |
83 | # virtualenv
84 | venv/
85 | ENV/
86 |
87 | # Spyder project settings
88 | .spyderproject
89 |
90 | # Rope project settings
91 | .ropeproject
92 |
93 | workdir/
94 | node_modules/
95 | _book/
96 | .vscode
97 | export/
98 | *.svg
99 | *.dot
100 | *.queue.txt
101 | site/
102 |
103 | # poetry
104 | # poetry.lock
105 |
106 | # backup files
107 | *.bak
108 |
109 | .history/
110 | .xqute/
111 | .pipen/
112 | t-*.ipynb
113 | *-output/
114 | *_results/
115 | t.py
116 |
117 | nohup.out
118 | test.py
119 | test.ipynb
120 | gac_key.json
121 | examples/.pipen.toml
122 | docs/api/
123 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 | fail_fast: false
4 | exclude: '^README.rst$|^tests/|^setup.py$|^examples/|^docs/'
5 | repos:
6 | - repo: https://github.com/pre-commit/pre-commit-hooks
7 | rev: 5df1a4bf6f04a1ed3a643167b38d502575e29aef
8 | hooks:
9 | - id: trailing-whitespace
10 | - id: end-of-file-fixer
11 | - id: check-yaml
12 | - id: check-added-large-files
13 | - repo: local
14 | hooks:
15 | - id: versionchecker
16 | name: Check version agreement in pyproject and __version__
17 | entry: bash -c
18 | language: system
19 | args:
20 | - get_ver() { echo $(egrep "^__version|^version" $1 | cut -d= -f2 | sed 's/\"\| //g'); };
21 | v1=`get_ver pyproject.toml`;
22 | v2=`get_ver pipen/version.py`;
23 | if [[ $v1 == $v2 ]]; then exit 0; else exit 1; fi
24 | pass_filenames: false
25 | files: ^pyproject\.toml|pipen/version\.py$
26 | - id: mypy
27 | name: Run mypy type check
28 | entry: mypy
29 | language: system
30 | args: ["-p", "pipen"]
31 | pass_filenames: false
32 | always_run: true
33 | files: ^/pipen/.+$
34 | - id: pytest
35 | name: Run pytest
36 | entry: pytest
37 | language: system
38 | args: [tests/]
39 | pass_filenames: false
40 | files: ^tests/.+$|^pipen/.+$
41 | - id: flake8
42 | name: Run flake8
43 | entry: flake8
44 | language: system
45 | args: [pipen]
46 | pass_filenames: false
47 | files: ^pipen/.+$
48 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
7 |
8 | ______________________________________________________________________
9 |
10 | [![Pypi][6]][7] [![Github][8]][9] ![Building][10] [![Docs and API][11]][1] [![Codacy][12]][13] [![Codacy coverage][14]][13] [![Deps][5]][23]
11 |
12 | [Documentation][1] | [ChangeLog][2] | [Examples][3] | [API][4]
13 |
14 | ## Features
15 |
16 | - Easy to use
17 | - Nearly zero-configuration
18 | - Nice logging
19 | - Highly extendable
20 | - Cloud support naively
21 |
22 | ## Installation
23 |
24 | ```bash
25 | pip install -U pipen
26 | ```
27 |
28 | ## Quickstart
29 |
30 | `example.py`
31 |
32 | ```python
33 | from pipen import Proc, Pipen, run
34 |
35 | class P1(Proc):
36 | """Sort input file"""
37 | input = "infile"
38 | input_data = ["/tmp/data.txt"]
39 | output = "outfile:file:intermediate.txt"
40 | script = "cat {{in.infile}} | sort > {{out.outfile}}"
41 |
42 | class P2(Proc):
43 | """Paste line number"""
44 | requires = P1
45 | input = "infile:file"
46 | output = "outfile:file:result.txt"
47 | script = "paste <(seq 1 3) {{in.infile}} > {{out.outfile}}"
48 |
49 | # class MyPipeline(Pipen):
50 | # starts = P1
51 |
52 | if __name__ == "__main__":
53 | # MyPipeline().run()
54 | run("MyPipeline", starts=P1)
55 | ```
56 |
57 | ```shell
58 | > echo -e "3\n2\n1" > /tmp/data.txt
59 | > python example.py
60 | ```
61 |
62 | ```log
63 | 04-17 16:19:35 I core _____________________________________ __
64 | 04-17 16:19:35 I core ___ __ \___ _/__ __ \__ ____/__ | / /
65 | 04-17 16:19:35 I core __ /_/ /__ / __ /_/ /_ __/ __ |/ /
66 | 04-17 16:19:35 I core _ ____/__/ / _ ____/_ /___ _ /| /
67 | 04-17 16:19:35 I core /_/ /___/ /_/ /_____/ /_/ |_/
68 | 04-17 16:19:35 I core
69 | 04-17 16:19:35 I core version: 0.17.3
70 | 04-17 16:19:35 I core
71 | 04-17 16:19:35 I core ╔═══════════════════════════ MYPIPELINE ════════════════════════════╗
72 | 04-17 16:19:35 I core ║ My pipeline ║
73 | 04-17 16:19:35 I core ╚═══════════════════════════════════════════════════════════════════╝
74 | 04-17 16:19:35 I core plugins : verbose v0.14.1
75 | 04-17 16:19:35 I core # procs : 2
76 | 04-17 16:19:35 I core profile : default
77 | 04-17 16:19:35 I core outdir :
78 | /home/pwwang/github/pipen/examples/MyPipeline-output
79 | 04-17 16:19:35 I core cache : True
80 | 04-17 16:19:35 I core dirsig : 1
81 | 04-17 16:19:35 I core error_strategy : ignore
82 | 04-17 16:19:35 I core forks : 1
83 | 04-17 16:19:35 I core lang : bash
84 | 04-17 16:19:35 I core loglevel : info
85 | 04-17 16:19:35 I core num_retries : 3
86 | 04-17 16:19:35 I core scheduler : local
87 | 04-17 16:19:35 I core submission_batch: 8
88 | 04-17 16:19:35 I core template : liquid
89 | 04-17 16:19:35 I core workdir :
90 | /home/pwwang/github/pipen/examples/.pipen/MyPipeline
91 | 04-17 16:19:35 I core plugin_opts :
92 | 04-17 16:19:35 I core template_opts : filters={'realpath': >> ['P2']
105 | 04-17 16:19:36 I verbose P1: in.infile: /tmp/data.txt
106 | 04-17 16:19:36 I verbose P1: out.outfile:
107 | /home/pwwang/github/pipen/examples/.pipen/MyPipeline/P1/0/output/intermediate
108 | .txt
109 | 04-17 16:19:38 I verbose P1: Time elapsed: 00:00:02.051s
110 | 04-17 16:19:38 I core
111 | 04-17 16:19:38 I core ╭═══════════════════════════════ P2 ════════════════════════════════╮
112 | 04-17 16:19:38 I core ║ Paste line number ║
113 | 04-17 16:19:38 I core ╰═══════════════════════════════════════════════════════════════════╯
114 | 04-17 16:19:38 I core P2: Workdir:
115 | '/home/pwwang/github/pipen/examples/.pipen/MyPipeline/P2'
116 | 04-17 16:19:38 I core P2: <<< ['P1']
117 | 04-17 16:19:38 I core P2: >>> [END]
118 | 04-17 16:19:38 I verbose P2: in.infile:
119 | /home/pwwang/github/pipen/examples/.pipen/MyPipeline/P1/0/output/intermediate
120 | .txt
121 | 04-17 16:19:38 I verbose P2: out.outfile:
122 | /home/pwwang/github/pipen/examples/MyPipeline-output/P2/result.txt
123 | 04-17 16:19:41 I verbose P2: Time elapsed: 00:00:02.051s
124 | 04-17 16:19:41 I core
125 |
126 |
127 | MYPIPELINE: 100%|██████████████████████████████| 2/2 [00:06<00:00, 0.35 procs/s]
128 | ```
129 |
130 | ```shell
131 | > cat ./MyPipeline-output/P2/result.txt
132 | 1 1
133 | 2 2
134 | 3 3
135 | ```
136 |
137 | ## Examples
138 |
139 | See more examples at `examples/` and a more realcase example at:
140 |
141 |
142 |
143 | ## Plugin gallery
144 |
145 | Plugins make `pipen` even better.
146 |
147 | - [`pipen-annotate`][26]: Use docstring to annotate pipen processes
148 | - [`pipen-args`][19]: Command line argument parser for pipen
149 | - [`pipen-board`][27]: Visualize configuration and running of pipen pipelines on the web
150 | - [`pipen-diagram`][18]: Draw pipeline diagrams for pipen
151 | - [`pipen-dry`][20]: Dry runner for pipen pipelines
152 | - [`pipen-filters`][17]: Add a set of useful filters for pipen templates.
153 | - [`pipen-lock`][25]: Process lock for pipen to prevent multiple runs at the same time.
154 | - [`pipen-log2file`][28]: Save running logs to file for pipen
155 | - [`pipen-poplog`][30]: Populate logs from jobs to running log of the pipeline
156 | - [`pipen-report`][16]: Generate report for pipen
157 | - [`pipen-runinfo`][29]: Save running information to file for pipen
158 | - [`pipen-verbose`][15]: Add verbosal information in logs for pipen.
159 | - [`pipen-gcs`][32]: A plugin for pipen to handle files in Google Cloud Storage.
160 | - [`pipen-cli-init`][21]: A pipen CLI plugin to create a pipen project (pipeline)
161 | - [`pipen-cli-ref`][31]: Make reference documentation for processes
162 | - [`pipen-cli-require`][24]: A pipen cli plugin check the requirements of a pipeline
163 | - [`pipen-cli-run`][22]: A pipen cli plugin to run a process or a pipeline
164 |
165 | [1]: https://pwwang.github.io/pipen
166 | [2]: https://pwwang.github.io/pipen/CHANGELOG
167 | [3]: https://pwwang.github.io/pipen/examples
168 | [4]: https://pwwang.github.io/pipen/api/pipen
169 | [5]: https://img.shields.io/librariesio/release/pypi/pipen?style=flat-square
170 | [6]: https://img.shields.io/pypi/v/pipen?style=flat-square
171 | [7]: https://pypi.org/project/pipen/
172 | [8]: https://img.shields.io/github/v/tag/pwwang/pipen?style=flat-square
173 | [9]: https://github.com/pwwang/pipen
174 | [10]: https://img.shields.io/github/actions/workflow/status/pwwang/pipen/build.yml?style=flat-square
175 | [11]: https://img.shields.io/github/actions/workflow/status/pwwang/pipen/docs.yml?label=docs&style=flat-square
176 | [12]: https://img.shields.io/codacy/grade/cf1c6c97e5c4480386a05b42dec10c6e?style=flat-square
177 | [13]: https://app.codacy.com/gh/pwwang/pipen
178 | [14]: https://img.shields.io/codacy/coverage/cf1c6c97e5c4480386a05b42dec10c6e?style=flat-square
179 | [15]: https://github.com/pwwang/pipen-verbose
180 | [16]: https://github.com/pwwang/pipen-report
181 | [17]: https://github.com/pwwang/pipen-filters
182 | [18]: https://github.com/pwwang/pipen-diagram
183 | [19]: https://github.com/pwwang/pipen-args
184 | [20]: https://github.com/pwwang/pipen-dry
185 | [21]: https://github.com/pwwang/pipen-cli-init
186 | [22]: https://github.com/pwwang/pipen-cli-run
187 | [23]: https://libraries.io/github/pwwang/pipen#repository_dependencies
188 | [24]: https://github.com/pwwang/pipen-cli-require
189 | [25]: https://github.com/pwwang/pipen-lock
190 | [26]: https://github.com/pwwang/pipen-annotate
191 | [27]: https://github.com/pwwang/pipen-board
192 | [28]: https://github.com/pwwang/pipen-log2file
193 | [29]: https://github.com/pwwang/pipen-runinfo
194 | [30]: https://github.com/pwwang/pipen-poplog
195 | [31]: https://github.com/pwwang/pipen-cli-ref
196 | [32]: https://github.com/pwwang/pipen-gcs
197 |
--------------------------------------------------------------------------------
/docs/basics.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Layers of a pipeline
4 |
5 | 
6 |
7 | The pipeline consists of channels and processes. A process may have many jobs. Each job uses the corresponding elements from the input channel of the process (a row of the input channel/dataframe), and generates values for output channel.
8 | Actually, what you need to do is just specify the first input channel, and then tell `pipen` the dependencies of the processes. The later processes will use the output channel of the processes they depend on. Of course, you can also modify the output channel to match the input of the next processes, using functions.
9 |
10 | ## Folder structure
11 | ```
12 | ./
13 | |- pipeline.py
14 | `- /
15 | `- /
16 | |- proc.name
17 | `- /
18 | |- input/
19 | |- output/
20 | |- job.signature.toml
21 | |- job.script
22 | |- job.rc
23 | |- job.stdout
24 | |- job.stderr
25 | |- job.status
26 | `- job.wrapped.
27 | ```
28 |
29 | | Path | Content | Memo |
30 | |------|---------|------|
31 | |``|Where the pipeline directories of all processes of current pipeline are located.|Can be set by `workdir`|
32 | |``|The slugified name of the pipeline.||
33 | |`/`|The job directory|Starts with `0`|
34 | |`/output/`|Where you can find all the output files|If this is an end process, it should be a link to the output directory of this process of the pipeline|
35 | |`/job.signature.toml`|The signature file of the job, used to check if job is cached||
36 | |`/job.script`|The rendered script file||
37 | |`/job.rc`|To file containing the return code||
38 | |`/job.stdout`|The STDOUT of the script||
39 | |`/job.stderr`|The STDERR of the script||
40 | |`/job.statis`|The status of the job||
41 | |`/job.wrapped.`|The wrapper for the scheduler to wrap the script||
42 |
43 |
--------------------------------------------------------------------------------
/docs/caching.md:
--------------------------------------------------------------------------------
1 |
2 | ## Job caching
3 |
4 | If `cache` set to `False` (detected in the sequence of configuration files, `Pipen` constructor, and process definition), the job is running anyway regardless of previous runs.
5 |
6 | If a previous run of a job fails, the job will be running anyway.
7 |
8 | If a job is done successfully, a signature file will be generated for the job. When we try to run the job again, the signature will be used to check if we can skip running the job again but to use the results generated by previous run.
9 |
10 | We can also do a force-cache for a job by setting `cache` to `"force"`. This make sure of the results of previous successful run regardless of input or script changes. This is useful for the cases that, for example, you make some changes to input/script, but you don't want them to take effect immediately, especially when the job takes long time to run.
11 |
12 | ## Job signature
13 |
14 | The signature of a job consists of input types and data, output types and data, and lastest time (`lastest_time`) any files/directories from the script, input or output files are generated/modified. So these siutations will make job-cache checking fail (job will start over):
15 |
16 | 1. Any changes in `input` or `output` types
17 | 2. Any changes in `input` or `output` data
18 | 3. Any changes to `script`
19 | 4. Any touches to input files (since they will make the last modified time > `lastest_time`)
20 | 5. Any touches to input directories
21 | - Use `dirsig` as the depth to check the files under the directories
22 | - Otherwise if it is `0`, only the directories themselves are checked. Note that modify a file inside a directory may not change the last modified time of the directory itself.
23 | 6. Any deletions to the output files/directories
24 | Note that only the files/directories specified by `output` are checked. Files or subdirectories in the output directories will NOT be checked.
25 |
--------------------------------------------------------------------------------
/docs/channel-collapse_files.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/docs/channel-collapse_files.png
--------------------------------------------------------------------------------
/docs/channel-expand_dir.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/docs/channel-expand_dir.png
--------------------------------------------------------------------------------
/docs/channels.md:
--------------------------------------------------------------------------------
1 |
2 | Channels are used to pass data from one process to another. It is actually a `pandas.DataFrame` object, where each column corresponds to an input key and each row corresponds to a job.
3 |
4 |
5 | The values for different variables in different jobs wil be:
6 |
7 | | Job Index | v1 | v2 | v3 |
8 | |-----------|----|-----|----|
9 | | 0 | a1 | b1 | c1 |
10 | | 1 | a2 | b2 | c2 |
11 | | ... |... | ... |... |
12 |
13 | With a process definition:
14 |
15 | ```python
16 | class MyProcess(Proc):
17 | input = "v1, v2, v3"
18 | input_data = df # The above data frame
19 | ```
20 |
21 | Then:
22 |
23 | |Job index|Template|Rendered to|
24 | |-|-|-|
25 | |0|`{{in.v1}}`|`a1`|
26 | |0|`{{in.v2}}`|`b1`|
27 | |0|`{{in.v3}}`|`c1`|
28 | |1|`{{in.v1}}`|`a2`|
29 | |1|`{{in.v2}}`|`b2`|
30 | |1|`{{in.v3}}`|`c2`|
31 | |...|...|...|
32 |
33 | The column names don't have to match the exact input keys. If `pipen` finds any of the input keys present in the data, just use them. However, if any input keys cannot find in the data frame, we will use the first couple of columns.
34 |
35 | For example:
36 | ```python
37 | class MyProcess2(Proc):
38 | input = "v4, v3"
39 | input_data = df # The above data frame
40 | ```
41 |
42 | The for job#0, `{{in.v4}}` will be rendered as `a1` (using column `v1` in the data), and `{{in.v3}}` as `c1` (using column `v3`).
43 |
44 |
45 | ## Creating channels
46 |
47 | Since channels are just data frames, so whatever creates a pandas data frame, can be used to create a channel. Besides, a couple of class methods are avaible to create channels:
48 |
49 | - `Channel.create(...)`
50 |
51 | This takes a list of values to create a channel. If a data frame is passed, will return that data frame.
52 |
53 | If each element in the list is a tuple, the list is used to create a data frame directly, just like:
54 |
55 | ```python
56 | from pandas import DataFrame
57 | ch = Channel.create([(1,2), (3,4)])
58 | # ch = DataFrame([(1,2), (3,4)])
59 | # 0 1
60 | #
61 | # 0 1 2
62 | # 1 3 4
63 | ```
64 |
65 | If each element is not a tuple (even it is a list), it is converted to tuple:
66 | ```python
67 | ch = Channel.create([1, 2])
68 | # equvalent to:
69 | # ch = Channel.create([(1, ), (2, )])
70 | ```
71 |
72 | The `input_data` is passed to this class method to create the input channel.
73 |
74 | - `Channel.from_glob(...)`
75 |
76 | This takes a glob pattern to match the files to create a single-column channel.
77 |
78 | You can also filter the types of files by `ftype`:
79 | - `any`: to match any files (default)
80 | - `link`: to mach any links
81 | - `dir`: to match any directories
82 | - `file`: to match any files
83 |
84 | You may also sort the files using `sortby`:
85 | - `name`: sort the files by their basename (default)
86 | - `mtime`: sort the files by their last modified time
87 | - `size`: sort by file size
88 |
89 | When `reverse` is True, the above sortings are reversed.
90 |
91 | - `Channel.from_pairs(...)`
92 |
93 | Like `Channel.from_glob()` but create a double-column channel.
94 |
95 | - `Channel.from_csv(...)`
96 |
97 | Uses `pandas.read_csv()` to create a channel
98 |
99 | - `Channel.from_excel(...)`
100 |
101 | Uses `pandas.read_excel()` to create a channel
102 |
103 | - `Channel.from_table(...)`
104 |
105 | Uses `pandas.read_table()` to create a channel
106 |
107 |
108 | ## Builtin verbs/functions to transform channels
109 |
110 | `pipen` uses [`pipda`][1] to create some verbs/functions to transform channels, so that you can use them with piping syntax:
111 |
112 | ```python
113 | channel >> verb(...)
114 | ```
115 |
116 | ### Expanding a channel by directory: `expand_dir()`
117 |
118 | Sometimes we prepare files in one process (for example, split a big file into small ones in a directory), then handle these files by different jobs in another process, so that they can be processed simultaneously.
119 |
120 | 
121 |
122 | For example:
123 | ```python
124 |
125 | class P1(Proc):
126 | # the original file: a.txt
127 | input = "infile:file"
128 | input_data = ["a.txt"]
129 | output = "outdir:dir:outdir"
130 | script = "# the script to split a.txt to 1.txt, 2.txt, 3.txt ... to {{out.outdir}}"
131 |
132 | class P2(Proc):
133 | requires = P1
134 | # expand channel [("outdir/a/",)] to channel:
135 | # [("outdir/a/1.txt",), ("outdir/a/2.txt",), ("outdir/a/3.txt",), ...]
136 | input = "infile:file"
137 | input_data = lambda ch: ch >> expand_dir(pattern="*.txt")
138 | # outfile: 1.result, 2.result, ...
139 | output = "outfile:file:{{in.infile.split('/')[-1].split('.')[0]}}.result"
140 | script = """
141 | # work on {{in.infile}} (1.txt, 2.txt, 3.txt, ...)
142 | # to result file {{out.outfile}} (1.result, 2.result, 3.result, ...)
143 | """
144 |
145 | # Run 3 jobs in a batch simultaneously
146 | Pipen(forks=3).run(P1)
147 | ```
148 |
149 | If the channel is a multi-column channel, you can also specify `col` to expand only on that column, values of other columns will be copied to the expanded rows/jobs.
150 |
151 | You can also filter and sort the expanded files using arguments `ftype`, `sortby` and `reverse`, just like when we use `Channel.from_glob(...)`
152 |
153 | !!! caution
154 |
155 | - `expand_dir(...)` only works for single-row channels, which will be expanded to `N` (number of files included). If original channel has more than 1 row, only first row will be used, and other rows will be ignored.
156 | - Only the value of the column to be expanded will be changed, values of other columns remain the same.
157 |
158 | ### Collapsing a channel by files in a common ancestor directory: `collapse_files(...)`
159 |
160 | It's basically the reverse process of `expand_dir()`. It applies when you deal with different files and in next process you need them all involved (i.e. combine the results):
161 |
162 | 
163 |
164 | For example:
165 | ```python
166 |
167 | class P1(Proc):
168 | input = "infile:file"
169 | input_data = ["/a/b/1.txt", "/a/b/2.txt", "/a/b/3.txt"]
170 | output = "outfile:file:{{in.infile.split('/')[-1].split('.')[0] | append: '.txt2'}}"
171 | script = """
172 | # the script to deal with each input file:
173 | # {{in.infile}} -> {{out.outfile}}
174 | """
175 |
176 | class P2(Proc):
177 | requires = P1
178 | # collapse channel [("/1.txt2",), ("/2.txt2",), ("/3.txt2",)]
179 | # to channel: [("/", )]
180 | input = "indir:file"
181 | input_data = lambda ch: ch >> collapse_files()
182 | output = "outfile:file:{{in.indir.split('/')[-1]}}.result"
183 | script = """
184 | # combine 1.txt2, 2.txt2, 3.txt3 in {{in.indir}} to {{out.outfile}}
185 | """
186 |
187 | Pipen().run(P1)
188 | ```
189 |
190 | Similarly, if we have multiple columns, you may specify the column by index or name to collapse by:
191 | `ch >> collapse_files(col=...)`
192 |
193 | !!! caution
194 |
195 | * `os.path.dirname(os.path.commonprefix(...))` is used to detect the common ancestor directory, so the files could be `['/a/1/1.file', '/a/2/1.file']`. In this case `/a/` will be returned.
196 | * values at other columns should be the same. They will NOT be checked! The values at the first row will be used.
197 |
198 | [1]: https://github.com/pwwang/pipda
199 |
--------------------------------------------------------------------------------
/docs/cli.md:
--------------------------------------------------------------------------------
1 | `pipen` has a CLI tool that you can run from command line.
2 |
3 | To run it:
4 |
5 | ```shell
6 | ❯ pipen --help
7 | Usage: pipen [-h] {version,profile,plugins,help} ...
8 |
9 | CLI Tool for pipen v0.4.2
10 |
11 | Optional Arguments:
12 | -h, --help show help message and exit
13 |
14 | Subcommands:
15 | version Print versions of pipen and its dependencies
16 | profile List available profiles.
17 | plugins List installed plugins
18 | help Print help for commands
19 | ```
20 |
21 | ## Writing a plugin to extend the cli
22 |
23 | ### CLI plugin abstract class
24 |
25 | A CLI plugin has to be a subclass of `pipen.cli.CLIPlugin`.
26 |
27 | A CLI plugin has to define a `name` property, which also is the sub-command of the plugin.
28 |
29 | There are a couple of methods of `pipen.cli.CLIPlugin` to extend for a plugin:
30 |
31 | - `__init__(self, parser, subparser)`: initialize the plugin
32 | It takes the main parser and the subparser of the sub-command as arguments. You can add arguments to the parser or subparser here.
33 | Check [argx][1] for more information about how to define arguments.
34 |
35 | - `parse_args(self)`: parse the arguments
36 | It takes no arguments. It should parse the arguments and return the parsed arguments (Namespace), which are used to execute the command.
37 | By default, `self.parser.parse_args()` is called to parse the arguments.
38 |
39 | - `exec_command(self, args)`: execute the command
40 | It takes the parsed arguments as argument. It should execute the command as you wish.
41 |
42 | ### loading CLI plugins
43 |
44 | Like pipen [plugins][2], [templates][3], and [schedulers][4], there are two ways to load the CLI plugins:
45 |
46 | 1. Use the plugin directly:
47 |
48 | ```python
49 | from pipen.cli import cli_plugin
50 |
51 | cli_plugin.register()
52 | ```
53 |
54 | 2. Use the entry points with group name `pipen_cli`
55 |
56 |
57 | ## The `profile` subcommand
58 |
59 | It is used to list the configurations/profiles in current directory. Run `pipen profile` or `pipen help profile` to get more information.
60 |
61 | ## The `plugins` subcommand
62 |
63 | This subcommand is used to list the plugins for `pipen` itself, templates, scheduler and cli. Run `pipen plugins` or `pipen help plugins` to get more information.
64 |
65 | ## The `version` subcommand
66 |
67 | This command prints the versions of `pipen` and its dependencies.
68 |
69 | ## CLI plugin gallery
70 |
71 | - [`pipen-cli-init`][5]: A pipen CLI plugin to create a pipen project (pipeline)
72 | - [`pipen-cli-ref`][6]: Make reference documentation for processes
73 | - [`pipen-cli-require`][7]: A pipen cli plugin check the requirements of a pipeline
74 | - [`pipen-cli-run`][8]: A pipen cli plugin to run a process or a pipeline
75 |
76 | [1]: https://github.com/pwwang/argx
77 | [2]: ../plugin
78 | [3]: ../templating
79 | [4]: ../scheduler
80 | [5]: https://github.com/pwwang/pipen-cli-init
81 | [6]: https://github.com/pwwang/pipen-cli-ref
82 | [7]: https://github.com/pwwang/pipen-cli-require
83 | [8]: https://github.com/pwwang/pipen-cli-run
84 |
--------------------------------------------------------------------------------
/docs/cloud.md:
--------------------------------------------------------------------------------
1 | Since `v0.16.0`, `pipen` supports the cloud naively. There are two ways by means of cloud support:
2 |
3 | - Run the pipeline locally (or schedulers like `sge`, `slurm`, etc.) and save the files to the cloud.
4 | - Run the pipeline on the cloud.
5 |
6 | ## Run the pipeline locally and save the files to the cloud
7 |
8 | To run the pipeline locally and save the files to the cloud, you need to install `pipen` with cloud support:
9 |
10 | ```bash
11 | pip install xqute[cloudsh]
12 | # To support a specific cloud service provider
13 | pip install cloudpathlib[s3]
14 | pip install cloudpathlib[gs]
15 | pip install cloudpathlib[azure]
16 | ```
17 |
18 | The you can directly assign a cloud path as a pipeline working directory:
19 |
20 | ```python
21 | from pipen import Pipen, Proc, run
22 |
23 |
24 | class P1(Proc):
25 | """Sort input file"""
26 | input = "in:var"
27 | input_data = ["Hello World"]
28 | output = "outfile:file:out.txt"
29 | # Note that out.outfile is on the cloud but the script is executed locally
30 | # we can use cloudsh to save the output to the cloud
31 | script = "echo {{in.in}} | cloudsh sink {{out.outfile}}"
32 |
33 |
34 | class MyPipeline(Pipen):
35 | starts = P1
36 | workdir = "gs://mybucket/mypipeline/workdir"
37 | output = "gs://mybucket/mypipeline/output"
38 |
39 |
40 | if __name__ == "__main__":
41 | MyPipeline().run()
42 | ```
43 |
44 | Like the following figure, the pipeline is run locally but the meta information is grabbed from and saved to the cloud (workdir).
45 | No local files are generated.
46 |
47 | For the output files, if a process is a non-export process, the output files are saved to the workdir.
48 | If a process is an export process, the output files are saved to the output directory (export dir).
49 |
50 | 
51 |
52 | ## Run the pipeline on the cloud
53 |
54 | Currently, `pipen` only supports running the pipeline on the cloud with google batch jobs.
55 |
56 | To run the pipeline on the cloud, you need to install `pipen` with cloud support:
57 |
58 | ```bash
59 | pip install xqute[gs]
60 | ```
61 |
62 | It is used to communicate with google cloud storage files. No `cloudsh` is needed, since operating the cloud files will be happening on the cloud (with the cloud paths mounted to the VM). You also need to have [google cloud sdk][1] installed and configured, which is used to communicate with google batch jobs (submit jobs, get job status, etc.).
63 |
64 | ```python
65 | from pipen import Pipen, Proc, run
66 |
67 |
68 | class P1(Proc):
69 | """Sort input file"""
70 | input = "in:var"
71 | input_data = ["Hello World"]
72 | output = "outfile:file:out.txt"
73 | # Note that out.outfile is on the cloud but the script is executed locally
74 | # we can use cloudsh to save the output to the cloud
75 | script = "echo {{in.in}} | cloudsh sink {{out.outfile}}"
76 |
77 |
78 | class MyPipeline(Pipen):
79 | starts = P1
80 | workdir = "gs://mybucket/mypipeline/workdir"
81 | output = "gs://mybucket/mypipeline/output"
82 | scheduler = "gbatch"
83 |
84 |
85 | if __name__ == "__main__":
86 | MyPipeline().run()
87 | ```
88 |
89 | The only difference is that we need to set `scheduler` to `gbatch` (google batch jobs).
90 |
91 | As shown in the following figure, the pipeline is run on the cloud platform, and the workdir and export dir will be mounted to the VM. So the process script can directly access the cloud files, no `cloudsh` or `gcloud` tools are needed.
92 |
93 | 
94 |
95 | [1]: https://cloud.google.com/sdk?hl=en
96 |
--------------------------------------------------------------------------------
/docs/configurations.md:
--------------------------------------------------------------------------------
1 |
2 | ## Configuration items
3 |
4 | There are two levels of configuration items in `pipen`: pipeline level and process level.
5 |
6 | There are only 3 configuration items at pipeline level:
7 |
8 | - `loglevel`: The logging level for the logger (Default: `"info"`)
9 | - `workdir`: Where the metadata and intermediate files are saved for the pipeline (Default: `./.pipen`)
10 | - `plugins`: The plugins to be enabled or disabled for the pipeline
11 |
12 | These items cannot be set or changed at process level.
13 |
14 | Following items are at process level. They can be set changed at process level so that they can be process-specific. You may also see some of the configuration items introduced [here][1]
15 |
16 | - `cache`: Should we detect whether the jobs are cached? See also [here][2]
17 | - `dirsig`: When checking the signature for caching, whether should we walk through the content of the directory? This is sometimes time-consuming if the directory is big.
18 | - `error_strategy`: How to deal with the errors: retry, ignore or halt. See also [here][3]
19 | - `num_retries`: How many times to retry to jobs once error occurs.
20 | - `template`: efine the template engine to use. See also [here][4]
21 | - `template_opts`: Options to initialize the template engine (will inherit from pipeline level)
22 | - `forks`: How many jobs to run simultaneously?
23 | - `lang`: The language for the script to run. See also [here][5]
24 | - `plugin_opts`: Options for process-level plugins, will inherit from pipeline level
25 | - `scheduler`: The scheduler to run the jobs
26 | - `scheduler_opts`: The options for the scheduler, will inherit from pipeline level
27 | - `submission_batch`: How many jobs to be submited simultaneously
28 |
29 | ## Configuration priorities
30 |
31 | There are different places to set values for the configuration items (priorities from low to high):
32 |
33 | - The configuration files (priorities from low to high):
34 |
35 | - `~/.pipen.toml`
36 | - `./.pipen.toml`
37 | - `PIPEN.osenv`
38 |
39 | See [here][6] for how the configuration files are loaded.
40 | `pipen` uses `TOML` as configuration language, see [here][7] for more information about `toml` format.
41 |
42 | - The arguments of `Pipen` constructor
43 | - The process definition
44 |
45 | !!! note
46 |
47 | The configurations from configuration files are with profiles. If the same profile name appears in multiple configuration files, the items will be inherited from the lower-priority files.
48 |
49 | !!! note
50 |
51 | Special note for `lang`.
52 |
53 | If it is not set at process level, and there are shebang in the script, whatever you specified at pipeline level (including in the configuration files), it will be ignored and the interpreter in the shebang will be used.
54 |
55 | See also [script][5]
56 |
57 | !!! tip
58 |
59 | If you have nothing set at `Pipen` constructor or process definition for a configuration item, the `PIPEN.osenv` is useful to use a different value than the one set in other configuration files. For example, to disable cache for all processes:
60 |
61 | ```
62 | PIPEN_DEFAULT_cache=0 python ./pipeline.py ...
63 | ```
64 |
65 | ## Profiles
66 |
67 | You can have different profiles in configuration files:
68 |
69 | `~/.pipen.toml`
70 | ```toml
71 | [default]
72 | scheduler = "local"
73 |
74 | [sge]
75 | scheduler = "sge"
76 |
77 | [sge.schduler_opts]
78 | sge_q = "1-day"
79 | ```
80 |
81 |
82 | To use the `sge` profile:
83 |
84 | ```python
85 | Pipen().run(P1, profile="sge")
86 | ```
87 |
88 | You can also have a configuration in current directory:
89 |
90 | `./.pipen.toml`
91 | ```toml
92 | [sge.scheduler_opts]
93 | sge_q = "7-days"
94 | ```
95 |
96 | Then the queue to run the jobs will be `7-days`. Note that we didn't specify the `scheduler` in `./.pipen.toml`, which is inherited from `~/.pipen.toml`.
97 |
98 | [1]: ../defining-proc
99 | [2]: ../caching
100 | [3]: ../error
101 | [4]: ../templating
102 | [5]: ../script
103 | [6]: https://github.com/pwwang/python-simpleconf#loading-configurations
104 | [7]: https://github.com/toml-lang/toml
105 |
--------------------------------------------------------------------------------
/docs/defining-proc.md:
--------------------------------------------------------------------------------
1 | A pipeline consists of many processes, which could own multiple jobs that run in parallel.
2 |
3 | ## Defining/Creating processes
4 |
5 | `pipen` has two (preferred) ways to define processes:
6 |
7 | ### Subclassing `pipen.Proc`
8 |
9 | ```python
10 | from pipen import Proc
11 |
12 | class MyProcess(Proc):
13 | ... # process configurations
14 | ```
15 |
16 | The configurations are specified as class variables of the class.
17 |
18 |
19 |
20 | ### Using class method `Proc.from_proc()`
21 |
22 | If you want to reuse a defined process, you can either subclass it:
23 |
24 | ```python
25 | class MyOtherProcess(MyProcess):
26 | ... # configurations inherited from MyProcess
27 | ```
28 |
29 | Or use `Proc.from_proc()`:
30 |
31 | ```python
32 | # You can also pass the configurations you want to override
33 | MyOtherProcess = Proc.from_proc(MyProcess, ...)
34 | ```
35 |
36 | Note that `Proc.from_proc()` cannot override all configurations/class variables, because we assume that there are some shared configurations if you want to "copy" from another process.
37 |
38 | These shared configurations are:
39 |
40 | 1. Template engine and its options (`template` and `template_opts`)
41 | 2. Script template (`script`)
42 | 3. Input keys (`input`)
43 | 4. Language/Interpreter of the script (`lang`)
44 | 5. Output keys (`output`)
45 |
46 |
47 | All other configurations can be passed to `Proc.from_proc()` to override the old ones.
48 |
49 | For all configurations/class variables for a process, see next section.
50 |
51 | You don't need to specify the new name of the new process, the variable name on the left-handle side will be used if `name` argument is not provided to `Proc.from_proc()`. For example:
52 |
53 | ```python
54 | NewProc = Proc.from_proc(OldProc)
55 | # NewProc.name == "NewProc"
56 | ```
57 |
58 | But you are able to assign a different name to a new process if you want. For example:
59 |
60 | ```python
61 | NewProc = Proc.from_proc(OldProc, name="NewProc2")
62 | # NewProc.name = "NewProc2"
63 | ```
64 |
65 | ### How about instantiation of `Proc` directly?
66 |
67 | You are not allowed to do that. `Proc` is an abstract class, which is designed to be subclassed.
68 |
69 | ### How about instantiation of a `Proc` subclass?
70 |
71 | Nope, in `pipen`, a process is a `Proc` subclass itself. The instances of the subcleasses are used internally, and they are singletons. In most cases, you don't need to use the instances, unless you want to access the computed properties of the instances, including:
72 |
73 | - `pipeline`: The pipeline, which is a `Pipen` object
74 | - `pbar`: The progress bar for the process, indicating the job status of this process
75 | - `jobs`: The jobs of this process
76 | - `xqute`: The `Xqute` object to manage the job running.
77 | - `template`: The template engine (a `pipen.template.Template` object)
78 | - `template_opts`: The template options (overwritten from config by the `template_opts` class variable)
79 | - `input`: The sanitized input keys and types
80 | - `output`: The compiled output template, ready for the jobs to render with their own data
81 | - `scheduler`: The scheduler object (inferred from the name or sheduler object from the `scheduler` class variable)
82 | - `script`: The compiled script template, ready for the jobs to render with their own data
83 |
84 | ### How about copy/deep-copy of a `Proc` subclass?
85 |
86 | Nope. Copy or deep-copy of a `Proc` subclass won't trigger `__init_subclass__()`, where consolidate the process name from the class name if not specified and connect the required processes with the current one. Copy or deep-copy keeps all properties, but disconnect the relationships between current process and the dependency processes, even with a separate assignment, such as `MyProcess.requires = ...`.
87 |
88 | ## process configurations and `Proc` class variables
89 |
90 | The configurations of a process are specified as class variables of subclasses of `Proc`.
91 |
92 | |Name|Meaning|Can be overwritten by `Proc.from_proc()`|
93 | |-|-|-|
94 | |`name`|The name of the process. Will use the class name by default.|Yes|
95 | |`desc`|The description of the process. Will use the summary from the docstring by default.|Yes|
96 | |`envs`|The env variables that are job-independent, useful for common options across jobs.|Yes, and old ones will be inherited|
97 | |`cache`|Should we detect whether the jobs are cached?|Yes|
98 | |`dirsig`|When checking the signature for caching, the depth we should walk through the content of the directory? This is sometimes time-consuming if the directory and the depth are big.|Yes|
99 | |`export`|When True, the results will be exported to `` Defaults to None, meaning only end processes will export. You can set it to True/False to enable or disable exporting for processes|Yes|
100 | |`error_strategy`|How to deal with the errors: retry, ignore, halt|Yes|
101 | |`num_retries`|How many times to retry to jobs once error occurs|Yes|
102 | |`template`|Define the template engine to use.|No|
103 | |`template_opts`|Options to initialize the template engine.|No|
104 | |`forks`|How many jobs to run simultaneously?|Yes|
105 | |`input`|The keys and types for the input channel|No|
106 | |`input_data`|The input data (will be computed for dependent processes)|Yes|
107 | |`lang`|The language for the script to run.|No|
108 | |`order`|The execution order for the same dependency-level processes|Yes|
109 | |`output`|The output keys for the output channel|No|
110 | |`plugin_opts`|Options for process-level plugins|Yes|
111 | |`requires`|The dependency processes|Yes|
112 | |`scheduler`|The scheduler to run the jobs|Yes|
113 | |`scheduler_opts`|The options for the scheduler|Yes|
114 | |`script`|The script template for the process|No|
115 | |`submission_batch`|How many jobs to be submited simultaneously|Yes|
116 |
--------------------------------------------------------------------------------
/docs/error.md:
--------------------------------------------------------------------------------
1 | You can tell `pipen` how to handle when a job fails to run.
2 |
3 | You can specify one of the following to `error_strategy`
4 |
5 | - `halt`: Any failure will just halt the whole pipeline
6 | - `ignore`: Ignore the error and keep running (assuming the job runs successfully anyway)
7 | - `retry`: Retry to job running
8 | - After `num_retries` times of retrying, if the job is still failing, then halt the pipeline.
9 |
10 | `pipen` uses `xqute` to handle the errors. See also [here][1].
11 |
12 | [1]: https://pwwang.github.io/xqute/api/xqute.defaults/#xqute.defaults.JobErrorStrategy
13 |
--------------------------------------------------------------------------------
/docs/input-output.md:
--------------------------------------------------------------------------------
1 |
2 | ## Specify input of a process
3 |
4 | The input of a process is specified with `input`, the keys of the input data, and `input_data`, the real input data
5 |
6 | !!! tip
7 |
8 | Why separate the keys and data?
9 |
10 | Because the keys and data are not always combined, for example, we need the keys to infer the `output` and `script` (using them in the template), but the data may be deferred to obtain from the output of dependency processes.
11 |
12 |
13 | The complete form of an input key (`input`) is `:`. The `` could be `var`, `file`, `dir`, `files` and `dirs`. **A type of `var` can be omitted.** So `ph1, ph2` is the same as `ph1:var, ph2:var`
14 |
15 | If a process is requiring other processes, the specified `input_data` will be ignored, and will use the output data of their required processes:
16 |
17 | ```python
18 | class P1(Proc):
19 | input = "v1"
20 | output = "o1:{{in.v1}}" # pass by v1 as output variable
21 | input_data = ["a"]
22 |
23 | class P2(Proc):
24 | input = "v2"
25 | output = "o2:{{in.v2}}"
26 | input_data = ["b"]
27 |
28 | class P3(Proc):
29 | requires = [P1, P2]
30 | input = "i1, i2"
31 | output = "o3:{{in.i1}}_{{in.i2}}" # will be "a_b"
32 | # input_data = [] # ignored with a warning
33 |
34 | Pipen().run(P1, P2)
35 | ```
36 |
37 | !!! Tip
38 |
39 | The direct `input_data` is ignore, but you can use a callback to modify the input channel.
40 | For example:
41 |
42 | ```python
43 | class P4(Proc):
44 | requires = [P1, P2]
45 | input = "i1, i2"
46 | input_data = lambda ch: ch.applymap(str.upper)
47 | output = "o3:{{in.i1}}_{{in.i2}}" # will be "A_B"
48 | ```
49 |
50 | !!! Note
51 |
52 | When the input data does have enough columns, `None` will be used with warnings. And when the input data has more columns than the input keys, the extra columns are dropped and ignored, also with warnings
53 |
54 | ## Specify output of a process
55 |
56 | Different from input, instead of channels, you have to tell `pipen` how to compute the output channel. The output can be a `list` or `str`. If it's `str`, a comma (`,`) is used to separate different keys:
57 |
58 | To use templating in `output`, see [`templating`][1].
59 |
60 | ```python
61 | class P1(Proc):
62 | input = "invar, infile"
63 | input_data = [(1, "/a/b/c.txt")]
64 | output = (
65 | "outvar:{{in.invar}}2, "
66 | "outfile:file:{{in.infile.split('/')[-1]}}2, "
67 | "outdir:dir:{{in.infile.split('/')[-1].split('.')[0]}}-dir"
68 | )
69 |
70 | # The type 'var' is omitted in the first element.
71 | # The output channel will be:
72 | #
73 | # outvar outfile outdir
74 | #