├── .codesandbox
    └── Dockerfile
├── .coveragerc
├── .github
    └── workflows
    │   ├── build.yml
    │   └── docs.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── docs
    ├── CHANGELOG.md
    ├── basics.md
    ├── caching.md
    ├── channel-collapse_files.png
    ├── channel-expand_dir.png
    ├── channels.md
    ├── cli.md
    ├── cloud.md
    ├── configurations.md
    ├── defining-proc.md
    ├── error.md
    ├── examples.md
    ├── input-output.md
    ├── layers.png
    ├── pipen-cloud1.png
    ├── pipen-cloud2.png
    ├── plugin.md
    ├── proc-group.md
    ├── requirements.txt
    ├── running.md
    ├── scheduler.md
    ├── script.md
    ├── style.css
    └── templating.md
├── examples
    ├── caching.py
    ├── cloudwdir.py
    ├── example.py
    ├── gbatch.py
    ├── input_data_callback.py
    ├── mako-templating.py
    ├── multijobs.py
    ├── plugin-example.py
    ├── python-script.py
    └── retry.py
├── mkdocs.yml
├── pipen.png
├── pipen
    ├── __init__.py
    ├── __main__.py
    ├── _job_caching.py
    ├── channel.py
    ├── cli
    │   ├── __init__.py
    │   ├── _hooks.py
    │   ├── _main.py
    │   ├── help.py
    │   ├── plugins.py
    │   ├── profile.py
    │   └── version.py
    ├── defaults.py
    ├── exceptions.py
    ├── job.py
    ├── pipen.py
    ├── pluginmgr.py
    ├── proc.py
    ├── procgroup.py
    ├── progressbar.py
    ├── py.typed
    ├── scheduler.py
    ├── template.py
    ├── utils.py
    └── version.py
├── poetry.lock
├── pyproject.toml
├── setup.py
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── helpers.py
    ├── test_channel.py
    ├── test_cli.py
    ├── test_job.py
    ├── test_pipen.py
    ├── test_plugin.py
    ├── test_proc.py
    ├── test_procgroup.py
    ├── test_scheduler.py
    ├── test_template.py
    ├── test_utils.py
    └── test_xqute_pars.py
└── tox.ini


/.codesandbox/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10.12
2 | 
3 | RUN apt-get update && apt-get install -y fish && \
4 |     pip install -U pip && \
5 |     pip install poetry && \
6 |     poetry config virtualenvs.create false && \
7 |     chsh -s /usr/bin/fish


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | omit =
 3 |     # need plugins to be installed to test
 4 |     pipen/cli/plugins.py
 5 |     tests/*
 6 |     setup.py
 7 | 
 8 | [report]
 9 | exclude_lines =
10 |     if TYPE_CHECKING:
11 |     pragma: no cover
12 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Deploy
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 | 
 7 |   build:
 8 |     runs-on: ubuntu-24.04
 9 |     if: "! contains(github.event.head_commit.message, 'wip') && ! startsWith(github.ref, 'refs/tags')"
10 |     strategy:
11 |       matrix:
12 |         # python-version: [3.8, 3.9, "3.10"]
13 |         python-version: [3.9, "3.10", "3.11", "3.12"]
14 | 
15 |     steps:
16 |       - uses: actions/checkout@v4
17 |       - name: Setup Python # Set Python version
18 |         uses: actions/setup-python@v5
19 |         with:
20 |           python-version: ${{ matrix.python-version }}
21 |       - name: Install dependencies
22 |         run: |
23 |           python -m pip install --upgrade pip
24 |           python -m pip install poetry
25 |           poetry config virtualenvs.create false
26 |           poetry install -v --with dev
27 |       - name: Run flake8
28 |         run: flake8 pipen
29 |       - uses: 'google-github-actions/auth@v2'
30 |         with:
31 |           credentials_json: ${{ secrets.GCP_SA_KEY }}
32 |       - name: Test with pytest
33 |         run: pytest tests/ --junitxml=junit/test-results-${{ matrix.python-version }}.xml
34 |       - name: Upload pytest test results
35 |         uses: actions/upload-artifact@v4
36 |         with:
37 |           name: pytest-results-${{ matrix.python-version }}
38 |           path: junit/test-results-${{ matrix.python-version }}.xml
39 |         # Use always() to always run this step to publish test results when there are test failures
40 |         if: ${{ always() }}
41 |       - name: Run codacy-coverage-reporter
42 |         uses: codacy/codacy-coverage-reporter-action@master
43 |         if: matrix.python-version == 3.10
44 |         with:
45 |           project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
46 |           coverage-reports: .coverage.xml
47 | 
48 |   deploy:
49 |     # needs: build
50 |     runs-on: ubuntu-24.04
51 |     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
52 |     strategy:
53 |       matrix:
54 |         python-version: ["3.10"]
55 |     steps:
56 |       - uses: actions/checkout@v4
57 |       - name: Setup Python # Set Python version
58 |         uses: actions/setup-python@v5
59 |       - name: Install dependencies
60 |         run: |
61 |           python -m pip install --upgrade pip
62 |           python -m pip install poetry
63 |       - name: Publish to PyPI
64 |         run: poetry publish --build -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASSWORD }}
65 |         if: success()
66 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: Build Docs
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   docs:
 7 |     runs-on: ubuntu-24.04
 8 |     # if: github.ref == 'refs/heads/master'
 9 |     strategy:
10 |       matrix:
11 |         python-version: ["3.10"]
12 |     steps:
13 |       - uses: actions/checkout@v4
14 |       - name: Setup Python # Set Python version
15 |         uses: actions/setup-python@v5
16 |         with:
17 |           python-version: ${{ matrix.python-version }}
18 |       - name: Install dependencies
19 |         run: |
20 |           python -m pip install --upgrade pip
21 |           python -m pip install poetry
22 |           poetry config virtualenvs.create false
23 |           poetry install -v
24 |       - name: Update docs
25 |         run: |
26 |           python -m pip install mkdocs
27 |           python -m pip install -r docs/requirements.txt
28 |           cd docs
29 |           cp ../README.md index.md
30 |           cp ../pipen.png pipen.png
31 |           cd ..
32 |           mkdocs gh-deploy --clean --force
33 |         if: success()
34 | 
35 |   # fix-index:
36 |   #   needs: docs
37 |   #   runs-on: ubuntu-latest
38 |   #   strategy:
39 |   #     matrix:
40 |   #       python-version: ["3.10"]
41 |   #   steps:
42 |   #     - uses: actions/checkout@v3
43 |   #       with:
44 |   #         ref: gh-pages
45 |   #     - name: Fix index.html
46 |   #       run: |
47 |   #         echo ':: head of index.html - before ::'
48 |   #         head index.html
49 |   #         sed -i '1,5{/^$/d}' index.html
50 |   #         echo ':: head of index.html - after ::'
51 |   #         head index.html
52 |   #       if: success()
53 |   #     - name: Commit changes
54 |   #       run: |
55 |   #         git config --local user.email "action@github.com"
56 |   #         git config --local user.name "GitHub Action"
57 |   #         git commit -m "Add changes" -a
58 |   #       if: success()
59 |   #     - name: Push changes
60 |   #       uses: ad-m/github-push-action@master
61 |   #       with:
62 |   #         github_token: ${{ secrets.GITHUB_TOKEN }}
63 |   #         branch: gh-pages
64 |   #       if: success()
65 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | .coverage.xml
 46 | cov.xml
 47 | *,cover
 48 | .hypothesis/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | 
 68 | # PyBuilder
 69 | target/
 70 | 
 71 | # IPython Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # dotenv
 81 | .env
 82 | 
 83 | # virtualenv
 84 | venv/
 85 | ENV/
 86 | 
 87 | # Spyder project settings
 88 | .spyderproject
 89 | 
 90 | # Rope project settings
 91 | .ropeproject
 92 | 
 93 | workdir/
 94 | node_modules/
 95 | _book/
 96 | .vscode
 97 | export/
 98 | *.svg
 99 | *.dot
100 | *.queue.txt
101 | site/
102 | 
103 | # poetry
104 | # poetry.lock
105 | 
106 | # backup files
107 | *.bak
108 | 
109 | .history/
110 | .xqute/
111 | .pipen/
112 | t-*.ipynb
113 | *-output/
114 | *_results/
115 | t.py
116 | 
117 | nohup.out
118 | test.py
119 | test.ipynb
120 | gac_key.json
121 | examples/.pipen.toml
122 | docs/api/
123 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | fail_fast: false
 4 | exclude: '^README.rst$|^tests/|^setup.py$|^examples/|^docs/'
 5 | repos:
 6 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 7 |     rev: 5df1a4bf6f04a1ed3a643167b38d502575e29aef
 8 |     hooks:
 9 |     -   id: trailing-whitespace
10 |     -   id: end-of-file-fixer
11 |     -   id: check-yaml
12 |     -   id: check-added-large-files
13 | -   repo: local
14 |     hooks:
15 |     -   id: versionchecker
16 |         name: Check version agreement in pyproject and __version__
17 |         entry: bash -c
18 |         language: system
19 |         args:
20 |             - get_ver() { echo $(egrep "^__version|^version" $1 | cut -d= -f2 | sed 's/\"\| //g'); };
21 |               v1=`get_ver pyproject.toml`;
22 |               v2=`get_ver pipen/version.py`;
23 |               if [[ $v1 == $v2 ]]; then exit 0; else exit 1; fi
24 |         pass_filenames: false
25 |         files: ^pyproject\.toml|pipen/version\.py$
26 |     -   id: mypy
27 |         name: Run mypy type check
28 |         entry: mypy
29 |         language: system
30 |         args: ["-p", "pipen"]
31 |         pass_filenames: false
32 |         always_run: true
33 |         files: ^/pipen/.+$
34 |     -   id: pytest
35 |         name: Run pytest
36 |         entry: pytest
37 |         language: system
38 |         args: [tests/]
39 |         pass_filenames: false
40 |         files: ^tests/.+$|^pipen/.+$
41 |     -   id: flake8
42 |         name: Run flake8
43 |         entry: flake8
44 |         language: system
45 |         args: [pipen]
46 |         pass_filenames: false
47 |         files: ^pipen/.+$
48 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |     <img src="./pipen.png" width="320px">
  3 | 
  4 | **A pipeline framework for python**
  5 | 
  6 | </div>
  7 | 
  8 | ______________________________________________________________________
  9 | 
 10 | [![Pypi][6]][7] [![Github][8]][9] ![Building][10] [![Docs and API][11]][1] [![Codacy][12]][13] [![Codacy coverage][14]][13] [![Deps][5]][23]
 11 | 
 12 | [Documentation][1] | [ChangeLog][2] | [Examples][3] | [API][4]
 13 | 
 14 | ## Features
 15 | 
 16 | - Easy to use
 17 | - Nearly zero-configuration
 18 | - Nice logging
 19 | - Highly extendable
 20 | - Cloud support naively
 21 | 
 22 | ## Installation
 23 | 
 24 | ```bash
 25 | pip install -U pipen
 26 | ```
 27 | 
 28 | ## Quickstart
 29 | 
 30 | `example.py`
 31 | 
 32 | ```python
 33 | from pipen import Proc, Pipen, run
 34 | 
 35 | class P1(Proc):
 36 |     """Sort input file"""
 37 |     input = "infile"
 38 |     input_data = ["/tmp/data.txt"]
 39 |     output = "outfile:file:intermediate.txt"
 40 |     script = "cat {{in.infile}} | sort > {{out.outfile}}"
 41 | 
 42 | class P2(Proc):
 43 |     """Paste line number"""
 44 |     requires = P1
 45 |     input = "infile:file"
 46 |     output = "outfile:file:result.txt"
 47 |     script = "paste <(seq 1 3) {{in.infile}} > {{out.outfile}}"
 48 | 
 49 | # class MyPipeline(Pipen):
 50 | #     starts = P1
 51 | 
 52 | if __name__ == "__main__":
 53 |     # MyPipeline().run()
 54 |     run("MyPipeline", starts=P1)
 55 | ```
 56 | 
 57 | ```shell
 58 | > echo -e "3\n2\n1" > /tmp/data.txt
 59 | > python example.py
 60 | ```
 61 | 
 62 | ```log
 63 | 04-17 16:19:35 I core                   _____________________________________   __
 64 | 04-17 16:19:35 I core                   ___  __ \___  _/__  __ \__  ____/__  | / /
 65 | 04-17 16:19:35 I core                   __  /_/ /__  / __  /_/ /_  __/  __   |/ /
 66 | 04-17 16:19:35 I core                   _  ____/__/ /  _  ____/_  /___  _  /|  /
 67 | 04-17 16:19:35 I core                   /_/     /___/  /_/     /_____/  /_/ |_/
 68 | 04-17 16:19:35 I core
 69 | 04-17 16:19:35 I core                               version: 0.17.3
 70 | 04-17 16:19:35 I core
 71 | 04-17 16:19:35 I core    ╔═══════════════════════════ MYPIPELINE ════════════════════════════╗
 72 | 04-17 16:19:35 I core    ║ My pipeline                                                       ║
 73 | 04-17 16:19:35 I core    ╚═══════════════════════════════════════════════════════════════════╝
 74 | 04-17 16:19:35 I core    plugins         : verbose v0.14.1
 75 | 04-17 16:19:35 I core    # procs         : 2
 76 | 04-17 16:19:35 I core    profile         : default
 77 | 04-17 16:19:35 I core    outdir          :
 78 |                 /home/pwwang/github/pipen/examples/MyPipeline-output
 79 | 04-17 16:19:35 I core    cache           : True
 80 | 04-17 16:19:35 I core    dirsig          : 1
 81 | 04-17 16:19:35 I core    error_strategy  : ignore
 82 | 04-17 16:19:35 I core    forks           : 1
 83 | 04-17 16:19:35 I core    lang            : bash
 84 | 04-17 16:19:35 I core    loglevel        : info
 85 | 04-17 16:19:35 I core    num_retries     : 3
 86 | 04-17 16:19:35 I core    scheduler       : local
 87 | 04-17 16:19:35 I core    submission_batch: 8
 88 | 04-17 16:19:35 I core    template        : liquid
 89 | 04-17 16:19:35 I core    workdir         :
 90 |                  /home/pwwang/github/pipen/examples/.pipen/MyPipeline
 91 | 04-17 16:19:35 I core    plugin_opts     :
 92 | 04-17 16:19:35 I core    template_opts   : filters={'realpath': <function realpath at
 93 |                  0x7fc3eba12...
 94 | 04-17 16:19:35 I core                    : globals={'realpath': <function realpath at
 95 |                  0x7fc3eba12...
 96 | 04-17 16:19:35 I core    Initializing plugins ...
 97 | 04-17 16:19:36 I core
 98 | 04-17 16:19:36 I core    ╭─────────────────────────────── P1 ────────────────────────────────╮
 99 | 04-17 16:19:36 I core    │ Sort input file                                                   │
100 | 04-17 16:19:36 I core    ╰───────────────────────────────────────────────────────────────────╯
101 | 04-17 16:19:36 I core    P1: Workdir:
102 |                  '/home/pwwang/github/pipen/examples/.pipen/MyPipeline/P1'
103 | 04-17 16:19:36 I core    P1: <<< [START]
104 | 04-17 16:19:36 I core    P1: >>> ['P2']
105 | 04-17 16:19:36 I verbose P1: in.infile: /tmp/data.txt
106 | 04-17 16:19:36 I verbose P1: out.outfile:
107 |                  /home/pwwang/github/pipen/examples/.pipen/MyPipeline/P1/0/output/intermediate
108 |                  .txt
109 | 04-17 16:19:38 I verbose P1: Time elapsed: 00:00:02.051s
110 | 04-17 16:19:38 I core
111 | 04-17 16:19:38 I core    ╭═══════════════════════════════ P2 ════════════════════════════════╮
112 | 04-17 16:19:38 I core    ║ Paste line number                                                 ║
113 | 04-17 16:19:38 I core    ╰═══════════════════════════════════════════════════════════════════╯
114 | 04-17 16:19:38 I core    P2: Workdir:
115 |                  '/home/pwwang/github/pipen/examples/.pipen/MyPipeline/P2'
116 | 04-17 16:19:38 I core    P2: <<< ['P1']
117 | 04-17 16:19:38 I core    P2: >>> [END]
118 | 04-17 16:19:38 I verbose P2: in.infile:
119 |                  /home/pwwang/github/pipen/examples/.pipen/MyPipeline/P1/0/output/intermediate
120 |                  .txt
121 | 04-17 16:19:38 I verbose P2: out.outfile:
122 |                  /home/pwwang/github/pipen/examples/MyPipeline-output/P2/result.txt
123 | 04-17 16:19:41 I verbose P2: Time elapsed: 00:00:02.051s
124 | 04-17 16:19:41 I core
125 | 
126 | 
127 |               MYPIPELINE: 100%|██████████████████████████████| 2/2 [00:06<00:00, 0.35 procs/s]
128 | ```
129 | 
130 | ```shell
131 | > cat ./MyPipeline-output/P2/result.txt
132 | 1       1
133 | 2       2
134 | 3       3
135 | ```
136 | 
137 | ## Examples
138 | 
139 | See more examples at `examples/` and a more realcase example at:
140 | 
141 | <https://github.com/pwwang/pipen-report/tree/master/example>
142 | 
143 | ## Plugin gallery
144 | 
145 | Plugins make `pipen` even better.
146 | 
147 | - [`pipen-annotate`][26]: Use docstring to annotate pipen processes
148 | - [`pipen-args`][19]: Command line argument parser for pipen
149 | - [`pipen-board`][27]: Visualize configuration and running of pipen pipelines on the web
150 | - [`pipen-diagram`][18]: Draw pipeline diagrams for pipen
151 | - [`pipen-dry`][20]: Dry runner for pipen pipelines
152 | - [`pipen-filters`][17]: Add a set of useful filters for pipen templates.
153 | - [`pipen-lock`][25]: Process lock for pipen to prevent multiple runs at the same time.
154 | - [`pipen-log2file`][28]: Save running logs to file for pipen
155 | - [`pipen-poplog`][30]: Populate logs from jobs to running log of the pipeline
156 | - [`pipen-report`][16]: Generate report for pipen
157 | - [`pipen-runinfo`][29]: Save running information to file for pipen
158 | - [`pipen-verbose`][15]: Add verbosal information in logs for pipen.
159 | - [`pipen-gcs`][32]: A plugin for pipen to handle files in Google Cloud Storage.
160 | - [`pipen-cli-init`][21]: A pipen CLI plugin to create a pipen project (pipeline)
161 | - [`pipen-cli-ref`][31]: Make reference documentation for processes
162 | - [`pipen-cli-require`][24]: A pipen cli plugin check the requirements of a pipeline
163 | - [`pipen-cli-run`][22]: A pipen cli plugin to run a process or a pipeline
164 | 
165 | [1]: https://pwwang.github.io/pipen
166 | [2]: https://pwwang.github.io/pipen/CHANGELOG
167 | [3]: https://pwwang.github.io/pipen/examples
168 | [4]: https://pwwang.github.io/pipen/api/pipen
169 | [5]: https://img.shields.io/librariesio/release/pypi/pipen?style=flat-square
170 | [6]: https://img.shields.io/pypi/v/pipen?style=flat-square
171 | [7]: https://pypi.org/project/pipen/
172 | [8]: https://img.shields.io/github/v/tag/pwwang/pipen?style=flat-square
173 | [9]: https://github.com/pwwang/pipen
174 | [10]: https://img.shields.io/github/actions/workflow/status/pwwang/pipen/build.yml?style=flat-square
175 | [11]: https://img.shields.io/github/actions/workflow/status/pwwang/pipen/docs.yml?label=docs&style=flat-square
176 | [12]: https://img.shields.io/codacy/grade/cf1c6c97e5c4480386a05b42dec10c6e?style=flat-square
177 | [13]: https://app.codacy.com/gh/pwwang/pipen
178 | [14]: https://img.shields.io/codacy/coverage/cf1c6c97e5c4480386a05b42dec10c6e?style=flat-square
179 | [15]: https://github.com/pwwang/pipen-verbose
180 | [16]: https://github.com/pwwang/pipen-report
181 | [17]: https://github.com/pwwang/pipen-filters
182 | [18]: https://github.com/pwwang/pipen-diagram
183 | [19]: https://github.com/pwwang/pipen-args
184 | [20]: https://github.com/pwwang/pipen-dry
185 | [21]: https://github.com/pwwang/pipen-cli-init
186 | [22]: https://github.com/pwwang/pipen-cli-run
187 | [23]: https://libraries.io/github/pwwang/pipen#repository_dependencies
188 | [24]: https://github.com/pwwang/pipen-cli-require
189 | [25]: https://github.com/pwwang/pipen-lock
190 | [26]: https://github.com/pwwang/pipen-annotate
191 | [27]: https://github.com/pwwang/pipen-board
192 | [28]: https://github.com/pwwang/pipen-log2file
193 | [29]: https://github.com/pwwang/pipen-runinfo
194 | [30]: https://github.com/pwwang/pipen-poplog
195 | [31]: https://github.com/pwwang/pipen-cli-ref
196 | [32]: https://github.com/pwwang/pipen-gcs
197 | 


--------------------------------------------------------------------------------
/docs/basics.md:
--------------------------------------------------------------------------------
 1 | <!-- toc -->
 2 | 
 3 | ## Layers of a pipeline
 4 | 
 5 | ![Layers](./layers.png)
 6 | 
 7 | The pipeline consists of channels and processes. A process may have many jobs. Each job uses the corresponding elements from the input channel of the process (a row of the input channel/dataframe), and generates values for output channel.
 8 | Actually, what you need to do is just specify the first input channel, and then tell `pipen` the dependencies of the processes. The later processes will use the output channel of the processes they depend on. Of course, you can also modify the output channel to match the input of the next processes, using functions.
 9 | 
10 | ## Folder structure
11 | ```
12 | ./
13 | |- pipeline.py
14 | `- <pipeline-workdir>/
15 |    `- <pipeline-name>/
16 |       |- proc.name
17 |       `- <job.index>/
18 |          |- input/
19 |          |- output/
20 |          |- job.signature.toml
21 |          |- job.script
22 |          |- job.rc
23 |          |- job.stdout
24 |          |- job.stderr
25 |          |- job.status
26 |          `- job.wrapped.<scheduler>
27 | ```
28 | 
29 | | Path | Content | Memo |
30 | |------|---------|------|
31 | |`<pipeline-workdir>`|Where the pipeline directories of all processes of current pipeline are located.|Can be set by `workdir`|
32 | |`<pipeline-name>`|The slugified name of the pipeline.||
33 | |`<job.index>/`|The job directory|Starts with `0`|
34 | |`<job.index>/output/`|Where you can find all the output files|If this is an end process, it should be a link to the output directory of this process of the pipeline|
35 | |`<job.index>/job.signature.toml`|The signature file of the job, used to check if job is cached||
36 | |`<job.index>/job.script`|The rendered script file||
37 | |`<job.index>/job.rc`|To file containing the return code||
38 | |`<job.index>/job.stdout`|The STDOUT of the script||
39 | |`<job.index>/job.stderr`|The STDERR of the script||
40 | |`<job.index>/job.statis`|The status of the job||
41 | |`<job.index>/job.wrapped.<scheduler>`|The wrapper for the scheduler to wrap the script||
42 | 
43 | 


--------------------------------------------------------------------------------
/docs/caching.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Job caching
 3 | 
 4 | If `cache` set to `False` (detected in the sequence of configuration files, `Pipen` constructor, and process definition), the job is running anyway regardless of previous runs.
 5 | 
 6 | If a previous run of a job fails, the job will be running anyway.
 7 | 
 8 | If a job is done successfully, a signature file will be generated for the job. When we try to run the job again, the signature will be used to check if we can skip running the job again but to use the results generated by previous run.
 9 | 
10 | We can also do a force-cache for a job by setting `cache` to `"force"`. This make sure of the results of previous successful run regardless of input or script changes. This is useful for the cases that, for example, you make some changes to input/script, but you don't want them to take effect immediately, especially when the job takes long time to run.
11 | 
12 | ## Job signature
13 | 
14 | The signature of a job consists of input types and data, output types and data, and lastest time (`lastest_time`) any files/directories from the script, input or output files are generated/modified. So these siutations will make job-cache checking fail (job will start over):
15 | 
16 | 1. Any changes in `input` or `output` types
17 | 2. Any changes in `input` or `output` data
18 | 3. Any changes to `script`
19 | 4. Any touches to input files (since they will make the last modified time > `lastest_time`)
20 | 5. Any touches to input directories
21 |    - Use `dirsig` as the depth to check the files under the directories
22 |    - Otherwise if it is `0`, only the directories themselves are checked. Note that modify a file inside a directory may not change the last modified time of the directory itself.
23 | 6. Any deletions to the output files/directories
24 |    Note that only the files/directories specified by `output` are checked. Files or subdirectories in the output directories will NOT be checked.
25 | 


--------------------------------------------------------------------------------
/docs/channel-collapse_files.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/docs/channel-collapse_files.png


--------------------------------------------------------------------------------
/docs/channel-expand_dir.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/docs/channel-expand_dir.png


--------------------------------------------------------------------------------
/docs/channels.md:
--------------------------------------------------------------------------------
  1 | 
  2 | Channels are used to pass data from one process to another. It is actually a `pandas.DataFrame` object, where each column corresponds to an input key and each row corresponds to a job.
  3 | 
  4 | 
  5 | The values for different variables in different jobs wil be:
  6 | 
  7 | | Job Index | v1 | v2  | v3 |
  8 | |-----------|----|-----|----|
  9 | | 0         | a1 | b1  | c1 |
 10 | | 1         | a2 | b2  | c2 |
 11 | | ...       |... | ... |... |
 12 | 
 13 | With a process definition:
 14 | 
 15 | ```python
 16 | class MyProcess(Proc):
 17 |     input = "v1, v2, v3"
 18 |     input_data = df #  The above data frame
 19 | ```
 20 | 
 21 | Then:
 22 | 
 23 | |Job index|Template|Rendered to|
 24 | |-|-|-|
 25 | |0|`{{in.v1}}`|`a1`|
 26 | |0|`{{in.v2}}`|`b1`|
 27 | |0|`{{in.v3}}`|`c1`|
 28 | |1|`{{in.v1}}`|`a2`|
 29 | |1|`{{in.v2}}`|`b2`|
 30 | |1|`{{in.v3}}`|`c2`|
 31 | |...|...|...|
 32 | 
 33 | The column names don't have to match the exact input keys. If `pipen` finds any of the input keys present in the data, just use them. However, if any input keys cannot find in the data frame, we will use the first couple of columns.
 34 | 
 35 | For example:
 36 | ```python
 37 | class MyProcess2(Proc):
 38 |     input = "v4, v3"
 39 |     input_data = df #  The above data frame
 40 | ```
 41 | 
 42 | The for job#0, `{{in.v4}}` will be rendered as `a1` (using column `v1` in the data), and `{{in.v3}}` as `c1` (using column `v3`).
 43 | 
 44 | 
 45 | ## Creating channels
 46 | 
 47 | Since channels are just data frames, so whatever creates a pandas data frame, can be used to create a channel. Besides, a couple of class methods are avaible to create channels:
 48 | 
 49 | - `Channel.create(...)`
 50 | 
 51 |     This takes a list of values to create a channel. If a data frame is passed, will return that data frame.
 52 | 
 53 |     If each element in the list is a tuple, the list is used to create a data frame directly, just like:
 54 | 
 55 |     ```python
 56 |     from pandas import DataFrame
 57 |     ch = Channel.create([(1,2), (3,4)])
 58 |     # ch = DataFrame([(1,2), (3,4)])
 59 |     #         0       1
 60 |     #   <int64> <int64>
 61 |     # 0       1       2
 62 |     # 1       3       4
 63 |     ```
 64 | 
 65 |     If each element is not a tuple (even it is a list), it is converted to tuple:
 66 |     ```python
 67 |     ch = Channel.create([1, 2])
 68 |     # equvalent to:
 69 |     # ch = Channel.create([(1, ), (2, )])
 70 |     ```
 71 | 
 72 |     The `input_data` is passed to this class method to create the input channel.
 73 | 
 74 | - `Channel.from_glob(...)`
 75 | 
 76 |     This takes a glob pattern to match the files to create a single-column channel.
 77 | 
 78 |     You can also filter the types of files by `ftype`:
 79 |     - `any`: to match any files (default)
 80 |     - `link`: to mach any links
 81 |     - `dir`: to match any directories
 82 |     - `file`: to match any files
 83 | 
 84 |     You may also sort the files using `sortby`:
 85 |     - `name`: sort the files by their basename (default)
 86 |     - `mtime`: sort the files by their last modified time
 87 |     - `size`: sort by file size
 88 | 
 89 |     When `reverse` is True, the above sortings are reversed.
 90 | 
 91 | - `Channel.from_pairs(...)`
 92 | 
 93 |     Like `Channel.from_glob()` but create a double-column channel.
 94 | 
 95 | - `Channel.from_csv(...)`
 96 | 
 97 |     Uses `pandas.read_csv()` to create a channel
 98 | 
 99 | - `Channel.from_excel(...)`
100 | 
101 |     Uses `pandas.read_excel()` to create a channel
102 | 
103 | - `Channel.from_table(...)`
104 | 
105 |     Uses `pandas.read_table()` to create a channel
106 | 
107 | 
108 | ## Builtin verbs/functions to transform channels
109 | 
110 | `pipen` uses [`pipda`][1] to create some verbs/functions to transform channels, so that you can use them with piping syntax:
111 | 
112 | ```python
113 | channel >> verb(...)
114 | ```
115 | 
116 | ### Expanding a channel by directory: `expand_dir()`
117 | 
118 | Sometimes we prepare files in one process (for example, split a big file into small ones in a directory), then handle these files by different jobs in another process, so that they can be processed simultaneously.
119 | 
120 | ![channel.expand_dir](./channel-expand_dir.png)
121 | 
122 | For example:
123 | ```python
124 | 
125 | class P1(Proc):
126 |     # the original file: a.txt
127 |     input  = "infile:file"
128 |     input_data = ["a.txt"]
129 |     output = "outdir:dir:outdir"
130 |     script = "# the script to split a.txt to 1.txt, 2.txt, 3.txt ... to {{out.outdir}}"
131 | 
132 | class P2(Proc):
133 |     requires = P1
134 |     # expand channel [("outdir/a/",)] to channel:
135 |     # [("outdir/a/1.txt",), ("outdir/a/2.txt",), ("outdir/a/3.txt",), ...]
136 |     input = "infile:file"
137 |     input_data = lambda ch: ch >> expand_dir(pattern="*.txt")
138 |     # outfile: 1.result, 2.result, ...
139 |     output  = "outfile:file:{{in.infile.split('/')[-1].split('.')[0]}}.result"
140 |     script  = """
141 |     # work on {{in.infile}} (1.txt, 2.txt, 3.txt, ...)
142 |     # to result file {{out.outfile}} (1.result, 2.result, 3.result, ...)
143 |     """
144 | 
145 | # Run 3 jobs in a batch simultaneously
146 | Pipen(forks=3).run(P1)
147 | ```
148 | 
149 | If the channel is a multi-column channel, you can also specify `col` to expand only on that column, values of other columns will be copied to the expanded rows/jobs.
150 | 
151 | You can also filter and sort the expanded files using arguments `ftype`, `sortby` and `reverse`, just like when we use `Channel.from_glob(...)`
152 | 
153 | !!! caution
154 | 
155 |     - `expand_dir(...)` only works for single-row channels, which will be expanded to `N` (number of files included). If original channel has more than 1 row, only first row will be used, and other rows will be ignored.
156 |     - Only the value of the column to be expanded will be changed, values of other columns remain the same.
157 | 
158 | ### Collapsing a channel by files in a common ancestor directory: `collapse_files(...)`
159 | 
160 | It's basically the reverse process of `expand_dir()`. It applies when you deal with different files and in next process you need them all involved (i.e. combine the results):
161 | 
162 | ![channel.collapse_files](./channel-collapse_files.png)
163 | 
164 | For example:
165 | ```python
166 | 
167 | class P1(Proc):
168 |     input = "infile:file"
169 |     input_data = ["/a/b/1.txt", "/a/b/2.txt", "/a/b/3.txt"]
170 |     output = "outfile:file:{{in.infile.split('/')[-1].split('.')[0] | append: '.txt2'}}"
171 |     script = """
172 |     # the script to deal with each input file:
173 |     # {{in.infile}} -> {{out.outfile}}
174 |     """
175 | 
176 | class P2(Proc):
177 |     requires = P1
178 |     # collapse channel [("<outdir>/1.txt2",), ("<outdir>/2.txt2",), ("<outdir>/3.txt2",)]
179 |     # to channel: [("<outdir>/", )]
180 |     input   = "indir:file"
181 |     input_data = lambda ch: ch >> collapse_files()
182 |     output  = "outfile:file:{{in.indir.split('/')[-1]}}.result"
183 |     script  = """
184 |     # combine 1.txt2, 2.txt2, 3.txt3 in {{in.indir}} to {{out.outfile}}
185 |     """
186 | 
187 | Pipen().run(P1)
188 | ```
189 | 
190 | Similarly, if we have multiple columns, you may specify the column by index or name to collapse by:
191 | `ch >> collapse_files(col=...)`
192 | 
193 | !!! caution
194 | 
195 |     * `os.path.dirname(os.path.commonprefix(...))` is used to detect the common ancestor directory, so the files could be `['/a/1/1.file', '/a/2/1.file']`. In this case `/a/` will be returned.
196 |     * values at other columns should be the same. They will NOT be checked! The values at the first row will be used.
197 | 
198 | [1]: https://github.com/pwwang/pipda
199 | 


--------------------------------------------------------------------------------
/docs/cli.md:
--------------------------------------------------------------------------------
 1 | `pipen` has a CLI tool that you can run from command line.
 2 | 
 3 | To run it:
 4 | 
 5 | ```shell
 6 | ❯ pipen --help
 7 | Usage: pipen [-h] {version,profile,plugins,help} ...
 8 | 
 9 | CLI Tool for pipen v0.4.2
10 | 
11 | Optional Arguments:
12 |   -h, --help            show help message and exit
13 | 
14 | Subcommands:
15 |     version             Print versions of pipen and its dependencies
16 |     profile             List available profiles.
17 |     plugins             List installed plugins
18 |     help                Print help for commands
19 | ```
20 | 
21 | ## Writing a plugin to extend the cli
22 | 
23 | ### CLI plugin abstract class
24 | 
25 | A CLI plugin has to be a subclass of `pipen.cli.CLIPlugin`.
26 | 
27 | A CLI plugin has to define a `name` property, which also is the sub-command of the plugin.
28 | 
29 | There are a couple of methods of `pipen.cli.CLIPlugin` to extend for a plugin:
30 | 
31 | - `__init__(self, parser, subparser)`: initialize the plugin
32 |   It takes the main parser and the subparser of the sub-command as arguments. You can add arguments to the parser or subparser here.
33 |   Check [argx][1] for more information about how to define arguments.
34 | 
35 | - `parse_args(self)`: parse the arguments
36 |   It takes no arguments. It should parse the arguments and return the parsed arguments (Namespace), which are used to execute the command.
37 |   By default, `self.parser.parse_args()` is called to parse the arguments.
38 | 
39 | - `exec_command(self, args)`: execute the command
40 |   It takes the parsed arguments as argument. It should execute the command as you wish.
41 | 
42 | ### loading CLI plugins
43 | 
44 | Like pipen [plugins][2], [templates][3], and [schedulers][4], there are two ways to load the CLI plugins:
45 | 
46 | 1. Use the plugin directly:
47 | 
48 |     ```python
49 |     from pipen.cli import cli_plugin
50 | 
51 |     cli_plugin.register(<your plugin>)
52 |     ```
53 | 
54 | 2. Use the entry points with group name `pipen_cli`
55 | 
56 | 
57 | ## The `profile` subcommand
58 | 
59 | It is used to list the configurations/profiles in current directory. Run `pipen profile` or `pipen help profile` to get more information.
60 | 
61 | ## The `plugins` subcommand
62 | 
63 | This subcommand is used to list the plugins for `pipen` itself, templates, scheduler and cli. Run `pipen plugins` or `pipen help plugins` to get more information.
64 | 
65 | ## The `version` subcommand
66 | 
67 | This command prints the versions of `pipen` and its dependencies.
68 | 
69 | ## CLI plugin gallery
70 | 
71 | - [`pipen-cli-init`][5]: A pipen CLI plugin to create a pipen project (pipeline)
72 | - [`pipen-cli-ref`][6]: Make reference documentation for processes
73 | - [`pipen-cli-require`][7]: A pipen cli plugin check the requirements of a pipeline
74 | - [`pipen-cli-run`][8]: A pipen cli plugin to run a process or a pipeline
75 | 
76 | [1]: https://github.com/pwwang/argx
77 | [2]: ../plugin
78 | [3]: ../templating
79 | [4]: ../scheduler
80 | [5]: https://github.com/pwwang/pipen-cli-init
81 | [6]: https://github.com/pwwang/pipen-cli-ref
82 | [7]: https://github.com/pwwang/pipen-cli-require
83 | [8]: https://github.com/pwwang/pipen-cli-run
84 | 


--------------------------------------------------------------------------------
/docs/cloud.md:
--------------------------------------------------------------------------------
 1 | Since `v0.16.0`, `pipen` supports the cloud naively. There are two ways by means of cloud support:
 2 | 
 3 | - Run the pipeline locally (or schedulers like `sge`, `slurm`, etc.) and save the files to the cloud.
 4 | - Run the pipeline on the cloud.
 5 | 
 6 | ## Run the pipeline locally and save the files to the cloud
 7 | 
 8 | To run the pipeline locally and save the files to the cloud, you need to install `pipen` with cloud support:
 9 | 
10 | ```bash
11 | pip install xqute[cloudsh]
12 | # To support a specific cloud service provider
13 | pip install cloudpathlib[s3]
14 | pip install cloudpathlib[gs]
15 | pip install cloudpathlib[azure]
16 | ```
17 | 
18 | The you can directly assign a cloud path as a pipeline working directory:
19 | 
20 | ```python
21 | from pipen import Pipen, Proc, run
22 | 
23 | 
24 | class P1(Proc):
25 |     """Sort input file"""
26 |     input = "in:var"
27 |     input_data = ["Hello World"]
28 |     output = "outfile:file:out.txt"
29 |     # Note that out.outfile is on the cloud but the script is executed locally
30 |     # we can use cloudsh to save the output to the cloud
31 |     script = "echo {{in.in}} | cloudsh sink {{out.outfile}}"
32 | 
33 | 
34 | class MyPipeline(Pipen):
35 |     starts = P1
36 |     workdir = "gs://mybucket/mypipeline/workdir"
37 |     output = "gs://mybucket/mypipeline/output"
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     MyPipeline().run()
42 | ```
43 | 
44 | Like the following figure, the pipeline is run locally but the meta information is grabbed from and saved to the cloud (workdir).
45 | No local files are generated.
46 | 
47 | For the output files, if a process is a non-export process, the output files are saved to the workdir.
48 | If a process is an export process, the output files are saved to the output directory (export dir).
49 | 
50 | ![pipen-cloud1](./pipen-cloud1.png)
51 | 
52 | ## Run the pipeline on the cloud
53 | 
54 | Currently, `pipen` only supports running the pipeline on the cloud with google batch jobs.
55 | 
56 | To run the pipeline on the cloud, you need to install `pipen` with cloud support:
57 | 
58 | ```bash
59 | pip install xqute[gs]
60 | ```
61 | 
62 | It is used to communicate with google cloud storage files. No `cloudsh` is needed, since operating the cloud files will be happening on the cloud (with the cloud paths mounted to the VM). You also need to have [google cloud sdk][1] installed and configured, which is used to communicate with google batch jobs (submit jobs, get job status, etc.).
63 | 
64 | ```python
65 | from pipen import Pipen, Proc, run
66 | 
67 | 
68 | class P1(Proc):
69 |     """Sort input file"""
70 |     input = "in:var"
71 |     input_data = ["Hello World"]
72 |     output = "outfile:file:out.txt"
73 |     # Note that out.outfile is on the cloud but the script is executed locally
74 |     # we can use cloudsh to save the output to the cloud
75 |     script = "echo {{in.in}} | cloudsh sink {{out.outfile}}"
76 | 
77 | 
78 | class MyPipeline(Pipen):
79 |     starts = P1
80 |     workdir = "gs://mybucket/mypipeline/workdir"
81 |     output = "gs://mybucket/mypipeline/output"
82 |     scheduler = "gbatch"
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     MyPipeline().run()
87 | ```
88 | 
89 | The only difference is that we need to set `scheduler` to `gbatch` (google batch jobs).
90 | 
91 | As shown in the following figure, the pipeline is run on the cloud platform, and the workdir and export dir will be mounted to the VM. So the process script can directly access the cloud files, no `cloudsh` or `gcloud` tools are needed.
92 | 
93 | ![pipen-cloud2](./pipen-cloud2.png)
94 | 
95 | [1]: https://cloud.google.com/sdk?hl=en
96 | 


--------------------------------------------------------------------------------
/docs/configurations.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Configuration items
  3 | 
  4 | There are two levels of configuration items in `pipen`: pipeline level and process level.
  5 | 
  6 | There are only 3 configuration items at pipeline level:
  7 | 
  8 | - `loglevel`: The logging level for the logger (Default: `"info"`)
  9 | - `workdir`: Where the metadata and intermediate files are saved for the pipeline (Default: `./.pipen`)
 10 | - `plugins`: The plugins to be enabled or disabled for the pipeline
 11 | 
 12 | These items cannot be set or changed at process level.
 13 | 
 14 | Following items are at process level. They can be set changed at process level so that they can be process-specific. You may also see some of the configuration items introduced [here][1]
 15 | 
 16 | - `cache`: Should we detect whether the jobs are cached? See also [here][2]
 17 | - `dirsig`: When checking the signature for caching, whether should we walk through the content of the directory? This is sometimes time-consuming if the directory is big.
 18 | - `error_strategy`: How to deal with the errors: retry, ignore or halt. See also [here][3]
 19 | - `num_retries`: How many times to retry to jobs once error occurs.
 20 | - `template`: efine the template engine to use. See also [here][4]
 21 | - `template_opts`: Options to initialize the template engine (will inherit from pipeline level)
 22 | - `forks`: How many jobs to run simultaneously?
 23 | - `lang`: The language for the script to run. See also [here][5]
 24 | - `plugin_opts`: Options for process-level plugins, will inherit from pipeline level
 25 | - `scheduler`: The scheduler to run the jobs
 26 | - `scheduler_opts`: The options for the scheduler, will inherit from pipeline level
 27 | - `submission_batch`: How many jobs to be submited simultaneously
 28 | 
 29 | ## Configuration priorities
 30 | 
 31 | There are different places to set values for the configuration items (priorities from low to high):
 32 | 
 33 | - The configuration files (priorities from low to high):
 34 | 
 35 |   - `~/.pipen.toml`
 36 |   - `./.pipen.toml`
 37 |   - `PIPEN.osenv`
 38 | 
 39 |   See [here][6] for how the configuration files are loaded.
 40 |   `pipen` uses `TOML` as configuration language, see [here][7] for more information about `toml` format.
 41 | 
 42 | - The arguments of `Pipen` constructor
 43 | - The process definition
 44 | 
 45 | !!! note
 46 | 
 47 |     The configurations from configuration files are with profiles. If the same profile name appears in multiple configuration files, the items will be inherited from the lower-priority files.
 48 | 
 49 | !!! note
 50 | 
 51 |     Special note for `lang`.
 52 | 
 53 |     If it is not set at process level, and there are shebang in the script, whatever you specified at pipeline level (including in the configuration files), it will be ignored and the interpreter in the shebang will be used.
 54 | 
 55 |     See also [script][5]
 56 | 
 57 | !!! tip
 58 | 
 59 |     If you have nothing set at `Pipen` constructor or process definition for a configuration item, the `PIPEN.osenv` is useful to use a different value than the one set in other configuration files. For example, to disable cache for all processes:
 60 | 
 61 |     ```
 62 |     PIPEN_DEFAULT_cache=0 python ./pipeline.py ...
 63 |     ```
 64 | 
 65 | ## Profiles
 66 | 
 67 | You can have different profiles in configuration files:
 68 | 
 69 | `~/.pipen.toml`
 70 | ```toml
 71 | [default]
 72 | scheduler = "local"
 73 | 
 74 | [sge]
 75 | scheduler = "sge"
 76 | 
 77 | [sge.schduler_opts]
 78 | sge_q = "1-day"
 79 | ```
 80 | 
 81 | 
 82 | To use the `sge` profile:
 83 | 
 84 | ```python
 85 | Pipen().run(P1, profile="sge")
 86 | ```
 87 | 
 88 | You can also have a configuration in current directory:
 89 | 
 90 | `./.pipen.toml`
 91 | ```toml
 92 | [sge.scheduler_opts]
 93 | sge_q = "7-days"
 94 | ```
 95 | 
 96 | Then the queue to run the jobs will be `7-days`. Note that we didn't specify the `scheduler` in `./.pipen.toml`, which is inherited from `~/.pipen.toml`.
 97 | 
 98 | [1]: ../defining-proc
 99 | [2]: ../caching
100 | [3]: ../error
101 | [4]: ../templating
102 | [5]: ../script
103 | [6]: https://github.com/pwwang/python-simpleconf#loading-configurations
104 | [7]: https://github.com/toml-lang/toml
105 | 


--------------------------------------------------------------------------------
/docs/defining-proc.md:
--------------------------------------------------------------------------------
  1 | A pipeline consists of many processes, which could own multiple jobs that run in parallel.
  2 | 
  3 | ## Defining/Creating processes
  4 | 
  5 | `pipen` has two (preferred) ways to define processes:
  6 | 
  7 | ### Subclassing `pipen.Proc`
  8 | 
  9 | ```python
 10 | from pipen import Proc
 11 | 
 12 | class MyProcess(Proc):
 13 |     ... # process configurations
 14 | ```
 15 | 
 16 | The configurations are specified as class variables of the class.
 17 | 
 18 | 
 19 | 
 20 | ### Using class method `Proc.from_proc()`
 21 | 
 22 | If you want to reuse a defined process, you can either subclass it:
 23 | 
 24 | ```python
 25 | class MyOtherProcess(MyProcess):
 26 |     ... # configurations inherited from MyProcess
 27 | ```
 28 | 
 29 | Or use `Proc.from_proc()`:
 30 | 
 31 | ```python
 32 | # You can also pass the configurations you want to override
 33 | MyOtherProcess = Proc.from_proc(MyProcess, ...)
 34 | ```
 35 | 
 36 | Note that `Proc.from_proc()` cannot override all configurations/class variables, because we assume that there are some shared configurations if you want to "copy" from another process.
 37 | 
 38 | These shared configurations are:
 39 | 
 40 | 1. Template engine and its options (`template` and `template_opts`)
 41 | 2. Script template (`script`)
 42 | 3. Input keys (`input`)
 43 | 4. Language/Interpreter of the script (`lang`)
 44 | 5. Output keys (`output`)
 45 | 
 46 | 
 47 | All other configurations can be passed to `Proc.from_proc()` to override the old ones.
 48 | 
 49 | For all configurations/class variables for a process, see next section.
 50 | 
 51 | You don't need to specify the new name of the new process, the variable name on the left-handle side will be used if `name` argument is not provided to `Proc.from_proc()`. For example:
 52 | 
 53 | ```python
 54 | NewProc = Proc.from_proc(OldProc)
 55 | # NewProc.name == "NewProc"
 56 | ```
 57 | 
 58 | But you are able to assign a different name to a new process if you want. For example:
 59 | 
 60 | ```python
 61 | NewProc = Proc.from_proc(OldProc, name="NewProc2")
 62 | # NewProc.name = "NewProc2"
 63 | ```
 64 | 
 65 | ### How about instantiation of `Proc` directly?
 66 | 
 67 | You are not allowed to do that. `Proc` is an abstract class, which is designed to be subclassed.
 68 | 
 69 | ### How about instantiation of a `Proc` subclass?
 70 | 
 71 | Nope, in `pipen`, a process is a `Proc` subclass itself. The instances of the subcleasses are used internally, and they are singletons. In most cases, you don't need to use the instances, unless you want to access the computed properties of the instances, including:
 72 | 
 73 | - `pipeline`: The pipeline, which is a `Pipen` object
 74 | - `pbar`: The progress bar for the process, indicating the job status of this process
 75 | - `jobs`: The jobs of this process
 76 | - `xqute`: The `Xqute` object to manage the job running.
 77 | - `template`: The template engine (a `pipen.template.Template` object)
 78 | - `template_opts`: The template options (overwritten from config by the `template_opts` class variable)
 79 | - `input`: The sanitized input keys and types
 80 | - `output`: The compiled output template, ready for the jobs to render with their own data
 81 | - `scheduler`: The scheduler object (inferred from the name or sheduler object from the `scheduler` class variable)
 82 | - `script`: The compiled script template, ready for the jobs to render with their own data
 83 | 
 84 | ### How about copy/deep-copy of a `Proc` subclass?
 85 | 
 86 | Nope. Copy or deep-copy of a `Proc` subclass won't trigger `__init_subclass__()`, where consolidate the process name from the class name if not specified and connect the required processes with the current one. Copy or deep-copy keeps all properties, but disconnect the relationships between current process and the dependency processes, even with a separate assignment, such as `MyProcess.requires = ...`.
 87 | 
 88 | ## process configurations and `Proc` class variables
 89 | 
 90 | The configurations of a process are specified as class variables of subclasses of `Proc`.
 91 | 
 92 | |Name|Meaning|Can be overwritten by `Proc.from_proc()`|
 93 | |-|-|-|
 94 | |`name`|The name of the process. Will use the class name by default.|Yes|
 95 | |`desc`|The description of the process. Will use the summary from the docstring by default.|Yes|
 96 | |`envs`|The env variables that are job-independent, useful for common options across jobs.|Yes, and old ones will be inherited|
 97 | |`cache`|Should we detect whether the jobs are cached?|Yes|
 98 | |`dirsig`|When checking the signature for caching, the depth we should walk through the content of the directory? This is sometimes time-consuming if the directory and the depth are big.|Yes|
 99 | |`export`|When True, the results will be exported to `<pipeline.outdir>` Defaults to None, meaning only end processes will export. You can set it to True/False to enable or disable exporting for processes|Yes|
100 | |`error_strategy`|How to deal with the errors: retry, ignore, halt|Yes|
101 | |`num_retries`|How many times to retry to jobs once error occurs|Yes|
102 | |`template`|Define the template engine to use.|No|
103 | |`template_opts`|Options to initialize the template engine.|No|
104 | |`forks`|How many jobs to run simultaneously?|Yes|
105 | |`input`|The keys and types for the input channel|No|
106 | |`input_data`|The input data (will be computed for dependent processes)|Yes|
107 | |`lang`|The language for the script to run.|No|
108 | |`order`|The execution order for the same dependency-level processes|Yes|
109 | |`output`|The output keys for the output channel|No|
110 | |`plugin_opts`|Options for process-level plugins|Yes|
111 | |`requires`|The dependency processes|Yes|
112 | |`scheduler`|The scheduler to run the jobs|Yes|
113 | |`scheduler_opts`|The options for the scheduler|Yes|
114 | |`script`|The script template for the process|No|
115 | |`submission_batch`|How many jobs to be submited simultaneously|Yes|
116 | 


--------------------------------------------------------------------------------
/docs/error.md:
--------------------------------------------------------------------------------
 1 | You can tell `pipen` how to handle when a job fails to run.
 2 | 
 3 | You can specify one of the following to `error_strategy`
 4 | 
 5 | - `halt`: Any failure will just halt the whole pipeline
 6 | - `ignore`: Ignore the error and keep running (assuming the job runs successfully anyway)
 7 | - `retry`: Retry to job running
 8 |   - After `num_retries` times of retrying, if the job is still failing, then halt the pipeline.
 9 | 
10 | `pipen` uses `xqute` to handle the errors. See also [here][1].
11 | 
12 | [1]: https://pwwang.github.io/xqute/api/xqute.defaults/#xqute.defaults.JobErrorStrategy
13 | 


--------------------------------------------------------------------------------
/docs/input-output.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Specify input of a process
  3 | 
  4 | The input of a process is specified with `input`, the keys of the input data, and `input_data`, the real input data
  5 | 
  6 | !!! tip
  7 | 
  8 |     Why separate the keys and data?
  9 | 
 10 |     Because the keys and data are not always combined, for example, we need the keys to infer the `output` and `script` (using them in the template), but the data may be deferred to obtain from the output of dependency processes.
 11 | 
 12 | 
 13 | The complete form of an input key (`input`) is `<key>:<type>`. The `<type>` could be `var`, `file`, `dir`, `files` and `dirs`. **A type of `var` can be omitted.** So `ph1, ph2` is the same as `ph1:var, ph2:var`
 14 | 
 15 | If a process is requiring other processes, the specified `input_data` will be ignored, and will use the output data of their required processes:
 16 | 
 17 | ```python
 18 | class P1(Proc):
 19 |     input = "v1"
 20 |     output = "o1:{{in.v1}}" # pass by v1 as output variable
 21 |     input_data = ["a"]
 22 | 
 23 | class P2(Proc):
 24 |     input = "v2"
 25 |     output = "o2:{{in.v2}}"
 26 |     input_data = ["b"]
 27 | 
 28 | class P3(Proc):
 29 |     requires = [P1, P2]
 30 |     input = "i1, i2"
 31 |     output = "o3:{{in.i1}}_{{in.i2}}" # will be "a_b"
 32 |     # input_data = []  # ignored with a warning
 33 | 
 34 | Pipen().run(P1, P2)
 35 | ```
 36 | 
 37 | !!! Tip
 38 | 
 39 |     The direct `input_data` is ignore, but you can use a callback to modify the input channel.
 40 |     For example:
 41 | 
 42 |     ```python
 43 |     class P4(Proc):
 44 |         requires = [P1, P2]
 45 |         input = "i1, i2"
 46 |         input_data = lambda ch: ch.applymap(str.upper)
 47 |         output = "o3:{{in.i1}}_{{in.i2}}" # will be "A_B"
 48 |     ```
 49 | 
 50 | !!! Note
 51 | 
 52 |     When the input data does have enough columns, `None` will be used with warnings. And when the input data has more columns than the input keys, the extra columns are dropped and ignored, also with warnings
 53 | 
 54 | ## Specify output of a process
 55 | 
 56 | Different from input, instead of channels, you have to tell `pipen` how to compute the output channel. The output can be a `list` or `str`. If it's `str`, a comma (`,`) is used to separate different keys:
 57 | 
 58 | To use templating in `output`, see [`templating`][1].
 59 | 
 60 | ```python
 61 | class P1(Proc):
 62 |     input = "invar, infile"
 63 |     input_data = [(1, "/a/b/c.txt")]
 64 |     output = (
 65 |         "outvar:{{in.invar}}2, "
 66 |         "outfile:file:{{in.infile.split('/')[-1]}}2, "
 67 |         "outdir:dir:{{in.infile.split('/')[-1].split('.')[0]}}-dir"
 68 |     )
 69 | 
 70 | # The type 'var' is omitted in the first element.
 71 | # The output channel will be:
 72 | #
 73 | #    outvar    outfile                 outdir
 74 | #    <object>  <object>                <object>
 75 | # 0  "12"      "<job.outdir>/c.text2"  "<job.outdir>/c-dir"
 76 | ```
 77 | 
 78 | ## Types of input and output
 79 | 
 80 | ### Input
 81 | 
 82 | |Type|Meaning|
 83 | |----|-------|
 84 | |`var`|Use the values directly|
 85 | |`file`|Treat the data as a file path|
 86 | |`dir`|Treat the data as a directory path|
 87 | |`files`|Treat the data as a list of file paths|
 88 | |`dirs`|Treat the data as a list of directory paths|
 89 | 
 90 | For `file`/`files`, when checking whether a job is cached, their last modified time will be checked.
 91 | 
 92 | For `dir`/`dirs`, if `dirsig > 0`, then the files inside the directories will be checked. Otherwise, the directories themselves are checked for last modified time.
 93 | 
 94 | 
 95 | ### Output
 96 | 
 97 | |Type|Meaning|Memo|
 98 | |----|-------|----|
 99 | |`var`|Use the values directly||
100 | |`dir`|Use the data as a directory path|The directory will be created directly|
101 | |`file`|Use the data as a file path||
102 | 
103 | [1]: ../templating
104 | 


--------------------------------------------------------------------------------
/docs/layers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/docs/layers.png


--------------------------------------------------------------------------------
/docs/pipen-cloud1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/docs/pipen-cloud1.png


--------------------------------------------------------------------------------
/docs/pipen-cloud2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/docs/pipen-cloud2.png


--------------------------------------------------------------------------------
/docs/plugin.md:
--------------------------------------------------------------------------------
  1 | `pipen` uses [`simplug`][1] for plugin support. There are very enriched hooks available for you to write your own plugins to extend `pipen`.
  2 | 
  3 | ## Runtime plugins
  4 | 
  5 | ### Plugin hooks
  6 | 
  7 | To implement a function in your plugin, just simply:
  8 | 
  9 | ```python
 10 | from pipen import plugin
 11 | 
 12 | @plugin.impl
 13 | [async ]def hook(...):
 14 |     ...
 15 | ```
 16 | 
 17 | Note that you have to use keyword-arguments and they have to match the hook signature.
 18 | 
 19 | See [`simplug`][1] for more details.
 20 | 
 21 | #### Pipeline-level hooks
 22 | 
 23 | - `on_setup(config)` (sync):
 24 | 
 25 |     Setup for the plugin, mainly used for initalization and set the default values for the plugin configuration items.
 26 | 
 27 |     This is only called once even when you have multiple pipelines (`Pipen` objects) in a python session.
 28 | 
 29 | - `on_init(pipen)` (async)
 30 | 
 31 |     Called when pipeline is initialized. Note that here only default configurations are loaded (from defaults.CONFIG and config files). The configurations from `Pipen` constructor and the processes are not loaded yet. It's useful for plugins to change the default configurations.
 32 | 
 33 | - `on_start(pipen)` (async)
 34 | 
 35 |     Right before the pipeline starts to run. The process relationships are inferred here.
 36 |     You can access the start processes by `pipen.starts` and all processes by `pipen.procs` in the sequence of the execution order.
 37 | 
 38 | - `on_complete(pipen, succeeded)` (async)
 39 | 
 40 |     After all processes finish. `succeeded` indicates whether all processes/jobs finish successfully.
 41 | 
 42 | #### Process-level hooks
 43 | 
 44 | - `on_proc_create(proc)` (sync)
 45 | 
 46 |     Called before proc get instantiated.
 47 |     Enables plugins to modify the default attributes of processes
 48 | 
 49 | - `on_proc_input_computed(proc)` (sync)
 50 | 
 51 |     Called after process input data is computed.
 52 | 
 53 | - `on_proc_script_computed(proc)` (sync)
 54 | 
 55 |     Called after process script is computed.
 56 | 
 57 |     The script is computed as a string that is about to compiled into a
 58 |     template. You can modify the script here.
 59 | 
 60 | - `on_proc_init(proc)` (async)
 61 | 
 62 |     When process object is initialized.
 63 |     Allows plugins to modify the process attributes after initialization, but
 64 |     before the jobs are initialized.
 65 | 
 66 | - `on_proc_start(proc)` (async)
 67 | 
 68 |     When process object initialization completes, including the `xqute` and job initialization. The `output_data` is also accessible here. The process is ready to run.
 69 | 
 70 | - `on_proc_shutdown(proc, sig)` (sync)
 71 | 
 72 |     When the process is shut down (i.e. by `<ctrl-c>`). You can access the signal that shuts the process down by `sig`. Only first plugin (based on the priority) that implements this hook will get called.
 73 | 
 74 | - `on_proc_done(proc, succeeded)` (async)
 75 | 
 76 |     When a process is done.
 77 | 
 78 | #### Job-level hooks
 79 | 
 80 | - `on_job_init(job)` (async)
 81 | 
 82 |     When a job is initialized
 83 | 
 84 | - `on_job_queued(job)` (async)
 85 | 
 86 |     When a job is queued in xqute. Note it might not be queued yet in the scheduler system.
 87 | 
 88 | - `on_job_submitting(job)` (async)
 89 | 
 90 |     When a job is submitting.
 91 | 
 92 |     The first plugin (based on priority) have this hook return `False` will cancel the submission
 93 | 
 94 | - `on_job_submitted(job)` (async)
 95 | 
 96 |     When a job is submitted in the scheduler system.
 97 | 
 98 | - `on_job_started(job)` (async)
 99 | 
100 |     When a job starts to run in then scheduler system.
101 | 
102 | - `on_job_polling(job)` (async)
103 | 
104 |     When status of a job is being polled.
105 | 
106 | - `on_job_killing(job)` (async)
107 | 
108 |     When a job is being killed.
109 | 
110 |     The first plugin (based on priority) have this hook return `False` will cancel the killing
111 | 
112 | - `on_job_killed(job)` (async)
113 | 
114 |     When a job is killed
115 | 
116 | - `on_job_succeeded(job)` (async)
117 | 
118 |     When a job completes successfully
119 | 
120 | - `on_job_cached(job)` (async)
121 | 
122 |     When a job is cached
123 | 
124 | - `on_job_failed(job)` (async)
125 | 
126 |     When a job is done but failed (i.e. return_code == 1).
127 | 
128 | - `on_jobcmd_init(job) -> str` (sync)
129 | 
130 |     When the job command wrapper script is initialized before the prescript is run
131 | 
132 |     This should return a piece of bash code to be inserted in the wrapped job
133 |     script (template), which is a python template string, with the following
134 |     variables available: `status` and `job`. `status` is the class `JobStatus` from
135 |     `xqute.defaults.py` and `job` is the `Job` instance.
136 | 
137 |     For multiple plugins, the code will be inserted in the order of the plugin priority.
138 | 
139 |     The code will replace the `#![jobcmd_init]` placeholder in the wrapped job script.
140 |     See also <https://github.com/pwwang/xqute/blob/master/xqute/defaults.py#L95>
141 | 
142 | - `on_jobcmd_prep(job) -> str` (sync)
143 | 
144 |     When the job command right about to be run
145 | 
146 |     This should return a piece of bash code to be inserted in the wrapped job
147 |     script (template), which is a python template string, with the following
148 |     variables available: `status` and `job`. `status` is the class `JobStatus` from
149 |     `xqute.defaults.py` and `job` is the `Job` instance.
150 | 
151 |     The bash variable `$cmd` is accessible in the context. It is also possible to
152 |     modify the `cmd` variable. Just remember to assign the modified value to `cmd`.
153 | 
154 |     For multiple plugins, the code will be inserted in the order of the plugin priority.
155 |     Keep in mind that the `$cmd` may be modified by other plugins.
156 | 
157 |     The code will replace the `#![jobcmd_prep]` placeholder in the wrapped job script.
158 |     See also <https://github.com/pwwang/xqute/blob/master/xqute/defaults.py#L95>
159 | 
160 | - `on_jobcmd_end(job) -> str` (sync):
161 | 
162 |     When the job command finishes and after the postscript is run
163 | 
164 |     This should return a piece of bash code to be inserted in the wrapped job
165 |     script (template), which is a python template string, with the following
166 |     variables available: `status` and `job`. `status` is the class `JobStatus` from
167 |     `xqute.defaults.py` and `job` is the `Job` instance.
168 | 
169 |     The bash variable `$rc` is accessible in the context, which is the return code
170 |     of the job command.
171 | 
172 |     For multiple plugins, the code will be inserted in the order of the plugin priority.
173 | 
174 |     The code will replace the `#![jobcmd_end]` placeholder in the wrapped job script.
175 |     See also <https://github.com/pwwang/xqute/blob/master/xqute/defaults.py#L95>
176 | 
177 | ### Loading plugins
178 | 
179 | You can specify the plugins to be loaded by specifying the names or the plugin itself in `plugins` configuration. With names, the plugins will be loaded from [entry points][2].
180 | 
181 | You can also disable some plugins if they are set in the lower-priority configurations. For example, you want to disable `pipen_verbose` (enabled in a configuration file) for a pipeline:
182 | 
183 | ```python
184 | Pipen(..., plugins=["-pipen_verbose"])
185 | ```
186 | 
187 | !!! note
188 | 
189 |     You can use `+` as prefix to enable a disabled plugin, or `-` as prefix to disable an enabled plugin. If no prefix is used, only the specified plugins will be enabled and all other plugins will be disabled. You should either use `+` or `-` for all plugins or none of them. If a plugin is not given as a string, it will be treated as `+plugin`.
190 | 
191 | ### Writing a plugin
192 | 
193 | You can write your own plugin by implementing some of the above hooks. You can import the plugin directly and add it to `Pipen(..., plugins=[...]). For example:
194 | 
195 | ```python
196 | from pipen import plugin, Pipen
197 | 
198 | class PipenPlugin:
199 | 
200 |     @plugin.impl
201 |     [async ]def hook(...):
202 |         ...
203 | 
204 | Pipen(..., plugins=[PipenPlugin])
205 | 
206 | 
207 | You can also use the entry point to register your plugin using the group name `pipen`
208 | 
209 | For `setup.py`, you will need:
210 | ```python
211 | setup(
212 |     # ...
213 |     entry_points={"pipen": ["pipen_verbose = pipen_verbose"]},
214 |     # ...
215 | )
216 | ```
217 | 
218 | For `pyproject.toml`:
219 | 
220 | ```toml
221 | [tool.poetry.plugins.pipen]
222 | pipen_verbose = "pipen_verbose"
223 | ```
224 | 
225 | Then the plugin `pipen_verbose` can be loaded by `plugins=["+pipen_verbose"]` or disabled by `plugins=["-pipen_verbose"]`
226 | 
227 | #### Logging to the console from a plugin
228 | 
229 | Of course you can do arbitrary logging from a plugin. However, to keep the consistency with main logger of `pipen`, The best practice is:
230 | 
231 | ```python
232 | from pipen.utils import get_logger
233 | 
234 | logger = get_logger("verbose", "info")
235 | 
236 | # do some logging inside the hooks
237 | ```
238 | 
239 | The above code will produce some logging on the console like this:
240 | 
241 | ```shell
242 | 11-04 12:00:19 I main    ╭═══════════════════════════ Process ═══════════════════════════╮
243 | 11-04 12:00:19 I main    ║ Undescribed.                                                  ║
244 | 11-04 12:00:19 I main    ╰═══════════════════════════════════════════════════════════════╯
245 | 11-04 12:00:19 I main    Process: Workdir: '.pipen/process'
246 | 11-04 12:00:19 I verbose Process: size: 10
247 | 11-04 12:00:19 I verbose Process: [0/9] in.a: 0
248 | 11-04 12:00:19 I verbose Process: [0/9] out.b: pipeline-0-output/Process/0/a.txt
249 | ```
250 | 
251 | ## CLI plugins
252 | 
253 | See [CLI][11] for more details.
254 | 
255 | ## Plugin gallery
256 | 
257 | - [`pipen-verbose`][3]: Add verbosal information in logs for pipen.
258 | - [`pipen-report`][4]: Generate report for pipen
259 | - [`pipen-filters`][8]: Add a set of useful filters for pipen templates.
260 | - [`pipen-diagram`][5]: Draw pipeline diagrams for pipen
261 | - [`pipen-args`][6]: Command line argument parser for pipen
262 | - [`pipen-dry`][7]: Dry runner for pipen pipelines
263 | - [`pipen-annotate`][12]: Use docstring to annotate pipen processes
264 | - [`pipen-board`][13]: Visualize configuration and running of pipen pipelines on the web
265 | - [`pipen-lock`][14]: Process lock for pipen to prevent multiple runs at the same time.
266 | - [`pipen-log2file`][15]: Save running logs to file for pipen
267 | - [`pipen-poplog`][16]: Populate logs from jobs to running log of the pipeline
268 | - [`pipen-runinfo`][17]: Save running information to file for pipen
269 | - [`pipen-gcs`][9]: A plugin for pipen to handle files in Google Cloud Storage.
270 | 
271 | [1]: https://github.com/pwwang/simplug
272 | [2]: https://packaging.python.org/specifications/entry-points/
273 | [3]: https://github.com/pwwang/pipen-verbose
274 | [4]: https://github.com/pwwang/pipen-report
275 | [5]: https://github.com/pwwang/pipen-diagram
276 | [6]: https://github.com/pwwang/pipen-args
277 | [7]: https://github.com/pwwang/pipen-dry
278 | [8]: https://github.com/pwwang/pipen-filters
279 | [9]: https://github.com/pwwang/pipen-gcs
280 | [11]: ../cli
281 | [12]: https://github.com/pwwang/pipen-annotate
282 | [13]: https://github.com/pwwang/pipen-board
283 | [14]: https://github.com/pwwang/pipen-lock
284 | [15]: https://github.com/pwwang/pipen-log2file
285 | [16]: https://github.com/pwwang/pipen-poplog
286 | [17]: https://github.com/pwwang/pipen-runinfo
287 | 


--------------------------------------------------------------------------------
/docs/proc-group.md:
--------------------------------------------------------------------------------
  1 | A process group is a collection of processes that are related to each other. It is a convenient way to manage a set of processes.
  2 | 
  3 | With `pipen`, not only a process can be reused, but also a group of processes can be reused. We just need to define the relationship between the processes in the group, and then we can reuse the group in other pipelines, or even run it directly as a pipeline.
  4 | 
  5 | ## Define a process group
  6 | 
  7 | To define a process group, we need to define a class that inherits from `pipen.procgroup.ProcGroup`. The class name will be the name of the group, unless we specify a `name` attribute.
  8 | 
  9 | ```python
 10 | from pipen.procgroup import ProcGroup
 11 | 
 12 | class MyGroup(ProcGroup):
 13 |     ...
 14 | ```
 15 | 
 16 | Note that the subclasses of `ProcGroup` are singleton classes. If you need to define multiple groups, you can define a base class and then inherit from it.
 17 | 
 18 | ## Add processes to a group
 19 | 
 20 | There are two ways to add processes to a group, using `pg.add_proc` or `ProcGroup.add_proc`, where `pg` is a process group instance. The first method is used after the group is instantiated and it decorates a process class directly. The second method is used before the group is instantiated and it decorates a property of `ProcGroup` that returns a process.
 21 | 
 22 | 1. Using the `pg.add_proc()` decorator.
 23 | 
 24 |     ```python
 25 |     from pipen import Proc, ProcGroup
 26 | 
 27 |     class MyGroup(ProcGroup):
 28 |         ...
 29 | 
 30 |     pg = MyGroup()
 31 | 
 32 |     @pg.add_proc
 33 |     class MyProc(Proc):
 34 |         ...
 35 |     ```
 36 | 
 37 | 2. Using the `ProcGroup.add_proc()` decorator to decorate a property of the group class.
 38 | 
 39 |     ```python
 40 |     from pipen import Proc, ProcGroup
 41 | 
 42 |     class MyGroup(ProcGroup):
 43 | 
 44 |         @ProcGroup.add_proc
 45 |         def my_proc(self):
 46 |             class MyProc(Proc):
 47 |                 ...
 48 |             return MyProc
 49 |     ```
 50 | 
 51 | This method adds a process at runtime, so it is useful when we want to add processes to a group dynamically.
 52 | 
 53 | ## Access processes in a group
 54 | 
 55 | We can access the processes in a group using the `pg.<proc>` attribute, where `pg` is a process group instance. Note that when we use the `ProcGroup.add_proc` method to add processes, the process name is the name of the property.
 56 | 
 57 | However, you can always use `pg.procs.<proc_name>` to access the process, where the `<proc_name>` is the real name of the process.
 58 | 
 59 | ```python
 60 | from pipen import Proc, ProcGroup
 61 | 
 62 | class MyGroup(ProcGroup):
 63 | 
 64 |     @ProcGroup.add_proc
 65 |     def my_proc(self):
 66 |         class MyProc(Proc):
 67 |             ...
 68 |         return MyProc
 69 | 
 70 | pg = MyGroup()
 71 | assert pg.my_proc.name == 'MyProc'
 72 | assert pg.procs.MyProc.name == 'MyProc'
 73 | ```
 74 | 
 75 | We can use `pg.starts` to get the start processes of the group, which are the processes that have no required processes. So when you add processes to a group, remember to specify `.requires` for each process, unless they are start processes.
 76 | 
 77 | ## Run a process group as a pipeline
 78 | 
 79 | To run a process group as a pipeline, we can convert it to a pipeline using the `as_pipen()` method. The method takes the same arguments as the `Pipen` constructor.
 80 | 
 81 | ```python
 82 | from pipen import Proc, ProcGroup
 83 | 
 84 | class MyGroup(ProcGroup):
 85 |     ...
 86 | 
 87 | pg = MyGroup()
 88 | 
 89 | @pg.add_proc
 90 | class MyProc(Proc):
 91 |     ...
 92 | 
 93 | pg.as_pipen().set_data(...).run()
 94 | ```
 95 | 
 96 | ## Integrate a process group into a pipeline
 97 | 
 98 | ```python
 99 | from pipen import Proc, ProcGroup
100 | 
101 | class MyGroup(ProcGroup):
102 | 
103 |     @ProcGroup.add_proc
104 |     def my_proc(self):
105 |         class MyProc(Proc):
106 |             ...
107 |         return MyProc
108 | 
109 |     @ProcGroup.add_proc
110 |     def my_proc2(self):
111 |         class MyProc2(Proc):
112 |             requires = self.my_proc
113 |             ...
114 | 
115 |         return MyProc2
116 | 
117 | pg = MyGroup()
118 | 
119 | class PrepareData(Proc):
120 |     ...
121 | 
122 | class PostGroup(Proc):
123 |     requires = pg.my_proc2
124 | 
125 | pg.my_proc.requires = PrepareData
126 | 
127 | pipen = Pipen().set_starts(PrepareData).set_data(...).run()
128 | ```
129 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | mkdocs
2 | jinja2
3 | mkdocs-material
4 | pymdown-extensions
5 | mkapi-fix
6 | 


--------------------------------------------------------------------------------
/docs/running.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Creating a `Pipen` object
  3 | 
  4 | The arguments for the constrctor are:
  5 | 
  6 | - `name`: The name of the pipeline
  7 | - `desc`: The description of the pipeline
  8 | - `outdir`: The output directory of the pipeline. If not provided, defaults to `<pipeline-name>_results`.
  9 | - `**kwargs`: Other configurations
 10 | 
 11 | ## Specification of the start processes
 12 | 
 13 | Once the requirements of the processes are specified, we are able to build the entire process dependency network. To start runing a pipeline, we just need to specify the start processes to start:
 14 | 
 15 | ```python
 16 | class P1(Proc):
 17 |     ...
 18 | 
 19 | class P2(Proc):
 20 |     ...
 21 | 
 22 | class P3(Proc):
 23 |     requires = [P1, P2]
 24 |     ...
 25 | 
 26 | Pipen().set_starts(P1, P2)
 27 | ```
 28 | 
 29 | You can specify the start processes individually, like we did above, or send a list of processes:
 30 | 
 31 | ```python
 32 | Pipen().set_starts([P1, P2])
 33 | ```
 34 | 
 35 | ## Setting input data for start processes
 36 | 
 37 | Other than set the input data when defining a process, you can also specify the input data for start processes:
 38 | 
 39 | ```python
 40 | Pipen().set_starts(P1, P2).set_data(<data for P1>, <data for P2>)
 41 | ```
 42 | 
 43 | This is useful when you want to reuse the processes.
 44 | 
 45 | The order of data in `.set_data()` has to be the same as the order of processes to be set in `.set_starts()`. When the `input_data` of a start process has already been set, an error will be raised. To use that `input_data`, use `None` in `.set_data()`. For example:
 46 | 
 47 | ```python
 48 | class P1(Proc):
 49 |     ...
 50 | 
 51 | class P2(Proc):
 52 |     input_data = [1]
 53 | 
 54 | Pipen().set_starts(P1, P2).set_data(<data for P1>, None)
 55 | ```
 56 | 
 57 | ## Running with a different profile
 58 | 
 59 | `Pipen.run()` accepts an argument `profile`, which allows you to use different profile from configuration files to run the pipeline:
 60 | 
 61 | ```python
 62 | Pipen().run("sge")
 63 | ```
 64 | 
 65 | See [configurations][1] for more details.
 66 | 
 67 | ## Shortcut for running a pipeline
 68 | 
 69 | ```python
 70 | import pipen
 71 | 
 72 | class P1(pipen.Proc):
 73 |     ...
 74 | 
 75 | class P2(pipen.Proc):
 76 |     ...
 77 | 
 78 | class P3(pipen.Proc):
 79 |     requires = [P1, P2]
 80 |     ...
 81 | 
 82 | pipen.run("MyPipeline", starts=[P1, P2], data=[<data for P1>, <data for P2>])
 83 | ```
 84 | 
 85 | ```python
 86 | >>> help(pipen.run)
 87 | 
 88 | run(
 89 |     name: 'str',
 90 |     starts: 'Type[Proc] | List[Type[Proc]]',
 91 |     data: 'Iterable' = None,
 92 |     *,
 93 |     desc: 'str' = None,
 94 |     outdir: 'str | PathLike' = None,
 95 |     profile: 'str' = 'default',
 96 |     **kwargs,
 97 | ) -> 'bool'
 98 |     Shortcut to run a pipeline
 99 | 
100 |     Args:
101 |         name: The name of the pipeline
102 |         starts: The start processes
103 |         data: The input data for the start processes
104 |         desc: The description of the pipeline
105 |         outdir: The output directory of the results
106 |         profile: The profile to use
107 |         **kwargs: Other options pass to Pipen to create the pipeline
108 | 
109 |     Returns:
110 |         True if the pipeline ends successfully else False
111 | ```
112 | 
113 | [1]: ../configurations
114 | 


--------------------------------------------------------------------------------
/docs/scheduler.md:
--------------------------------------------------------------------------------
 1 | 
 2 | `pipen` can send jobs to different scheduler system to run. To specify the scheduler, use `scheduler` and `scheduler_opts` configurations.
 3 | 
 4 | ## Default supported schedulers
 5 | 
 6 | `pipen` uses [`xqute`][1] for scheduler backend support. By default, the `local` and `sge` schedulers are supported by `xqute`. They are also the supported schedulers supported by `pipen`.
 7 | 
 8 | ### `local`
 9 | 
10 | This is the default scheduler used by `pipen`. The jobs will be run on the local machine.
11 | 
12 | No scheduler-specific options are available.
13 | 
14 | ### `sge`
15 | 
16 | Send the jobs to run on `sge` scheduler.
17 | 
18 | The `scheduler_opts` will be the ones supported by `qsub`.
19 | 
20 | ### `slurm`
21 | 
22 | Send the jobs to run on `slurm` scheduler.
23 | 
24 | The `scheduler_opts` will be the ones supported by `sbatch`.
25 | 
26 | ### `ssh`
27 | 
28 | Send the jobs to run on a remote machine via `ssh`.
29 | 
30 | The `scheduler_opts` will be the ones supported by `ssh`.
31 | 
32 | See also [xqute][1].
33 | 
34 | ### `gbatch`
35 | 
36 | Send the jobs to run using Google Batch Jobs.
37 | 
38 | The `scheduler_opts` will be used to construct the job configuration (json) file.
39 | 
40 | By default, `taskGroups[0].taskSpec.runnables[0].script.text` is set to run the job script, and `taskGroups[0].taskSpec.volumes[0]` and `taskGroups[0].taskSpec.volumes[1]` will be set to mount the workdir and output directory to the VM.
41 | The `scheduler_opts` will be used to set the other fields in the job configuration file.
42 | 
43 | `gbatch` scheduler also supports a `fast_mount` option to speed up the mounting a cloud directory to the VM. For example, `scheduler_opts={"fast_mount": "gs://bucket/path:/mnt/dir"}` will mount `gs://bucket/path` to `/mnt/dir` on the VM.
44 | 
45 | ## Writing your own scheduler plugin
46 | 
47 | To write a scheduler plugin, you need to subclass both `xqute.schedulers.scheduler.Scheduler` and `pipen.scheduler.SchedulerPostInit`.
48 | 
49 | For examples of a scheduler plugin, see [local_scheduler][2], [sge_scheduler][3], [slurm_scheduler][4], [ssh_scheduler][5], and [gbatch_scheduler][6], and also `pipen.scheduler`.
50 | 
51 | 
52 | A scheduler class can be passed to `scheduler` configuration directly to be used as a scheduler. But you can also register it with entry points:
53 | 
54 | For `setup.py`, you will need:
55 | ```python
56 | setup(
57 | 	# ...
58 | 	entry_points={"pipen_sched": ["mysched = pipen_mysched"]},
59 | 	# ...
60 | )
61 | ```
62 | 
63 | For `pyproject.toml`:
64 | ```toml
65 | [tool.poetry.plugins.pipen_sched]
66 | mysched = "pipen_mysched"
67 | ```
68 | 
69 | Then you can switch the scheduler to `mysched` by `scheduler="mysched"`
70 | 
71 | 
72 | [1]: https://github.com/pwwang/xqute
73 | [2]: https://github.com/pwwang/xqute/blob/master/xqute/schedulers/local_scheduler.py
74 | [3]: https://github.com/pwwang/xqute/blob/master/xqute/schedulers/sge_scheduler.py
75 | [4]: https://github.com/pwwang/xqute/blob/master/xqute/schedulers/slurm_scheduler.py
76 | [5]: https://github.com/pwwang/xqute/blob/master/xqute/schedulers/ssh_scheduler/
77 | [4]: https://github.com/pwwang/xqute/blob/master/xqute/schedulers/gbatch_scheduler.py
78 | 


--------------------------------------------------------------------------------
/docs/script.md:
--------------------------------------------------------------------------------
 1 | 
 2 | For templating in `script`, see [`templating`][2]
 3 | 
 4 | ## Choosing your language
 5 | 
 6 | You can specify the path of interpreter to `lang`. If the interpreter is in `$PATH`, you can directly give the basename of the interpreter (i.e. `python` instead of `/path/to/python`).
 7 | 
 8 | For example, if you have your own perl installed at `/home/user/bin/perl`, then you need to tell `pipen` where it is: `lang = "/home/user/bin/perl"`. If `/home/user/bin` is in your `$PATH`, you can simply do: `lang = "perl"`
 9 | 
10 | You can also use [shebang][1] to specify the interperter:
11 | ```perl
12 | #!/home/usr/bin/perl
13 | # You perl code goes here
14 | ```
15 | 
16 | If you have shebang in your script, the `lang` specified in the configuration files and `Pipen` constructor will be ignored (but the one specified in process definition is not).
17 | 
18 | ## Use script from a file
19 | 
20 | You can also put the script into a file, and use it with a `file://` prefix: `script = "file:///a/b/c.pl"`
21 | 
22 | !!! note
23 | 
24 |     You may also use a script file with a relative path, which is relative to where process is defined. For example: a process with `script = "file://./scripts/script.py"` is defined in `/a/b/pipeline.py`, then the script file refers to `/a/b/scripts/script.py`
25 | 
26 | !!! hint
27 | 
28 |     Indents are important in python, when you write your scripts, you don't have to worry about the indents in your first empty lines. For example, you don't have to do this:
29 | 
30 |     ```python
31 |     class P1(Proc):
32 |         lang = "python"
33 |         script = """
34 |     import os
35 |     import re
36 |     def somefunc ():
37 |         pass
38 |     """
39 |     ```
40 | 
41 |     You can do this:
42 | 
43 |     ```python
44 |     class P1(Proc):
45 |         lang = "python"
46 |         script = """
47 |         import os
48 |         import re
49 |         def somefunc ():
50 |             pass
51 |         """
52 |     ```
53 | 
54 |     Only the first non-empty line is used to detect the indent for the whole script.
55 | 
56 | ## Debugging your script
57 | 
58 | If you need to debug your script, you just need to find the real running script, which is at: `<pipeline-workdir>/<proc-name>/<job.index>/job.script`. The template is rendered already in the file. You can debug it using the tool according to the language you used for the script.
59 | 
60 | ## Caching your results
61 | 
62 | Job results get automatically cached previous run is successful and input/output data are not changed, see [caching][3].
63 | 
64 | However, there are cases when you want to cache some results even when the job fails. For example, there is a very time-consuming chunk of code in your script that you don't want to run that part each time if it finishes once. In that case, you can save the intermediate results in a directory under `<job.outdir>`, where the directory is not specified in `output`. This keeps that directory untouched each time when the running data get purged if previous run fails.
65 | 
66 | [1]: https://en.wikipedia.org/wiki/Shebang_(Unix)
67 | [2]: ../templating
68 | [3]: ../caching
69 | 


--------------------------------------------------------------------------------
/docs/style.css:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | .md-typeset .admonition, .md-typeset details {
  4 |     font-size: .7rem !important;
  5 | }
  6 | 
  7 | .md-typeset table:not([class]) td {
  8 |     padding: .55em 1.25em !important;
  9 | }
 10 | 
 11 | .md-typeset table:not([class]) th {
 12 |     padding: .75em 1.25em !important;
 13 | }
 14 | 
 15 | .mkapi-docstring{
 16 |     line-height: 1;
 17 | }
 18 | .mkapi-node {
 19 |     background-color: #f0f6fa;
 20 |     border-top: 3px solid #559bc9;
 21 | }
 22 | .mkapi-node .mkapi-object-container {
 23 |     background-color: #b4d4e9;
 24 |     padding: .12em .4em;
 25 | }
 26 | .mkapi-node .mkapi-object-container .mkapi-object.code {
 27 |     background: none;
 28 |     border: none;
 29 | }
 30 | .mkapi-node .mkapi-object-container .mkapi-object.code * {
 31 |     font-size: .65rem !important;
 32 | }
 33 | .mkapi-node pre {
 34 |     line-height: 1.5;
 35 | }
 36 | .md-typeset pre>code {
 37 |     overflow: visible;
 38 |     line-height: 1.2;
 39 | }
 40 | .mkapi-docstring .md-typeset pre>code {
 41 |     font-size: 0.1rem !important;
 42 | }
 43 | .mkapi-section-name.bases {
 44 |     margin-top: .2em;
 45 | }
 46 | .mkapi-section-body.bases {
 47 |     padding-bottom: .7em;
 48 |     line-height: 1.3;
 49 | }
 50 | .mkapi-section.bases {
 51 |     margin-bottom: .8em;
 52 | }
 53 | .mkapi-node * {
 54 |     font-size: .7rem;
 55 | }
 56 | .mkapi-node a.mkapi-src-link {
 57 |     word-break: keep-all;
 58 | }
 59 | .mkapi-docstring {
 60 |     padding: .4em .15em !important;
 61 | }
 62 | .mkapi-section-name-body {
 63 |     font-size: .72rem !important;
 64 | }
 65 | .mkapi-node ul.mkapi-items li {
 66 |     line-height: 1.4 !important;
 67 | }
 68 | .mkapi-node ul.mkapi-items li * {
 69 |     font-size: .65rem !important;
 70 | }
 71 | .mkapi-node code.mkapi-object-signature {
 72 |     padding-right: 2px;
 73 | }
 74 | .mkapi-node .mkapi-code * {
 75 |     font-size: .65rem;
 76 | }
 77 | .mkapi-node a.mkapi-docs-link {
 78 |     font-size: .6rem;
 79 | }
 80 | .mkapi-node h1.mkapi-object.mkapi-object-code {
 81 |     margin: .2em .3em;
 82 | }
 83 | .mkapi-node h1.mkapi-object.mkapi-object-code .mkapi-object-kind.mkapi-object-kind-code {
 84 |     font-style: normal;
 85 |     margin-right: 16px;
 86 | }
 87 | .mkapi-node .mkapi-item-name {
 88 |     font-size: .7rem !important;
 89 |     color: #555;
 90 |     padding-right: 4px;
 91 | }
 92 | .md-typeset {
 93 |     font-size: .75rem !important;
 94 |     line-height: 1.5 !important;
 95 | }
 96 | .mkapi-object-kind.package.top {
 97 |     font-size: .8rem !important;
 98 |     color: #111;
 99 | 
100 | }
101 | .mkapi-object.package.top > h2 {
102 |     font-size: .8rem !important;
103 | }
104 | 
105 | .mkapi-object-body.package.top * {
106 |     font-size: .75rem !important;
107 | }
108 | .mkapi-object-kind.module.top {
109 |     font-size: .75rem !important;
110 |     color: #222;
111 | }
112 | 
113 | .mkapi-object-body.module.top * {
114 |     font-size: .75rem !important;
115 | }
116 | 


--------------------------------------------------------------------------------
/docs/templating.md:
--------------------------------------------------------------------------------
  1 | Templates are used in `output` and `script` in process definition.
  2 | 
  3 | ## Template engines
  4 | 
  5 | By default, `pipen` uses [`liquid`][1] template engine to render the `output` and `script`. You can also switch the template engine to [`jinja2`][2] by specifying:
  6 | 
  7 | ```toml
  8 | template = "jinja2"
  9 | ```
 10 | 
 11 | in one of the configuration files, or in the `Pipen` constructor:
 12 | 
 13 | ```python
 14 | pipeline = Pipen(..., template="jinja2", ...)
 15 | ```
 16 | 
 17 | or in the process definition
 18 | 
 19 | ```python
 20 | class MyProcess(Proc):
 21 |     ...
 22 |     template = "jinja2" # overwrite the global template engine
 23 | ```
 24 | 
 25 | Besides specifying the name of a template engine, you can also specify a subclass `pipen.template.Template` as a template engine. This enables us to use our own template engine. You just have to wrap then use a subclass of `pipen.template.Template`. For example, if you want to use [`mako`][3]:
 26 | 
 27 | ```python
 28 | from mako.template import Template as MakoTemplate
 29 | from pipen.template import Template
 30 | 
 31 | class TemplateMako(Template):
 32 | 
 33 |     def __init__(self, source, **kwargs):
 34 |         super().__init__(source)
 35 |         self.engine = MakoTemplate(source, **kwargs)
 36 | 
 37 |     def _render(self, data):
 38 |         return self.engine.render(**data)
 39 | 
 40 | # Use it for a process
 41 | from pipen import Proc
 42 | 
 43 | class MyProcess(Proc):
 44 |     template = TemplateMako
 45 |     ... # other configurations
 46 | 
 47 | ```
 48 | 
 49 | The `template_opts` configuration is used to pass to `TemplateMako` constructor. The values is passed by to the `MakoTemplate` constructor.
 50 | 
 51 | You can also register the template as a plugin of pipen:
 52 | 
 53 | In `pyproject.toml`:
 54 | 
 55 | ```toml
 56 | [tool.poetry.plugins.pipen_tpl]
 57 | mako = "pipen_mako:pipen_mako"
 58 | ```
 59 | 
 60 | Or in `setup.py`:
 61 | 
 62 | ```python
 63 | setup(
 64 |     ...,
 65 |     entry_points={"pipen_tpl": ["pipen_mako:pipen_mako"]},
 66 | )
 67 | ```
 68 | 
 69 | Then in `pipen_mako.py` of your package:
 70 | 
 71 | ```python
 72 | def pipen_mako():
 73 |     # TemplateMako is defined as the above
 74 |     return TemplateMako
 75 | ```
 76 | 
 77 | ## Rendering data
 78 | 
 79 | There are some data shared to render both `output` and `script`. However, there are some different. One of the obvious reasons is that, the `script` template can use the `output` data to render.
 80 | 
 81 | ### `output`
 82 | 
 83 | The data to render the `output`:
 84 | 
 85 | |Name|Description|
 86 | |-|-|
 87 | |`job.index`|The index of the job, 0-based|
 88 | |`job.metadir`<sup>1</sup>|The directory where job metadata is saved, typically `<pipeline-workdir>/<pipeline-name>/<proc-name>/<job.index>/`|
 89 | |`job.outdir`<sup>1</sup>|*The output directory of the job: `<pipeline-workdir>/<pipeline-name>/<proc-name>/<job.index>/output`|
 90 | |`job.stdout_file`<sup>1</sup>|The file that saves the stdout of the job|
 91 | |`job.stderr_file`<sup>1</sup>|The file that saves the stderr of the job|
 92 | |`in`|The input data of the job. You can use `in.<input-key>`<sup>1</sup> to access the data for each input key|
 93 | |`proc`|The process object, used to access their properties, such as `proc.workdir`|
 94 | |`envs`|The `envs` of the process|
 95 | 
 96 | `*`: If the process is an end process, it will be a symbolic link to `<pipeline-outdir>/<process-name>/<job.index>`. When the process has only a single job, the `<job.index>` is also omitted.
 97 | 
 98 | ### `script`
 99 | 
100 | All the data used to render `output` can also be used to render `script`. Addtionally, the rendered `output` can also be used to render `script`. For example:
101 | 
102 | ```python
103 | class MyProcess(Proc):
104 |     input = "in"
105 |     output = "outfile:file:{{in.in}}.txt"
106 |     script = "echo {{in.in}} > {{out.outfile}}"
107 |     ... # other configurations
108 | 
109 | ```
110 | 
111 | With input data ["a"], the script is rendered as `echo a > <job.outdir>/a.txt`
112 | 
113 | > <sup>1</sup> The paths are [`MountedPath`][4] objects, which represent paths of jobs and it is useful when a job is running in a remote system (a VM, a container, etc.), where we need to mount the paths into the remote system. It has an attribute `spec` to get the specified path. When there is no mountings, it is the same as the path itself.
114 | 
115 | [1]: https://github.com/pwwang/liquidpy
116 | [2]: https://github.com/pallets/jinja
117 | [3]: https://www.makotemplates.org/
118 | [4]: https://pwwang.github.io/xqute/api/xqute.path/
119 | 


--------------------------------------------------------------------------------
/examples/caching.py:
--------------------------------------------------------------------------------
 1 | """An example showing how caching works"""
 2 | 
 3 | from pathlib import Path
 4 | from pipen import Pipen, Proc
 5 | 
 6 | 
 7 | class AProcess(Proc):
 8 |     """A normal process"""
 9 |     input = "infile:file"
10 |     output = "outfile:file:{{in.infile.name}}"
11 |     script = "cat {{in.infile}} > {{out.outfile}}"
12 | 
13 | 
14 | class MyPipeline(Pipen):
15 |     starts = AProcess
16 |     # Enable debugging information so you will see why jobs are not cached
17 |     loglevel = "debug"
18 | 
19 | 
20 | if __name__ == "__main__":
21 | 
22 |     infile = "/tmp/pipen_example_caching.txt"
23 |     if not Path(infile).exists():
24 |         Path(infile).write_text("123")
25 | 
26 |     MyPipeline().set_data([infile]).run()
27 | 
28 | 
29 | # Run this script the repeatedly, you will see the jobs are cached
30 | 
31 | # To "de-cache" the jobs, either
32 | # 1. touch the input file
33 | # 2. change any part of input, output, script
34 | # 3. run:
35 | #    PIPEN_default_cache=0 python caching.py
36 | # 4. Pass cache=False or set it to AProcess and run again
37 | 


--------------------------------------------------------------------------------
/examples/cloudwdir.py:
--------------------------------------------------------------------------------
 1 | """An example using cloud workdir/outdiur"""
 2 | 
 3 | from dotenv import load_dotenv
 4 | from pipen import Proc, Pipen
 5 | 
 6 | load_dotenv()
 7 | BUCKET = "gs://handy-buffer-287000.appspot.com"
 8 | 
 9 | 
10 | class MyProcess(Proc):
11 |     """A process"""
12 | 
13 |     input = "a"
14 |     input_data = [1]
15 |     output = "outfile:file:{{in.a}}.txt"
16 |     script = "cloudsh touch {{out.outfile}}"
17 | 
18 | 
19 | class MyProcess2(Proc):
20 |     """Another process"""
21 |     requires = MyProcess
22 |     input = "infile:file"
23 |     output = "outfile:file:{{in.infile.stem}}2.txt"
24 |     script = "cloudsh cat {{in.infile}} | cloudsh sink {{out.outfile}}"
25 | 
26 | 
27 | class MyCloudDirPipeline(Pipen):
28 |     starts = MyProcess
29 |     workdir = f"{BUCKET}/pipen-test/clouddir-pipeline/workdir"
30 |     outdir = f"{BUCKET}/pipen-test/clouddir-pipeline/outdir"
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     MyCloudDirPipeline().run()
35 | 


--------------------------------------------------------------------------------
/examples/example.py:
--------------------------------------------------------------------------------
 1 | from pipen import Proc, Pipen, run
 2 | 
 3 | class P1(Proc):
 4 |     """Sort input file"""
 5 |     input = "infile"
 6 |     input_data = ["/tmp/data.txt"]
 7 |     output = "outfile:file:intermediate.txt"
 8 |     script = "cat {{in.infile}} | sort > {{out.outfile}}"
 9 | 
10 | class P2(Proc):
11 |     """Paste line number"""
12 |     requires = P1
13 |     input = "infile:file"
14 |     output = "outfile:file:result.txt"
15 |     script = "paste <(seq 1 3) {{in.infile}} > {{out.outfile}}"
16 | 
17 | # class MyPipeline(Pipen):
18 | #     starts = P1
19 | 
20 | if __name__ == "__main__":
21 |     # MyPipeline().run()
22 |     # Before running the pipeline, make sure to create the input file
23 |     # $ echo -e "3\n2\n1" > /tmp/data.txt
24 |     run("MyPipeline", starts=P1, desc="My pipeline")
25 | 


--------------------------------------------------------------------------------
/examples/gbatch.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dotenv import load_dotenv
 4 | from pipen import Proc, Pipen
 5 | 
 6 | load_dotenv()
 7 | BUCKET = f"gs://{os.environ['GBATCH_EXAMPLE_BUCKET']}"
 8 | 
 9 | 
10 | class MyProcess(Proc):
11 | 
12 |     input = "a"
13 |     input_data = [1]
14 |     output = "outfile:file:{{in.a}}.txt"
15 |     script = "echo {{in.a}} > {{out.outfile}}"
16 | 
17 | 
18 | # Works even when metadir/outdir mounted
19 | class MyProcess2(Proc):
20 |     requires = MyProcess
21 |     input = "infile:file"
22 |     output = "outfile:file:{{in.infile.stem}}2.txt"
23 |     script = "echo 123 > {{out.outfile}}"
24 |     export = True
25 | 
26 | 
27 | # Works even when metadir/outdir mounted
28 | class MyProcess3(Proc):
29 |     requires = MyProcess2
30 |     input = "infile:file"
31 |     output = "outfile:file:{{in.infile.stem}}3.txt"
32 |     script = "echo 456 > {{out.outfile}}"
33 | 
34 | 
35 | class MyGBatchPipeline(Pipen):
36 |     starts = MyProcess
37 |     workdir = f"{BUCKET}/pipen-test/workdir"
38 |     outdir = f"{BUCKET}/pipen-test/outdir"
39 |     loglevel = "DEBUG"
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     MyGBatchPipeline().run(profile="gbatch")
44 | 


--------------------------------------------------------------------------------
/examples/input_data_callback.py:
--------------------------------------------------------------------------------
 1 | """An example showing using callback to modify the channel
 2 | 
 3 | It's a more complete example from README.md
 4 | """
 5 | from pathlib import Path
 6 | import random
 7 | from pipen import Proc, Pipen
 8 | from pipen.channel import Channel
 9 | 
10 | 
11 | def wc(path):
12 |     """Count lines in the file"""
13 |     i = 0
14 |     with Path(path).open() as f:
15 |         for line in f:
16 |             i += 1
17 |     return i
18 | 
19 | 
20 | class P1(Proc):
21 |     """Sort input file"""
22 | 
23 |     input = "infile:file"
24 |     output = "outfile:file:intermediate.txt"
25 |     script = "cat {{in.infile}} | sort > {{out.outfile}}"
26 | 
27 | 
28 | class P2(Proc):
29 |     """Paste line number"""
30 | 
31 |     requires = P1
32 |     input = "infile:file, nlines"
33 |     # use the callback to add number of lines for each file
34 |     input_data = lambda ch: ch.assign(nlines=ch.outfile.apply(wc))
35 |     output = "outfile:file:result.txt"
36 |     script = "paste <(seq 1 {{in.nlines}}) {{in.infile}} > {{out.outfile}}"
37 | 
38 | 
39 | def prepare_input_data():
40 |     """Prepare input data"""
41 |     tmpdir = "/tmp/pipen_example_input_data_callback/"
42 |     Path(tmpdir).mkdir(exist_ok=True)
43 | 
44 |     for i in range(10):
45 |         seq = list(range(i + 2))
46 |         random.shuffle(seq)
47 |         seq = (f"{i}_{x}" for x in seq)
48 | 
49 |         Path(tmpdir).joinpath(f"{i}.txt").write_text("\n".join(seq))
50 | 
51 |     return Channel.from_glob(f"{tmpdir}/*.txt")
52 | 
53 | 
54 | class MyPipeline(Pipen):
55 |     starts = [P1]
56 |     data = [prepare_input_data()]
57 |     forks = 3
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     MyPipeline().run()
62 | 


--------------------------------------------------------------------------------
/examples/mako-templating.py:
--------------------------------------------------------------------------------
 1 | """An example showing how to use mako template engine"""
 2 | 
 3 | from mako.template import Template as Mako
 4 | from pipen.template import Template
 5 | from pipen import Proc, Pipen
 6 | 
 7 | 
 8 | class TemplateMako(Template):
 9 | 
10 |     name = "mako"
11 | 
12 |     def __init__(self, source, **kwargs):
13 |         super().__init__(source)
14 |         self.engine = Mako(source, **kwargs)
15 | 
16 |     def _render(self, data):
17 |         return self.engine.render(**data)
18 | 
19 | 
20 | class MakoProcess(Proc):
21 |     """A process using mako templating"""
22 |     template = TemplateMako
23 |     input = "a"
24 |     input_data = [1]
25 |     output = "outfile:file:${in_['a']}.txt"
26 |     script = "touch ${out['outfile']}"
27 | 
28 | 
29 | class MyPipeline(Pipen):
30 |     starts = MakoProcess
31 |     plugins = ["-filters"]
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     MyPipeline().run()
36 | 


--------------------------------------------------------------------------------
/examples/multijobs.py:
--------------------------------------------------------------------------------
 1 | """An example for a process to have multiple jobs and run jobs in parallel"""
 2 | from pipen import Proc, Pipen
 3 | 
 4 | 
 5 | class MultiJobProc(Proc):
 6 |     """A process with multiple jobs"""
 7 |     input = "i"
 8 |     input_data = range(10)
 9 |     forks = 3
10 |     # Don't cache, we need to see the jobs to run every time
11 |     cache = False
12 |     output = "outfile:file:{{in.i}}.txt"
13 |     # Let the job takes long the see the parallelization from the progressbar
14 |     script = "sleep 1; echo {{in.i}} > {{out.outfile}}"
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     Pipen().set_starts(MultiJobProc).run()
19 | 


--------------------------------------------------------------------------------
/examples/plugin-example.py:
--------------------------------------------------------------------------------
 1 | """An example showing how to create a plugin"""
 2 | 
 3 | from pipen import Proc, Pipen, plugin
 4 | from pipen.utils import get_logger
 5 | 
 6 | logger = get_logger("notify", "info")
 7 | 
 8 | 
 9 | class NotifyPlugin:
10 |     version = "0.0.0"
11 | 
12 |     @plugin.impl
13 |     def on_setup(config):
14 |         logger.info("Calling on_setup")
15 | 
16 |     @plugin.impl
17 |     async def on_start(pipen):
18 |         logger.info("Calling on_start")
19 | 
20 |     @plugin.impl
21 |     async def on_complete(pipen, succeeded):
22 |         logger.info("Calling on_complete, succeeded = %s", succeeded)
23 | 
24 |     @plugin.impl
25 |     async def on_proc_start(proc):
26 |         logger.info("Calling on_proc_start")
27 | 
28 |     @plugin.impl
29 |     async def on_proc_done(proc, succeeded):
30 |         logger.info("Calling on_proc_done, succeeded = %s", succeeded)
31 | 
32 |     @plugin.impl
33 |     async def on_job_polling(job):
34 |         logger.info("Calling on_job_polling")
35 | 
36 | 
37 | class AProcess(Proc):
38 |     input = "a"
39 |     script = 'sleep 2'
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     Pipen(plugins=[NotifyPlugin], cache=False).set_starts(AProcess).run()
44 | 


--------------------------------------------------------------------------------
/examples/python-script.py:
--------------------------------------------------------------------------------
 1 | """An example using python as interpreter for the script"""
 2 | 
 3 | from pipen import Pipen, Proc
 4 | 
 5 | 
 6 | class PythonScriptProc(Proc):
 7 |     """A process using python interpreter for script"""
 8 |     input = "a"
 9 |     input_data = [1]
10 |     output = "outfile:file:{{in.a}}.txt"
11 |     lang = "python"
12 |     script = """
13 |         from pathlib import Path
14 |         Path("{{out.outfile}}").write_text("{{in.a}}")
15 |     """
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     Pipen().set_starts(PythonScriptProc).run()
20 | 


--------------------------------------------------------------------------------
/examples/retry.py:
--------------------------------------------------------------------------------
 1 | """An example to retry the jobs when error happends"""
 2 | import time
 3 | from pipen import Pipen, Proc
 4 | 
 5 | 
 6 | class RetryProc(Proc):
 7 |     """Retry the jobs when fail"""
 8 |     input = "starttime"
 9 |     input_data = [int(time.time())]
10 |     error_strategy = "retry"
11 |     # Make sure the job succeeds finally
12 |     num_retries = 10
13 |     script = """
14 |         timefile="{{job.outdir}}/time.txt"
15 |         now=$(date +"%s")
16 |         expect={{in.starttime + 10}}
17 |         if [[ $now -gt $expect ]]; then
18 |             echo $now $expect 0 >> "$timefile"
19 |             exit 0
20 |         else
21 |             echo $now $expect 1 >> "$timefile"
22 |             exit 1
23 |         fi
24 |     """
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     # Show debug information so we see the retrying message
29 |     Pipen(loglevel="debug").set_starts(RetryProc).run()
30 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: pipen
 2 | repo_url: https://github.com/pwwang/pipen
 3 | repo_name: pwwang/pipen
 4 | theme:
 5 |     name: 'material'
 6 | markdown_extensions:
 7 |     - markdown.extensions.admonition
 8 |     - pymdownx.superfences:
 9 |         preserve_tabs: true
10 |     - toc:
11 |         baselevel: 2
12 | plugins:
13 |     - search # necessary for search to work
14 |     - mkapi
15 | extra_css:
16 |     - style.css
17 | nav:
18 |     - 'Introduction': 'index.md'
19 |     - 'Basics': 'basics.md'
20 |     - 'Defining a process': 'defining-proc.md'
21 |     - 'Defining and running a pipeline': 'running.md'
22 |     - 'Templating': 'templating.md'
23 |     - 'Channels': 'channels.md'
24 |     - 'Input and output': 'input-output.md'
25 |     - 'Script': 'script.md'
26 |     - 'Caching': 'caching.md'
27 |     - 'Cloud support': 'cloud.md'
28 |     - 'Error handling': 'error.md'
29 |     - 'Configurations': 'configurations.md'
30 |     - 'Plugins': 'plugin.md'
31 |     - 'Scheduler': 'scheduler.md'
32 |     - 'Process group': 'proc-group.md'
33 |     - 'Command line iterface': 'cli.md'
34 |     - 'Examples': 'examples.md'
35 |     - 'Change log': 'CHANGELOG.md'
36 |     - 'API': mkapi/api/pipen
37 | 


--------------------------------------------------------------------------------
/pipen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/pipen.png


--------------------------------------------------------------------------------
/pipen/__init__.py:
--------------------------------------------------------------------------------
 1 | """A pipeline framework for python"""
 2 | from .pipen import Pipen, run
 3 | from .proc import Proc
 4 | from .procgroup import ProcGroup
 5 | 
 6 | # Use from pipen.channel import Channel instead of
 7 | # from pipen import Channel
 8 | # This slows down import
 9 | # from .channel import Channel
10 | from .pluginmgr import plugin
11 | from .version import __version__
12 | 


--------------------------------------------------------------------------------
/pipen/__main__.py:
--------------------------------------------------------------------------------
1 | from .cli._main import main
2 | 
3 | 
4 | if __name__ == '__main__':
5 |     main()
6 | 


--------------------------------------------------------------------------------
/pipen/_job_caching.py:
--------------------------------------------------------------------------------
  1 | """Provide JobCaching class that implements caching for jobs"""
  2 | 
  3 | from __future__ import annotations
  4 | from typing import TYPE_CHECKING
  5 | 
  6 | from diot import Diot
  7 | from simpleconf import Config
  8 | 
  9 | from .defaults import ProcInputType, ProcOutputType
 10 | from .utils import get_mtime, path_is_symlink
 11 | 
 12 | if TYPE_CHECKING:
 13 |     from xqute.path import SpecPath
 14 | 
 15 | 
 16 | class JobCaching:
 17 |     """Provide caching functionality of jobs"""
 18 | 
 19 |     @property
 20 |     def signature_file(self) -> SpecPath:
 21 |         """Get the path to the signature file
 22 | 
 23 |         Returns:
 24 |             The path to the signature file
 25 |         """
 26 |         return self.metadir / "job.signature.toml"
 27 | 
 28 |     async def cache(self) -> None:
 29 |         """write signature to signature file"""
 30 |         dirsig = (
 31 |             self.proc.pipeline.config.dirsig
 32 |             if self.proc.dirsig is None
 33 |             else self.proc.dirsig
 34 |         )
 35 |         # Check if mtimes of input is greater than those of output
 36 |         try:
 37 |             max_mtime = get_mtime(self.script_file, 0)
 38 |         except Exception:  # pragma: no cover
 39 |             max_mtime = 0
 40 | 
 41 |         # Make self.input serializable
 42 |         input_data = {}
 43 |         for inkey, intype in self.proc.input.type.items():
 44 |             if intype == ProcInputType.VAR:
 45 |                 input_data[inkey] = self.input[inkey]
 46 |                 continue
 47 | 
 48 |             if intype in (ProcInputType.FILE, ProcInputType.DIR):
 49 |                 if self.input[inkey] is None:
 50 |                     input_data[inkey] = None
 51 |                 else:
 52 |                     input_data[inkey] = str(self.input[inkey].spec)
 53 |                     max_mtime = max(
 54 |                         max_mtime,
 55 |                         get_mtime(self.input[inkey].spec, dirsig),
 56 |                     )
 57 | 
 58 |             if intype in (ProcInputType.FILES, ProcInputType.DIRS):
 59 |                 if self.input[inkey] is None:  # pragma: no cover
 60 |                     input_data[inkey] = None
 61 |                 else:
 62 |                     input_data[inkey] = []
 63 |                     for file in self.input[inkey]:
 64 |                         input_data[inkey].append(str(file.spec))
 65 |                         max_mtime = max(max_mtime, get_mtime(file.spec, dirsig))
 66 | 
 67 |         # Make self.output serializable
 68 |         output_data = {}
 69 |         for outkey, outval in self._output_types.items():
 70 |             if outval in (ProcOutputType.FILE, ProcInputType.DIR):
 71 |                 output_data[outkey] = str(self.output[outkey].spec)
 72 |                 max_mtime = max(max_mtime, get_mtime(self.output[outkey].spec, dirsig))
 73 |             else:
 74 |                 output_data[outkey] = self.output[outkey]
 75 | 
 76 |         signature = {
 77 |             "input": {
 78 |                 "type": self.proc.input.type,
 79 |                 "data": input_data,
 80 |             },
 81 |             "output": {"type": self._output_types, "data": output_data},
 82 |             "ctime": float("inf") if max_mtime == 0 else max_mtime,
 83 |         }
 84 |         with self.signature_file.open("w") as f:
 85 |             f.write(Diot(signature).to_toml())
 86 | 
 87 |     async def _clear_output(self) -> None:
 88 |         """Clear output if not cached"""
 89 |         self.log("debug", "Clearing previous output files.")
 90 |         for outkey, outval in self._output_types.items():
 91 |             if outval not in (ProcOutputType.FILE, ProcOutputType.DIR):
 92 |                 continue
 93 | 
 94 |             path = self.output[outkey].spec
 95 |             if not path.exists() and path_is_symlink(path):  # dead link
 96 |                 path.unlink()
 97 |             elif path.exists():
 98 |                 if not path.is_dir():
 99 |                     path.unlink()
100 |                 else:
101 |                     path.rmtree(ignore_errors=True)
102 |                     path.mkdir()
103 | 
104 |     async def _check_cached(self) -> bool:
105 |         """Check if the job is cached based on signature
106 | 
107 |         Returns:
108 |             True if the job is cached otherwise False
109 |         """
110 |         with self.signature_file.open("r") as sf:
111 |             signature = Config.load(sf, loader="toml")
112 | 
113 |         dirsig = (
114 |             self.proc.pipeline.config.dirsig
115 |             if self.proc.dirsig is None
116 |             else self.proc.dirsig
117 |         )
118 | 
119 |         try:
120 |             # check if inputs/outputs are still the same
121 |             if (
122 |                 signature.input.type != self.proc.input.type
123 |                 or signature.output.type != self._output_types
124 |             ):
125 |                 self.log("debug", "Not cached (input or output types are different)")
126 |                 return False
127 | 
128 |             # check if any script file is newer
129 |             script_mtime = get_mtime(self.script_file, 0)
130 |             if script_mtime > signature.ctime + 1e-3:
131 |                 self.log(
132 |                     "debug",
133 |                     "Not cached (script file is newer: %s > %s)",
134 |                     script_mtime,
135 |                     signature.ctime,
136 |                 )
137 |                 return False
138 | 
139 |             # Check if input is different
140 |             for inkey, intype in self.proc.input.type.items():
141 |                 sig_indata = signature.input.data.get(inkey)
142 | 
143 |                 if intype == ProcInputType.VAR:
144 |                     if sig_indata != self.input[inkey]:
145 |                         self.log(
146 |                             "debug",
147 |                             "Not cached (input %s:%s is different)",
148 |                             inkey,
149 |                             intype,
150 |                         )
151 |                         return False
152 | 
153 |                 elif int(self.input[inkey] is None) + int(sig_indata is None) == 1:
154 |                     # one is None, the other is not
155 |                     self.log(
156 |                         "debug",
157 |                         "Not cached (input %s:%s is different; "
158 |                         "it is <%s> in signature, but <%s> in data)",
159 |                         inkey,
160 |                         intype,
161 |                         type(sig_indata).__name__,
162 |                         type(self.input[inkey]).__name__,
163 |                     )
164 |                     return False
165 | 
166 |                 elif self.input[inkey] is None and sig_indata is None:
167 |                     continue
168 | 
169 |                 elif intype in (ProcInputType.FILE, ProcInputType.DIR):
170 |                     if sig_indata != str(self.input[inkey].spec):
171 |                         self.log(
172 |                             "debug",
173 |                             "Not cached (input %s:%s is different)",
174 |                             inkey,
175 |                             intype,
176 |                         )
177 |                         return False
178 | 
179 |                     if (
180 |                         get_mtime(self.input[inkey].spec, dirsig)
181 |                         > signature.ctime + 1e-3
182 |                     ):
183 |                         self.log(
184 |                             "debug",
185 |                             "Not cached (Input file is newer: %s)",
186 |                             inkey,
187 |                         )
188 |                         return False
189 | 
190 |                 # FILES/DIRS
191 | 
192 |                 # self.input[inkey] can't be None with intype files/dirs
193 |                 # elif sig_indata is None:  # both None
194 |                 #     continue
195 | 
196 |                 elif not isinstance(sig_indata, list):  # pragma: no cover
197 |                     self.log(
198 |                         "debug",
199 |                         "Not cached (input %s:%s is different, "
200 |                         "%s detected in signature)",
201 |                         inkey,
202 |                         intype,
203 |                         type(sig_indata).__name__,
204 |                     )
205 |                     return False
206 | 
207 |                 else:  # both list
208 |                     if len(sig_indata) != len(self.input[inkey]):  # pragma: no cover
209 |                         self.log(
210 |                             "debug",
211 |                             "Not cached (input %s:%s length is different)",
212 |                             inkey,
213 |                             intype,
214 |                         )
215 |                         return False
216 | 
217 |                     for i, file in enumerate(self.input[inkey]):
218 |                         if sig_indata[i] != str(file.spec):  # pragma: no cover
219 |                             self.log(
220 |                                 "debug",
221 |                                 "Not cached (input %s:%s at index %s is different)",
222 |                                 inkey,
223 |                                 intype,
224 |                                 i,
225 |                             )
226 |                             return False
227 | 
228 |                         if get_mtime(file.spec, dirsig) > signature.ctime + 1e-3:
229 |                             self.log(
230 |                                 "debug",
231 |                                 "Not cached (input %s:%s at index %s is newer)",
232 |                                 inkey,
233 |                                 intype,
234 |                                 i,
235 |                             )
236 |                             return False
237 | 
238 |             # Check if output is different
239 |             for outkey, outtype in self._output_types.items():
240 |                 sig_outdata = signature.output.data.get(outkey)
241 |                 if outtype == ProcOutputType.VAR:
242 |                     if sig_outdata != self.output[outkey]:  # pragma: no cover
243 |                         self.log(
244 |                             "debug",
245 |                             "Not cached (output %s:%s is different)",
246 |                             outkey,
247 |                             outtype,
248 |                         )
249 |                         return False
250 | 
251 |                 else:  # FILE/DIR
252 |                     if sig_outdata != str(self.output[outkey].spec):  # pragma: no cover
253 |                         self.log(
254 |                             "debug",
255 |                             "Not cached (output %s:%s is different)",
256 |                             outkey,
257 |                             outtype,
258 |                         )
259 |                         return False
260 | 
261 |                     if not self.output[outkey].spec.exists():
262 |                         self.log(
263 |                             "debug",
264 |                             "Not cached (output %s:%s was removed)",
265 |                             outkey,
266 |                             outtype,
267 |                         )
268 |                         return False
269 | 
270 |         except Exception as exc:  # pragma: no cover
271 |             # meaning signature is incomplete
272 |             # or any file is deleted
273 |             self.log("debug", "Not cached (%s)", exc)
274 |             return False
275 | 
276 |         return True
277 | 
278 |     @property
279 |     async def cached(self) -> bool:
280 |         """Check if a job is cached
281 | 
282 |         Returns:
283 |             True if the job is cached otherwise False
284 |         """
285 |         out = True
286 |         proc_cache = (
287 |             self.proc.pipeline.config.cache
288 |             if self.proc.cache is None
289 |             else self.proc.cache
290 |         )
291 |         if not proc_cache:
292 |             self.log(
293 |                 "debug",
294 |                 "Not cached (proc.cache is False)",
295 |             )
296 |             out = False
297 |         elif self.rc != 0:
298 |             self.log(
299 |                 "debug",
300 |                 "Not cached (job.rc != 0)",
301 |             )
302 |             out = False
303 |         elif proc_cache == "force":
304 |             try:
305 |                 await self.cache()
306 |             except Exception:  # pragma: no cover
307 |                 # FileNotFoundError, google.api_core.exceptions.NotFound, etc
308 |                 out = False
309 |             else:
310 |                 out = True
311 |         elif not self.signature_file.is_file():
312 |             self.log(
313 |                 "debug",
314 |                 "Not cached (signature file not found)",
315 |             )
316 |             out = False
317 |         else:
318 |             out = await self._check_cached()
319 | 
320 |         if not out:
321 |             await self._clear_output()
322 | 
323 |         return out
324 | 


--------------------------------------------------------------------------------
/pipen/channel.py:
--------------------------------------------------------------------------------
  1 | """Provide some function for creating and modifying channels (dataframes)"""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from glob import glob
  6 | from os import path
  7 | from itertools import chain
  8 | from pathlib import Path
  9 | from typing import Any, List
 10 | 
 11 | import pandas
 12 | from yunpath import AnyPath, CloudPath
 13 | from pandas import DataFrame
 14 | from pipda import register_verb
 15 | 
 16 | from .utils import path_is_symlink
 17 | 
 18 | 
 19 | # ----------------------------------------------------------------
 20 | # Creators
 21 | class Channel(DataFrame):
 22 |     """A DataFrame wrapper with creators"""
 23 | 
 24 |     @classmethod
 25 |     def create(cls, value: DataFrame | List[Any]) -> DataFrame:
 26 |         """Create a channel from a list.
 27 | 
 28 |         The second dimension is identified by tuple. if all elements are tuple,
 29 |         then a channel is created directly. Otherwise, elements are converted
 30 |         to tuples first and channels are created then.
 31 | 
 32 |         Examples:
 33 |             >>> Channel.create([1, 2, 3]) # 3 rows, 1 column
 34 |             >>> Channel.create([(1,2,3)]) # 1 row, 3 columns
 35 | 
 36 |         Args:
 37 |             value: The value to create a channel
 38 | 
 39 |         Returns:
 40 |             A channel (dataframe)
 41 |         """
 42 |         if isinstance(value, DataFrame):
 43 |             return value
 44 |         if all(isinstance(elem, tuple) for elem in value):
 45 |             return cls(value)
 46 |         return cls((val,) for val in value)
 47 | 
 48 |     @classmethod
 49 |     def from_glob(
 50 |         cls,
 51 |         pattern: str,
 52 |         ftype: str = "any",
 53 |         sortby: str = "name",
 54 |         reverse: bool = False,
 55 |     ) -> DataFrame:
 56 |         """Create a channel with a glob pattern
 57 | 
 58 |         Args:
 59 |             ftype: The file type, one of any, link, dir and file
 60 |             sortby: How the files should be sorted. One of name, mtime and size
 61 |             reverse: Whether sort them in a reversed way.
 62 | 
 63 |         Returns:
 64 |             The channel
 65 |         """
 66 | 
 67 |         def sort_key(file: Path | CloudPath) -> Any:
 68 |             if sortby == "mtime":
 69 |                 return file.stat().st_mtime
 70 |             if sortby == "size":
 71 |                 return file.stat().st_size
 72 | 
 73 |             return str(file)  # sort by name
 74 | 
 75 |         def file_filter(file: Path | CloudPath) -> bool:
 76 |             if ftype == "link":
 77 |                 return path_is_symlink(file)
 78 |             if ftype == "dir":
 79 |                 return file.is_dir()
 80 |             if ftype == "file":
 81 |                 return file.is_file()
 82 |             return True
 83 | 
 84 |         pattern: Path | CloudPath = AnyPath(pattern)
 85 |         if isinstance(pattern, CloudPath):
 86 |             parts = pattern.parts
 87 |             bucket = CloudPath("".join(parts[:2]))  # gs://bucket
 88 |             # CloudPath.glob() does not support a/b/*.txt
 89 |             # we have to do it part by part
 90 |             parts = parts[2:]
 91 |             files = [bucket]
 92 |             for i, part in enumerate(parts):
 93 |                 tmp = chain(*[base.glob(part) for base in files])
 94 |                 tmp = list(tmp)
 95 |                 files = [
 96 |                     base for base in tmp
 97 |                     if (i < len(parts) - 1 and base.is_dir())
 98 |                     or (i == len(parts) - 1 and file_filter(base))
 99 |                 ]
100 |         else:  # local path
101 |             files = (
102 |                 Path(file) for file in glob(str(pattern)) if file_filter(Path(file))
103 |             )
104 | 
105 |         return cls.create(
106 |             [
107 |                 str(file)
108 |                 for file in sorted(
109 |                     files,
110 |                     key=sort_key if sortby in ("name", "mtime", "size") else None,
111 |                     reverse=reverse,
112 |                 )  # type: ignore
113 |             ]
114 |         )
115 | 
116 |     @classmethod
117 |     def from_pairs(
118 |         cls,
119 |         pattern: str,
120 |         ftype: str = "any",
121 |         sortby: str = "name",
122 |         reverse: bool = False,
123 |     ) -> DataFrame:
124 |         """Create a width=2 channel with a glob pattern
125 | 
126 |         Args:
127 |             ftype: The file type, one of any, link, dir and file
128 |             sortby: How the files should be sorted. One of name, mtime and size
129 |             reverse: Whether sort them in a reversed way.
130 | 
131 |         Returns:
132 |             The channel
133 |         """
134 |         mates = cls.from_glob(pattern, ftype, sortby, reverse)
135 |         return pandas.concat(
136 |             (
137 |                 mates.iloc[::2].reset_index(drop=True),
138 |                 mates.iloc[1::2].reset_index(drop=True),
139 |             ),
140 |             axis=1,
141 |         )
142 | 
143 |     @classmethod
144 |     def from_csv(cls, *args, **kwargs):
145 |         """Create a channel from a csv file
146 | 
147 |         Uses pandas.read_csv() to create a channel
148 | 
149 |         Args:
150 |             *args: and
151 |             **kwargs: Arguments passing to pandas.read_csv()
152 |         """
153 |         return pandas.read_csv(*args, **kwargs)
154 | 
155 |     @classmethod
156 |     def from_excel(cls, *args, **kwargs):
157 |         """Create a channel from an excel file.
158 | 
159 |         Uses pandas.read_excel() to create a channel
160 | 
161 |         Args:
162 |             *args: and
163 |             **kwargs: Arguments passing to pandas.read_excel()
164 |         """
165 |         return pandas.read_excel(*args, **kwargs)
166 | 
167 |     @classmethod
168 |     def from_table(cls, *args, **kwargs):
169 |         """Create a channel from a table file.
170 | 
171 |         Uses pandas.read_table() to create a channel
172 | 
173 |         Args:
174 |             *args: and
175 |             **kwargs: Arguments passing to pandas.read_table()
176 |         """
177 |         return pandas.read_table(*args, **kwargs)
178 | 
179 | 
180 | # ----------------------------------------------------------------
181 | # Verbs
182 | @register_verb(DataFrame)
183 | def expand_dir(
184 |     data: DataFrame,
185 |     col: str | int = 0,
186 |     pattern: str = "*",
187 |     ftype: str = "any",
188 |     sortby: str = "name",
189 |     reverse: bool = False,
190 | ) -> DataFrame:
191 |     """Expand a Channel according to the files in <col>,
192 |     other cols will keep the same.
193 | 
194 |     This is only applicable to a 1-row channel.
195 | 
196 |     Examples:
197 |         >>> ch = channel.create([('./', 1)])
198 |         >>> ch >> expand()
199 |         >>> [['./a', 1], ['./b', 1], ['./c', 1]]
200 | 
201 |     Args:
202 |         col: the index or name of the column used to expand
203 |         pattern: use a pattern to filter the files/dirs, default: `*`
204 |         ftype: the type of the files/dirs to include
205 |             - 'dir', 'file', 'link' or 'any' (default)
206 |         sortby:  how the list is sorted
207 |             - 'name' (default), 'mtime', 'size'
208 |         reverse: reverse sort.
209 | 
210 |     Returns:
211 |         The expanded channel
212 |     """
213 |     assert data.shape[0] == 1, "Can only expand a single row DataFrame."
214 |     col_loc = col if isinstance(col, int) else data.columns.get_loc(col)
215 |     full_pattern = f"{data.iloc[0, col_loc]}/{pattern}"
216 |     expanded = Channel.from_glob(
217 |         full_pattern,
218 |         ftype,
219 |         sortby,
220 |         reverse,
221 |     ).iloc[:, 0]
222 |     ret = pandas.concat([data] * expanded.size, axis=0, ignore_index=True)
223 |     ret.iloc[:, col_loc] = expanded.values
224 |     return ret.reset_index(drop=True)
225 | 
226 | 
227 | @register_verb(DataFrame)
228 | def collapse_files(data: DataFrame, col: str | int = 0) -> DataFrame:
229 |     """Collapse a Channel according to the files in <col>,
230 |     other cols will use the values in row 0.
231 | 
232 |     Note that other values in other rows will be discarded.
233 | 
234 |     Examples:
235 |         >>> ch = channel.create([['./a', 1], ['./b', 1], ['./c', 1]])
236 |         >>> ch >> collapse()
237 |         >>> [['.', 1]]
238 | 
239 |     Args:
240 |         data: The original channel
241 |         col: the index or name of the column used to collapse on
242 | 
243 |     Returns:
244 |         The collapsed channel
245 |     """
246 |     assert data.shape[0] > 0, "Cannot collapse on an empty DataFrame."
247 |     col_loc = col if isinstance(col, int) else data.columns.get_loc(col)
248 |     paths = list(data.iloc[:, col_loc])
249 |     compx = path.dirname(path.commonprefix(paths))
250 |     ret = data.iloc[[0], :].copy()
251 |     ret.iloc[0, col_loc] = compx
252 |     return ret
253 | 


--------------------------------------------------------------------------------
/pipen/cli/__init__.py:
--------------------------------------------------------------------------------
1 | """Provide CLI for pipen"""
2 | 
3 | from ._hooks import CLIPlugin
4 | from ._main import main
5 | 


--------------------------------------------------------------------------------
/pipen/cli/_hooks.py:
--------------------------------------------------------------------------------
 1 | """Provide Cli class"""
 2 | from __future__ import annotations
 3 | 
 4 | from abc import ABC, abstractmethod
 5 | from typing import TYPE_CHECKING
 6 | 
 7 | from simplug import Simplug
 8 | 
 9 | from ..defaults import CLI_ENTRY_GROUP
10 | 
11 | if TYPE_CHECKING:
12 |     from argx import ArgumentParser
13 |     from argparse import Namespace
14 | 
15 | cli_plugin = Simplug(CLI_ENTRY_GROUP)
16 | 
17 | 
18 | class CLIPlugin(ABC):
19 |     """The abc for cli plugin"""
20 | 
21 |     def __init__(
22 |         self,
23 |         parser: ArgumentParser,
24 |         subparser: ArgumentParser,
25 |     ) -> None:
26 |         self.parser = parser
27 |         self.subparser = subparser
28 | 
29 |     @property
30 |     @abstractmethod
31 |     def name(self) -> str:
32 |         """The name/command of this plugin"""
33 | 
34 |     def parse_args(self) -> Namespace:
35 |         """Define arguments for the command"""
36 |         return self.parser.parse_args()
37 | 
38 |     @abstractmethod
39 |     def exec_command(self, args: Namespace) -> None:
40 |         """Execute the command"""
41 | 


--------------------------------------------------------------------------------
/pipen/cli/_main.py:
--------------------------------------------------------------------------------
 1 | """CLI main entrance"""
 2 | import re
 3 | import importlib
 4 | from pathlib import Path
 5 | 
 6 | from argx import ArgumentParser
 7 | 
 8 | from ._hooks import cli_plugin
 9 | from ..version import __version__
10 | 
11 | parser = ArgumentParser(
12 |     prog="pipen",
13 |     description=f"CLI Tool for pipen v{__version__}",
14 | )
15 | 
16 | 
17 | def load_builtin_clis() -> None:
18 |     """Load builtin cli plugins in this directory"""
19 |     for clifile in Path(__file__).parent.glob("*.py"):
20 |         if clifile.stem.startswith("_"):
21 |             continue
22 |         cli = importlib.import_module(f".{clifile.stem}", __package__)
23 |         plg = getattr(cli, cli.__all__[0])
24 |         cli_plugin.register(plg)
25 | 
26 | 
27 | def main() -> None:
28 |     """Main function of pipen CLI"""
29 |     cli_plugin.load_entrypoints()
30 |     # builtin plugins have the highest priority
31 |     # so they are loaded later to override the entrypoints
32 |     load_builtin_clis()
33 | 
34 |     plugin_names = sorted(
35 |         cli_plugin.get_enabled_plugin_names(),
36 |         key=lambda cmd: 999 if cmd == "help" else 0,
37 |     )
38 |     plugins = {}
39 |     for name in plugin_names:
40 |         plg = cli_plugin.get_plugin(name, raw=True)
41 | 
42 |         docstr = plg.__doc__
43 |         if docstr is not None:
44 |             docstr = docstr.strip()
45 | 
46 |         subparser = parser.add_command(
47 |             plg.name,
48 |             help=(
49 |                 None
50 |                 if docstr is None
51 |                 else re.sub(r"\s+", " ", docstr.splitlines()[0])
52 |             ),
53 |             description=docstr,
54 |         )
55 |         plugins[plg.name] = plg(parser, subparser)
56 | 
57 |     known_parsed, _ = parser.parse_known_args()
58 |     parsed = plugins[known_parsed.COMMAND].parse_args()
59 |     plugins[known_parsed.COMMAND].exec_command(parsed)
60 | 


--------------------------------------------------------------------------------
/pipen/cli/help.py:
--------------------------------------------------------------------------------
 1 | """Print help for commands"""
 2 | from __future__ import annotations
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | from ._hooks import CLIPlugin
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from argx import ArgumentParser
 9 |     from argparse import Namespace
10 | 
11 | __all__ = ("CLIHelpPlugin",)
12 | 
13 | 
14 | class CLIHelpPlugin(CLIPlugin):
15 |     """Print help for commands"""
16 | 
17 |     name = "help"
18 | 
19 |     def __init__(self, parser: ArgumentParser, subparser: ArgumentParser):
20 |         super().__init__(parser, subparser)
21 |         subparser.add_argument(
22 |             "cmd",
23 |             nargs="?",
24 |             choices=[
25 |                 n
26 |                 for n in parser._subparsers._group_actions[0].choices
27 |                 if n != "help"
28 |             ],
29 |             help="The command to show help for",
30 |         )
31 | 
32 |     def exec_command(self, args: Namespace) -> None:
33 |         """Run the command"""
34 | 
35 |         if not args.cmd:
36 |             self.parser.parse_args(["--help"])
37 |         else:
38 |             self.parser.parse_args([args.cmd, "--help"])
39 | 


--------------------------------------------------------------------------------
/pipen/cli/plugins.py:
--------------------------------------------------------------------------------
  1 | """List plugins"""
  2 | from __future__ import annotations
  3 | from typing import TYPE_CHECKING, Any, Iterable, List, Tuple
  4 | 
  5 | from rich import print
  6 | 
  7 | from ._hooks import CLIPlugin
  8 | from ..defaults import (
  9 |     CLI_ENTRY_GROUP,
 10 |     SCHEDULER_ENTRY_GROUP,
 11 |     TEMPLATE_ENTRY_GROUP,
 12 | )
 13 | from ..utils import load_entrypoints
 14 | 
 15 | if TYPE_CHECKING:
 16 |     from argx import ArgumentParser
 17 |     from argparse import Namespace
 18 | 
 19 | 
 20 | COMMAND = "plugins"
 21 | GROUPS = [
 22 |     "pipen",
 23 |     SCHEDULER_ENTRY_GROUP,
 24 |     TEMPLATE_ENTRY_GROUP,
 25 |     CLI_ENTRY_GROUP,
 26 | ]
 27 | GROUP_NAMES = {
 28 |     "pipen": "Pipen",
 29 |     SCHEDULER_ENTRY_GROUP: "Scheduler",
 30 |     TEMPLATE_ENTRY_GROUP: "Template",
 31 |     CLI_ENTRY_GROUP: "CLI",
 32 | }
 33 | 
 34 | __all__ = ("CliPluginsPlugin",)
 35 | 
 36 | 
 37 | def _get_plugins_by_group(group: str) -> Iterable[Tuple[str, Any]]:
 38 |     """Get plugins from entry points by group name
 39 | 
 40 |     Args:
 41 |         group: The name of the group
 42 | 
 43 |     Returns:
 44 |         A list of tuples with the plugin name and the plugin itself
 45 |     """
 46 |     for name, obj in load_entrypoints(group):
 47 |         yield name, obj
 48 | 
 49 | 
 50 | def _list_group_plugins(
 51 |     group: str,
 52 |     plugins: List[Tuple[str, Any]],
 53 | ) -> None:
 54 |     """List plugins in a single group
 55 | 
 56 |     Args:
 57 |         group: The group of the plugins
 58 |         plugins: A list of tuples with name and plugin
 59 |     """
 60 |     print("")
 61 |     print(f"[bold][u]{GROUP_NAMES[group]} plugins:[/u][/bold]")
 62 |     namelen = max(len(name) for name, _ in plugins) if plugins else 0
 63 |     for name, plugin in plugins:
 64 |         try:
 65 |             ver = plugin.version
 66 |         except AttributeError:
 67 |             try:
 68 |                 ver = plugin.__version__
 69 |             except AttributeError:
 70 |                 ver = "unknown"
 71 |         print(f"- {name.ljust(namelen)}: (version: {ver})")
 72 | 
 73 | 
 74 | def _list_plugins(plugins: List[Tuple[str, str, Any]]) -> None:
 75 |     """List plugins
 76 | 
 77 |     Args:
 78 |         plugins: A list of tuples with group, name and plugin
 79 |     """
 80 |     pipen_plugins = [
 81 |         (name, plugin) for group, name, plugin in plugins if group == "pipen"
 82 |     ]
 83 |     sched_plugins = [
 84 |         (name, plugin)
 85 |         for group, name, plugin in plugins
 86 |         if group == SCHEDULER_ENTRY_GROUP
 87 |     ]
 88 |     tpl_plugins = [
 89 |         (name, plugin)
 90 |         for group, name, plugin in plugins
 91 |         if group == TEMPLATE_ENTRY_GROUP
 92 |     ]
 93 |     cli_plugins = [
 94 |         (name, plugin)
 95 |         for group, name, plugin in plugins
 96 |         if group == CLI_ENTRY_GROUP
 97 |     ]
 98 |     _list_group_plugins("pipen", pipen_plugins)
 99 |     _list_group_plugins(SCHEDULER_ENTRY_GROUP, sched_plugins)
100 |     _list_group_plugins(TEMPLATE_ENTRY_GROUP, tpl_plugins)
101 |     _list_group_plugins(CLI_ENTRY_GROUP, cli_plugins)
102 | 
103 | 
104 | class CliPluginsPlugin(CLIPlugin):
105 |     """List installed plugins"""
106 | 
107 |     name = "plugins"
108 | 
109 |     def __init__(
110 |         self,
111 |         parser: ArgumentParser,
112 |         subparser: ArgumentParser,
113 |     ) -> None:
114 |         super().__init__(parser, subparser)
115 |         subparser.add_argument(
116 |             "-g",
117 |             "--group",
118 |             choices=GROUPS + ["all"],
119 |             default="all",
120 |             help="The name of the entry point group. Show all if not provided",
121 |         )
122 | 
123 |     def exec_command(self, args: Namespace) -> None:
124 |         """Execute the command"""
125 |         from ..version import __version__
126 |         print("Pipen version:", __version__)
127 | 
128 |         plugins: List[Tuple[str, str, Any]] = []
129 | 
130 |         if args.group and args.group != "all":
131 |             for name, plugin in _get_plugins_by_group(args.group):
132 |                 plugins.append((args.group, name, plugin))
133 | 
134 |         else:  # args.name
135 |             for group in GROUPS:
136 |                 for name, plugin in _get_plugins_by_group(group):
137 |                     plugins.append((group, name, plugin))
138 | 
139 |         _list_plugins(plugins)
140 | 


--------------------------------------------------------------------------------
/pipen/cli/profile.py:
--------------------------------------------------------------------------------
  1 | """List available profiles."""
  2 | from __future__ import annotations
  3 | from typing import TYPE_CHECKING
  4 | 
  5 | from rich import print
  6 | from rich.panel import Panel
  7 | from rich.syntax import Syntax
  8 | from simpleconf import ProfileConfig
  9 | 
 10 | from ._hooks import CLIPlugin
 11 | from ..defaults import CONFIG, CONFIG_FILES
 12 | 
 13 | if TYPE_CHECKING:
 14 |     from argx import ArgumentParser
 15 |     from argparse import Namespace
 16 | 
 17 | __all__ = ("CLIProfilePlugin",)
 18 | 
 19 | 
 20 | class CLIProfilePlugin(CLIPlugin):
 21 |     """List available profiles."""
 22 | 
 23 |     name = "profile"
 24 | 
 25 |     def __init__(
 26 |         self,
 27 |         parser: ArgumentParser,
 28 |         subparser: ArgumentParser,
 29 |     ) -> None:
 30 |         super().__init__(parser, subparser)
 31 |         subparser.add_argument(
 32 |             "-n",
 33 |             "--name",
 34 |             default="",
 35 |             help="The name of the profile to show. Show all if not provided.",
 36 |         )
 37 |         subparser.add_argument(
 38 |             "-l",
 39 |             "--list",
 40 |             action="store_true",
 41 |             default=False,
 42 |             help="List the names of all available profiles (-n won't work).",
 43 |         )
 44 | 
 45 |     def exec_command(self, args: Namespace) -> None:
 46 |         """Run the command"""
 47 | 
 48 |         config = ProfileConfig.load(
 49 |             {"default": CONFIG},
 50 |             *CONFIG_FILES,
 51 |             ignore_nonexist=True,
 52 |         )
 53 | 
 54 |         if args.list:
 55 |             print("\n".join(ProfileConfig.profiles(config)))
 56 |             return
 57 | 
 58 |         print("Configurations loaded from:")
 59 |         print("- pipen.defaults.CONFIG (python dictionary)")
 60 |         for conffile in reversed(CONFIG_FILES):
 61 |             print(f"- {conffile}")
 62 |         print("")
 63 | 
 64 |         print("Note:")
 65 |         print(
 66 |             "- The same profile from different configuration files "
 67 |             "are inherited."
 68 |         )
 69 |         print(
 70 |             "- These configurations can still be overriden by "
 71 |             "Pipen constructor and process definition."
 72 |         )
 73 |         print("")
 74 | 
 75 |         if not args.name:
 76 |             for profile in ProfileConfig.profiles(config):
 77 |                 with ProfileConfig.with_profile(config, profile):
 78 |                     conf = ProfileConfig.detach(config)
 79 |                     print(
 80 |                         Panel(
 81 |                             Syntax(conf.to_toml(), "toml"),
 82 |                             title=f"Profile: {profile}",
 83 |                             title_align="left",
 84 |                         )
 85 |                     )
 86 | 
 87 |         else:
 88 |             if not ProfileConfig.has_profile(config, args.name):
 89 |                 raise ValueError(f"No such profile: {args.name}")
 90 | 
 91 |             ProfileConfig.use_profile(config, args.name)
 92 |             conf = ProfileConfig.detach(config)
 93 |             print(
 94 |                 Panel(
 95 |                     Syntax(conf.to_toml(), "toml"),
 96 |                     title=f"Profile: {args.name}",
 97 |                     title_align="left",
 98 |                 )
 99 |             )
100 | 


--------------------------------------------------------------------------------
/pipen/cli/version.py:
--------------------------------------------------------------------------------
 1 | """Print help for commands"""
 2 | from __future__ import annotations
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | from rich import print
 6 | 
 7 | from ._hooks import CLIPlugin
 8 | 
 9 | if TYPE_CHECKING:
10 |     from argparse import Namespace
11 | 
12 | __all__ = ("CLIVersionPlugin",)
13 | 
14 | 
15 | class CLIVersionPlugin(CLIPlugin):
16 |     """Print versions of pipen and its dependencies"""
17 | 
18 |     name = "version"
19 | 
20 |     def exec_command(self, args: Namespace) -> None:
21 |         """Run the command"""
22 |         import sys
23 |         from importlib.metadata import version
24 |         from .. import __version__
25 | 
26 |         versions = {"python": sys.version, "pipen": __version__}
27 | 
28 |         for pkg in (
29 |             "liquidpy",
30 |             "pandas",
31 |             "enlighten",
32 |             "argx",
33 |             "xqute",
34 |             "python-simpleconf",
35 |             "pipda",
36 |             "varname",
37 |         ):
38 |             versions[pkg] = version(pkg)
39 | 
40 |         keylen = max(map(len, versions))
41 |         for key in versions:
42 |             ver = versions[key]
43 |             verlines = ver.splitlines()
44 |             print(f"{key.ljust(keylen)}: {verlines.pop(0)}")
45 |             for verline in verlines:  # pragma: no cover
46 |                 print(f"{' ' * keylen}  {verline}")
47 | 


--------------------------------------------------------------------------------
/pipen/defaults.py:
--------------------------------------------------------------------------------
 1 | """Provide some default values/objects"""
 2 | from pathlib import Path
 3 | from typing import ClassVar
 4 | 
 5 | from diot import Diot
 6 | from xqute import JobErrorStrategy
 7 | from xqute.utils import logger as xqute_logger
 8 | 
 9 | # Remove the rich handler
10 | _xqute_handlers = xqute_logger.handlers
11 | if _xqute_handlers:
12 |     # The very first handler is the rich handler
13 |     xqute_logger.removeHandler(_xqute_handlers[0])
14 | 
15 | LOGGER_NAME = "core"
16 | CONFIG_FILES = (
17 |     Path("~/.pipen.toml").expanduser(),
18 |     "./.pipen.toml",
19 |     "PIPEN.osenv",
20 | )
21 | CONFIG = Diot(
22 |     # pipeline level: The logging level
23 |     loglevel="info",
24 |     # process level: The cache option, True/False/export
25 |     cache=True,
26 |     # process level: Whether expand directory to check signature
27 |     dirsig=1,
28 |     # process level:
29 |     # How to deal with the errors
30 |     # retry, ignore, halt
31 |     # halt to halt the whole pipeline, no submitting new jobs
32 |     # terminate to just terminate the job itself
33 |     error_strategy=JobErrorStrategy.IGNORE,
34 |     # process level:
35 |     # How many times to retry to jobs once error occurs
36 |     num_retries=3,
37 |     # process level:
38 |     # The directory to export the output files
39 |     forks=1,
40 |     # process level: Default shell/language
41 |     lang="bash",
42 |     # process level:
43 |     # How many jobs to be submitted in a batch
44 |     submission_batch=8,
45 |     # pipeline level:
46 |     # The working directory for the pipeline
47 |     workdir="./.pipen",
48 |     # process level: template engine
49 |     template="liquid",
50 |     # process level: template options
51 |     template_opts={},
52 |     # process level: scheduler
53 |     scheduler="local",
54 |     # process level: scheduler options
55 |     scheduler_opts={},
56 |     # pipeline level: plugins
57 |     plugins=None,
58 |     # pipeline level: plugin opts
59 |     plugin_opts={},
60 | )
61 | 
62 | # Just the total width of the terminal
63 | # when logging with a rich.Panel()
64 | CONSOLE_WIDTH_WITH_PANEL = 100
65 | # The width of the terminal when the width cannot be detected,
66 | # we are probably logging into a file
67 | CONSOLE_DEFAULT_WIDTH = 2048
68 | # [05/16/22 11:46:40] I
69 | # v0.3.4:
70 | # 05-16 11:11:11 I
71 | # The markup code is included
72 | # Don't modify this unless the logger formatter is changed
73 | CONSOLE_WIDTH_SHIFT = 25
74 | # For pipen scheduler plugins
75 | SCHEDULER_ENTRY_GROUP = "pipen_sched"
76 | # For pipen template plugins
77 | TEMPLATE_ENTRY_GROUP = "pipen_tpl"
78 | # For pipen template cli plugins
79 | CLI_ENTRY_GROUP = "pipen_cli"
80 | 
81 | 
82 | class ProcInputType:
83 |     """Types for process inputs"""
84 | 
85 |     VAR: ClassVar[str] = "var"
86 |     FILE: ClassVar[str] = "file"
87 |     DIR: ClassVar[str] = "dir"
88 |     FILES: ClassVar[str] = "files"
89 |     DIRS: ClassVar[str] = "dirs"
90 | 
91 | 
92 | class ProcOutputType:
93 |     """Types for process outputs"""
94 | 
95 |     VAR: ClassVar[str] = "var"
96 |     DIR: ClassVar[str] = "dir"
97 |     FILE: ClassVar[str] = "file"
98 | 


--------------------------------------------------------------------------------
/pipen/exceptions.py:
--------------------------------------------------------------------------------
 1 | """Provide exception classes"""
 2 | 
 3 | 
 4 | class PipenException(Exception):
 5 |     """Base exception class for pipen"""
 6 | 
 7 | 
 8 | class PipenSetDataError(PipenException, ValueError):
 9 |     """When trying to set input data to processes with input_data already set
10 |     using Pipen.set_data()."""
11 | 
12 | 
13 | class ProcInputTypeError(PipenException, TypeError):
14 |     """When an unsupported input type is provided"""
15 | 
16 | 
17 | class ProcInputKeyError(PipenException, KeyError):
18 |     """When an unsupported input key is provided"""
19 | 
20 | 
21 | class ProcInputValueError(PipenException, ValueError):
22 |     """When an unsupported input value is provided"""
23 | 
24 | 
25 | class ProcScriptFileNotFound(PipenException, FileNotFoundError):
26 |     """When script file specified as 'file://' cannot be found"""
27 | 
28 | 
29 | class ProcOutputNameError(PipenException, NameError):
30 |     """When no name or malformatted output is provided"""
31 | 
32 | 
33 | class ProcOutputTypeError(PipenException, TypeError):
34 |     """When an unsupported output type is provided"""
35 | 
36 | 
37 | class ProcOutputValueError(PipenException, ValueError):
38 |     """When a malformatted output value is provided"""
39 | 
40 | 
41 | class ProcDependencyError(PipenException):
42 |     """When there is something wrong the process dependencies"""
43 | 
44 | 
45 | class NoSuchSchedulerError(PipenException):
46 |     """When specified scheduler cannot be found"""
47 | 
48 | 
49 | class WrongSchedulerTypeError(PipenException, TypeError):
50 |     """When specified scheduler is not a subclass of Scheduler"""
51 | 
52 | 
53 | class NoSuchTemplateEngineError(PipenException):
54 |     """When specified template engine cannot be found"""
55 | 
56 | 
57 | class WrongTemplateEnginTypeError(PipenException, TypeError):
58 |     """When specified tempalte engine is not a subclass of Scheduler"""
59 | 
60 | 
61 | class TemplateRenderingError(PipenException):
62 |     """Failed to render a template"""
63 | 
64 | 
65 | class ConfigurationError(PipenException):
66 |     """When something wrong set as configuration"""
67 | 
68 | 
69 | class PipenOrProcNameError(PipenException):
70 |     """ "When more than one processes are sharing the same workdir"""
71 | 


--------------------------------------------------------------------------------
/pipen/procgroup.py:
--------------------------------------------------------------------------------
  1 | """Process group that contains a set of processes.
  2 | 
  3 | It can be easily used to create a pipeline that runs independently or
  4 | integrated into a larger pipeline.
  5 | 
  6 | Runs directly:
  7 | >>> proc_group = ProcGroup(<options>)
  8 | >>> proc_group.as_pipen(<pipeline options>).set_data(<data>).run()
  9 | 
 10 | Integrated into a larger pipeline
 11 | >>> proc_group = ProcGroup(<options>)
 12 | >>> # proc could be a process within the larger pipeline
 13 | >>> proc.requires = prog_group.<proc>
 14 | 
 15 | To add a process to the proc group, use the `add_proc` method:
 16 | >>> class MyProcGroup(ProcGroup):
 17 | >>>     ...
 18 | >>>
 19 | >>> proc_group = MyProcGroup(...)
 20 | >>> @proc_group.add_proc
 21 | >>> class MyProc(Proc):
 22 | >>>     ...
 23 | 
 24 | Or add a process at runtime:
 25 | >>> class MyProcGroup(ProcGroup):
 26 | >>>     ...
 27 | >>>
 28 | >>>     @ProcGroup.add_proc
 29 | >>>     def my_proc(self):
 30 | >>>         class MyProc(Proc):
 31 | >>>             # You may use self.options here
 32 | >>>             ...
 33 | >>>         return MyProc
 34 | >>> proc_group = MyProcGroup(...)
 35 | """
 36 | from __future__ import annotations
 37 | 
 38 | from os import PathLike
 39 | from functools import wraps, cached_property
 40 | from typing import Any, Callable, Mapping, Type, List
 41 | from abc import ABC, ABCMeta
 42 | from diot import Diot
 43 | 
 44 | from .pipen import Pipen
 45 | from .proc import Proc
 46 | 
 47 | 
 48 | class ProcGropuMeta(ABCMeta):
 49 |     """Meta class for ProcGroup"""
 50 | 
 51 |     _INST = None
 52 | 
 53 |     def __call__(cls, *args, **kwds):
 54 |         """Make sure Proc subclasses are singletons
 55 | 
 56 |         Args:
 57 |             *args: and
 58 |             **kwds: Arguments for the constructor
 59 | 
 60 |         Returns:
 61 |             The Proc instance
 62 |         """
 63 |         if cls._INST is None:
 64 |             cls._INST = super().__call__(*args, **kwds)
 65 | 
 66 |         return cls._INST
 67 | 
 68 | 
 69 | class ProcGroup(ABC, metaclass=ProcGropuMeta):
 70 |     """A group of processes that can be run independently or
 71 |     integrated into a larger pipeline.
 72 |     """
 73 | 
 74 |     name: str | None = None
 75 |     __meta__: Mapping[str, Any] = {}
 76 |     DEFAULTS = Diot()
 77 |     PRESERVED = {
 78 |         "opts",
 79 |         "name",
 80 |         "add_proc",
 81 |         "as_pipen",
 82 |         "procs",
 83 |         "starts",
 84 |         "DEFAULTS",
 85 |         "PRESERVED",
 86 |         "_INST",
 87 |     }
 88 | 
 89 |     def __init_subclass__(cls) -> None:
 90 |         # Clear the meta
 91 |         cls.__meta__ = {}
 92 | 
 93 |     def __init__(self, **opts) -> None:
 94 |         self.opts = Diot(self.__class__.DEFAULTS or {}) | (opts or {})
 95 |         self.name = self.__class__.name or self.__class__.__name__
 96 |         self.starts: List[Type[Proc]] = []
 97 |         self.procs = Diot()
 98 | 
 99 |         self._load_runtime_procs()
100 | 
101 |     def _load_runtime_procs(self):
102 |         """Load all processes that are added at runtime"""
103 |         # Load all processes if they are decorated by ProcGroup.add_proc
104 |         for name, attr in self.__class__.__dict__.items():
105 |             if isinstance(attr, cached_property):
106 |                 getattr(self, name)
107 |             elif isinstance(attr, type) and issubclass(attr, Proc):
108 |                 self.add_proc(attr)
109 | 
110 |     def add_proc(
111 |         self_or_method: ProcGroup | Callable[[ProcGroup], Type[Proc]],
112 |         proc: Type[Proc] | None = None,
113 |     ) -> Type[Proc] | cached_property:
114 |         """Add a process to the proc group
115 | 
116 |         It works either as a decorator to the process directly or as a
117 |         decorator to a method that returns the process.
118 | 
119 |         Args:
120 |             self_or_method: The proc group instance or a method that
121 |                 returns the process
122 |             proc: The process class if `self_or_method` is the proc group
123 | 
124 |         Returns:
125 |             The process class if `self_or_method` is the proc group, or
126 |             a cached property that returns the process class
127 |         """
128 |         if isinstance(self_or_method, ProcGroup):
129 |             # Called as self.add_proc or pg.add_proc
130 |             if proc is None:
131 |                 return self_or_method.add_proc  # type: ignore
132 | 
133 |             if proc.name in self_or_method.__class__.PRESERVED:
134 |                 raise ValueError(
135 |                     f"Process name `{proc.name}` is reserved for ProcGroup"
136 |                 )
137 | 
138 |             setattr(self_or_method, proc.name, proc)
139 |             proc.__meta__["procgroup"] = self_or_method  # type: ignore
140 |             if not proc.requires and proc not in self_or_method.starts:
141 |                 self_or_method.starts.append(proc)
142 |             self_or_method.procs[proc.name] = proc
143 |             return proc
144 | 
145 |         @wraps(self_or_method)
146 |         def wrapper(self):
147 |             proc = self_or_method(self)
148 | 
149 |             if proc is None:
150 |                 return None
151 | 
152 |             if (not isinstance(proc, type) or not issubclass(proc, Proc)):
153 |                 raise ValueError(f"`{proc}` is not a Proc subclass")
154 | 
155 |             proc.__meta__["procgroup"] = self
156 |             if not proc.requires and proc not in self.starts:
157 |                 self.starts.append(proc)
158 |             self.procs[proc.name] = proc
159 |             return proc
160 | 
161 |         return cached_property(wrapper)
162 | 
163 |     def as_pipen(
164 |         self,
165 |         name: str | None = None,
166 |         desc: str | None = None,
167 |         outdir: str | PathLike | None = None,
168 |         **kwargs,
169 |     ) -> Pipen:
170 |         """Convert the pipeline to a Pipen instance
171 | 
172 |         Args:
173 |             name: The name of the pipeline
174 |             desc: The description of the pipeline
175 |             outdir: The output directory of the pipeline
176 |             **kwargs: The keyword arguments to pass to Pipen
177 | 
178 |         Returns:
179 |             The Pipen instance
180 |         """
181 |         name = name or self.__class__.__name__
182 |         if self.__doc__:
183 |             desc = desc or self.__doc__.lstrip().splitlines()[0]
184 | 
185 |         pipe = Pipen(name=name, desc=desc, outdir=outdir, **kwargs)
186 |         pipe.set_start(self.starts)
187 |         return pipe
188 | 


--------------------------------------------------------------------------------
/pipen/progressbar.py:
--------------------------------------------------------------------------------
  1 | """Provide the PipelinePBar and ProcPBar classes"""
  2 | from __future__ import annotations
  3 | 
  4 | from typing import TYPE_CHECKING
  5 | 
  6 | from .utils import truncate_text
  7 | 
  8 | if TYPE_CHECKING:  # pragma: no cover
  9 |     import enlighten
 10 | 
 11 | # [12/02/20 12:44:06] I core
 12 | #                 pipeline: 100%|
 13 | # |        desc_len      |
 14 | PBAR_DESC_LEN = 24
 15 | 
 16 | 
 17 | class ProcPBar:
 18 |     """The progress bar for processes"""
 19 | 
 20 |     def __init__(
 21 |         self, manager: enlighten.Manager, proc_size: int, proc_name: str
 22 |     ) -> None:
 23 |         self.submitted_counter = manager.counter(
 24 |             total=proc_size,
 25 |             color="cyan",
 26 |             desc=proc_name,
 27 |             unit="jobs",
 28 |             leave=False,
 29 |         )
 30 |         self.running_counter = self.submitted_counter.add_subcounter("yellow")
 31 |         self.success_counter = self.submitted_counter.add_subcounter("green")
 32 |         self.failure_counter = self.submitted_counter.add_subcounter("red")
 33 | 
 34 |     def update_job_submitted(self):
 35 |         """Update the progress bar when a job is submitted"""
 36 |         self.submitted_counter.update()
 37 | 
 38 |     def update_job_retrying(self):
 39 |         """Update the progress bar when a job is retrying"""
 40 |         # self.running_counter.count -= 1
 41 |         self.failure_counter.update(-1)
 42 | 
 43 |     def update_job_running(self):
 44 |         """Update the progress bar when a job is running"""
 45 |         try:
 46 |             self.running_counter.update_from(self.submitted_counter)
 47 |         except ValueError:  # pragma: no cover
 48 |             pass
 49 | 
 50 |     def update_job_succeeded(self):
 51 |         """Update the progress bar when a job is succeeded"""
 52 |         try:
 53 |             self.success_counter.update_from(self.running_counter)
 54 |         except ValueError:  # pragma: no cover
 55 |             try:
 56 |                 self.success_counter.update_from(self.submitted_counter)
 57 |             except ValueError:  # pragma: no cover
 58 |                 pass
 59 |         except:  # noqa: E722  # pragma: no cover
 60 |             pass
 61 | 
 62 |     def update_job_failed(self):
 63 |         """Update the progress bar when a job is failed"""
 64 |         try:
 65 |             self.failure_counter.update_from(self.running_counter)
 66 |         except ValueError:  # pragma: no cover
 67 |             try:
 68 |                 self.failure_counter.update_from(self.submitted_counter)
 69 |             except ValueError:  # pragma: no cover
 70 |                 pass
 71 |         except:  # noqa: E722  # pragma: no cover
 72 |             pass
 73 | 
 74 |     def done(self):
 75 |         """The process is done"""
 76 |         try:
 77 |             self.submitted_counter.close()
 78 |         except:  # noqa: E722  # pragma: no cover
 79 |             pass
 80 | 
 81 | 
 82 | class PipelinePBar:
 83 |     """Progress bar for the pipeline"""
 84 | 
 85 |     def __init__(self, n_procs: int, ppln_name: str) -> None:
 86 |         """Initialize progress bar for pipeline"""
 87 |         import enlighten
 88 | 
 89 |         desc_len = PBAR_DESC_LEN
 90 |         ppln_name = truncate_text(ppln_name, desc_len)
 91 |         self.manager = enlighten.get_manager()
 92 |         self.running_counter = self.manager.counter(
 93 |             total=n_procs,
 94 |             color="yellow",
 95 |             desc=f"{ppln_name:>{desc_len}}:",
 96 |             unit="procs",
 97 |         )
 98 |         self.success_counter = self.running_counter.add_subcounter("green")
 99 |         self.failure_counter = self.running_counter.add_subcounter("red")
100 |         self.desc_len = desc_len
101 | 
102 |     def proc_bar(self, proc_size: int, proc_name: str) -> ProcPBar:
103 |         """Get the progress bar for a process
104 | 
105 |         Args:
106 |             proc_size: The size of the process
107 |             proc_name: The name of the process
108 | 
109 |         Returns:
110 |             The progress bar for the given process
111 |         """
112 |         proc_name = truncate_text(proc_name, self.desc_len)
113 |         proc_name = f"{proc_name:>{self.desc_len}}:"
114 |         return ProcPBar(self.manager, proc_size, proc_name)
115 | 
116 |     def update_proc_running(self):
117 |         """Update the progress bar when a process is running"""
118 |         self.running_counter.update()
119 | 
120 |     def update_proc_done(self):
121 |         """Update the progress bar when a process is done"""
122 |         self.success_counter.update_from(self.running_counter)
123 | 
124 |     def update_proc_error(self):
125 |         """Update the progress bar when a process is errored"""
126 |         self.failure_counter.update_from(self.running_counter)
127 | 
128 |     def done(self) -> None:
129 |         """When the pipeline is done"""
130 |         try:
131 |             self.running_counter.close()
132 |             self.manager.stop()
133 |         except:  # noqa: E722  # pragma: no cover
134 |             pass
135 | 


--------------------------------------------------------------------------------
/pipen/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pwwang/pipen/0f99f02de29d15bf8426805a74ce9bca99bdcc03/pipen/py.typed


--------------------------------------------------------------------------------
/pipen/scheduler.py:
--------------------------------------------------------------------------------
  1 | """Provide builting schedulers"""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from typing import TYPE_CHECKING, Sequence, Type
  6 | 
  7 | from diot import Diot
  8 | 
  9 | # Use cloudpathlib.GSPath instead of yunpath.GSPath,
 10 | # the latter is a subclass of the former.
 11 | # (_GSPath is cloudpathlib.GSPath)
 12 | from yunpath.patch import _GSPath
 13 | from xqute import Scheduler
 14 | from xqute.schedulers.local_scheduler import LocalScheduler as XquteLocalScheduler
 15 | from xqute.schedulers.sge_scheduler import SgeScheduler as XquteSgeScheduler
 16 | from xqute.schedulers.slurm_scheduler import SlurmScheduler as XquteSlurmScheduler
 17 | from xqute.schedulers.ssh_scheduler import SshScheduler as XquteSshScheduler
 18 | from xqute.schedulers.gbatch_scheduler import GbatchScheduler as XquteGbatchScheduler
 19 | from xqute.path import SpecPath
 20 | 
 21 | from .defaults import SCHEDULER_ENTRY_GROUP
 22 | from .exceptions import NoSuchSchedulerError, WrongSchedulerTypeError
 23 | from .job import Job
 24 | from .utils import is_subclass, load_entrypoints
 25 | 
 26 | if TYPE_CHECKING:
 27 |     from .proc import Proc
 28 | 
 29 | 
 30 | class SchedulerPostInit:
 31 |     """Provides post init function for all schedulers"""
 32 | 
 33 |     job_class = Job
 34 | 
 35 |     MOUNTED_METADIR: str
 36 |     MOUNTED_OUTDIR: str
 37 | 
 38 |     def post_init(self, proc: Proc) -> None: ...  # noqa: E704
 39 | 
 40 | 
 41 | class LocalScheduler(SchedulerPostInit, XquteLocalScheduler):
 42 |     """Local scheduler"""
 43 | 
 44 | 
 45 | class SgeScheduler(SchedulerPostInit, XquteSgeScheduler):
 46 |     """SGE scheduler"""
 47 | 
 48 | 
 49 | class SlurmScheduler(SchedulerPostInit, XquteSlurmScheduler):
 50 |     """Slurm scheduler"""
 51 | 
 52 | 
 53 | class SshScheduler(SchedulerPostInit, XquteSshScheduler):
 54 |     """SSH scheduler"""
 55 | 
 56 | 
 57 | class GbatchScheduler(SchedulerPostInit, XquteGbatchScheduler):
 58 |     """Google Cloud Batch scheduler"""
 59 | 
 60 |     MOUNTED_METADIR: str = "/mnt/pipen-pipeline/workdir"
 61 |     MOUNTED_OUTDIR: str = "/mnt/pipen-pipeline/outdir"
 62 | 
 63 |     # fast mount is used to add a volume taskGroups[0].taskSpec.volumes
 64 |     # to mount additional cloud directory to the VM
 65 |     # For example: fast_mount="gs://bucket/path:/mnt/path"
 66 |     # will add a volume: {
 67 |     #   "gcs": {"remotePath": "bucket/path"},
 68 |     #   "mountPath": "/mnt/path"
 69 |     # }
 70 |     def __init__(
 71 |         self,
 72 |         *args,
 73 |         project,
 74 |         location,
 75 |         fast_mount: str | Sequence[str] = None,
 76 |         **kwargs,
 77 |     ):
 78 |         super().__init__(*args, project=project, location=location, **kwargs)
 79 |         if not fast_mount:
 80 |             return
 81 | 
 82 |         if isinstance(fast_mount, str):
 83 |             fast_mount = [fast_mount]
 84 | 
 85 |         for fm in fast_mount:
 86 |             if fm.count(":") != 2:
 87 |                 raise ValueError(
 88 |                     "'fast_mount' for gbatch scheduler should be in the format of "
 89 |                     "'gs://bucket/path:/mnt/path'"
 90 |                 )
 91 | 
 92 |             if not fm.startswith("gs://"):
 93 |                 raise ValueError(
 94 |                     "'fast_mount' for gbatch scheduler should be "
 95 |                     "a Google Cloud Storage path (begins with 'gs://')"
 96 |                 )
 97 | 
 98 |             remote_path, mount_path = fm[5:].split(":", 1)
 99 |             self.config.taskGroups[0].taskSpec.volumes.append(
100 |                 Diot(
101 |                     {
102 |                         "gcs": {"remotePath": remote_path},
103 |                         "mountPath": mount_path,
104 |                     }
105 |                 )
106 |             )
107 | 
108 |     def post_init(self, proc: Proc):
109 |         super().post_init(proc)
110 | 
111 |         # Check if pipeline outdir is a GSPath
112 |         if not isinstance(proc.pipeline.outdir, _GSPath):
113 |             raise ValueError(
114 |                 "'gbatch' scheduler requires google cloud storage 'outdir'."
115 |             )
116 | 
117 |         mounted_workdir = f"{self.MOUNTED_METADIR}/{proc.name}"
118 |         self.workdir: SpecPath = SpecPath(self.workdir, mounted=mounted_workdir)
119 | 
120 |         # update the mounted metadir
121 |         self.config.taskGroups[0].taskSpec.volumes[0].mountPath = mounted_workdir
122 | 
123 |         # update the config to map the outdir to vm
124 |         self.config.taskGroups[0].taskSpec.volumes.append(
125 |             Diot(
126 |                 {
127 |                     "gcs": {"remotePath": proc.pipeline.outdir._no_prefix},
128 |                     "mountPath": self.MOUNTED_OUTDIR,
129 |                 }
130 |             )
131 |         )
132 | 
133 | 
134 | def get_scheduler(scheduler: str | Type[Scheduler]) -> Type[Scheduler]:
135 |     """Get the scheduler by name of the scheduler class itself
136 | 
137 |     Args:
138 |         scheduler: The scheduler class or name
139 | 
140 |     Returns:
141 |         The scheduler class
142 |     """
143 |     if is_subclass(scheduler, Scheduler):
144 |         return scheduler  # type: ignore
145 | 
146 |     if scheduler == "local":
147 |         return LocalScheduler
148 | 
149 |     if scheduler == "sge":
150 |         return SgeScheduler
151 | 
152 |     if scheduler == "slurm":
153 |         return SlurmScheduler
154 | 
155 |     if scheduler == "ssh":
156 |         return SshScheduler
157 | 
158 |     if scheduler == "gbatch":
159 |         return GbatchScheduler
160 | 
161 |     for n, obj in load_entrypoints(SCHEDULER_ENTRY_GROUP):  # pragma: no cover
162 |         if n == scheduler:
163 |             if not is_subclass(obj, Scheduler):
164 |                 raise WrongSchedulerTypeError(
165 |                     "Scheduler should be a subclass of " "pipen.scheduler.Scheduler."
166 |                 )
167 |             return obj
168 | 
169 |     raise NoSuchSchedulerError(str(scheduler))
170 | 


--------------------------------------------------------------------------------
/pipen/template.py:
--------------------------------------------------------------------------------
  1 | """Template adaptor for pipen"""
  2 | from __future__ import annotations
  3 | 
  4 | from abc import ABC, abstractmethod
  5 | from typing import Any, Mapping, Type
  6 | 
  7 | from liquid import Liquid
  8 | 
  9 | from .defaults import TEMPLATE_ENTRY_GROUP
 10 | from .exceptions import NoSuchTemplateEngineError, WrongTemplateEnginTypeError
 11 | from .utils import is_subclass, load_entrypoints
 12 | 
 13 | __all__ = [
 14 |     "Template",
 15 |     "TemplateLiquid",
 16 |     "TemplateJinja2",
 17 |     "get_template_engine",
 18 | ]
 19 | 
 20 | 
 21 | class Template(ABC):
 22 |     """Base class wrapper to wrap template for pipen"""
 23 | 
 24 |     def __init__(
 25 |         self,
 26 |         source: Any,
 27 |         **kwargs: Any,
 28 |     ):
 29 |         """Template construct"""
 30 |         self.engine: Any = None
 31 | 
 32 |     def render(self, data: Mapping[str, Any] = None) -> str:
 33 |         """
 34 |         Render the template
 35 |         @parmas:
 36 |             data (dict): The data used to render
 37 |         """
 38 |         return self._render(data or {})
 39 | 
 40 |     @abstractmethod
 41 |     def _render(self, data: Mapping[str, Any]) -> str:
 42 |         """Implement rendering"""
 43 | 
 44 | 
 45 | class TemplateLiquid(Template):
 46 |     """Liquidpy template wrapper."""
 47 | 
 48 |     name = "liquid"
 49 | 
 50 |     def __init__(
 51 |         self,
 52 |         source: Any,
 53 |         **kwargs: Any,
 54 |     ):
 55 |         """Initiate the engine with source and envs
 56 | 
 57 |         Args:
 58 |             source: The souce text
 59 |             envs: The env data
 60 |             **kwargs: Other arguments for Liquid
 61 |         """
 62 |         super().__init__(source)
 63 |         self.engine = Liquid(
 64 |             source,
 65 |             from_file=False,
 66 |             mode="wild",
 67 |             **kwargs,
 68 |         )
 69 | 
 70 |     def _render(self, data: Mapping[str, Any]) -> str:
 71 |         """Render the template
 72 | 
 73 |         Args:
 74 |             data: The data used for rendering
 75 | 
 76 |         Returns
 77 |             The rendered string
 78 |         """
 79 |         return self.engine.render(data)
 80 | 
 81 | 
 82 | class TemplateJinja2(Template):
 83 |     """Jinja2 template wrapper"""
 84 | 
 85 |     name = "jinja2"
 86 | 
 87 |     def __init__(
 88 |         self,
 89 |         source: Any,
 90 |         **kwargs: Any,
 91 |     ):
 92 |         """Initiate the engine with source and envs
 93 | 
 94 |         Args:
 95 |             source: The souce text
 96 |             envs: The env data
 97 |             **kwargs: Other arguments for jinja2.Template
 98 |         """
 99 |         import jinja2
100 | 
101 |         super().__init__(source)
102 |         filters = kwargs.pop("filters", {})
103 |         envs = kwargs.pop("globals", {})
104 |         filters = kwargs.pop("filters", {})
105 |         self.engine = jinja2.Template(source, **kwargs)
106 |         self.engine.globals.update(envs)
107 |         self.engine.environment.filters.update(filters)
108 | 
109 |     def _render(self, data: Mapping[str, Any]) -> str:
110 |         """Render the template
111 | 
112 |         Args:
113 |             data: The data used for rendering
114 | 
115 |         Retuens:
116 |             The rendered string
117 |         """
118 |         return self.engine.render(data)
119 | 
120 | 
121 | def get_template_engine(template: str | Type[Template]) -> Type[Template]:
122 |     """Get the template engine by name or the template engine itself
123 | 
124 |     Args:
125 |         template: The name of the template engine or the template engine itself
126 | 
127 |     Returns:
128 |         The template engine
129 |     """
130 |     if is_subclass(template, Template):
131 |         return template  # type: ignore
132 | 
133 |     if template == "liquid":
134 |         return TemplateLiquid
135 | 
136 |     if template == "jinja2":
137 |         return TemplateJinja2
138 | 
139 |     for name, obj in load_entrypoints(
140 |         TEMPLATE_ENTRY_GROUP
141 |     ):  # pragma: no cover
142 |         if name == template:
143 |             if not is_subclass(obj, Template):
144 |                 raise WrongTemplateEnginTypeError(
145 |                     "Template engine should be a subclass of "
146 |                     "pipen.templates.Template."
147 |                 )
148 |             return obj
149 | 
150 |     raise NoSuchTemplateEngineError(str(template))
151 | 


--------------------------------------------------------------------------------
/pipen/version.py:
--------------------------------------------------------------------------------
1 | """Provide version of pipen"""
2 | 
3 | __version__ = "0.17.4"
4 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [ "poetry>=1.1.0" ]
 3 | build-backend = "poetry.masonry.api"
 4 | 
 5 | [tool.poetry]
 6 | name = "pipen"
 7 | version = "0.17.4"
 8 | description = "A pipeline framework for python"
 9 | authors = [ "pwwang <pwwang@pwwang.com>",]
10 | license = "MIT"
11 | readme = "README.md"
12 | homepage = "https://github.com/pwwang/pipen"
13 | repository = "https://github.com/pwwang/pipen"
14 | 
15 | [tool.poetry.build]
16 | generate-setup-file = true
17 | 
18 | [tool.poetry.dependencies]
19 | python = "^3.9"
20 | liquidpy = "^0.8"
21 | pandas = "^2.0"
22 | enlighten = "^1"
23 | argx = "^0.3"
24 | xqute = "^0.9"
25 | ## included in xqute
26 | # rich = "^13"
27 | # diot = "^0.1"
28 | # simplug = "^0.0"
29 | python-simpleconf = {version = "^0.7", extras = ["toml"]}
30 | pipda = "^0.13"
31 | varname = "^0.14"
32 | 
33 | [tool.poetry.group.dev.dependencies]
34 | openpyxl = "^3"
35 | pytest = "^8"
36 | pytest-asyncio = "^0.25"
37 | pytest-cov = "^6"
38 | pytest-xdist = "^3"
39 | pytest-forked = "^1.6"
40 | # This also installs scipy and wcwidth
41 | datar = { version = "^0.15", extras = ["pandas"] }
42 | flake8 = "^7"
43 | python-dotenv = "^1"
44 | cloudsh = {version = "^0.1", extras = ["gs"]}
45 | 
46 | # dependencies for pipelines in ./examples
47 | ipykernel = "^6.29.5"
48 | [tool.poetry.group.example.dependencies]
49 | mako = "^1.3"
50 | python-dotenv = "^1"
51 | cloudsh = {version = "^0.1", extras = ["gs"]}
52 | 
53 | 
54 | [tool.poetry.group.docs.dependencies]
55 | mkdocs = "^1.6.1"
56 | jinja2 = "^3.1.5"
57 | mkdocs-material = "^9.6.5"
58 | pymdown-extensions = "^10.14.3"
59 | mkapi-fix = "^0.1.1"
60 | 
61 | [tool.poetry.scripts]
62 | pipen = "pipen.cli:main"
63 | 
64 | [tool.pytest.ini_options]
65 | addopts = "-vv -n auto --dist loadgroup -p no:benchmark -W error::UserWarning --cov-config=.coveragerc --cov=pipen --cov-report xml:.coverage.xml --cov-report term-missing"
66 | console_output_style = "progress"
67 | junit_family = "xunit1"
68 | asyncio_default_fixture_loop_scope = "function"
69 | filterwarnings = [
70 |    "ignore:.+may lead to deadlocks in the child:DeprecationWarning",
71 | ]
72 | 
73 | [tool.mypy]
74 | ignore_missing_imports = true
75 | allow_redefinition = true
76 | disable_error_code = ["attr-defined", "no-redef"]
77 | show_error_codes = true
78 | strict_optional = false
79 | 
80 | [tool.black]
81 | line-length = 88
82 | target-version = ['py39', 'py310', 'py311', 'py312']
83 | include = '\.pyi?$'
84 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | This will not be included in the distribution.
 4 | The distribution is managed by poetry
 5 | This file is kept only for
 6 | 1. Github to index the dependents
 7 | 2. pip install -e .
 8 | 
 9 | Do NOT use this to install this package, unless you handled the dependencies
10 | by your self:
11 | 
12 | pip install git+https://...
13 | """
14 | from setuptools import setup
15 | 
16 | setup(name="pipen")
17 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | from pipen import Pipen
2 | 
3 | # disable all plugins
4 | Pipen.SETUP = True
5 | 
6 | # Don't delete the following, which is used for relative path script testing
7 | # AbCdEf
8 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import datar
2 | from dotenv import load_dotenv
3 | 
4 | datar.options(backends=["numpy", "pandas"])
5 | load_dotenv()
6 | 


--------------------------------------------------------------------------------
/tests/helpers.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | # import signal
  3 | from tempfile import gettempdir
  4 | from pathlib import Path
  5 | # from shutil import rmtree
  6 | # from multiprocessing import Process
  7 | 
  8 | import pytest
  9 | from pipen import Proc, Pipen, plugin
 10 | from pipen.utils import is_loading_pipeline
 11 | 
 12 | BUCKET = "gs://handy-buffer-287000.appspot.com"
 13 | 
 14 | 
 15 | class SimpleProc(Proc):
 16 |     """A very simple process for testing"""
 17 | 
 18 |     input = ["input"]
 19 |     script = "sleep 1.5"  # let on_job_polling run
 20 | 
 21 | 
 22 | class NormalProc(Proc):
 23 |     """A normal proc"""
 24 | 
 25 |     input = "input:var"
 26 |     output = ["output:{{in.input}}"]
 27 |     script = "echo {{in.input}}"
 28 | 
 29 | 
 30 | class In2Out1Proc(Proc):
 31 |     """Process with 2 input vars and 1 output var"""
 32 | 
 33 |     input = "in1:var, in2:var"
 34 |     output = "out:var:{{in.in1}}_{{in.in2}}"
 35 |     script = "echo {{in.in1}} {{in.in2}}"
 36 | 
 37 | 
 38 | class RelPathScriptProc(Proc):
 39 |     """Process uses relative path script"""
 40 | 
 41 |     input = "in"
 42 |     output = "out:var:{{in.in}}"
 43 |     # use this file itself
 44 |     script = "file://__init__.py"
 45 | 
 46 | 
 47 | class ScriptNotExistsProc(Proc):
 48 |     """Process uses relative path script"""
 49 | 
 50 |     input = "in"
 51 |     output = "out:var:{{in.in}}"
 52 |     # use this file itself
 53 |     script = "file:///no/such/file"
 54 | 
 55 | 
 56 | class ErrorProc(Proc):
 57 |     """Errant process"""
 58 | 
 59 |     input = ["input"]
 60 |     script = "exit 1"
 61 | 
 62 | 
 63 | class ScriptRenderErrorProc(Proc):
 64 |     """When script is failed to render"""
 65 | 
 66 |     input = "a"
 67 |     output = "b:var:1"
 68 |     script = "{{c(d)}}"
 69 | 
 70 | 
 71 | class SleepingProc(Proc):
 72 |     """Process to sleep for a certain time"""
 73 | 
 74 |     input = "time"
 75 |     script = "sleep {{in.time}}"
 76 | 
 77 | 
 78 | class RetryProc(ErrorProc):
 79 |     input = "starttime"
 80 |     error_strategy = "retry"
 81 |     num_retries = 10
 82 |     lang = sys.executable  # python
 83 |     script = """
 84 |         import sys, time
 85 |         sys.exit(1 if time.time() < {{in.starttime}} + 3 else 0)
 86 |     """
 87 | 
 88 | 
 89 | class OutputRenderErrorProc(Proc):
 90 |     """When output is failed to render"""
 91 | 
 92 |     input = "a"
 93 |     output = "b:var:{{c(d)}}"
 94 | 
 95 | 
 96 | class OutputNoNameErrorProc(Proc):
 97 |     """When no name/type given in output"""
 98 | 
 99 |     input = "a"
100 |     output = "b"
101 | 
102 | 
103 | class OutputWrongTypeProc(Proc):
104 |     """When no name/type given in output"""
105 | 
106 |     input = "a"
107 |     output = "b:c:d"
108 | 
109 | 
110 | class OutputAbsPathProc(Proc):
111 |     """When no name/type given in output"""
112 | 
113 |     input = "a"
114 |     output = "b:file:/a/b"
115 | 
116 | 
117 | class NoInputProc(Proc):
118 |     """Process without input"""
119 | 
120 | 
121 | class InputTypeUnsupportedProc(Proc):
122 |     """Input type not supported"""
123 | 
124 |     input = "input:unsupported:1"
125 | 
126 | 
127 | class FileInputProc(Proc):
128 |     """Process with file input"""
129 | 
130 |     input = "in:file"
131 |     output = "out:file:{{in.in.name}}"
132 |     script = "cat {{in.in}} > {{out.out}}"
133 | 
134 | 
135 | class FileInputProcToDiff(Proc):
136 |     """Process with file input to different output"""
137 | 
138 |     input = "in:file"
139 |     output = "out:var:output"
140 |     # script does not rely on input or output
141 |     script = "echo output"
142 | 
143 | 
144 | class OutputNotGeneratedProc(Proc):
145 |     """Process with output file not generated intentionally"""
146 | 
147 |     input = "in"
148 |     output = "out:file:{{in.in}}"
149 |     script = "echo {{in.in}}"
150 | 
151 | 
152 | class FileInputsProc(Proc):
153 |     """Process with files input"""
154 | 
155 |     input = "in:files"
156 |     output = "out:file:{{in.in[0].name}}"
157 |     script = "echo {{in.in[0].name}} > {{out.out}}"
158 | 
159 | 
160 | class MixedInputProc(Proc):
161 |     """Process with mixed types of input"""
162 | 
163 |     input = "invar:var, infile:file"
164 |     output = "outfile:file:{{in.invar}}"
165 |     script = "echo {{in.invar}} > {{out.outfile}}"
166 | 
167 | 
168 | class DirOutputProc(Proc):
169 |     """Process with directory output"""
170 | 
171 |     input = "in"
172 |     output = "outfile:dir:outdir"
173 |     script = "echo {{in.in}} > {{out.outfile}}/outfile; "
174 | 
175 | 
176 | class SimplePlugin:
177 |     @plugin.impl
178 |     async def on_init(pipen):
179 |         if getattr(pipen.__class__, "loading", False):
180 |             assert is_loading_pipeline("--help")
181 |         print("SimplePlugin")
182 | 
183 |     @plugin.impl
184 |     async def on_job_polling(job):
185 |         print("SimplePlugin on_job_polling")
186 | 
187 | 
188 | @pytest.fixture
189 | def pipen(tmp_path):
190 |     """Get a simple Pipen object each time"""
191 |     index = Pipen.PIPELINE_COUNT + 1
192 |     pipen_simple = Pipen(
193 |         name=f"simple_pipeline_{index}",
194 |         desc="No description",
195 |         loglevel="debug",
196 |         cache=True,
197 |         workdir=tmp_path / ".pipen",
198 |         outdir=tmp_path / f"pipen_simple_{index}",
199 |     )
200 | 
201 |     return pipen_simple
202 | 
203 | 
204 | @pytest.fixture
205 | def pipen_with_plugin(tmp_path):
206 |     """Get a simple Pipen object each time"""
207 |     index = Pipen.PIPELINE_COUNT + 1
208 |     pipen_simple = Pipen(
209 |         name=f"simple_pipeline_{index}",
210 |         desc="No description",
211 |         loglevel="debug",
212 |         cache=True,
213 |         plugins=[SimplePlugin()],
214 |         workdir=tmp_path / ".pipen",
215 |         outdir=tmp_path / f"pipen_simple_{index}",
216 |     )
217 | 
218 |     return pipen_simple
219 | 
220 | 
221 | class PipenIsLoading(Pipen):
222 |     name = "PipenIsLoading"
223 |     loading = True
224 |     plugins = [SimplePlugin()]
225 |     starts = SimpleProc
226 | 
227 | 
228 | @pytest.fixture
229 | def infile(tmp_path):
230 |     out = tmp_path / "infile"
231 |     out.write_text("in")
232 |     return out
233 | 
234 | 
235 | @pytest.fixture
236 | def infile1(tmp_path):
237 |     out = tmp_path / "infile1"
238 |     out.write_text("in1")
239 |     return out
240 | 
241 | 
242 | @pytest.fixture
243 | def infile2(tmp_path):
244 |     out = tmp_path / "infile2"
245 |     out.write_text("in2")
246 |     return out
247 | 
248 | 
249 | def create_dead_link(path):
250 |     target = Path(gettempdir()) / "__NoSuchFile__"
251 |     target.write_text("")
252 |     link = Path(path)
253 |     if link.exists() or link.is_symlink():
254 |         link.unlink()
255 |     link.symlink_to(target)
256 |     target.unlink()
257 | 
258 | 
259 | # for load_pipeline tests
260 | pipeline = Pipen(
261 |     name=f"simple_pipeline_{Pipen.PIPELINE_COUNT + 1}",
262 |     desc="No description",
263 |     loglevel="debug",
264 |     cache=True,
265 |     workdir=gettempdir() + "/.pipen",
266 |     outdir=gettempdir() + f"/pipen_simple_{Pipen.PIPELINE_COUNT}",
267 | ).set_starts(SimpleProc)
268 | 


--------------------------------------------------------------------------------
/tests/test_channel.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from io import StringIO
  3 | from pathlib import Path
  4 | from math import ceil
  5 | 
  6 | import pytest  # noqa
  7 | from pipen.channel import Channel, expand_dir, collapse_files
  8 | from datar.tibble import tibble
  9 | 
 10 | from pandas import DataFrame
 11 | 
 12 | from .helpers import BUCKET
 13 | 
 14 | 
 15 | def test_create():
 16 |     assert isinstance(Channel.create(DataFrame([[1]])), DataFrame)
 17 | 
 18 | 
 19 | def test_from_glob():
 20 |     glob = Path(__file__).parent / "test_*.py"
 21 |     glob_files = list(Path(__file__).parent.glob("test_*.py"))
 22 |     ch = Channel.from_glob(glob)
 23 |     assert ch.shape == (len(glob_files), 1)
 24 | 
 25 | 
 26 | def test_from_glob_sortby_mtime(tmp_path):
 27 |     file1 = tmp_path / "file1.txt"
 28 |     file2 = tmp_path / "file2.txt"
 29 |     file3 = tmp_path / "file3.txt"
 30 |     file1.touch()
 31 |     file2.touch()
 32 |     file3.touch()
 33 |     os.utime(file1, (1000, 1000))
 34 |     os.utime(file2, (3000, 3000))
 35 |     os.utime(file3, (2000, 2000))
 36 |     ch = Channel.from_glob(tmp_path / "*.txt", sortby="mtime")
 37 |     assert ch.iloc[0, 0] == str(file1)
 38 |     assert ch.iloc[1, 0] == str(file3)
 39 |     assert ch.iloc[2, 0] == str(file2)
 40 | 
 41 | 
 42 | def test_from_glob_sortby_size(tmp_path):
 43 |     file1 = tmp_path / "file1.txt"
 44 |     file2 = tmp_path / "file2.txt"
 45 |     file3 = tmp_path / "file3.txt"
 46 |     file1.write_text("1")
 47 |     file2.write_text("222")
 48 |     file3.write_text("33")
 49 |     ch = Channel.from_glob(tmp_path / "*.txt", sortby="size")
 50 |     assert ch.iloc[0, 0] == str(file1)
 51 |     assert ch.iloc[1, 0] == str(file3)
 52 |     assert ch.iloc[2, 0] == str(file2)
 53 | 
 54 | 
 55 | def test_from_glob_filter_link(tmp_path):
 56 |     file1 = tmp_path / "file1.txt"
 57 |     file2 = tmp_path / "file2.txt"
 58 |     file3 = tmp_path / "file3.txt"
 59 |     file1.touch()
 60 |     file2.symlink_to(file1)
 61 |     file3.symlink_to(file1)
 62 |     ch = Channel.from_glob(tmp_path / "*.txt", ftype="link")
 63 |     assert ch.shape == (2, 1)
 64 |     assert ch.iloc[0, 0] == str(file2)
 65 |     assert ch.iloc[1, 0] == str(file3)
 66 | 
 67 | 
 68 | def test_from_glob_filter_dir_file(tmp_path):
 69 |     file1 = tmp_path / "file1.txt"
 70 |     file2 = tmp_path / "file2.txt"
 71 |     file3 = tmp_path / "file3.txt"
 72 |     file1.mkdir()
 73 |     file2.touch()
 74 |     file3.mkdir()
 75 | 
 76 |     ch = Channel.from_glob(tmp_path / "*.txt", ftype="dir")
 77 |     assert ch.shape == (2, 1)
 78 |     assert ch.iloc[0, 0] == str(file1)
 79 |     assert ch.iloc[1, 0] == str(file3)
 80 | 
 81 |     ch = Channel.from_glob(tmp_path / "*.txt", ftype="file")
 82 |     assert ch.shape == (1, 1)
 83 |     assert ch.iloc[0, 0] == str(file2)
 84 | 
 85 | 
 86 | def test_from_glob_cloudpath():
 87 |     ch = Channel.from_glob(f"{BUCKET}/pipen-test/channel/test*.txt")
 88 |     assert ch.shape == (3, 1)
 89 |     assert ch.iloc[0, 0] == f"{BUCKET}/pipen-test/channel/test1.txt"
 90 |     assert ch.iloc[1, 0] == f"{BUCKET}/pipen-test/channel/test2.txt"
 91 |     assert ch.iloc[2, 0] == f"{BUCKET}/pipen-test/channel/test3.txt"
 92 | 
 93 | 
 94 | def test_from_pairs():
 95 |     glob = Path(__file__).parent / "test_*.py"
 96 |     glob_files = list(Path(__file__).parent.glob("test_*.py"))
 97 |     ch = Channel.from_pairs(glob)
 98 |     assert ch.shape == (ceil(len(glob_files) / 2.0), 2)
 99 | 
100 | 
101 | def test_expand_dir_collapse_files():
102 |     ch0 = Channel.create([(Path(__file__).parent.as_posix(), 1)])
103 |     ch1 = ch0 >> expand_dir(pattern="test_*.py")
104 |     glob_files = list(Path(__file__).parent.glob("test_*.py"))
105 |     assert ch1.shape == (len(glob_files), 2)
106 | 
107 |     ch2 = ch1 >> collapse_files()
108 |     assert ch2.equals(ch0)
109 | 
110 | 
111 | def test_from_csv(tmp_path):
112 |     df = tibble(a=[1, 2], b=[3, 4])
113 |     df.to_csv(tmp_path / "input.csv", index=False)
114 |     ch = Channel.from_csv(tmp_path / "input.csv")
115 |     assert ch.equals(df)
116 | 
117 | 
118 | def test_from_excel(tmp_path):
119 |     df = tibble(a=[1, 2], b=[3, 4])
120 |     df.to_excel(tmp_path / "input.xls", index=False)
121 |     ch = Channel.from_excel(tmp_path / "input.xls")
122 |     assert ch.equals(df)
123 | 
124 | 
125 | def test_from_table():
126 |     df = StringIO("a b\n1 3\n2 4\n")
127 |     ch = Channel.from_table(df, sep=" ")
128 |     exp = tibble(a=[1, 2], b=[3, 4])
129 |     assert ch.equals(exp)
130 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pytest  # noqa: F401
 3 | 
 4 | from subprocess import check_output, CalledProcessError, STDOUT
 5 | 
 6 | 
 7 | def cmdoutput(cmd):
 8 |     try:
 9 |         return check_output(
10 |             [sys.executable, "-m"] + cmd,
11 |             stderr=STDOUT,
12 |             encoding="utf-8",
13 |         )
14 |     except CalledProcessError as err:
15 |         return err.output
16 | 
17 | 
18 | @pytest.mark.forked
19 | def test_main():
20 |     out = cmdoutput(["pipen", "--help"])
21 |     assert "CLI Tool for pipen" in out
22 | 
23 | 
24 | @pytest.mark.forked
25 | def test_nosuch_command():
26 |     out = cmdoutput(["pipen", "x"])
27 |     assert "invalid choice" in out
28 | 
29 | 
30 | @pytest.mark.forked
31 | def test_help():
32 |     out = cmdoutput(["pipen", "help", "x"])
33 |     assert "invalid choice" in out
34 |     out = cmdoutput(["pipen", "help", "profile"])
35 |     assert "The name of the profile to show" in out
36 |     out = cmdoutput(["pipen", "help"])
37 |     assert "CLI Tool for pipen" in out
38 | 
39 | 
40 | @pytest.mark.forked
41 | def test_profile_all():
42 |     out = cmdoutput(["pipen", "profile"])
43 |     assert "Note:" in out
44 |     out = cmdoutput(["pipen", "profile", "--list"])
45 |     assert "default" in out
46 | 
47 | 
48 | @pytest.mark.forked
49 | def test_profile_default():
50 |     out = cmdoutput(["pipen", "profile", "--name", "default"])
51 |     assert "Profile: default" in out
52 | 
53 | 
54 | @pytest.mark.forked
55 | def test_profile_nosuch():
56 |     out = cmdoutput(["pipen", "profile", "-n", "nosuch"])
57 |     assert "Profile: nosuch" not in out
58 | 
59 | 
60 | @pytest.mark.forked
61 | def test_version():
62 |     out = cmdoutput(["pipen", "version"])
63 |     assert "pipen" in out
64 |     assert "python" in out
65 |     assert "liquidpy" in out
66 | 


--------------------------------------------------------------------------------
/tests/test_pipen.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from uuid import uuid4
  3 | from yunpath import AnyPath
  4 | from pipen import Proc, Pipen, run
  5 | from pipen.exceptions import (
  6 |     ProcDependencyError,
  7 |     PipenSetDataError,
  8 | )
  9 | from pipen.proc import PipenOrProcNameError
 10 | 
 11 | from .helpers import (  # noqa: F401
 12 |     ErrorProc,
 13 |     NormalProc,
 14 |     SimpleProc,
 15 |     RelPathScriptProc,
 16 |     pipen,
 17 |     SimplePlugin,
 18 |     pipen_with_plugin,
 19 |     BUCKET,
 20 | )
 21 | 
 22 | 
 23 | @pytest.fixture
 24 | def uid():
 25 |     return uuid4()
 26 | 
 27 | 
 28 | @pytest.mark.forked
 29 | def test_init(pipen):
 30 |     assert isinstance(pipen, Pipen)
 31 | 
 32 | 
 33 | @pytest.mark.forked
 34 | def test_name():
 35 |     p = Pipen()
 36 |     assert p.name == "p"
 37 |     [p] = [Pipen()]
 38 |     assert p.name.startswith("Pipen-")
 39 | 
 40 | 
 41 | @pytest.mark.forked
 42 | def test_run(pipen):
 43 |     ret = pipen.set_starts(SimpleProc).run()
 44 |     assert ret
 45 | 
 46 |     ret = pipen.set_starts([ErrorProc]).run()
 47 |     assert not ret
 48 | 
 49 | 
 50 | @pytest.mark.forked
 51 | def test_no_start_procs(pipen):
 52 |     with pytest.raises(ProcDependencyError):
 53 |         pipen.run()
 54 | 
 55 | 
 56 | @pytest.mark.forked
 57 | def test_cyclic_dependency(pipen):
 58 |     """
 59 |     proc1(start) --> proc2 --> proc3(start)
 60 |                            <--
 61 |     """
 62 |     proc1 = Proc.from_proc(NormalProc, input_data=[1])
 63 |     proc2 = Proc.from_proc(NormalProc)
 64 |     proc3 = Proc.from_proc(NormalProc, requires=proc2)
 65 |     proc2.requires = [proc1, proc3]
 66 | 
 67 |     with pytest.raises(ProcDependencyError, match="Cyclic dependency"):
 68 |         pipen.set_starts(proc1, proc3).run()
 69 | 
 70 | 
 71 | @pytest.mark.forked
 72 | def test_wrong_type_starts(pipen):
 73 |     with pytest.raises(ProcDependencyError, match="is not a subclass of"):
 74 |         pipen.set_starts(1)
 75 | 
 76 |     with pytest.raises(ProcDependencyError, match="is not a subclass of"):
 77 |         pipen.set_starts(lambda: 1)
 78 | 
 79 | 
 80 | @pytest.mark.forked
 81 | def test_not_cyclic_for_subclass_of_proc_in_pipeline(pipen):
 82 |     proc1 = Proc.from_proc(NormalProc, input_data=[1])
 83 |     proc2 = Proc.from_proc(NormalProc, requires=proc1)
 84 | 
 85 |     class proc3(proc1):
 86 |         requires = proc2
 87 | 
 88 |     pipen.set_starts(proc1).run()
 89 |     assert pipen.procs == [proc1, proc2, proc3]
 90 | 
 91 | 
 92 | @pytest.mark.forked
 93 | def test_no_next_procs(pipen):
 94 |     """
 95 |     proc1 --> proc2 --> proc3
 96 |                     <--
 97 |     """
 98 |     proc1 = Proc.from_proc(NormalProc, input_data=[1])
 99 |     proc2 = Proc.from_proc(NormalProc)
100 |     proc3 = Proc.from_proc(NormalProc, requires=proc2)
101 |     proc2.requires = [proc1, proc3]
102 |     # trigger requires/nexts computation
103 |     proc2.__init_subclass__()
104 | 
105 |     with pytest.raises(
106 |         ProcDependencyError,
107 |         match="No available next processes",
108 |     ):
109 |         pipen.set_starts(proc1).run()
110 | 
111 | 
112 | @pytest.mark.forked
113 | def test_plugins_are_pipeline_dependent(pipen, pipen_with_plugin, caplog):
114 |     simproc = Proc.from_proc(SimpleProc)
115 |     pipen_with_plugin.set_starts(simproc).run()
116 |     assert "simpleplugin" in caplog.text
117 | 
118 |     caplog.clear()
119 |     pipen.set_starts(simproc).run()  # No simple plugin enabled
120 |     assert "simpleplugin" not in caplog.text
121 | 
122 | 
123 | @pytest.mark.forked
124 | def test_set_starts_error(pipen):
125 |     with pytest.raises(ProcDependencyError):
126 |         pipen.set_starts(SimpleProc, SimpleProc)
127 | 
128 | 
129 | @pytest.mark.forked
130 | def test_set_data(pipen):
131 |     simproc = Proc.from_proc(SimpleProc, input_data=[1])
132 |     pipen.set_starts(simproc).set_data(None)
133 |     assert simproc.input_data == [1]
134 | 
135 |     with pytest.raises(PipenSetDataError):
136 |         pipen.set_data([2])
137 | 
138 | 
139 | @pytest.mark.forked
140 | def test_proc_order(pipen):
141 |     proc1 = Proc.from_proc(NormalProc, input_data=[1])
142 |     proc2 = Proc.from_proc(NormalProc, requires=proc1)
143 |     proc3 = Proc.from_proc(NormalProc, requires=proc1, order=-1)
144 | 
145 |     pipen.set_starts(proc1).run()
146 |     assert pipen.procs == [proc1, proc3, proc2]
147 | 
148 | 
149 | @pytest.mark.forked
150 | def test_proc_inherited(pipen):
151 |     proc1 = Proc.from_proc(RelPathScriptProc)
152 |     proc2 = Proc.from_proc(proc1)
153 |     pipen.set_starts(proc2).set_data([1]).run()
154 |     assert proc2.__doc__ == RelPathScriptProc.__doc__
155 | 
156 | 
157 | @pytest.mark.forked
158 | def test_subclass_pipen(tmp_path, caplog):
159 |     class Proc1(Proc):
160 |         input = "a"
161 |         output = "b:var:{{in.a}}"
162 | 
163 |     class Proc2(Proc):
164 |         requires = Proc1
165 |         input = "b"
166 |         output = "c:file:{{in.b}}"
167 |         script = "touch {{out.c}}"
168 | 
169 |     class MyPipen(Pipen):
170 |         name = "MyAwesomePipeline"
171 |         starts = Proc1
172 |         data = ([1],)
173 |         outdir = tmp_path / "outdir"
174 |         workdir = tmp_path
175 |         loglevel = "DEBUG"
176 |         plugin_opts = {"x": 1}
177 |         scheduler_opts = {"n": 1}
178 |         template_opts = {"a": 1}
179 | 
180 |     MyPipen(plugin_opts={"y": 2}).run()
181 | 
182 |     assert (tmp_path / "outdir" / "Proc2" / "1").is_file()
183 |     assert "MYAWESOMEPIPELINE" in caplog.text
184 |     assert "x=1" in caplog.text
185 |     assert "y=2" in caplog.text
186 | 
187 |     class MyPipe2(Pipen):
188 |         ...
189 | 
190 |     assert MyPipe2().name == "MyPipe2"
191 | 
192 | 
193 | @pytest.mark.forked
194 | def test_invalid_name():
195 |     class MyPipe3(Pipen):
196 |         name = "a+"
197 | 
198 |     with pytest.raises(PipenOrProcNameError, match="Invalid pipeline name"):
199 |         MyPipe3().run()
200 | 
201 | 
202 | @pytest.mark.forked
203 | def test_duplicate_proc_name():
204 |     class MyProc1(Proc):
205 |         ...
206 | 
207 |     class MyProc2(Proc):
208 |         requires = MyProc1
209 |         name = "MyProc1"
210 | 
211 |     class MyPipe4(Pipen):
212 |         starts = MyProc1
213 | 
214 |     with pytest.raises(PipenOrProcNameError, match="already used by another"):
215 |         MyPipe4().run()
216 | 
217 | 
218 | @pytest.mark.forked
219 | def test_run2():
220 |     class RProc1(Proc):
221 |         input = "a"
222 |         output = "b:var:{{in.a}}"
223 | 
224 |     class RProc2(Proc):
225 |         requires = RProc1
226 |         input = "b"
227 |         output = "c:file:{{in.b}}"
228 |         script = "touch {{out.c}}"
229 | 
230 |     assert run("MyPipe", RProc1)
231 | 
232 | 
233 | @pytest.mark.forked
234 | def test_cloud_workdir_outdir(uid):
235 |     class RProc1(Proc):
236 |         input = "a"
237 |         input_data = [1]
238 |         output = "b:file:{{in.a}}.txt"
239 |         script = "cloudsh touch {{out.b}}"
240 | 
241 |     class RProc2(Proc):
242 |         requires = RProc1
243 |         input = "b:file"
244 |         output = "c:file:{{in.b.stem}}2.txt"
245 |         script = "echo 123 | cloudsh sink {{out.c}}"
246 | 
247 |     # make sure multiple tests can run in parallel
248 |     # e.g. for python3.9, python3.10, etc.
249 |     cloud_dir = AnyPath(f"{BUCKET}/pipen-test/test-pipeline/{uid}")
250 | 
251 |     assert run(
252 |         "MyCloudPipe",
253 |         RProc1,
254 |         workdir=f"{cloud_dir}/workdir",
255 |         outdir=f"{cloud_dir}/outdir",
256 |     )
257 | 
258 |     cloud_dir.rmtree()
259 | 


--------------------------------------------------------------------------------
/tests/test_plugin.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from pipen import plugin, Pipen, Proc
  4 | 
  5 | from .helpers import OutputNotGeneratedProc, SimpleProc, pipen  # noqa: F401
  6 | 
  7 | 
  8 | class Plugin:
  9 |     def __init__(self, name):
 10 |         self.name = name
 11 | 
 12 |     @plugin.impl
 13 |     async def on_complete(self, pipen, succeeded):
 14 |         print(f"<<<{self.name}>>>")
 15 | 
 16 | 
 17 | @pytest.mark.forked
 18 | def test_job_succeeded(pipen, caplog):
 19 |     out = pipen.set_starts(OutputNotGeneratedProc).run()
 20 |     assert not out
 21 | 
 22 | 
 23 | @pytest.mark.forked
 24 | def test_plugin_context_only(tmp_path, capsys):
 25 |     plugin1 = Plugin("plugin1")
 26 |     plugin2 = Plugin("plugin2")
 27 |     plugin3 = Plugin("plugin3")
 28 |     plugin4 = Plugin("plugin4")
 29 | 
 30 |     plugin.register(plugin1, plugin2, plugin3, plugin4)
 31 |     plugin.get_plugin("plugin4").disable()
 32 | 
 33 |     pipeline = Pipen(
 34 |         name="pipeline_plugin_context_only",
 35 |         desc="No description",
 36 |         loglevel="debug",
 37 |         cache=True,
 38 |         workdir=tmp_path / ".pipen",
 39 |         outdir=tmp_path / "pipeline_plugin_context_only",
 40 |         plugins=["plugin1", "plugin2"],
 41 |     )
 42 |     pipeline.set_starts(SimpleProc).run()
 43 |     out = capsys.readouterr().out
 44 |     assert "<<<plugin1>>>" in out
 45 |     assert "<<<plugin2>>>" in out
 46 |     assert "<<<plugin3>>>" not in out
 47 |     assert "<<<plugin4>>>" not in out
 48 | 
 49 | 
 50 | @pytest.mark.forked
 51 | def test_plugin_context_mixed(tmp_path, capsys):
 52 |     plugin1 = Plugin("plugin1")
 53 |     plugin2 = Plugin("plugin2")
 54 |     plugin3 = Plugin("plugin3")
 55 |     plugin4 = Plugin("plugin4")
 56 | 
 57 |     plugin.register(plugin1, plugin2, plugin3, plugin4)
 58 |     plugin.get_plugin("plugin3").disable()
 59 |     plugin.get_plugin("plugin4").disable()
 60 | 
 61 |     pipeline = Pipen(
 62 |         name="pipeline_plugin_context_mixed",
 63 |         desc="No description",
 64 |         loglevel="debug",
 65 |         cache=True,
 66 |         workdir=tmp_path / ".pipen",
 67 |         outdir=tmp_path / "pipeline_plugin_context_mixed",
 68 |         plugins=["+plugin3", plugin.get_plugin("plugin4"), "-plugin2"],
 69 |     )
 70 |     pipeline.set_starts(SimpleProc).run()
 71 |     out = capsys.readouterr().out
 72 |     assert "<<<plugin1>>>" in out
 73 |     assert "<<<plugin2>>>" not in out
 74 |     assert "<<<plugin3>>>" in out
 75 |     assert "<<<plugin4>>>" in out
 76 | 
 77 | 
 78 | @pytest.mark.forked
 79 | def test_jobcmd_hooks(pipen):
 80 | 
 81 |     @plugin.register
 82 |     class MyJobCmdPlugin:
 83 |         @plugin.impl
 84 |         def on_jobcmd_init(job):
 85 |             return "# on_jobcmd_init from myjobcmdplugin"
 86 | 
 87 |         @plugin.impl
 88 |         def on_jobcmd_prep(job):
 89 |             return "# on_jobcmd_prep from myjobcmdplugin"
 90 | 
 91 |         @plugin.impl
 92 |         def on_jobcmd_end(job):
 93 |             return "# on_jobcmd_end from myjobcmdplugin"
 94 | 
 95 |     class MyProc(Proc):
 96 |         input = "in:var"
 97 |         input_data = [1]
 98 |         output = "out:var:{{in.in}}"
 99 |         script = "echo {{proc.name}}"
100 | 
101 |     pipen.set_starts(MyProc).run()
102 |     assert pipen.run()
103 | 
104 |     wrapper_script = pipen.workdir / "MyProc" / "0" / "job.wrapped.local"
105 |     assert wrapper_script.exists()
106 |     content = wrapper_script.read_text()
107 |     assert "# on_jobcmd_init from myjobcmdplugin" in content
108 |     assert "# on_jobcmd_prep from myjobcmdplugin" in content
109 |     assert "# on_jobcmd_end from myjobcmdplugin" in content
110 | 


--------------------------------------------------------------------------------
/tests/test_proc.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | 
  4 | import pandas
  5 | from pipen import Proc
  6 | from pipen.exceptions import (
  7 |     ProcInputKeyError,
  8 |     ProcInputTypeError,
  9 |     ProcScriptFileNotFound,
 10 |     PipenOrProcNameError,
 11 | )
 12 | from datar.dplyr import mutate
 13 | from .helpers import (  # noqa: F401
 14 |     In2Out1Proc,
 15 |     NoInputProc,
 16 |     NormalProc,
 17 |     FileInputProc,
 18 |     RelPathScriptProc,
 19 |     ScriptNotExistsProc,
 20 |     SimpleProc,
 21 |     InputTypeUnsupportedProc,
 22 |     pipen,
 23 | )
 24 | 
 25 | 
 26 | @pytest.mark.forked
 27 | def test_more_nexts(pipen):
 28 |     proc1 = Proc.from_proc(NormalProc)
 29 |     Proc.from_proc(NormalProc, "proc2", requires=proc1)
 30 |     Proc.from_proc(NormalProc, "proc3", requires=proc1)
 31 |     ret = pipen.set_starts(proc1).run()
 32 |     assert ret
 33 | 
 34 | 
 35 | @pytest.mark.forked
 36 | def test_proc_no_input(pipen):
 37 |     with pytest.raises(ProcInputKeyError):
 38 |         pipen.set_starts(NoInputProc).run()
 39 | 
 40 | 
 41 | @pytest.mark.forked
 42 | def test_unsupported_input_type(pipen):
 43 |     with pytest.raises(ProcInputTypeError):
 44 |         pipen.set_starts(InputTypeUnsupportedProc).run()
 45 | 
 46 | 
 47 | @pytest.mark.forked
 48 | def test_proc_with_input_data(pipen):
 49 |     proc = Proc.from_proc(NormalProc, input_data=[1])
 50 |     pipen.set_starts(proc).run()
 51 |     assert proc.output_data.equals(pandas.DataFrame({"output": ["1"]}))
 52 | 
 53 | 
 54 | @pytest.mark.forked
 55 | def test_proc_with_symlink_input(pipen, tmp_path):
 56 |     infile_orig = tmp_path / "a.txt"
 57 |     infile_orig.write_text("1")
 58 |     infile_symlink = tmp_path / "b.txt"
 59 |     infile_symlink.symlink_to(infile_orig)
 60 |     proc = Proc.from_proc(FileInputProc, input_data=[infile_symlink])
 61 |     pipen.set_starts(proc).run()
 62 |     outfile = proc.output_data["out"].iloc[0]
 63 |     assert outfile.name == "b.txt"
 64 | 
 65 | 
 66 | @pytest.mark.forked
 67 | def test_proc_with_input_callable(pipen):
 68 |     proc = Proc.from_proc(NormalProc, input_data=[1])
 69 |     proc2 = Proc.from_proc(
 70 |         NormalProc, requires=proc, input_data=lambda ch: ch >> mutate(output=2)
 71 |     )
 72 |     pipen.set_starts(proc).run()
 73 |     assert proc2.output_data.equals(pandas.DataFrame({"output": ["2"]}))
 74 | 
 75 | 
 76 | @pytest.mark.forked
 77 | def test_ignore_input_data_of_start_proc(caplog, pipen):
 78 |     proc = Proc.from_proc(NormalProc, input_data=[1])
 79 |     proc2 = Proc.from_proc(NormalProc, requires=proc, input_data=[2])
 80 |     pipen.set_starts(proc).run()
 81 |     assert "Ignoring input data" in caplog.text
 82 |     assert proc2.output_data.equals(pandas.DataFrame({"output": ["1"]}))
 83 | 
 84 | 
 85 | @pytest.mark.forked
 86 | def test_proc_wasted_input_columns(caplog, pipen):
 87 |     proc1 = Proc.from_proc(NormalProc, input_data=[1])
 88 |     proc2 = Proc.from_proc(NormalProc, input_data=[1])
 89 |     proc3 = Proc.from_proc(NormalProc, requires=[proc1, proc2])  # noqa: F841
 90 |     pipen.set_starts(proc1, proc2).run()
 91 |     assert "Wasted 1 column" in caplog.text
 92 | 
 93 | 
 94 | @pytest.mark.forked
 95 | def test_proc_not_enough_input_columns(caplog, pipen):
 96 |     proc1 = Proc.from_proc(NormalProc, input_data=[1])
 97 |     proc2 = Proc.from_proc(In2Out1Proc, requires=proc1)
 98 |     pipen.set_starts(proc1).run()
 99 |     assert "No data column for input: ['in2'], using None" in caplog.text
100 |     assert proc2.output_data.equals(pandas.DataFrame({"out": ["1_None"]}))
101 | 
102 | 
103 | @pytest.mark.forked
104 | def test_proc_relative_path_script(pipen):
105 |     pipen.set_starts(RelPathScriptProc).run()
106 |     script = RelPathScriptProc().script.render()
107 |     assert "AbCdEf" in script
108 | 
109 | 
110 | @pytest.mark.forked
111 | def test_script_file_exists(pipen):
112 |     with pytest.raises(ProcScriptFileNotFound):
113 |         pipen.set_starts(ScriptNotExistsProc).run()
114 | 
115 | 
116 | @pytest.mark.forked
117 | def test_cached_run(caplog, pipen):
118 |     NormalProc.nexts = []
119 |     # force uncache NormalProc
120 |     # shutil.rmtree(pipen.config.workdir)
121 |     ret = pipen.set_start(NormalProc).run()
122 |     assert ret
123 | 
124 |     # trigger caching
125 |     ret = pipen.set_start(NormalProc).run()
126 |     assert ret
127 | 
128 |     assert caplog.text.count("Cached jobs:") == 1
129 | 
130 | 
131 | def test_proc_repr():
132 |     assert repr(SimpleProc) == "<Proc:SimpleProc>"
133 | 
134 | 
135 | def test_from_proc_no_name():
136 |     procs = [None]
137 |     with pytest.raises(PipenOrProcNameError):
138 |         procs[0] = Proc.from_proc(SimpleProc)
139 | 
140 | 
141 | def test_from_proc():
142 |     proc = Proc.from_proc(
143 |         SimpleProc,
144 |         name="new_proc",
145 |         desc="new desc",
146 |         envs={"a": 1},
147 |         cache=True,
148 |         forks=2,
149 |         plugin_opts={"p": 1},
150 |         scheduler="sge",
151 |         scheduler_opts={"s": 1},
152 |         error_strategy="retry",
153 |         num_retries=10,
154 |         submission_batch=3,
155 |     )
156 |     assert proc.name == "new_proc"
157 |     assert proc.desc == "new desc"
158 |     assert proc.envs == {"a": 1}
159 |     assert proc.cache
160 |     assert proc.forks == 2
161 |     assert proc.plugin_opts == {"p": 1}
162 |     assert proc.scheduler == "sge"
163 |     assert proc.scheduler_opts == {"s": 1}
164 |     assert proc.error_strategy == "retry"
165 |     assert proc.num_retries == 10
166 |     assert proc.submission_batch == 3
167 | 
168 | 
169 | def test_proc_is_singleton(pipen):
170 |     pipen.workdir = ".pipen/"
171 |     os.makedirs(pipen.workdir, exist_ok=True)
172 |     p1 = SimpleProc(pipen)
173 |     p2 = SimpleProc(pipen)
174 |     assert p1 is p2
175 | 
176 | 
177 | def test_invalid_name():
178 |     with pytest.raises(PipenOrProcNameError):
179 |         Proc.from_proc(SimpleProc, name="a b")
180 | 
181 | 
182 | def test_inherit_proc_envs():
183 |     class Proc1_1(Proc):
184 |         envs_depth = 1
185 |         envs = {"a": {"b": 1, "c": 2}}
186 | 
187 |     class Proc2(Proc1_1):
188 |         envs = {"a": {"b": 2}}
189 | 
190 |     class Proc1_2(Proc):
191 |         envs_depth = 2
192 |         envs = {"a": {"b": 1, "c": 2}}
193 | 
194 |     class Proc3(Proc1_2):
195 |         envs_depth = 2
196 |         envs = {"a": {"b": 3}}
197 | 
198 |     class Proc1_3(Proc):
199 |         envs_depth = 3
200 |         envs = {"a": {"b": 1, "c": 2}}
201 | 
202 |     class Proc4(Proc1_3):
203 |         envs = {"a": {"b": 4}}
204 | 
205 |     Proc5 = Proc.from_proc(Proc1_3, envs={"a": {"b": 5}})
206 | 
207 |     assert Proc5.envs == {"a": {"b": 5, "c": 2}}
208 |     assert Proc4.envs == {"a": {"b": 4, "c": 2}}
209 |     assert Proc3.envs == {"a": {"b": 3, "c": 2}}
210 |     assert Proc2.envs == {"a": {"b": 2}}
211 |     assert Proc1_1.envs == {"a": {"b": 1, "c": 2}}
212 |     assert Proc1_2.envs == {"a": {"b": 1, "c": 2}}
213 |     assert Proc1_3.envs == {"a": {"b": 1, "c": 2}}
214 | 


--------------------------------------------------------------------------------
/tests/test_procgroup.py:
--------------------------------------------------------------------------------
  1 | import pytest  # noqa: F401
  2 | 
  3 | from pipen import Proc, Pipen
  4 | from pipen.procgroup import ProcGroup
  5 | 
  6 | 
  7 | def test_singleton():
  8 |     class PG(ProcGroup):
  9 |         ...
 10 | 
 11 |     assert PG() is PG()
 12 | 
 13 | 
 14 | def test_option_overrides_defaults():
 15 |     class PG(ProcGroup):
 16 |         DEFAULTS = {"a": 1}
 17 | 
 18 |     pg = PG(a=2)
 19 |     assert pg.opts.a == 2
 20 | 
 21 | 
 22 | def test_add_proc():
 23 | 
 24 |     class PG(ProcGroup):
 25 |         ...
 26 | 
 27 |     pg = PG()
 28 | 
 29 |     @pg.add_proc()
 30 |     class P1(Proc):
 31 |         pass
 32 | 
 33 |     assert pg.P1 is P1
 34 |     assert len(pg.procs) == 1
 35 |     assert pg.procs.P1 is P1
 36 |     assert pg.starts == [P1]
 37 | 
 38 | 
 39 | def test_define_proc():
 40 | 
 41 |     class P1(Proc): pass  # noqa: E701
 42 |     class P2(Proc): pass  # noqa: E701
 43 |     class P3(Proc): pass  # noqa: E701
 44 | 
 45 |     class PG(ProcGroup):
 46 | 
 47 |         @ProcGroup.add_proc
 48 |         def p1(self):
 49 |             return P1
 50 | 
 51 |         @ProcGroup.add_proc
 52 |         def p2(self):
 53 |             P2.requires = self.p1
 54 |             return P2
 55 | 
 56 |         @ProcGroup.add_proc
 57 |         def p3(self):
 58 |             P3.requires = self.p2
 59 |             return P3
 60 | 
 61 |     pg = PG()
 62 | 
 63 |     assert pg.starts == [P1]
 64 | 
 65 |     assert pg.p1 is P1
 66 |     assert pg.p2 is P2
 67 |     assert pg.p3 is P3
 68 |     assert pg.procs == {"P1": P1, "P2": P2, "P3": P3}
 69 | 
 70 | 
 71 | def test_define_proc_wrong_return():
 72 |     class PG(ProcGroup):
 73 |         @ProcGroup.add_proc
 74 |         def p1(self):
 75 |             return None
 76 | 
 77 |         @ProcGroup.add_proc
 78 |         def p2(self):
 79 |             return 1
 80 | 
 81 |     with pytest.raises(ValueError):
 82 |         PG()
 83 | 
 84 | 
 85 | def test_as_pipen():
 86 |     class PG(ProcGroup):
 87 |         """A pipeline group"""
 88 | 
 89 |     pg = PG()
 90 | 
 91 |     @pg.add_proc
 92 |     class P1(Proc):
 93 |         ...
 94 | 
 95 |     p = pg.as_pipen()
 96 |     assert isinstance(p, Pipen)
 97 |     assert p.desc == "A pipeline group"
 98 | 
 99 |     p = pg.as_pipen(desc="Test desc")
100 |     assert p.desc == "Test desc"
101 | 
102 | 
103 | def test_procgroup_cleared_when_subclassed():
104 |     class PG(ProcGroup):
105 |         ...
106 | 
107 |     pg = PG()
108 | 
109 |     @pg.add_proc
110 |     class P1(Proc):
111 |         ...
112 | 
113 |     assert P1.__meta__["procgroup"] is pg
114 | 
115 |     class P2(P1):
116 |         ...
117 | 
118 |     assert P2.__meta__["procgroup"] is None
119 | 
120 | 
121 | def test_name():
122 |     class PG(ProcGroup):
123 |         ...
124 | 
125 |     pg = PG()
126 |     assert pg.name == "PG"
127 | 
128 |     class PG2(ProcGroup):
129 |         name = "PG10"
130 | 
131 |     pg2 = PG2()
132 |     assert pg2.name == "PG10"
133 | 
134 | 
135 | def test_invliad_proc_name():
136 |     class PG(ProcGroup):
137 |         ...
138 | 
139 |     pg = PG()
140 | 
141 |     with pytest.raises(ValueError, match="Process name `opts` is reserved"):
142 |         @pg.add_proc
143 |         class opts(Proc):
144 |             ...
145 | 
146 | 
147 | def test_add_proc_directly():
148 |     class P1(Proc):
149 |         ...
150 | 
151 |     class PG(ProcGroup):
152 |         p1 = P1
153 | 
154 |     pg = PG()
155 | 
156 |     assert pg.p1 is P1
157 |     assert pg.procs == {"P1": P1}
158 |     assert pg.starts == [P1]
159 | 


--------------------------------------------------------------------------------
/tests/test_scheduler.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from unittest.mock import MagicMock
  3 | 
  4 | from yunpath import AnyPath
  5 | from pipen.scheduler import (
  6 |     get_scheduler,
  7 |     LocalScheduler,
  8 |     SgeScheduler,
  9 |     SshScheduler,
 10 |     SlurmScheduler,
 11 |     GbatchScheduler,
 12 |     NoSuchSchedulerError,
 13 | )
 14 | 
 15 | 
 16 | def test_get_scheduler():
 17 | 
 18 |     local = get_scheduler("local")
 19 |     assert local is LocalScheduler
 20 | 
 21 |     local = get_scheduler(local)
 22 |     assert local is LocalScheduler
 23 | 
 24 |     sge = get_scheduler("sge")
 25 |     assert sge is SgeScheduler
 26 | 
 27 |     sge = get_scheduler(sge)
 28 |     assert sge is SgeScheduler
 29 | 
 30 |     slurm = get_scheduler("slurm")
 31 |     assert slurm is SlurmScheduler
 32 | 
 33 |     slurm = get_scheduler(slurm)
 34 |     assert slurm is SlurmScheduler
 35 | 
 36 |     ssh = get_scheduler("ssh")
 37 |     assert ssh is SshScheduler
 38 | 
 39 |     ssh = get_scheduler(ssh)
 40 |     assert ssh is SshScheduler
 41 | 
 42 |     gbatch = get_scheduler("gbatch")
 43 |     assert gbatch is GbatchScheduler
 44 | 
 45 |     gbatch = get_scheduler(gbatch)
 46 |     assert gbatch is GbatchScheduler
 47 | 
 48 |     with pytest.raises(NoSuchSchedulerError):
 49 |         get_scheduler("nosuchscheduler")
 50 | 
 51 | 
 52 | def test_gbatch_scheduler_init():
 53 |     gbatch_sched = get_scheduler("gbatch")
 54 | 
 55 |     with pytest.raises(
 56 |         ValueError, match="'fast_mount' for gbatch scheduler should be in the format"
 57 |     ):
 58 |         gbatch_sched(
 59 |             project="test_project",
 60 |             location="test_location",
 61 |             workdir="gs://test-bucket/workdir",
 62 |             fast_mount="test",
 63 |         )
 64 | 
 65 |     with pytest.raises(
 66 |         ValueError,
 67 |         match="'fast_mount' for gbatch scheduler should be a Google Cloud Storage",
 68 |     ):
 69 |         gbatch_sched(
 70 |             project="test_project",
 71 |             location="test_location",
 72 |             workdir="gs://test-bucket/workdir",
 73 |             fast_mount="file:///tmp/test:/mnt/test",
 74 |         )
 75 | 
 76 |     gbatch = gbatch_sched(
 77 |         project="test_project",
 78 |         location="test_location",
 79 |         workdir="gs://test-bucket/workdir",
 80 |         fast_mount="gs://test-bucket/path:/mnt/path",
 81 |     )
 82 |     assert gbatch.project == "test_project"
 83 |     assert gbatch.location == "test_location"
 84 |     assert gbatch.config.taskGroups[0].taskSpec.volumes[-1].mountPath == "/mnt/path"
 85 |     assert (
 86 |         gbatch.config.taskGroups[0].taskSpec.volumes[-1].gcs.remotePath
 87 |         == "test-bucket/path"
 88 |     )
 89 | 
 90 | 
 91 | def test_gbatch_scheduler_post_init_non_gs_outdir():
 92 |     gbatch = get_scheduler("gbatch")(
 93 |         project="test_project",
 94 |         location="test_location",
 95 |         workdir="gs://test-bucket/workdir",
 96 |     )
 97 |     pipeline = MagicMock(outdir="/local/outdir")
 98 |     proc = MagicMock(pipeline=pipeline)
 99 |     proc.name = "test_proc"
100 |     with pytest.raises(ValueError):
101 |         gbatch.post_init(proc)
102 | 
103 | 
104 | def test_gbatch_scheduler_post_init():
105 |     gbatch = get_scheduler("gbatch")(
106 |         project="test_project",
107 |         location="test_location",
108 |         workdir="gs://test-bucket/workdir",
109 |     )
110 |     pipeline_outdir = AnyPath("gs://test-bucket/outdir")
111 |     pipeline = MagicMock(outdir=pipeline_outdir)
112 |     proc = MagicMock(pipeline=pipeline)
113 |     proc.name = "test_proc"
114 |     gbatch.post_init(proc)
115 | 
116 |     assert str(gbatch.workdir) == "gs://test-bucket/workdir"
117 |     assert (
118 |         str(gbatch.workdir.mounted) == f"{GbatchScheduler.MOUNTED_METADIR}/{proc.name}"
119 |     )
120 |     assert (
121 |         gbatch.config.taskGroups[0].taskSpec.volumes[-1].mountPath
122 |         == f"{GbatchScheduler.MOUNTED_OUTDIR}"
123 |     )
124 |     assert (
125 |         gbatch.config.taskGroups[0].taskSpec.volumes[-1].gcs.remotePath
126 |         == "test-bucket/outdir"
127 |     )
128 |     assert (
129 |         gbatch.config.taskGroups[0].taskSpec.volumes[-2].mountPath
130 |         == f"{GbatchScheduler.MOUNTED_METADIR}/{proc.name}"
131 |     )
132 |     assert (
133 |         gbatch.config.taskGroups[0].taskSpec.volumes[-2].gcs.remotePath
134 |         == "test-bucket/workdir"
135 |     )
136 | 


--------------------------------------------------------------------------------
/tests/test_template.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from pipen.template import get_template_engine
 4 | from pipen.template import NoSuchTemplateEngineError
 5 | 
 6 | 
 7 | def test_update_envs():
 8 |     jinja = get_template_engine("jinja2")
 9 |     jinja2 = get_template_engine(jinja)
10 |     assert jinja is jinja2
11 |     jinja_tpl = jinja("abc")
12 |     assert jinja_tpl.render() == "abc"
13 | 
14 |     with pytest.raises(NoSuchTemplateEngineError):
15 |         get_template_engine("nosuchtemplate")
16 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import pytest
  4 | import pipen
  5 | from yunpath import CloudPath
  6 | from pipen.utils import (
  7 |     brief_list,
  8 |     desc_from_docstring,
  9 |     get_logger,
 10 |     get_mtime,
 11 |     get_shebang,
 12 |     ignore_firstline_dedent,
 13 |     strsplit,
 14 |     truncate_text,
 15 |     update_dict,
 16 |     mark,
 17 |     get_marked,
 18 |     _get_obj_from_spec,
 19 |     load_pipeline,
 20 |     path_is_symlink,
 21 |     path_symlink_to,
 22 | )
 23 | from pipen.proc import Proc
 24 | from pipen.procgroup import ProcGroup
 25 | from pipen.exceptions import ConfigurationError
 26 | 
 27 | from .helpers import BUCKET
 28 | 
 29 | HERE = Path(__file__).parent.resolve()
 30 | 
 31 | 
 32 | @pytest.mark.forked
 33 | def test_get_logger(caplog):
 34 |     logger = get_logger("test", "info")
 35 |     logger.debug("debug message")
 36 |     assert "debug message" not in caplog.text
 37 | 
 38 | 
 39 | @pytest.mark.forked
 40 | def test_brief_list():
 41 |     assert brief_list([1]) == "1"
 42 |     assert brief_list([1, 2, 3]) == "1-3"
 43 | 
 44 | 
 45 | @pytest.mark.forked
 46 | def test_get_mtime_dir():
 47 |     package_dir = Path(pipen.__file__).parent
 48 |     mtime = get_mtime(package_dir, 2)
 49 |     assert mtime > 0
 50 | 
 51 | 
 52 | @pytest.mark.forked
 53 | def test_get_mtime_symlink_dir(tmp_path):
 54 |     dir = tmp_path / "dir"
 55 |     dir.mkdir()
 56 |     file = dir / "file"
 57 |     file.touch()
 58 |     link = tmp_path / "link"
 59 |     link.symlink_to(dir)
 60 |     mtime = get_mtime(link, 2)
 61 |     assert mtime > 0
 62 | 
 63 | 
 64 | @pytest.mark.forked
 65 | def test_get_mtime_cloud_file():
 66 |     file = CloudPath(f"{BUCKET}/pipen-test/channel/test1.txt")
 67 |     mtime = get_mtime(file)
 68 |     assert mtime > 0
 69 | 
 70 | 
 71 | @pytest.mark.forked
 72 | def test_get_mtime_symlink_to_cloud_dir(tmp_path):
 73 |     link = tmp_path / "link"
 74 |     path_symlink_to(link, CloudPath(f"{BUCKET}/pipen-test/channel"))
 75 |     lmtime = get_mtime(link, 0)
 76 |     mtime = get_mtime(link)
 77 |     assert mtime < lmtime
 78 | 
 79 | 
 80 | @pytest.mark.forked
 81 | def test_desc_from_docstring():
 82 |     class Base:
 83 |         ...
 84 | 
 85 |     class Obj1(Base):
 86 |         """
 87 | 
 88 |         abc
 89 |         def
 90 | 
 91 |         """
 92 | 
 93 |     desc = desc_from_docstring(Obj1, Base)
 94 |     assert desc == "abc def"
 95 | 
 96 | 
 97 | @pytest.mark.forked
 98 | def test_update_dict():
 99 |     assert update_dict(None, None) is None
100 |     assert update_dict({}, None) == {}
101 |     assert update_dict(None, {}) == {}
102 |     assert update_dict({"a": 1}, {"b": 2}) == {"a": 1, "b": 2}
103 |     assert update_dict({"a": 1}, {"a": 2}) == {"a": 2}
104 |     assert update_dict({"a": {"b": 1}}, {"a": {"c": 2}}) == {
105 |         "a": {"b": 1, "c": 2}
106 |     }
107 |     assert update_dict({"a": {"b": 1}}, {"a": {"c": 2}}, depth=1) == {
108 |         "a": {"c": 2}
109 |     }
110 |     assert update_dict(
111 |         {"a": {"b1": {"c": 1, "d": 2}, "b2": {"c": 1, "d": 2}}},
112 |         {"a": {"b1": {"c": 2}}},
113 |     ) == {"a": {"b1": {"c": 2, "d": 2}, "b2": {"c": 1, "d": 2}}}
114 | 
115 |     assert update_dict(
116 |         {"a": {"b1": {"c": 1, "d": 2}, "b2": {"c": 1, "d": 2}}},
117 |         {"a": {"b1": {"c": 2}}},
118 |         depth=2,
119 |     ) == {"a": {"b1": {"c": 2}, "b2": {"c": 1, "d": 2}}}
120 | 
121 |     assert update_dict(
122 |         {"a": {"b1": {"c": 1, "d": 2}, "b2": {"c": 1, "d": 2}}},
123 |         {"a": {"b1": {"c": 2}}},
124 |         depth=1,
125 |     ) == {"a": {"b1": {"c": 2}}}
126 | 
127 | 
128 | @pytest.mark.forked
129 | def test_strsplit():
130 |     assert strsplit("a ,b ", ",", trim=None) == ["a ", "b "]
131 |     assert strsplit("a , b", ",", trim="left") == ["a ", "b"]
132 |     assert strsplit("a , b", ",", trim="right") == ["a", " b"]
133 | 
134 | 
135 | @pytest.mark.forked
136 | def test_get_shebang():
137 |     assert get_shebang("") is None
138 |     assert get_shebang("#!bash") == "bash"
139 |     assert get_shebang("#!bash \n") == "bash"
140 | 
141 | 
142 | @pytest.mark.forked
143 | def test_ignore_firstline_dedent():
144 |     text = """
145 | 
146 |     a
147 |     """
148 |     assert ignore_firstline_dedent(text) == "a\n"
149 | 
150 | 
151 | @pytest.mark.forked
152 | def test_truncate_text():
153 |     assert truncate_text("abcd", 2) == "a…"
154 | 
155 | 
156 | @pytest.mark.forked
157 | def test_mark():
158 |     @mark(a=1)
159 |     class P1(pipen.Proc):
160 |         ...
161 | 
162 |     assert get_marked(P1, "a") == 1
163 | 
164 |     class P2(P1):
165 |         ...
166 | 
167 |     assert get_marked(P2, "a", None) is None
168 | 
169 |     P3 = pipen.Proc.from_proc(P1)
170 |     assert get_marked(P3, "a") is None
171 | 
172 |     class X:
173 |         ...
174 | 
175 |     assert get_marked(X, "a", None) is None
176 | 
177 |     @mark(a=1)
178 |     class Y:
179 |         ...
180 | 
181 |     assert get_marked(Y, "a") == 1
182 | 
183 |     class Z(Y):
184 |         ...
185 | 
186 |     # Marks inherited, as Y/Z are not Proc nor ProcGroup
187 |     assert get_marked(Z, "a", None) == 1
188 | 
189 | 
190 | @pytest.mark.forked
191 | def test_get_obj_from_spec():
192 |     with pytest.raises(ValueError):
193 |         _get_obj_from_spec("a.b.c")
194 | 
195 |     obj = _get_obj_from_spec(f"{HERE}/helpers.py:SimpleProc")
196 |     assert obj.name == "SimpleProc"
197 | 
198 |     obj = _get_obj_from_spec("pipen:Pipen")
199 |     assert obj is pipen.Pipen
200 | 
201 | 
202 | @pytest.mark.forked
203 | @pytest.mark.asyncio
204 | async def test_load_pipeline(tmp_path):
205 |     with pytest.raises(TypeError):
206 |         await load_pipeline(f"{HERE}/helpers.py:create_dead_link")
207 |     with pytest.raises(TypeError):
208 |         await load_pipeline(ConfigurationError)
209 | 
210 |     # Proc
211 |     pipeline = await load_pipeline(f"{HERE}/helpers.py:SimpleProc")
212 |     assert pipeline.name == "SimpleProcPipeline"
213 | 
214 |     # ProcGroup
215 |     class PG(ProcGroup):
216 |         ...
217 | 
218 |     pg = PG()
219 | 
220 |     @pg.add_proc()
221 |     class P1(Proc):
222 |         pass
223 | 
224 |     pipeline = await load_pipeline(PG)
225 |     assert pipeline.name == "PG"
226 | 
227 |     pipeline = await load_pipeline(f"{HERE}/helpers.py:PipenIsLoading")
228 |     assert pipeline.name == "PipenIsLoading"
229 |     assert pipeline.starts[0].name == "SimpleProc"
230 |     assert len(pipeline.procs) == 1
231 | 
232 | 
233 | @pytest.mark.forked
234 | @pytest.mark.asyncio
235 | async def test_load_pipeline_pipen_object(tmp_path):
236 |     p = await load_pipeline(f"{HERE}/helpers.py:pipeline", a=1)
237 |     assert p._kwargs["a"] == 1
238 | 
239 | 
240 | @pytest.mark.forked
241 | # To avoid: Another plugin named simpleplugin has already been registered.
242 | @pytest.mark.asyncio
243 | async def test_is_load_pipeline_with_help(tmp_path):
244 |     pipeline = await load_pipeline(
245 |         f"{HERE}/helpers.py:PipenIsLoading",
246 |         "_",  # not @pipen
247 |         ["--help"],
248 |     )
249 |     assert pipeline.name == "PipenIsLoading"
250 |     assert pipeline.starts[0].name == "SimpleProc"
251 |     assert len(pipeline.procs) == 1
252 | 
253 | 
254 | def test_path_is_symlink(tmp_path):
255 |     link = tmp_path / "link"
256 |     path_symlink_to(link, tmp_path / "target")
257 |     assert path_is_symlink(link)
258 | 
259 |     fake_symlink = tmp_path / "fake_symlink"
260 |     path_symlink_to(fake_symlink, CloudPath(f"{BUCKET}/target"))
261 |     assert path_is_symlink(fake_symlink)
262 | 
263 |     nonexist_file = tmp_path / "nonexist"
264 |     assert not path_is_symlink(nonexist_file)
265 | 
266 |     dir = tmp_path / "dir"
267 |     dir.mkdir()
268 |     assert not path_is_symlink(dir)
269 | 


--------------------------------------------------------------------------------
/tests/test_xqute_pars.py:
--------------------------------------------------------------------------------
 1 | """Test parameters for xqute"""
 2 | from pipen.proc import Proc
 3 | import pytest
 4 | 
 5 | import time
 6 | from .helpers import RetryProc, pipen  # noqa: F401
 7 | 
 8 | 
 9 | @pytest.mark.forked
10 | def test_retry(caplog, pipen):  # noqa: F811
11 |     proc = Proc.from_proc(RetryProc)
12 |     rc = pipen.set_starts(proc).set_data([time.time()]).run()
13 |     assert "Retrying" in caplog.text
14 |     assert rc
15 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203, W503
3 | max-line-length = 88
4 | per-file-ignores =
5 |     # imported but unused
6 |     __init__.py: F401
7 |     pipen/utils.py: F401
8 |     tests/*: F811
9 | 


--------------------------------------------------------------------------------