├── .github └── workflows │ ├── black.yml │ ├── python-publish.yml │ └── run-pytest.yml ├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── design-notes.md ├── docs ├── README.md ├── advanced-run-method.md ├── best-practices.md ├── changelog.md ├── clean.md ├── cli.md ├── conf.py ├── configuration.md ├── contributing.md ├── faq.md ├── features.md ├── img │ ├── error.svg │ ├── job_status.svg │ ├── logging.svg │ ├── memory.svg │ ├── protection.svg │ ├── pypiper.svg │ ├── pypiper_bug.svg │ ├── pypiper_logo.svg │ ├── pypiper_logo_dark.svg │ ├── recovery.svg │ ├── reports.svg │ ├── restartability.svg │ └── simplicity.svg ├── ngstk_intro.md ├── outputs.md ├── philosophy.md ├── pipestat.md ├── report.md └── support.md ├── docs_jupyter ├── basic-pipeline.ipynb ├── build │ └── .gitignore └── hello-world.ipynb ├── example_pipelines ├── basic.py ├── count_reads.py ├── hello_pypiper.py └── logmuse_example.py ├── init_interactive.py ├── logo_pypiper.svg ├── mkdocs.yml ├── pypiper ├── __init__.py ├── _version.py ├── const.py ├── exceptions.py ├── flags.py ├── folder_context.py ├── manager.py ├── ngstk.py ├── pipeline.py ├── stage.py └── utils.py ├── requirements ├── requirements-dev-extra.txt ├── requirements-docs.txt ├── requirements-ngstk.txt ├── requirements-plot.txt ├── requirements-pypiper.txt └── requirements-test.txt ├── setup.cfg ├── setup.py └── tests ├── Data ├── default_pipestat_output_schema.yaml └── sample_output_schema.yaml ├── __init__.py ├── conftest.py ├── helpers.py ├── pipeline ├── __init__.py ├── conftest.py ├── test_multi_pipeline_sample.py ├── test_pipeline.py ├── test_pipeline_checkpoint.py └── test_pipeline_constructor.py ├── pipeline_manager ├── test_halt.py ├── test_manager_constructor.py ├── test_manager_state.py ├── test_pipeline_manager.py ├── test_pipeline_manager_timestamp.py ├── test_pipeline_manager_timestamp_checkpoint_filepath.py └── test_set_status_flag.py ├── test_packaging.py ├── test_pipeline_filepath.py └── utils_tests ├── test_check_command_callability.py └── test_head_util.py /.github/workflows/black.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - uses: actions/setup-python@v5 11 | - uses: psf/black@stable -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | name: upload release to PyPI 13 | runs-on: ubuntu-latest 14 | permissions: 15 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Set up Python 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: '3.x' 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install setuptools wheel twine 26 | - name: Build and publish 27 | run: | 28 | python setup.py sdist bdist_wheel 29 | - name: Publish package distributions to PyPI 30 | uses: pypa/gh-action-pypi-publish@release/v1 31 | 32 | -------------------------------------------------------------------------------- /.github/workflows/run-pytest.yml: -------------------------------------------------------------------------------- 1 | name: Run pytests 2 | 3 | on: 4 | pull_request: 5 | branches: [master, dev] 6 | workflow_dispatch: 7 | inputs: null 8 | 9 | jobs: 10 | pytest: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | python-version: ["3.8", "3.13"] 15 | os: [ubuntu-latest] 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | 25 | - name: Install dev dependencies 26 | run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi 27 | 28 | - name: Install test dependencies 29 | run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi 30 | 31 | - name: Install package 32 | run: python -m pip install . 33 | 34 | - name: Run pytest tests 35 | run: pytest tests -x -vv --remote-data 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore test results 2 | tests/test/* 3 | 4 | # toy/experimental files 5 | *.csv 6 | *.tsv 7 | *.pkl 8 | 9 | # ignore eggs 10 | .eggs/ 11 | *.egg 12 | 13 | # ignore built docs 14 | build/* 15 | doc/build/* 16 | docs/autodoc_build/* 17 | 18 | # ignore test results 19 | example_pipelines/hello_pypiper_results/* 20 | 21 | # generic ignore list: 22 | *.lst 23 | 24 | # Compiled source 25 | *.com 26 | *.class 27 | *.dll 28 | *.exe 29 | *.o 30 | *.so 31 | *.pyc 32 | 33 | # Packages 34 | # it's better to unpack these files and commit the raw source 35 | # git has its own built in compression methods 36 | *.7z 37 | *.dmg 38 | *.gz 39 | *.iso 40 | *.jar 41 | *.rar 42 | *.tar 43 | *.zip 44 | 45 | # Logs and databases 46 | *.log 47 | *.sql 48 | *.sqlite 49 | 50 | # OS generated files 51 | .DS_Store 52 | .DS_Store? 53 | ._* 54 | .Spotlight-V100 55 | .Trashes 56 | ehthumbs.db 57 | Thumbs.db 58 | 59 | # Gedit temporary files 60 | *~ 61 | 62 | # libreoffice lock files: 63 | .~lock* 64 | 65 | # Default-named test output 66 | microtest/ 67 | open_pipelines/ 68 | 69 | # IDE-specific items 70 | .idea/ 71 | 72 | # pytest-related 73 | .cache/ 74 | .coverage 75 | .pytest_cache 76 | .hypothesis 77 | 78 | # Reserved files for comparison 79 | *RESERVE* 80 | 81 | # Build-related stuff 82 | dist/ 83 | pypiper.egg-info/ 84 | piper.egg-info/ 85 | 86 | 87 | *ipynb_checkpoints* 88 | *.egg-info* 89 | 90 | 91 | example_pipelines/pipeline_output 92 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.5" 5 | - "3.6" 6 | os: 7 | - linux 8 | install: 9 | - pip install --upgrade six 10 | - pip install . 11 | - pip install -r requirements/reqs-ngstk.txt 12 | - pip install -r requirements/reqs-test.txt 13 | script: pytest -v --cov=pypiper 14 | after_success: 15 | - coveralls 16 | branches: 17 | only: 18 | - dev 19 | - master 20 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2018 Nathan Sheffield 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 10 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements/* 2 | include README.md 3 | include logo_pypiper.svg -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Pypiper logo 2 | 3 | # Pypiper 4 | 5 | [![Documentation Status](https://readthedocs.org/projects/pypiper/badge/?version=latest)](http://pypiper.readthedocs.org/en/latest/?badge=latest) 6 | [![Build Status](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml/badge.svg)](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml) 7 | [![PEP compatible](https://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io) 8 | [![pypi-badge](https://img.shields.io/pypi/v/piper)](https://pypi.org/project/piper) 9 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 10 | 11 | A lightweight python toolkit for gluing together restartable, robust shell pipelines. Learn more in the [documentation](http://pypiper.databio.org). 12 | -------------------------------------------------------------------------------- /design-notes.md: -------------------------------------------------------------------------------- 1 | # Design decision notes 2 | 3 | ## Terms 4 | - **Stage** or **phase**: an arbitrarily defined logical processing *step* or 5 | *unit* of operation(s) within a pipeline (e.g., read trimming or peak calling 6 | - **Checkpoint**: closely tied to the notion of a *stage* or *phase*, a 7 | checkpoint represents a point in a pipeline that the author has deemed 8 | as sufficiently significant to warrant designation. 9 | 10 | 11 | ## Classes 12 | 13 | ### `Pipeline` 14 | 15 | Since a pipeline author determines how to compose logical units, steps, or 16 | phases to define the pipeline, this class is inherently abstract. We 17 | prefer to be able to impose and enforce the requirement for stage definitions up 18 | front. This precludes the definition or creation of a `Pipeline` without stages 19 | as we declare `stages` as an `abc.abstractproperty` in the definition of 20 | `Pipeline`. This also permits us to validate the stage definitions up front, at 21 | time of pipeline creation rather than waiting until invocation of something like 22 | `run`. A further benefit of this design is the ability to store the parsed, 23 | validated form of the stage definitions obtained during instance construction. 24 | This eliminates a potential need to pass the stage definitions among methods for 25 | which they're needed, thereby simplifying our function signatures. 26 | 27 | Not only do we want to provide a simple framework in which processing 28 | stage/phases may be enumerated and defined in sequence, but we also want to 29 | facilitate non-sequential stages to be defined by the pipeline author. In the 30 | context of say, testing multiple alternative ways to do the same conceptual task 31 | (e.g., read trimming or peak calling) within the same pipeline, in early 32 | pipeline development, it's particularly likely that the desire to define 33 | unordered stages may arise. 34 | 35 | Additionally, it would be nice to support varying degrees of expressive power 36 | and simplicity. To some extent, this is likely to present a trade-off, with 37 | greater expressive power coming at the expense of implementation simplicity 38 | for a developer who wishes to implement/extend `Pipeline`. Possibilities 39 | for some of the "levels" of simplicity and power include but are not limited to: 40 | 41 | ### `Stage` 42 | 43 | 44 | ## Checkpointing complexity 45 | 46 | ### Direct pipeline file writes 47 | - In the most basic case, the pipeline may d 48 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # a developer's pipeline framework 2 | 3 | [![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io) 4 | [![pypi-badge](https://img.shields.io/pypi/v/piper)](https://pypi.org/project/piper) 5 | [![Documentation Status](https://readthedocs.org/projects/pypiper/badge/?version=latest)](http://pypiper.readthedocs.org/en/latest/?badge=latest) 6 | [![Build Status](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml/badge.svg)](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml) 7 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 8 | 9 | ## What is pypiper? 10 | 11 | `Pypiper` is a **development-oriented** pipeline framework. It is a python package that helps you write robust pipelines directly in python, handling mundane tasks like restartability, monitoring for time and memory use, monitoring job status, copious log output, robust error handling, easy debugging tools, and guaranteed file output integrity. 12 | 13 | 14 | 15 | ## What makes pypiper better? 16 | With Pypiper, **simplicity is paramount**. Prerequisites are few: base python and 2 common packages (`pyyaml` and `psutil`). It should take fewer than 15 minutes to build your first pipeline and only an hour or two to learn the advanced features. Pypiper pipelines are: 17 | 18 | 1. written in pure python, so they do not require learning a new language; 19 | 2. easy to modify, so they are simple to update and maintain; 20 | 3. simple to understand for an outsider, so they can be approached by others. 21 | 22 | These traits make pypiper ideally suited for **pipelines under active development**. Read more about the [pypiper philosophy](philosophy). 23 | 24 | ## Installing 25 | 26 | Releases are posted as [GitHub releases](https://github.com/databio/pypiper/releases), or you can install from PyPI using `pip`: 27 | 28 | Global scope for single user: 29 | ```{console} 30 | pip install --user --upgrade piper 31 | ``` 32 | 33 | Within an active virtual environment: 34 | ```{console} 35 | pip install --upgrade piper 36 | ``` 37 | 38 | ## Quick start 39 | 40 | To employ pypiper, you build something like a shell script, but pass the commands through the `run` method on a `PipelineManager` object. Build your pipeline in **pure python**: 41 | 42 | ```{python} 43 | #!/usr/bin/env python 44 | 45 | import pypiper 46 | outfolder = "hello_pypiper_results" # Choose a folder for your results 47 | 48 | # Create a PipelineManager, the workhorse of pypiper 49 | pm = pypiper.PipelineManager(name="hello_pypiper", outfolder=outfolder) 50 | 51 | # Timestamps to delineate pipeline sections are easy: 52 | pm.timestamp("Hello!") 53 | 54 | # Now build a command and pass it to pm.run() 55 | target_file = "hello_pypiper_results/output.txt" 56 | command = "echo 'Hello, Pypiper!' > " + target_file 57 | pm.run(command, target_file) 58 | 59 | pm.stop_pipeline() 60 | ``` 61 | 62 | Then invoke your pipeline via the command-line: 63 | 64 | ```{console} 65 | python my_pipeline.py --help 66 | ``` 67 | 68 | ## Pypiper strengths 69 | 70 | Pypiper differs from existing frameworks in its focus on **simplicity**. Pypiper requires learning no new language, as **pipelines are written in pure python**. Pypiper is geared toward **developing pipelines** that are contained in a single file, easy to update, and easy to understand. 71 | -------------------------------------------------------------------------------- /docs/advanced-run-method.md: -------------------------------------------------------------------------------- 1 | # Run method options 2 | 3 | The `PipelineManager.run()` function is the core of `pypiper`. In its simplest case, all you need to provide is a command to run, but it can be much more powerful with additional arguments. 4 | 5 | ## The `cmd` argument 6 | 7 | Normally you just pass a string, but you can also pass a list of commands to `run`, like this: 8 | 9 | ``` 10 | pm.run([cmd1, cmd2, cmd3]) 11 | ``` 12 | 13 | Pypiper will treat these commands as a group, running each one in turn (and monitoring them individually for time and memory use). The difference in doing it this way, rather than 3 separate calls to `run()` is that if the series does not complete, the entire series will be re-run. This is therefore useful to piece together commands that must all be run together. 14 | 15 | ## The `target` and `lock_name` arguments 16 | 17 | If you provide a `target` file, then `pypiper` will first check to see if that target exists, and only run the `command` if the `target` does not exist. To prevent two pipelines from running commands on the same target, `pypiper` will automatically derive a lock file name from your target file. You can use the `lock_name` argument to override this default. If you do not provide a `target`, then you will need to provide a `lock_name` argument because `pypiper` will not be able to derive one automatically. 18 | 19 | ## The `nofail` argument 20 | 21 | By default, a command that fails will cause the entire pipeline to halt. If you want to provide a command that *should not* halt the pipeline upon failure, set `nofail=True`. `nofail` can be used to implement non-essential parts of the pipeline. 22 | 23 | ## The `follow` argument 24 | 25 | The `PipelineManager.run` function has an optional argument named `follow` that is useful for checking or reporting results from a command. To the `follow` argument you must pass a python function (which may be either a defined function or a `lambda` function). These *follow functions* are then coupled to the command that is run; the follow function will be called by python **if and only if** the command is run. 26 | 27 | Why is this useful? The major use cases are QC checks and reporting results. We use a folllow function to run a QC check to make sure processes did what we expect, and then to report that result to the `stats` file. We only need to check the result and report the statistic once, so it's best to put these kind of checks in a `follow` function. Often, you'd like to run a function to examine the result of a command, but you only want to run that once, *right after the command that produced the result*. For example, counting the number of lines in a file after producing it, or counting the number of reads that aligned right after an alignment step. You want the counting process coupled to the alignment process, and don't need to re-run the counting every time you restart the pipeline. Because pypiper is smart, it will not re-run the alignment once it has been run; so there is no need to re-count the result on every pipeline run! 28 | 29 | *Follow functions* let you avoid running unnecessary processes repeatedly in the event that you restart your pipeline multiple times (for instance, while debugging later steps in the pipeline). 30 | 31 | ## The `container` argument 32 | 33 | If you specify a string here, `pypiper` will wrap the command in a `docker run` call using the given `container` image name. 34 | 35 | ## The `shell` argument: Python subprocess types 36 | 37 | Since Pypiper runs all your commands from within python (using the `subprocess` python module), it's nice to be aware of the two types of processes that `subprocess` allows: **direct processes** and **shell processes**. 38 | 39 | **Direct process**: A direct process is executed and managed by Python, so Python retains control over the process completely. This enables Python to monitor the memory use of the subprocess and keep track of it more efficiently. The disadvantage is that you may not use shell-specific operators; for instance, a shell like `Bash` is what understands an asterisk (`*`) for wildcard expansion, or a bracket (`>`) for output redirection, or a pipe (`|`) to string commands together; these therefore cannot be used in direct subprocesses in Python. 40 | 41 | **Shell process**: In a shell process, Python first spawns a shell, and then runs the command in that shell. The spawned shell is the process controlled by Python, but processes in the shell are not. This allows you to use shell operators (*e.g.* `*`, `>`), but at the cost of the ability to monitor each command independently, because Python does not have direct control over subprocesses run inside a subshell. 42 | 43 | Because we'd like to run *direct* subprocesses whenever possible, `pypiper` includes 2 nice provisions that help us deal with shell processes. First, pypiper automatically divides commands with pipes (`|`) and executes them as *direct* processes. This enables you to pass a piped shell command, but still get the benefit of a direct process. Each process in the pipe is monitored for return value and for memory use individually, and this information is reported in the pipeline log. Nice! Second, pypiper uses the `psutil` module to monitor memory of *all child processes*. That means when you use a shell process, we *do* monitor the memory use of that process (and any other processes it spawns), which gives us more accurate memory monitoring -- but not from each task individually. 44 | 45 | You can force Pypiper by specifying `shell=True` or `shell=False` to the `run` function, but really, you shouldn't have to. By default Pypiper will try to guess: if your command contains `*` or `>`, it will be run in a shell. If it contains a pipe (`|`), it will be split and run as direct, piped subprocesses. Anything else will be run as a direct subprocess. 46 | -------------------------------------------------------------------------------- /docs/best-practices.md: -------------------------------------------------------------------------------- 1 | 2 | # Best practices 3 | 4 | Here are some guidelines for how you can design the most effective pipelines. 5 | 6 | 7 | * **Compartmentalize output into folders**. 8 | In your output, keep pipeline steps separate by organizing output into subfolders. 9 | 10 | * **Use git for versioning**. 11 | If you develop your pipeline in a git repository, Pypiper will automatically record the commit hash when you run a pipeline, making it easy to figure out **exactly** what code version you ran. 12 | 13 | * **Record stats as you go**. 14 | In other words, don't do all your stats (`report_result()`) and QC at the end; do it along the way. This facilitates monitoring and maximizes availability of statistics even when a pipeline fails. 15 | 16 | * **Use looper args**. 17 | Even if you're not using looper at first, use `looper_args` and your pipeline will be looper-ready when it comes time to run 500 samples. 18 | 19 | * **Use NGSTk early on**. 20 | `NGSTk` has lots of useful functions that you will probably need. We've worked hard to make these robust and universal. For example, using NGSTk, you can easily make your pipeline take flexible input formats (FASTQ or BAM). Right now you may always have the same input type (FASTQ, for example), but later you may want your pipeline to be able to work from `bam` files. We've already written simple functions to handle single or multiple BAM or FASTQ inputs; just use this infrastructure (in `NGSTk`) instead of writing your own, and you'll save yourself future headaches. 21 | 22 | * **Make some important parameters in the pipeline config, instead of hardcoding them** 23 | Pypiper makes it painfully easy to use a config file to make your pipeline configurable. Typically you'll start by hard-coding in those parameters in your pipeline steps. But you can select a few important parameters and make them customizable in the pipeline config. Start from the very beginning by making a `yaml` pipeline config file. See an example of a [pipeline config file](configuration.md). 24 | -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [0.14.4] -- 2025-02-25 4 | ### Changed 5 | - Fixed warnings for Python >3.12 6 | - Updated version of Python to 3.13 in pytests 7 | 8 | 9 | ## [0.14.3] -- 2024-10-02 10 | ### Changed 11 | - bump requirements to require pipestat>=0.11.0 12 | 13 | ## [0.14.2] -- 2024-05-07 14 | ### Changed 15 | - Addresses [#218](https://github.com/databio/pypiper/issues/218) 16 | 17 | ## [0.14.1] -- 2024-04-19 18 | ### Changed 19 | - remove pipestat_project_name from PipelineManager parameters 20 | - refactor pipestat_sample_name to pipestat_record_identifier in PipelineManager parameters 21 | - update requirements for pipestat 0.9.0, ubiquerg 0.8.0, and yacman 0.9.3 22 | - set `force_overwrite` to default to true, Issue #209 23 | 24 | 25 | ## [0.14.0] -- 2023-12-22 26 | ### Changed 27 | - refactor for pipestat v0.6.0 release 28 | - drop python 2.7 29 | - updated requirements 30 | - changed message_raw to be a value_dict when reporting to conform to pipestat 31 | - ### Fixed 32 | - fixed #196 and #197 33 | - ### Added 34 | - added `force_overwrite` to `report_result` and `report_object` 35 | - added pipestat_pipeline_type, defaulting to sample-level 36 | 37 | ## [0.13.2] -- 2023-08-02 38 | ### Fixed 39 | - fixed self.new_start overriding checkpoints. 40 | 41 | ## [0.13.1] -- 2023-07-14 42 | ### Fixed 43 | - added _safe_write_to_file back into pypiper for Pepatac backwards compatibility 44 | 45 | ## [0.13.0] -- 2023-06-29 46 | ### Added 47 | 48 | - [pipestat](http://pipestat.databio.org/en/latest/) support 49 | 50 | ## [0.12.3] -- 2022-01-25 51 | 52 | ### Fixed 53 | - A few bugs with compatibility with Python version 3.9 54 | 55 | ## [0.12.2] -- 2021-12-20 56 | 57 | ### Fixed 58 | - Removed use2to3 for compatibility with setuptools 58 59 | 60 | ## [0.12.1] -- 2019-08-29 61 | 62 | ### Fixed 63 | - Increased requirement for logmuse 64 | 65 | ### Changed 66 | - Sort argument outputs in logs 67 | - Fail messages can now be a string (previously required an Exception). 68 | 69 | ## [0.12.0] -- 2019-08-14 70 | 71 | ### Added 72 | - Use profile to determine total elapsed time 73 | - `logging` functions directly on `PipelineManager` 74 | - Re-export `add_logging_options` from `logmuse`, for direct use by a pipeline author. 75 | - `logger_via_cli` that defaults to the `strict=False` behavior of the same-named function from `logmuse` 76 | - Use logging for pypiper-generated output. 77 | 78 | ### Fixed 79 | - Fix childless processes memory monitoring issue 80 | - Fix problems with runtime reading from pipeline profile TSV formatted according to two styles 81 | - Fix problems running containerized executables that would sometimes hang 82 | - Fix inaccurate elapsed time accumulation 83 | - Fixed a bug that caused hanging when running in singularity containerized executables 84 | - Fixed bugs with merging bamfiles using samtools 85 | 86 | ### Changed 87 | - The hashes in the pipeline profile are produced from the entire original command, even if it is a pipe 88 | - Changed output to simplify and improve log readability 89 | 90 | ## [0.11.3] -- 2019-06-17 91 | ### Fixed 92 | - Fixed a bug that caused an OSError removing lock files for some filesystems. 93 | 94 | ## [0.11.2] -- 2019-06-06 95 | ### Fixed 96 | - Elevate `attmap` depdendency bound to require inclusion of improved path expansion behavior. 97 | 98 | ## [0.11.1] -- 2019-05-30 99 | ### Fixed 100 | - Elevate `attmap` dependency bound to require inclusion of a bugfix there. 101 | 102 | ## [0.11.0] -- 2019-05-13 103 | - Improve python3 handling of integers and strings 104 | - Fixed a bug with cleanup scripts in `dirty` mode 105 | - Restructured profile output with hash and processID, and made lock paths relative 106 | - Streamlined some logging outputs 107 | - Allows nested parenthesies and braces for piped commands 108 | - Fixed a bug that would have split a pipe within a braced command 109 | - Some performance improvements for ngstk functions 110 | - Allow `ngstk.input_to_fastq` to yield gzipped fastq files 111 | 112 | ## [0.10.0] -- 2019-03-22 113 | - Fixed a bug that raised exception with empty commands 114 | - Fixed the pipeline profiling issues 115 | - Major updates to internal systems: Switch to `attmap` 116 | - Revamped way of handling child subprocesses which should lead to more 117 | efficient memory monitoring of piped subprocesses, and more consistent 118 | handling of rogues subprocesses during pipeline failure. 119 | - Added force mode to ngstk `gzip` and `pigz` use. 120 | - Changed documentation from sphinx to mkdocs. 121 | - Fixed a bug with python3 output buffering 122 | - Implement multi-target commands 123 | - Fixed a bug that had prevented new start mode from working in certain cases. 124 | - Allow user to change units of memory passed in with default pypiper cli. 125 | 126 | ## [0.9.4] -- 2019-01-31 127 | 128 | - Point release to PyPI for README rendering. 129 | 130 | ## [0.9.3] -- 2019-01-31 131 | 132 | - Simple point release update to fix PyPI landing page. 133 | 134 | ## [0.9.2] -- 2019-01-30 135 | 136 | - Never echo protected-looking attribute request. 137 | 138 | ## [0.9.1] -- 2019-01-29 139 | 140 | - Fixed a bug in NGSTk that caused errors for read counting functions on 141 | MACOS. MACOS `wc` returns leading whitespace, which caused these functions 142 | to fail. 143 | 144 | ## [0.9.0] -- 2018-11-19 145 | 146 | - Use `psutil` to track aggregate memory usage for processes that spawn 147 | children. This results in accurate memory records for these processes. 148 | - Individual commands in a string of commands connected by shell pipes are 149 | now treated as individual commands, and and monitored individually for 150 | time and memory, and if a single component, fails, the entire string will 151 | fail. Previously, only the final return command was recorded, as in `bash`. 152 | - Various other small improvements (like waiting checking for dynamic recover 153 | flags) 154 | 155 | 156 | ## [0.8.1] -- 2018-09-20 157 | 158 | - Fixed a bug that caused a problem for some pipelines adding groups of pypiper args. 159 | - Improved the `run` waiting method to immediately stop upon job 160 | completion, rather than minute-increment polling. This should improve 161 | performance particularly in pipelines with many, medium-runtime steps, and 162 | improve accuracy of timing profiles. 163 | 164 | 165 | ## [0.8.0] -- 2018-06-15 166 | 167 | - Implemented 'new start' mode. 168 | - Improved error messages and exception handling for missing child software. 169 | - Clarified the built-in required vs. optional args by allowing pipeline authors to specify which of the pypiper args are required. The command-line help UI now displays these correctly as 'required arguments' instead of incorrectly as 'optional arguments'. 170 | - Corrected the sort order of added arguments, so they are listed in the help menu more naturally. 171 | - Fixed a bug that caused an erroneous error message indicating missing pypiper args. 172 | - Clarified the license is BSD2 173 | - Fixed a bug that neglected to list pyyaml as a dependency 174 | 175 | ## [0.7.2] -- 2018-06-05 176 | 177 | - Implemented the 'report object' function. 178 | - Cleanup files are now relative, so a moved folder could still be cleaned. 179 | - Fixed a bug that prevented install if pypandoc was not installed 180 | - Fixed a bug that caused an error in containers where /proc wasn't accessible 181 | 182 | 183 | ## [0.7.1] -- 2018-02-27 184 | 185 | - Package cleanup for Pypi. 186 | 187 | ## [0.7.0] -- 2017-12-12 188 | 189 | - Standardize `NGSTk` function naming. 190 | - Introduce `Stage` as a model for a logically related set of pipeline processing steps. 191 | - Introduce `Pipeline` framework for automated processing phase execution and checkpointing. 192 | - Add ability to start and/or stop a pipeline at arbitrary checkpoints. 193 | - Introduce new state for a paused/halted pipeline. 194 | - Improve spawned process shutdown to avoid zombie processes. 195 | 196 | ## [0.6.0] -- 2017-08-24 197 | 198 | - Adds 'dynamic recovery' capability. For jobs that are terminated by an interrupt, such as a SIGINT or SIGTERM (as opposed to a failed command), pypiper will now set a dynamic recovery flags. These jobs, when restarted, will automatically pick up where they left off, without requiring any user intervention. Previously, the user would have to specify recover mode (`-R`). Now, recover mode forces a recover regardless of failure type, but interrupted pipelines will auto-recover. 199 | - Pypiper now appropriately adds cleanup files intermediate files for failed runs. It adds them to the cleanup script. 200 | - Improves error messages so only a single exception is raised with a more direct relevance to the user/ 201 | - Pypiper will automatically remove existing flags when the run starts, eliminating the earlier issue of confusion due to multiple flags present on runs that were restarted. 202 | - Fixes a bug that caused a pipeline to continue if a SIGTERM is given during a process that was marked `nofail`. 203 | - Pypiper now can handle multiple SIGTERMs without one canceling the shutdown procedure begun by the other. 204 | - Major improvements to documentation and tutorials. 205 | - Adds `report_figure` function. 206 | 207 | ## [0.5.0] -- 2017-07-21 208 | 209 | - Adds preliminary support for handling docker containers 210 | - Updates docs, adds Hello World example 211 | - Adds 'waiting' flag 212 | - Eliminates extra spaces in reported results 213 | - Pypiper module is version aware 214 | - Updates Success time format to eliminate space 215 | - Improves efficiency in some ngstk merging functions 216 | 217 | ## [0.4.0] -- 2017-01-23 218 | 219 | - First major public release! 220 | - Revamps pypiper args 221 | - Adds parallel compression/decompression with pigz 222 | - Various small bug fixes and speed improvements 223 | -------------------------------------------------------------------------------- /docs/clean.md: -------------------------------------------------------------------------------- 1 | # Cleaning up intermediate files 2 | 3 | Many pipelines produce intermediate files along the way. Should you retain these files or delete them? 4 | 5 | On the one hand, you may not necessarily want to delete them *immediately* after creating them, because what if a later pipeline step fails and you need to inspect an intermediate file? On the other hand, you may not want those intermediate files sticking around forever because they waste valuable disk space. 6 | 7 | Pypiper solves this problem with the concept of a *clean list*. The clean list is simply a list of files that are flagged for eventual cleanup. A pipeline developer adds to this list using `pm.clean_add(filename)`. Files on the clean list are *not* cleaned immediately; instead, they are **removed as soon as the pipeline is completed successfully** (in other words, after `pm.complete_pipeline()` is called). The advantage is that intermediate files will always be available as long as a pipeline has not completed successfully. 8 | 9 | In case a user of a pipeline instead wants to retain these files indefinitely, he or she may simply add `--dirty` when invoking the pipeline script. This instructs pypiper to *not* clean the intermediate files, even after a successful pipeline run. In this case, `pypiper` will produce a shell script (`clean.sh`), which can be run to remove all flagged files at a later point. 10 | -------------------------------------------------------------------------------- /docs/cli.md: -------------------------------------------------------------------------------- 1 | # Command-line arguments 2 | 3 | Your final pypiper pipeline will be a python script that a pipeline user will invoke on the command-line. You will likely need to allow the user to change some parameters on the command line, and to take full advantage of Pypiper (make your pipeline recoverable, etc.), you wil need to add command-line options to your pipeline that change pypiper's settings as well. Pypiper uses the typical Python [argparse module](https://docs.python.org/2/library/argparse.html) to define command-line arguments to your pipeline, and offers a series of built-in functions to help you populate your pipeline's `ArgumentParser` with pypiper-specific options. 4 | 5 | You can use an ArgumentParser as usual, adding whatever arguments you like. Then, you add Pypiper args to your parser with the function `add_pypiper_args()`, and pass command-line options and arguments to your `PipelineManager`, like this: 6 | 7 | ```{python} 8 | import pypiper, os, argparse 9 | parser = ArgumentParser(description='Write a short description here') 10 | 11 | # add any custom args here 12 | # e.g. parser.add_argument('--foo', help='foo help') 13 | 14 | # once you've established all your custom arguments, we can add the default 15 | # pypiper arguments to your parser like this: 16 | 17 | parser = pypiper.add_pypiper_args(parser) 18 | 19 | # Then, pass the args parsed along to the PipelineManger 20 | 21 | args = parser.parse_args() 22 | 23 | pipeline = pypiper.PipelineManager(name="my_pipeline", outfolder="out", \ 24 | args=args) 25 | ``` 26 | 27 | Once you've added pypiper arguments, your pipeline will then enable a few built-in arguments: `--recover`, `--follow`, and `--dirty`, for example. As a side bonus, all arguments (including any of your custom arguments) will be recorded in the log outputs. 28 | 29 | That's the basics. But you can customize things for more efficiency using a simple set of pre-built args and groups of args in pypiper: 30 | 31 | 32 | # Universal pypiper options 33 | 34 | With that said, there are a few universal (Pypiper-added) options that are frequently (but not necessarily always) honored by pypiper pipelines. These default pypiper arguments are detailed below: 35 | 36 | - `-R, --recover` 37 | Recover mode, overwrite locks. This argument will tell pypiper to recover from a failed previous run. Pypiper will execute commands until it encounters a locked file, at which point it will re-execute the failed command and continue from there. 38 | 39 | - `-F, --follow` 40 | Force run follow-functions. By default, follow-functions are only run if their corresponding `run` command was run; with this option you can force all follow functions to run. This is useful for regenerating QC data on existing output. For more details, see :ref:`the follow argument `. 41 | 42 | - `-D, --dirty` 43 | Make all cleanups manual. By default, pypiper pipelines will delete any intermediate files. For debugging, you may want to turn this option off -- you can do that by specifying **dirty mode**. 44 | 45 | - `-N, --new-start` 46 | New start mode. This flag will tell pypiper to start over, and run every command, even if its target output already exists. 47 | 48 | 49 | ## Customizing `add_pypiper_args()` 50 | 51 | 52 | There are two ways to modulate the arguments added by `add_pypiper_args()` function: the `groups` argument, which lets you add argument groups; or the `args` argument, which lets you add arguments indvidually. By default, `add_pypiper_args()` add all arguments listed in the `pypiper` group. You may instead pass a list of one or more of these groups of arguments (to `groups`) or individual arguments (to `args`) to customize exactly the set of built-in options your pipeline implements. 53 | 54 | For example, `parser.add_pypiper_args(parser, groups=['pypiper', 'common'])` will add all arguments listed under `pypiper` and `common` below: 55 | 56 | 57 | ## Built-in arguments accessed with `add_pypiper_args()` 58 | 59 | Individual arguments that are understood and used by pypiper: 60 | 61 | - `-R, --recover`: for a failed pipeline run, start off at the last successful step. 62 | - `-N, --new-start`: Just recreate everything, even if it exists. 63 | - `-D, --dirty`: Disables automatic cleaning of temporary files, so all intermediate files will still exist after a pipeline run (either sucessful or failed). Useful for debugging a pipeline even if it succeeds. 64 | - `-F, --follow`: Runs all `follow-functions`, regardless of whether the accompanying command is run. 65 | - `-C, --config`: Pypiper pipeline config yaml file. 66 | 67 | Individual arguments just provided for convenience and standardization: 68 | - `-S, --sample-name`: name of the sample 69 | - `-I, --input`: primary input file (e.g. read1) 70 | - `-I2, --input2`: secondary input file (e.g. read2) 71 | - `-O, --output-parent`: parent folder for pipeline results (the pipeline will use this as the parent directory for a folder named `sample-name`) 72 | - `-P, --cores`: Number of cores to use 73 | - `-M, --mem`: Amount of memory in megabytes 74 | - `-G, --genome`: Reference genome assembly (e.g. `hg38`) 75 | - `-Q, --simple-or-paired`: For sequencing data, is input single-end or paired-end? 76 | 77 | ## Pre-built collections of arguments added via `groups`: 78 | 79 | - pypiper: `recover`, `new-start`, `dirty`, `follow` 80 | - common: `input`, `sample-name` 81 | - config: `config` 82 | - resource: `mem`, `cores` 83 | - looper: `config`, `output-parent`, `mem`, `cores` 84 | - ngs: `input`, `sample-name`, `input2`, `genome`, `single-or-paired` 85 | 86 | 87 | ## Specifying required built-in arguments 88 | 89 | If you're using the built-in arguments, you may want to module which are required and which are not. That way, you can piggyback on how `ArgumentParser` handles required arguments very nicely -- if the user does not specify a required argument, the pipeline will automatically prompt with usage instructions. 90 | 91 | By default, built-in arguments are not flagged as required, but you can pass a list of required built-ins to the `required` parameter, like `add_pypiper_args(parser, args=["sample-name"], required=["sample-name"])`. 92 | 93 | 94 | ## Examples 95 | 96 | import pypiper, os, argparse 97 | parser = ArgumentParser(description='Write a short description here') 98 | 99 | # add just arguments from group `pypiper` 100 | parser = pypiper.add_pypiper_args(parser, groups=["pypiper"]) 101 | 102 | # add just arguments from group `common` 103 | parser = pypiper.add_pypiper_args(parser, groups=["common"]) 104 | 105 | # add arguments from two groups 106 | parser = pypiper.add_pypiper_args(parser, groups=["common", "resources"], 107 | required=["sample-name", "output-parent"]) 108 | 109 | # add individual argument 110 | parser = pypiper.add_pypiper_args(parser, args=["genome"]) 111 | 112 | # add some groups and some individual arguments 113 | parser = pypiper.add_pypiper_args(parser, args=["genome"], groups=["looper", "ngs"]) 114 | -------------------------------------------------------------------------------- /docs/configuration.md: -------------------------------------------------------------------------------- 1 | # Pipeline configuration files 2 | 3 | If you write a pipeline config file in `yaml` format and name it the same thing as the pipeline (but replacing `.py` with `.yaml`), pypiper will automatically load and provide access to these configuration options, and make it possible to pass customized config files on the command line. This is very useful for tweaking a pipeline for a similar project with slightly different parameters, without having to re-write the pipeline. 4 | 5 | It's easy: just load the `PipelineManager` with `args` (as described in [command-line arguments](cli.md)), and you have access to the config file automatically in in `pipeline.config`. 6 | 7 | For example, in `myscript.py` you write: 8 | 9 | ```{python} 10 | parser = pypiper.add_pipeline_args(parser, args=["config"]) 11 | pipeline = pypiper.PipelineManager(name="my_pipeline", outfolder=outfolder, \ 12 | args = parser) 13 | ``` 14 | 15 | And in the same folder, you include `myscript.yaml`: 16 | 17 | 18 | 19 | my_section: 20 | setting1: True 21 | setting2: 15 22 | 23 | Then you can access these settings automatically in your script using: 24 | 25 | 26 | 27 | pipeline.config.my_section.setting1 28 | pipeline.config.my_section.setting2 29 | 30 | 31 | This `yaml` file is useful for any parameters *not related to the input Sample* (which should be passed on the command-line). By convention, for consistency across pipelines, we use sections called `tools`, `resources`, and `parameters`, but the developer has the freedom to add other sections/variables as needed. 32 | 33 | Here's a more realist example pipeline configuration file: 34 | 35 | 36 | ```{yaml} 37 | # paths to required tools 38 | tools: 39 | java: "/home/user/.local/tools/java" 40 | trimmomatic: "/home/user/.local/tools/trimmomatic.jar" 41 | fastqc: "fastqc" 42 | samtools: "samtools" 43 | bsmap: "/home/user/.local/tools/bsmap" 44 | split_reads: "/home/user/.local/tools/split_reads.py" 45 | 46 | # paths to reference genomes, adapter files, and other required shared data 47 | resources: 48 | resources: "/data/groups/lab_bock/shared/resources" 49 | genomes: "/data/groups/lab_bock/shared/resources/genomes/" 50 | adapters: "/data/groups/lab_bock/shared/resources/adapters/" 51 | 52 | # parameters passed to bioinformatic tools, subclassed by tool 53 | parameters: 54 | trimmomatic: 55 | quality_encoding: "phred33" 56 | threads: 30 57 | illuminaclip: 58 | adapter_fasta: "/home/user/.local/tools/resources/cpgseq_adapter.fa" 59 | seed_mismatches: 2 60 | palindrome_clip_threshold: 40 61 | simple_clip_threshold: 7 62 | slidingwindow: 63 | window_size: 4 64 | required_quality: 15 65 | maxinfo: 66 | target_length: 17 67 | strictness: 0.5 68 | minlen: 69 | min_length: 17 70 | bsmap: 71 | seed_size: 12 72 | mismatches_allowed_for_background: 0.10 73 | mismatches_allowed_for_left_splitreads: 0.06 74 | mismatches_allowed_for_right_splitreads: 0.00 75 | equal_best_hits: 100 76 | quality_threshold: 15 77 | quality_encoding: 33 78 | max_number_of_Ns: 3 79 | processors: 8 80 | random_number_seed: 0 81 | map_to_strands: 0 82 | ``` 83 | 84 | 85 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributing 3 | 4 | We welcome contributions in the form of pull requests. 5 | 6 | If proposing changes to package source code, please run the test suite in `python2` and `python3` by running `pytest` or `python setup.py test` from within the repository root. 7 | 8 | If using `pytest` directly, we suggest first activating the appropriate Python version's virtual environment and running `pip install --ugprade ./`. 9 | Otherwise, simply specify the appropriate Python version, i.e. `python2 setup.py test` or `python3 setup.py test`. -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## How can I run my pipeline on more than 1 sample? 4 | 5 | Pypiper only handles individual-sample pipelines. To run it on multiple samples, write a loop, or use [looper](http://looper.readthedocs.io/). Dividing multi-sample handling from individual sample handling is a conceptual advantage that allows us to write a nice, universal, generic sample-handler that you only have to learn once. 6 | 7 | ## What cluster resources can pypiper use? 8 | 9 | Pypiper is compute-agnostic. You run it wherever you want. If you want a nice way to submit pipelines for samples any cluster manager, check out [looper](http://looper.readthedocs.io/), which can run your pipeline on any compute infrastructure using the [divvy python package](http://code.databio.org/divvy). 10 | 11 | ## What does it mean for a sample to be in the "waiting" state? 12 | 13 | Waiting means `pypiper` encountered a file lock, but no recovery flag. So the pipeline thinks a process (from another run or another process) is currently writing that file. It periodically checks for the lock file to disappear, and assumes that the other process will unlock the file when finished. If you are sure there's not another process writing to that file, you can get `pypiper` to continue by deleting the corresponding `lock` file. In the future, you can use `pypiper's` recover mode (`-R`) to automatically restart a process when a `lock` file is found, instead of waiting. 14 | 15 | ## What is the 'elapsed time' in output? 16 | 17 | The "elapsed" time is referring to the amount of time since the preceding timestamp, not since the start of the pipeline. Timestamps are all displayed with a flag: `_TIME_`. The total cumulative time for the pipeline is displayed only at the end. 18 | 19 | ## How should I run a QC step to check results of one of my commands? 20 | 21 | Usually, you only want to run a QC step if the result was created in the same pipeline run. There's no need to re-run that step if you have to restart the pipeline due to an error later on. If you use `run()` for these steps, then they'll need to run each time the pipeline runs. Instead, this is exactly why we created [the follow argument](../advanced-run-method/#the-follow-argument) This option lets you couple a QC step to a `run()` call, so it only gets excecuted when it is required. 22 | 23 | ## How do I solve installation errors involving `psutil` and/or a compiler like `gcc` or `clang`? 24 | 25 | If you have trouble with installation and it looks like one of these pieces of software is involved, please check the [`psutil` installation guide](https://github.com/giampaolo/psutil/blob/master/INSTALL.rst). 26 | 27 | -------------------------------------------------------------------------------- /docs/features.md: -------------------------------------------------------------------------------- 1 | # Pypiper features at-a-glance 2 | 3 | ![](img/simplicity.svg) **Simplicity** 4 | 5 | Pipelines are simple both to use and to develop. A pypiper pipeline is nothing more than a python script. You run it on the command line like you would any other python script. The basic documentation is just a few pages. It should only take you 15 minutes to write your first pipeline. 6 | 7 | ![](img/restartability.svg) **Restartability** 8 | 9 | Commands check for their targets and only run if the target needs to be created. This provides computational advantages, and also means the pipeline will pick up where it left off in case it needs to be restarted or extended. 10 | 11 | ![](img/protection.svg) **File integrity protection** 12 | 13 | Pypiper uses automatic file locks. This ensures that tasks complete, and pipelines never continue with half-finished analysis. It also ensures that multiple pipeline runs will not interfere with one another -even if the steps are identical and produce the same files. 14 | 15 | ![](img/logging.svg) **Copious logging** 16 | 17 | Pypiper automatically prints output to screen and also stores it in a log file, so all subprocess output is captured permanently. It also provides copious information on versions, compute host, and easy timestamping. 18 | 19 | ![](img/memory.svg) **Memory use monitoring** 20 | 21 | Processes are polled for memory use, allowing you to more accurately gauge your future memory requirements. 22 | 23 | ![](img/job_status.svg) **Job status monitoring** 24 | 25 | Pypiper automatically creates status flag files, so you can summarize the current state (`running`, `failed`, or `completed`) of hundreds of jobs simultaneously. 26 | 27 | ![](img/reports.svg) **Easy result reports** 28 | 29 | Pypiper provides functions to put key-value pairs into an easy-to-parse stats file, making it easy to summarize your pipeline results. 30 | 31 | ![](img/error.svg) **Robust error handling** 32 | 33 | Pypiper closes pipelines gracefully on interrupt or termination signals, converting the status to `failed`. By default, a process that returns a nonzero value halts the pipeline, unlike in bash, where by default the pipeline would continue using an incomplete or failed result. This behavior can be overridden as desired with a single parameter. 34 | 35 | ![](img/recovery.svg) **Dynamic recovery** 36 | 37 | If a job is interrupted (with SIGINT or SIGTERM), either from a user or by a cluster resource manager, pypiper will set a `dynamic recovery` flag. The next time the run is started, it will automatically pick up where it left off. This makes pypiper pipelines `automatically pre-emption ready`, so they can be immediately deployed on servers where jobs may be pre-empted. 38 | -------------------------------------------------------------------------------- /docs/img/error.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 20 | 22 | 29 | 33 | 37 | 38 | 46 | 51 | 56 | 57 | 65 | 69 | 74 | 75 | 82 | 86 | 90 | 91 | 99 | 101 | 104 | 109 | 110 | 118 | 120 | 123 | 127 | 128 | 135 | 137 | 141 | 146 | 147 | 154 | 157 | 158 | 166 | 174 | 183 | 187 | 191 | 192 | 201 | 205 | 209 | 210 | 211 | 230 | 232 | 233 | 235 | image/svg+xml 236 | 238 | 239 | 240 | 241 | 242 | 247 | 251 | 256 | 259 | 266 | 272 | 278 | 284 | 289 | 290 | 291 | 292 | 293 | 294 | -------------------------------------------------------------------------------- /docs/img/protection.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 20 | 22 | 29 | 33 | 37 | 38 | 46 | 51 | 56 | 57 | 65 | 69 | 74 | 75 | 84 | 91 | 95 | 99 | 100 | 110 | 120 | 121 | 140 | 142 | 143 | 145 | image/svg+xml 146 | 148 | 149 | 150 | 151 | 152 | 157 | 161 | 164 | 173 | 178 | 187 | 196 | 197 | 200 | 205 | 210 | 215 | 216 | 217 | 218 | 219 | -------------------------------------------------------------------------------- /docs/img/pypiper_bug.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 20 | 22 | 25 | 29 | 33 | 34 | 43 | 45 | 49 | 53 | 54 | 63 | 65 | 69 | 73 | 74 | 84 | 85 | 102 | 104 | 105 | 107 | image/svg+xml 108 | 110 | 111 | 112 | 113 | 114 | 119 | 125 | 131 | 135 | 140 | 145 | 146 | 152 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /docs/img/recovery.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 20 | 22 | 29 | 33 | 37 | 38 | 46 | 51 | 56 | 57 | 65 | 69 | 74 | 75 | 82 | 86 | 90 | 91 | 100 | 103 | 107 | 111 | 112 | 121 | 130 | 139 | 148 | 157 | 158 | 177 | 179 | 180 | 182 | image/svg+xml 183 | 185 | 186 | 187 | 188 | 189 | 194 | 198 | 204 | 210 | 211 | 217 | 223 | 224 | 230 | 236 | 237 | 238 | 239 | 240 | -------------------------------------------------------------------------------- /docs/img/simplicity.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 20 | 22 | 29 | 33 | 37 | 38 | 46 | 51 | 56 | 57 | 65 | 69 | 74 | 75 | 82 | 86 | 90 | 91 | 93 | 97 | 101 | 105 | 109 | 110 | 118 | 120 | 123 | 128 | 129 | 137 | 139 | 142 | 146 | 147 | 154 | 156 | 160 | 165 | 166 | 173 | 176 | 177 | 185 | 193 | 200 | 204 | 208 | 209 | 210 | 229 | 231 | 232 | 234 | image/svg+xml 235 | 237 | 238 | 239 | 240 | 241 | 246 | 250 | 253 | 260 | 265 | 272 | 277 | 278 | 279 | 280 | 281 | -------------------------------------------------------------------------------- /docs/ngstk_intro.md: -------------------------------------------------------------------------------- 1 | 2 | # NGSTk - Next Gen Sequencing Toolkit 3 | 4 | Pypiper functions are generic; they simply accept command-line commands and run them. You could use this to produce a pipeline in any domain. To add to this, it's helpful to build convenience functions specific to your scientific domain. It's really easy to create your own library of python functions by creating a python package. Then, you just need to import your package in your pipeline script and make use of the common functions. We refer to this type of package as a "toolkit". 5 | 6 | Pypiper includes a built-in toolkit called NGSTk (next-generation sequencing toolkit). NGSTk simply provides some convenient helper functions to create common shell commands, like converting from file formats (_e.g._ `bam_to_fastq()`), merging files (_e.g._ `merge_bams()`), counting reads, etc. These make it faster to design bioinformatics pipelines in Pypiper, but are entirely optional. 7 | 8 | Here's how to use `NGSTk`: 9 | 10 | ```{python} 11 | import pypiper 12 | pm = pypiper.PipelineManager(..., args = args) 13 | 14 | # Create a ngstk object (pass the PipelineManager as an argument) 15 | ngstk = pypiper.NGSTk(pm = pm) 16 | 17 | # Now you use use ngstk functions 18 | cmd = ngstk.index_bam("sample.bam") 19 | pm.run(cmd, target="sample.bam") 20 | ``` 21 | 22 | A complete list of functions is in the [API](../autodoc_build/pypiper) or in the [source code for NGSTk](https://github.com/databio/pypiper/blob/master/pypiper/ngstk.py). 23 | -------------------------------------------------------------------------------- /docs/outputs.md: -------------------------------------------------------------------------------- 1 | 2 | # Outputs explained 3 | 4 | Assume you are using a pypiper pipeline named `PIPE` ( it passes `name="PIPE"` to the PipelineManager constructor). By default, your `PipelineManager` will produce the following outputs automatically (in addition to any output created by the actual pipeline commands you run): 5 | 6 | * **PIPE_log.md** 7 | The log starts with a bunch of useful information about your run: a starting timestamp, version numbers of the pipeline and pypiper, a declaration of all arguments passed to the pipeline, the compute host, etc. Then, all output sent to screen is automatically logged to this file, providing a complete record of your run. 8 | 9 | * **PIPE_status.flag** 10 | As the pipeline runs, it produces a flag in the output directory, which can be either `PIPE_running.flag`, `PIPE_failed.flag`, or `PIPE_completed.flag`. These flags make it easy to assess the current state of running pipelines for individual samples, and for many samples in a project simultaneously. 11 | 12 | * **stats.yaml** 13 | Any results reported by the pipeline are saved as key-value pairs in this file, for easy parsing. 14 | 15 | * **PIPE_profile.md** 16 | A profile log file that provides, for every process run by the pipeline, 3 items: 1) the process name; 2) the clock time taken by the process; and 3) the memory high water mark used by the process. This file makes it easy to profile pipelines for memory and time resources. 17 | 18 | * **PIPE_commands.md** 19 | Pypiper produces a log file containing all the commands run by the pipeline, verbatim. These are also included in the main log. 20 | 21 | Multiple pipelines can easily be run on the same sample, using the same output folder (and possibly sharing intermediate files), as the result outputs will be identifiable by the `PIPE_` identifier. 22 | 23 | These files are [markdown](https://daringfireball.net/projects/markdown/) making it easy to read either in text format, or to quickly convert to a pretty format like HTML. 24 | -------------------------------------------------------------------------------- /docs/philosophy.md: -------------------------------------------------------------------------------- 1 | # Pypiper's development philosophy 2 | 3 | ## Who should use Pypiper? 4 | 5 | The target audience for pypiper is an individual who wants to build a basic 6 | pipeline, but **wants to do better job than just writing a shell script, without 7 | learning a new language or system**. Many bioinformatics pipelines are simple 8 | shell scripts that piece together commands, because that seems the most 9 | accessible. Although there has been an explosion of more feature-rich pipeline 10 | development frameworks, these often require substantial training and investment 11 | to write a pipeline that could be more quickly written as a shell script. 12 | Pipelines built using a framework are also harder to understand for users 13 | unfamiliar with the framework, and require more experience to develop and 14 | modify. Pypiper tries to give 80% of the benefits of a professional-scale 15 | pipelining system while requiring very little additional effort. 16 | 17 | If you have a shell script that would benefit from a layer of "handling code", 18 | Pypiper helps you convert that set of shell commands into a production-scale 19 | workflow, automatically handling the annoying details (restartablilty, file 20 | integrity, logging) to make your pipeline robust and restartable. 21 | 22 | Pypiper's strength is its simplicity. If all you want is a 23 | shell-like script, but now with the power of python, some built-in benefits, and 24 | syntactic sugar, then Pypiper is for you. 25 | 26 | ## What Pypiper does NOT do 27 | 28 | Pypiper tries to exploit the [Pareto principle](https://en.wikipedia.org/wiki/Pareto_principle) -- you'll get 80% of the 29 | features with only 20% of the work of other pipeline management systems. So, 30 | there are a few things Pypiper deliberately doesn't do: 31 | 32 | 33 | - Task dependencies. Pypiper runs sequential pipelines. We view this as an 34 | advantage because it makes the pipeline easier to write, easier to understand, 35 | easier to modify, and easier to debug -- critical things for pipelines that 36 | are still under active development (which is most pipelines in bioinformatics). For 37 | developmental pipelines, the complexity introduced by task dependencies is not 38 | worth the minimal benefit -- read this [post on parallelism in 39 | bioinformatics](http://databio.org/posts/paralellism_in_bioinformatics.html) 40 | for an explanation. 41 | 42 | - Cluster submission. Pypiper pipelines are scripts. You can run them on 43 | whatever computing resources you have. We have divided cluster resource 44 | management into a separate project called 45 | [looper](http://looper.readthedocs.io/). Pypiper builds individual, 46 | single-sample pipelines that can be run one sample at a time. 47 | [Looper](http://looper.readthedocs.io/) then processes groups of samples, 48 | submitting appropriate pipelines to a cluster or server. The two projects are 49 | independent and can be used separately, keeping things simple and modular. 50 | 51 | 52 | ## Yet another pipeline system? 53 | 54 | As I began to put together production-scale pipelines, I found a lot of relevant 55 | pipelining systems, but was universally disappointed. For my needs, they were 56 | all overly complex. I wanted something **simple enough to quickly write and 57 | maintain** a pipeline without having to learn a lot of new functions and 58 | conventions, but robust enough to handle requirements like restartability and 59 | memory usage monitoring. Everything related was either a pre-packaged pipeline 60 | for a defined purpose, or a heavy-duty development environment that was overkill 61 | for a simple pipeline. Both of these seemed to be targeted toward ultra- 62 | efficient uses, and neither fit my needs: I had a set of commands already in 63 | mind -- I just needed a wrapper that could take that code and make it 64 | automatically restartable, logged, robust to crashing, easy to debug, and so 65 | forth. 66 | 67 | Pypiper has evolved over the years and gained lots of cool new features. But its 68 | core principal has remained the same: simplicity. A pypiper pipeline can be 69 | nothing more than a familiar python script that strings together a few shell 70 | commands. -------------------------------------------------------------------------------- /docs/pipestat.md: -------------------------------------------------------------------------------- 1 | # Pipestat 2 | 3 | Starting with pypiper v0.13.0 [pipestat](http://pipestat.databio.org) is the recommended way of reporting pipeline statistics. 4 | You can browse the pipestat documentation to learn more about it, but briefly pipestat is a tool that standardizes reporting of pipeline results. It provides 1) a standard specification for how pipeline outputs should be stored; and 2) an implementation to easily write results to that format from within Python or from the command line. 5 | 6 | ## Advancements 7 | 8 | There are a multiple advantages of using pipestat instead of the current pipeline results reporting system: 9 | 10 | 1. **Database results storage:** the results can be stored either in a database or a YAML-formatted results file. This way a pypiper pipeline running in an emphemeral compute environment can report the results to the database and exit. No need to sync the results with a central results storage. 11 | 2. **Strict and clear results definition:** all the results that can be reported by a pipeline run *must* be pre-defined in a [pipestat results schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format) that in a simplest case just indicates the result's type. This presents pipestat clients with the possibility to *reliably* gather all the possible results and related metadata. 12 | 3. **On-the-fly results validation:** the schema is used to validate and/or convert the reported result to a strictly determined type, which makes the connection of pypiper with downstream pipeline results processing software seamless. 13 | 4. **Unified, pipeline-agnostic results interface:** other pipelines, possibly created with different pipeline frameworks, can read and write results via Python API or command line interface. This feature significantly incerases your pipeline interoperability. 14 | 15 | ## Setup 16 | 17 | In order to start reporting results with pipestat in your pipeline all you need to do is define a [pipestat resuts schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format): 18 | 19 | ```yaml 20 | my_int_result: 21 | type: integer 22 | description: "This is my first result" 23 | my_str_result: 24 | type: string 25 | ``` 26 | 27 | And in the simplest case... that's it! Now you can use `pipestat` property of the `PipelineManager` object to report/retrieve results. 28 | 29 | Pypiper *by default* will use a YAML-formated file to store the reported results in the selected `outfolder` and will look for `pipestat_results_schema.yaml` file in the pipeline Python script directory. 30 | 31 | ### Advanced features 32 | 33 | Pypiper-pipestat integration really shines when more advanced features are used. Here's how to set them up. 34 | 35 | #### Configure custom pipestat options 36 | 37 | You can configure pipestat by passing arguments with custom values to `pypiper.PipelineManager` constructor: 38 | 39 | ```python 40 | pm = pypiper.PipelineManager( 41 | ..., 42 | pipestat_schema="custom_results_schema.yaml", 43 | pipestat_results_file="custom_results_file.yaml", 44 | pipestat_sample_name="my_record", 45 | pipestat_project_name="my_namespace", 46 | pipestat_config="custom_pipestat_config.yaml", 47 | ) 48 | ``` 49 | 50 | #### Use a database to store reported results 51 | 52 | In order to establish a database connection pipestat requires few pieces of information, which *must* be provided in a [pipestat configuration file](http://pipestat.databio.org/en/latest/config/) passed to the `PipelineManager` constructor. 53 | 54 | This is an example of such a file: 55 | 56 | ```yaml 57 | database: 58 | name: pypiper # database name 59 | user: pypiper # database user name 60 | password: pypiper # database password 61 | host: localhost # database host address 62 | port: 5433 # port the database is running on 63 | dialect: postgresql # type of the databse 64 | driver: psycopg2 # driver to use to communicate 65 | ``` 66 | 67 | For reference, here is a Docker command that would run a PostgreSQL instance that could be used to store the pipeline results when configured with with the configuration file above: 68 | 69 | ```console 70 | docker volume create postgres-data 71 | 72 | docker run -d --name pypiper-postgres \ 73 | -p 5432:5433 -e POSTGRES_PASSWORD=pypiper \ 74 | -e POSTGRES_USER=pypiper -e POSTGRES_DB=pypiper \ 75 | -v postgres-data:/var/lib/postgresql/data postgres 76 | ``` 77 | 78 | #### Highlight results 79 | 80 | The pipestat results schema can include any number of additional attributes for results. An example of that is *results highlighting*. 81 | 82 | When a `highlight: true` attribute is included attribute under result identifier in the schema file the highlighted results can be later retrieved by pipestat clients via `PipelineManager.pipestat.highlighted_results` property, which simply returns a list of result identifiers. to be presented in a special way. 83 | 84 | ### Usage 85 | 86 | Since a pipeline run-specific `PipestatManager` instance is attached to the `PipelineManager` object all the public pipestat API can be used. Please refer to the [pipestat API documentation](http://pipestat.databio.org/en/latest/autodoc_build/pipestat/) to read about all the currently available features. 87 | 88 | Here we present the most commonly used features: 89 | 90 | - results reporting 91 | 92 | *report a result, convert to schema-defined type and overwrite previously reported result* 93 | 94 | ```python 95 | results = { 96 | "my_int_result": 10, 97 | "my_str_result": "test" 98 | } 99 | pm.pipestat.report( 100 | values=results, 101 | strict_type=True, 102 | force_overwrite=True 103 | ) 104 | ``` 105 | 106 | - results retrieval 107 | 108 | ```python 109 | pm.pipestat.retrieve(result_identifier="my_int_result") 110 | ``` 111 | 112 | - results schema exploration 113 | 114 | ```python 115 | pm.pipestat.schema 116 | ``` 117 | 118 | - exploration of canonical [jsonschema](https://json-schema.org/) representation of result schemas 119 | 120 | ```python 121 | pm.pipestat.result_schemas 122 | ``` 123 | -------------------------------------------------------------------------------- /docs/report.md: -------------------------------------------------------------------------------- 1 | # Reporting statistics 2 | 3 | One of the most useful features of pypiper is the `report_result` function. This function provides a way to record small-scale results, like summary statistics. It standardizes the output so that universal tools can be built to process all the pipeline results from any pipeline, because the results are all reported in the same way. 4 | 5 | When you call `pm.report_result(key, value)`, pypiper simply writes the key-value pair to a `tsv` file (`stats.tsv`) in the pipeline output folder. These `stats.tsv` files can then later be read and aggregated systematically by other tools, such as `looper summarize`. 6 | 7 | ## Reporting objects 8 | 9 | **Note**: Reporting objects will be deprecated in a future release. It is recommended to use `report_result`. 10 | 11 | Starting in version 0.8, pypiper now implements a second reporting function, `report_object`. This is analogous to the `report_result` function, but instead of reporting simple key-value pairs, it lets you record any produced file as an output. Most commonly, this is used to record figures (PDFs, PNGs, etc.) produced by the pipeline. It can also be used to report other files, like HTML files. 12 | 13 | Pypiper writes results to `objects.tsv`, which can then be aggregated for project-level summaries of plots and other pipeline result files. 14 | 15 | 16 | ## Re-using previously reported results 17 | 18 | We frequently want to use the `report_result` capability in `follow` functions. It's a convenient place to do something like count or assess the result of a long-running command, and then report some summary statistic on it. One potential hangup with this strategy is dealing with secondary results after a pipeline is interrupted and restarted. By secondary result, I mean one that requires knowing the value of an earlier result. For example, if you want to compute the **percentage of reads that aligned**, you need to first know the **total reads** -- but what if your pipeline got interrupted and calculation of **total reads** happened in an earlier pipeline run? 19 | 20 | To solve this issue, Pypiper has a neat function called `get_stat` that lets you retrieve any value you've reported with `report_result` so you could use it to calculate statistics elsewhere in the pipeline. It will retrieve this either from memory, if the calculation of that result happened during the current pipeline run, or from the `stats.tsv` file, if the result was reported by an earlier run (or even another pipeline). So you could, in theory, calculate statistics based on results across pipelines. 21 | 22 | An example for how to use this is how we handle calculating the alignment rate in an NGS pipeline: 23 | 24 | ```{python} 25 | x = myngstk.count_mapped_reads(bamfile, args.paired_end) 26 | pm.report_result("Aligned_reads", x) 27 | rr = float(pm.get_stat("Raw_reads")) 28 | pm.report_result("Alignment_rate", round((rr * 100 / float(x), 3)) 29 | ``` 30 | 31 | Here, we use `get_stat` to grab a result that we reported previously (with `report_result`), when we counted the number of `Raw_reads` (earlier in the pipeline). We need this after the alignment to calculate the alignment rate. Later, now that we've reported `Alignment_rate`, you could harvest this stat again for use with `pm.get_stat("Alignment_rate")`. This is useful because you could put this block of code in a `follow` statement so it may not be executed, but you can still grab a reported result like this even if the execution happened outside of the current pipeline run; you'd only have to do the calculation once. 32 | -------------------------------------------------------------------------------- /docs/support.md: -------------------------------------------------------------------------------- 1 | 2 | # Support 3 | 4 | If you find a bug or want request a feature, open an issue at https://github.com/databio/pypiper/issues. 5 | -------------------------------------------------------------------------------- /docs_jupyter/build/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /docs_jupyter/hello-world.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hello world\n", 8 | "\n", 9 | "This brief tutorial will run your first basic pypiper pipeline to ensure you have everything set up correctly. \n", 10 | "\n", 11 | "Just run these 3 lines of code and you're running your first pypiper pipeline!\n", 12 | "\n", 13 | "### Install the latest version of pypiper\n", 14 | "\n", 15 | "```{console}\n", 16 | "pip install --user piper\n", 17 | "```\n", 18 | "\n", 19 | "\n", 20 | "### Download hello_pypiper.py\n", 21 | "```{console}\n", 22 | "wget https://raw.githubusercontent.com/databio/pypiper/master/example_pipelines/hello_pypiper.py\n", 23 | "```\n", 24 | "\n", 25 | "This is a basic pipeline. Here are the contents:" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "#!/usr/bin/env python\n", 38 | "\n", 39 | "import pypiper\n", 40 | "outfolder = \"hello_pypiper_results\" # Choose a folder for your results\n", 41 | "\n", 42 | "# Create a PipelineManager, the workhorse of pypiper\n", 43 | "pm = pypiper.PipelineManager(name=\"hello_pypiper\", outfolder=outfolder)\n", 44 | "\n", 45 | "# Timestamps to delineate pipeline sections are easy:\n", 46 | "pm.timestamp(\"Hello!\")\n", 47 | "\n", 48 | "# Now build a command-line command however you like, and pass it to pm.run()\n", 49 | "target_file = \"hello_pypiper_results/output.txt\"\n", 50 | "cmd = \"echo 'Hello, Pypiper!' > \" + target_file\n", 51 | "pm.run(cmd, target_file)\n", 52 | "\n", 53 | "pm.stop_pipeline()\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "cat ../example_pipelines/hello_pypiper.py" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "### Run it!" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 2, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "### [Pipeline run code and environment:]\n", 78 | "\n", 79 | "* Command: `../example_pipelines/hello_pypiper.py`\n", 80 | "* Compute host: nox\n", 81 | "* Working dir: /home/sheffien/code/pypiper/docs_jupyter\n", 82 | "* Outfolder: hello_pypiper_results/\n", 83 | "* Pipeline started at: (03-16 23:47:37) elapsed: 0.0 _TIME_\n", 84 | "\n", 85 | "### [Version log:]\n", 86 | "\n", 87 | "* Python version: 3.6.7\n", 88 | "* Pypiper dir: `/home/sheffien/.local/lib/python3.6/site-packages/pypiper`\n", 89 | "* Pypiper version: 0.9.5dev\n", 90 | "* Pipeline dir: `/home/sheffien/code/pypiper/example_pipelines`\n", 91 | "* Pipeline version: None\n", 92 | "* Pipeline hash: b'134e8c8f723da66697ab4f5b204315979b4e1042\\n'\n", 93 | "* Pipeline branch: b'* dev\\n'\n", 94 | "* Pipeline date: b'2019-03-16 11:41:56 -0400\\n'\n", 95 | "* Pipeline diff: b' 1 file changed, 16 insertions(+), 1 deletion(-)\\n'\n", 96 | "\n", 97 | "### [Arguments passed to pipeline:]\n", 98 | "\n", 99 | "\n", 100 | "----------------------------------------\n", 101 | "\n", 102 | "\n", 103 | "Changed status from initializing to running.\n", 104 | "No config file\n", 105 | "Hello! (03-16 23:47:37) elapsed: 0.0 _TIME_\n", 106 | "\n", 107 | "Target to produce: `hello_pypiper_results/output.txt`\n", 108 | "\n", 109 | "\n", 110 | "> `echo 'Hello, Pypiper!' > hello_pypiper_results/output.txt`\n", 111 | "\n", 112 | "
\n",
113 |       "
\n", 114 | "Process 128 returned: (0). Elapsed: 0:00:00. Peak memory: (Process: None; Pipeline: 0GB)\n", 115 | "\n", 116 | "Changed status from running to completed.\n", 117 | "\n", 118 | "> `Time`\t0:00:00\thello_pypiper\t_RES_\n", 119 | "\n", 120 | "> `Success`\t03-16-23:47:37\thello_pypiper\t_RES_\n", 121 | "\n", 122 | "##### [Epilogue:]\n", 123 | "* Total elapsed time: 0:00:00\n", 124 | "* Peak memory used: 0 GB\n", 125 | "* Pipeline completed at: (03-16 23:47:37) elapsed: 0.0 _TIME_\n", 126 | "\n", 127 | "Pypiper terminating spawned child process 114...(tee)\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "python3 ../example_pipelines/hello_pypiper.py" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "This output is printed to your screen and also recorded in a log file (called ``hello_pypiper_log.md``). There are a few other outputs from the pipeline as well. All results are placed in a folder called ``hello_pypiper_results``. Navigate to that folder to observe the output of the pipeline, which will include these files:\n", 140 | "\n", 141 | " * hello_pypiper_commands.sh\n", 142 | " * hello_pypiper_completed.flag\n", 143 | " * hello_pypiper_log.md\n", 144 | " * hello_pypiper_profile.tsv\n", 145 | " * output.txt\n", 146 | " * stats.tsv\n", 147 | "\n", 148 | "These files are explained in more detail in the reference section [outputs explained](outputs). \n", 149 | "\n", 150 | "What's next? That depends on if you're interested in just *running* pypiper pipelines, or if you want to *develop* pypiper pipelines. The next sections are a series of HOW-TO articles that address each of these scenarios.\n" 151 | ] 152 | } 153 | ], 154 | "metadata": { 155 | "kernelspec": { 156 | "display_name": "Bash", 157 | "language": "bash", 158 | "name": "bash" 159 | }, 160 | "language_info": { 161 | "codemirror_mode": "shell", 162 | "file_extension": ".sh", 163 | "mimetype": "text/x-sh", 164 | "name": "bash" 165 | } 166 | }, 167 | "nbformat": 4, 168 | "nbformat_minor": 2 169 | } 170 | -------------------------------------------------------------------------------- /example_pipelines/basic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Getting Started: A simple sample pipeline built using pypiper.""" 4 | 5 | # This is a runnable example. You can run it to see what the output 6 | # looks like. 7 | 8 | # First, make sure you can import the pypiper package 9 | 10 | import os 11 | 12 | import pypiper 13 | 14 | # Create a PipelineManager instance (don't forget to name it!) 15 | # This starts the pipeline. 16 | 17 | pm = pypiper.PipelineManager(name="BASIC", outfolder="pipeline_output/") 18 | 19 | # Now just build shell command strings, and use the run function 20 | # to execute them in order. run needs 2 things: a command, and the 21 | # target file you are creating. 22 | 23 | # First, generate some random data 24 | 25 | # specify target file: 26 | tgt = "pipeline_output/test.out" 27 | 28 | # build the command 29 | cmd = f"shuf -i 1-500000000 -n 10000000 > {tgt}" 30 | 31 | # and run with run(). 32 | pm.run(cmd, target=tgt) 33 | 34 | # Now copy the data into a new file. 35 | # first specify target file and build command: 36 | tgt = "pipeline_output/copied.out" 37 | cmd = f"cp pipeline_output/test.out {tgt}" 38 | pm.run(cmd, target=tgt) 39 | 40 | # You can also string multiple commands together, which will execute 41 | # in order as a group to create the final target. 42 | cmd1 = "sleep 5" 43 | cmd2 = "touch pipeline_output/touched.out" 44 | pm.run([cmd1, cmd2], target="pipeline_output/touched.out") 45 | 46 | # A command without a target will run every time. 47 | # Find the biggest line 48 | cmd = "awk 'n < $0 {n=$0} END{print n}' pipeline_output/test.out" 49 | pm.run(cmd, "lock.max") 50 | 51 | # Use checkprint() to get the results of a command, and then use 52 | # report_result() to print and log key-value pairs in the stats file: 53 | last_entry = pm.checkprint("tail -n 1 pipeline_output/copied.out") 54 | pm.report_result("last_entry", last_entry) 55 | 56 | 57 | # Now, stop the pipeline to complete gracefully. 58 | pm.stop_pipeline() 59 | 60 | # Observe your outputs in the pipeline_output folder 61 | # to see what you've created. 62 | -------------------------------------------------------------------------------- /example_pipelines/count_reads.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Counts reads. 5 | """ 6 | 7 | __author__ = "Nathan Sheffield" 8 | __email__ = "nathan@code.databio.org" 9 | __license__ = "GPL3" 10 | __version__ = "0.1" 11 | 12 | import os 13 | import re 14 | import subprocess 15 | import sys 16 | from argparse import ArgumentParser 17 | 18 | import yaml 19 | 20 | import pypiper 21 | 22 | parser = ArgumentParser( 23 | description="A pipeline to count the number of reads and file size. Accepts" 24 | " BAM, fastq, or fastq.gz files." 25 | ) 26 | 27 | # First, add standard arguments from Pypiper. 28 | # groups="pypiper" will add all the arguments that pypiper uses, 29 | # and adding "common" adds arguments for --input and --sample--name 30 | # and "output_parent". You can read more about your options for standard 31 | # arguments in the pypiper docs (section "command-line arguments") 32 | parser = pypiper.add_pypiper_args( 33 | parser, 34 | groups=["pypiper", "common", "ngs"], 35 | args=["output-parent", "config"], 36 | required=["sample-name", "output-parent"], 37 | ) 38 | 39 | # Add any pipeline-specific arguments if you like here. 40 | 41 | args = parser.parse_args() 42 | 43 | if not args.input or not args.output_parent: 44 | parser.print_help() 45 | raise SystemExit 46 | 47 | if args.single_or_paired == "paired": 48 | args.paired_end = True 49 | else: 50 | args.paired_end = False 51 | 52 | # args for `output_parent` and `sample_name` were added by the standard 53 | # `add_pypiper_args` function. 54 | # A good practice is to make an output folder for each sample, housed under 55 | # the parent output folder, like this: 56 | outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name)) 57 | 58 | # Create a PipelineManager object and start the pipeline 59 | pm = pypiper.PipelineManager(name="count", outfolder=outfolder, args=args) 60 | 61 | # NGSTk is a "toolkit" that comes with pypiper, providing some functions 62 | # for dealing with genome sequence data. You can read more about toolkits in the 63 | # documentation 64 | 65 | # Create a ngstk object 66 | ngstk = pypiper.NGSTk(pm=pm) 67 | 68 | raw_folder = os.path.join(outfolder, "raw/") 69 | fastq_folder = os.path.join(outfolder, "fastq/") 70 | 71 | # Merge/Link sample input and Fastq conversion 72 | # These commands merge (if multiple) or link (if single) input files, 73 | # then convert (if necessary, for bam, fastq, or gz format) files to fastq. 74 | 75 | # We'll start with a timestamp that will provide a division for this section 76 | # in the log file 77 | pm.timestamp("### Merge/link and fastq conversion: ") 78 | 79 | # Now we'll rely on 2 NGSTk functions that can handle inputs of various types 80 | # and convert these to fastq files. 81 | 82 | local_input_files = ngstk.merge_or_link( 83 | [args.input, args.input2], raw_folder, args.sample_name 84 | ) 85 | 86 | cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq( 87 | local_input_files, args.sample_name, args.paired_end, fastq_folder 88 | ) 89 | 90 | 91 | # Now we'll use another NGSTk function to grab the file size from the input files 92 | # 93 | pm.report_result("File_mb", ngstk.get_file_size(local_input_files)) 94 | 95 | 96 | # And then count the number of reads in the file 97 | 98 | n_input_files = len(list(filter(bool, local_input_files))) 99 | 100 | raw_reads = ( 101 | sum( 102 | [ 103 | int(ngstk.count_reads(input_file, args.paired_end)) 104 | for input_file in local_input_files 105 | ] 106 | ) 107 | / n_input_files 108 | ) 109 | 110 | # Finally, we use the report_result() function to print the output and 111 | # log the key-value pair in the standard stats.tsv file 112 | pm.report_result("Raw_reads", str(raw_reads)) 113 | 114 | # Cleanup 115 | pm.stop_pipeline() 116 | -------------------------------------------------------------------------------- /example_pipelines/hello_pypiper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pypiper 4 | 5 | outfolder = "hello_pypiper_results" # Choose a folder for your results 6 | 7 | # Create a PipelineManager, the workhorse of pypiper 8 | pm = pypiper.PipelineManager(name="hello_pypiper", outfolder=outfolder) 9 | 10 | # Timestamps to delineate pipeline sections are easy: 11 | pm.timestamp("Hello!") 12 | 13 | # Now build a command-line command however you like, and pass it to pm.run() 14 | target_file = "hello_pypiper_results/output.txt" 15 | cmd = f"echo 'Hello, Pypiper!' > {target_file}" 16 | pm.run(cmd, target_file) 17 | 18 | pm.stop_pipeline() 19 | -------------------------------------------------------------------------------- /example_pipelines/logmuse_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Counts reads. 5 | """ 6 | 7 | __author__ = "Nathan Sheffield" 8 | __email__ = "nathan@code.databio.org" 9 | __license__ = "GPL3" 10 | __version__ = "0.1" 11 | 12 | import os 13 | import re 14 | import subprocess 15 | import sys 16 | from argparse import ArgumentParser 17 | 18 | import yaml 19 | 20 | import pypiper 21 | 22 | 23 | def build_argparser(): 24 | parser = ArgumentParser( 25 | description="A pipeline to count the number of reads and file size. Accepts" 26 | " BAM, fastq, or fastq.gz files." 27 | ) 28 | 29 | # First, add standard arguments from Pypiper. 30 | # groups="pypiper" will add all the arguments that pypiper uses, 31 | # and adding "common" adds arguments for --input and --sample--name 32 | # and "output_parent". You can read more about your options for standard 33 | # arguments in the pypiper docs (section "command-line arguments") 34 | parser = pypiper.add_pypiper_args( 35 | parser, 36 | groups=["pypiper", "common", "ngs", "logmuse"], 37 | args=["output-parent", "config"], 38 | required=["sample-name", "output-parent"], 39 | ) 40 | 41 | # Add any pipeline-specific arguments if you like here. 42 | 43 | # args for `output_parent` and `sample_name` were added by the standard 44 | # `add_pypiper_args` function. 45 | 46 | return parser 47 | 48 | 49 | def run_pipeline(): 50 | # A good practice is to make an output folder for each sample, housed under 51 | # the parent output folder, like this: 52 | outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name)) 53 | 54 | # Create a PipelineManager object and start the pipeline 55 | pm = pypiper.PipelineManager(name="logmuse-test", outfolder=outfolder, args=args) 56 | pm.info("Getting started!") 57 | # NGSTk is a "toolkit" that comes with pypiper, providing some functions 58 | # for dealing with genome sequence data. You can read more about toolkits in the 59 | # documentation 60 | 61 | files = [str(x) + ".tmp" for x in range(1, 20)] 62 | 63 | pm.run("touch " + " ".join(files), target=files, clean=True) 64 | 65 | # Create a ngstk object 66 | ngstk = pypiper.NGSTk(pm=pm) 67 | 68 | raw_folder = os.path.join(outfolder, "raw/") 69 | fastq_folder = os.path.join(outfolder, "fastq/") 70 | 71 | # Merge/Link sample input and Fastq conversion 72 | # These commands merge (if multiple) or link (if single) input files, 73 | # then convert (if necessary, for bam, fastq, or gz format) files to fastq. 74 | 75 | # We'll start with a timestamp that will provide a division for this section 76 | # in the log file 77 | pm.timestamp("### Merge/link and fastq conversion: ") 78 | 79 | # Now we'll rely on 2 NGSTk functions that can handle inputs of various types 80 | # and convert these to fastq files. 81 | 82 | local_input_files = ngstk.merge_or_link( 83 | [args.input, args.input2], raw_folder, args.sample_name 84 | ) 85 | 86 | cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq( 87 | local_input_files, args.sample_name, args.paired_end, fastq_folder 88 | ) 89 | 90 | # Now we'll use another NGSTk function to grab the file size from the input files 91 | # 92 | pm.report_result("File_mb", ngstk.get_file_size(local_input_files)) 93 | 94 | # And then count the number of reads in the file 95 | 96 | n_input_files = len(list(filter(bool, local_input_files))) 97 | 98 | raw_reads = ( 99 | sum( 100 | [ 101 | int(ngstk.count_reads(input_file, args.paired_end)) 102 | for input_file in local_input_files 103 | ] 104 | ) 105 | / n_input_files 106 | ) 107 | 108 | # Finally, we use the report_result() function to print the output and 109 | # log the key-value pair in the standard stats.tsv file 110 | pm.report_result("Raw_reads", str(raw_reads)) 111 | 112 | # Cleanup 113 | pm.stop_pipeline() 114 | 115 | 116 | if __name__ == "__main__": 117 | try: 118 | parser = build_argparser() 119 | args = parser.parse_args() 120 | 121 | if not args.input or not args.output_parent: 122 | parser.print_help() 123 | raise SystemExit 124 | 125 | if args.single_or_paired == "paired": 126 | args.paired_end = True 127 | else: 128 | args.paired_end = False 129 | 130 | sys.exit(run_pipeline()) 131 | except KeyboardInterrupt: 132 | sys.exit(1) 133 | -------------------------------------------------------------------------------- /init_interactive.py: -------------------------------------------------------------------------------- 1 | """ Create dummy PipelineManager and NGSTk instance for interactive session. """ 2 | 3 | import os 4 | 5 | from pypiper import NGSTk, PipelineManager 6 | 7 | __author__ = "Vince Reuter" 8 | __email__ = "vreuter@virginia.edu" 9 | 10 | 11 | pm = PipelineManager(name="interactive", outfolder=os.path.expanduser("~")) 12 | tk = NGSTk(pm=pm) 13 | -------------------------------------------------------------------------------- /logo_pypiper.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 20 | 22 | 25 | 29 | 33 | 34 | 43 | 45 | 49 | 53 | 54 | 63 | 65 | 69 | 73 | 74 | 84 | 85 | 102 | 104 | 105 | 107 | image/svg+xml 108 | 110 | 111 | 112 | 113 | 114 | 119 | 125 | 131 | 135 | 140 | 145 | 146 | 152 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Pypiper 2 | site_logo: img/pypiper_logo_dark.svg 3 | site_url: http://code.databio.org/pypiper/ 4 | repo_url: http://github.com/databio/pypiper 5 | pypi_name: piper 6 | 7 | nav: 8 | - Getting Started: 9 | - Introduction: README.md 10 | - Philosophy: philosophy.md 11 | - Features at-a-glance: features.md 12 | - Hello world: hello-world.md 13 | - Developer guides: 14 | - Building a basic pipeline: basic-pipeline.md 15 | - Using the run method: advanced-run-method.md 16 | - Automatic command-line arguments: cli.md 17 | - Configuring pipelines: configuration.md 18 | - Reporting statistics: report.md 19 | - Reporting statistics with pipestat: pipestat.md 20 | - Cleaning up intermediate files: clean.md 21 | - Best practices: best-practices.md 22 | - Toolkits: 23 | - "NGSTk: the NGS toolkit": ngstk_intro.md 24 | - Reference: 25 | - Catalog of pipeline outputs: outputs.md 26 | - Pypiper API: autodoc_build/pypiper.md 27 | - NGSTk API: autodoc_build/ngstk.md 28 | - FAQ: faq.md 29 | - Support: support.md 30 | - Contributing: contributing.md 31 | - Changelog: changelog.md 32 | 33 | theme: databio 34 | 35 | plugins: 36 | - databio: 37 | autodoc_build: "docs/autodoc_build" 38 | autodoc_package: "pypiper" 39 | no_top_level: true 40 | build_list: 41 | pypiper: [PipelineManager] 42 | ngstk: [NGSTk] 43 | - search 44 | 45 | -------------------------------------------------------------------------------- /pypiper/__init__.py: -------------------------------------------------------------------------------- 1 | # Implicitly re-export so logmuse usage by pipeline author routes through here. 2 | from logmuse import add_logging_options 3 | 4 | from ._version import __version__ 5 | from .exceptions import * 6 | from .manager import * 7 | from .ngstk import * 8 | from .pipeline import * 9 | from .stage import * 10 | from .utils import * 11 | -------------------------------------------------------------------------------- /pypiper/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.14.4" 2 | -------------------------------------------------------------------------------- /pypiper/const.py: -------------------------------------------------------------------------------- 1 | """ Pypiper constants. """ 2 | 3 | CHECKPOINT_EXTENSION = ".checkpoint" 4 | DEFAULT_SAMPLE_NAME = "DEFAULT_SAMPLE_NAME" 5 | PIPELINE_CHECKPOINT_DELIMITER = "_" 6 | STAGE_NAME_SPACE_REPLACEMENT = "-" 7 | PROFILE_COLNAMES = ["pid", "hash", "cid", "runtime", "mem", "cmd", "lock"] 8 | -------------------------------------------------------------------------------- /pypiper/exceptions.py: -------------------------------------------------------------------------------- 1 | """ Custom pypiper exceptions """ 2 | 3 | __author__ = "Vince Reuter" 4 | __email__ = "vreuter@virginia.edu" 5 | 6 | 7 | __all__ = [ 8 | "PipelineError", 9 | "PipelineHalt", 10 | "IllegalPipelineDefinitionError", 11 | "IllegalPipelineExecutionError", 12 | "MissingCheckpointError", 13 | "UnknownPipelineStageError", 14 | "UnsupportedFiletypeException", 15 | "SubprocessError", 16 | ] 17 | 18 | 19 | class PipelineError(Exception): 20 | """General pipeline error.""" 21 | 22 | pass 23 | 24 | 25 | class SubprocessError(Exception): 26 | pass 27 | 28 | 29 | class IllegalPipelineDefinitionError(PipelineError): 30 | pass 31 | 32 | 33 | class IllegalPipelineExecutionError(PipelineError): 34 | """Represent cases of illogical start/stop run() declarations.""" 35 | 36 | pass 37 | 38 | 39 | class MissingCheckpointError(Exception): 40 | """Represent case of expected but absent checkpoint file.""" 41 | 42 | def __init__(self, checkpoint, filepath): 43 | msg = "{}: '{}'".format(checkpoint, filepath) 44 | super(MissingCheckpointError, self).__init__(msg) 45 | 46 | 47 | class UnknownPipelineStageError(Exception): 48 | """ 49 | Triggered by use of unknown/undefined name for a pipeline stage. 50 | 51 | :param str stage_name: Name of the stage triggering the exception. 52 | :param pypiper.Pipeline pipeline: Pipeline for which the stage is unknown/undefined. 53 | """ 54 | 55 | def __init__(self, stage_name, pipeline=None): 56 | message = stage_name 57 | if pipeline is not None: 58 | try: 59 | stages = pipeline.stages() 60 | except AttributeError: 61 | # Just don't contextualize the error with known stages. 62 | pass 63 | else: 64 | message = "{}; defined stages: {}".format( 65 | message, ", ".join(map(str, stages)) 66 | ) 67 | super(UnknownPipelineStageError, self).__init__(message) 68 | 69 | 70 | class PipelineHalt(Exception): 71 | """ 72 | Execution-stopping exception for halting a pipeline. 73 | 74 | This is useful for stopping execution of a truly script-like pipeline. 75 | That is, a pipeline that doesn't bundle/define stages or wrap run() calls 76 | in functions. In this case, we want to be able to stop the Python process 77 | as it chugs through a pipeline script, and we can do that by having a 78 | PipelineManager's halt method raise this exception. 79 | 80 | """ 81 | 82 | def __init__(self, checkpoint=None, finished=None): 83 | if checkpoint is None: 84 | super(PipelineHalt, self).__init__() 85 | else: 86 | if isinstance(checkpoint, str): 87 | last_stage_done = checkpoint 88 | else: 89 | last_stage_done = getattr(checkpoint, "name", None) or getattr( 90 | checkpoint, "__name__", None 91 | ) 92 | if not last_stage_done: 93 | super(PipelineHalt, self).__init__() 94 | else: 95 | if finished is None: 96 | msg = last_stage_done 97 | elif finished: 98 | msg = "Finished '{}'".format(last_stage_done) 99 | else: 100 | msg = "Stopped at '{}'".format(last_stage_done) 101 | super(PipelineHalt, self).__init__(msg) 102 | 103 | 104 | class UnsupportedFiletypeException(Exception): 105 | """Restrict filetype domain.""" 106 | 107 | # Use superclass ctor to allow file name/path or extension to pass 108 | # through as the message for why this error is occurring. 109 | pass 110 | -------------------------------------------------------------------------------- /pypiper/flags.py: -------------------------------------------------------------------------------- 1 | """ Status flags """ 2 | 3 | # TODO: ultimately, these should migrate to pep. 4 | RUN_FLAG = "running" 5 | COMPLETE_FLAG = "completed" 6 | FAIL_FLAG = "failed" 7 | WAIT_FLAG = "waiting" 8 | PAUSE_FLAG = "partial" 9 | FLAGS = [RUN_FLAG, COMPLETE_FLAG, FAIL_FLAG, WAIT_FLAG, PAUSE_FLAG] 10 | 11 | __all__ = ["COMPLETE_FLAG", "FAIL_FLAG", "FLAGS", "PAUSE_FLAG", "RUN_FLAG", "WAIT_FLAG"] 12 | -------------------------------------------------------------------------------- /pypiper/folder_context.py: -------------------------------------------------------------------------------- 1 | """ Context manager for temporarily changing folder. """ 2 | 3 | import os 4 | 5 | __author__ = "Vince Reuter" 6 | __email__ = "vreuter@virginia.edu" 7 | 8 | 9 | class FolderContext(object): 10 | """Context manager for temporarily changing directory.""" 11 | 12 | def __init__(self, folder): 13 | """ 14 | Store the previous working path to restore upon exit. 15 | 16 | :param str folder: Path to set as new working directory 17 | """ 18 | if not os.path.isdir(folder): 19 | raise ValueError("Requested temp entry to non-folder: {}".format(folder)) 20 | self._prevdir = os.getcwd() 21 | self._currdir = folder 22 | 23 | def __enter__(self): 24 | """Make the working directory switch.""" 25 | os.chdir(self._currdir) 26 | 27 | def __exit__(self, exc_type, exc_val, exc_tb): 28 | """Switch back to the previous working directory.""" 29 | if not os.path.isdir(self._prevdir): 30 | raise RuntimeError( 31 | "Return path is no longer a directory: {}".format(self._prevdir) 32 | ) 33 | os.chdir(self._prevdir) 34 | -------------------------------------------------------------------------------- /pypiper/stage.py: -------------------------------------------------------------------------------- 1 | """ Conceptualize a pipeline processing phase/stage. """ 2 | 3 | import copy 4 | 5 | from .utils import translate_stage_name 6 | 7 | __author__ = "Vince Reuter" 8 | __email__ = "vreuter@virginia.edu" 9 | 10 | 11 | __all__ = ["Stage"] 12 | 13 | 14 | class Stage(object): 15 | """ 16 | Single stage/phase of a pipeline; a logical processing "unit". A stage is a 17 | collection of commands that is checkpointed. 18 | """ 19 | 20 | def __init__( 21 | self, 22 | func, 23 | f_args=None, 24 | f_kwargs=None, 25 | name=None, 26 | checkpoint=True, 27 | *, 28 | nofail=False 29 | ): 30 | """ 31 | A function, perhaps with arguments, defines the stage. 32 | 33 | :param callable func: The processing logic that defines the stage 34 | :param tuple f_args: Positional arguments for func 35 | :param dict f_kwargs: Keyword arguments for func 36 | :param str name: name for the phase/stage 37 | :param callable func: Object that defines how the stage will execute. 38 | :param bool nofail: Allow a failure of this stage to not fail the pipeline 39 | in which it's running 40 | """ 41 | if isinstance(func, Stage): 42 | raise TypeError("Cannot create Stage from Stage") 43 | super(Stage, self).__init__() 44 | self.f = func 45 | self.f_args = f_args or tuple() 46 | self.f_kwargs = f_kwargs or dict() 47 | self.name = name or func.__name__ 48 | self.checkpoint = checkpoint 49 | self.nofail = nofail 50 | 51 | @property 52 | def checkpoint_name(self): 53 | """ 54 | Determine the checkpoint name for this Stage. 55 | 56 | :return str | NoneType: Checkpoint name for this stage; null if this 57 | Stage is designated as a non-checkpoint. 58 | """ 59 | return translate_stage_name(self.name) if self.checkpoint else None 60 | 61 | def run(self, *args, **kwargs): 62 | """Alternate form for direct call; execute stage.""" 63 | self(*args, **kwargs) 64 | 65 | def __call__(self, *args, **update_kwargs): 66 | """Execute the stage, allowing updates to args/kwargs.""" 67 | kwargs = copy.deepcopy(self.f_kwargs) 68 | kwargs.update(update_kwargs) 69 | args = args or self.f_args 70 | self.f(*args, **kwargs) 71 | 72 | def __eq__(self, other): 73 | return ( 74 | isinstance(other, Stage) 75 | and self.f.__name__ == other.f.__name__ 76 | and ( 77 | {k: v for k, v in self.__dict__.items() if k != "f"} 78 | == {k: v for k, v in other.__dict__.items() if k != "f"} 79 | ) 80 | ) 81 | 82 | def __ne__(self, other): 83 | return not (self == other) 84 | 85 | def __repr__(self): 86 | return ( 87 | "{klass} '{n}': f={f}, args={pos}, kwargs={kwd}, " 88 | "checkpoint={check}".format( 89 | klass=self.__class__.__name__, 90 | f=self.f, 91 | n=self.name, 92 | pos=self.f_args, 93 | kwd=self.f_kwargs, 94 | check=self.checkpoint, 95 | ) 96 | ) 97 | 98 | def __str__(self): 99 | return "{}: '{}'".format(self.__class__.__name__, self.name) 100 | -------------------------------------------------------------------------------- /requirements/requirements-dev-extra.txt: -------------------------------------------------------------------------------- 1 | black 2 | -------------------------------------------------------------------------------- /requirements/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | mkdocs>=1.0 2 | markdown-include 3 | pydoc-markdown 4 | piper 5 | pipestat>=0.9.0a1 6 | https://github.com/databio/mkdocs-databio/archive/master.zip -------------------------------------------------------------------------------- /requirements/requirements-ngstk.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | pysam 4 | yacman 5 | pipestat>=0.1.0 -------------------------------------------------------------------------------- /requirements/requirements-plot.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | scipy 3 | seaborn 4 | -------------------------------------------------------------------------------- /requirements/requirements-pypiper.txt: -------------------------------------------------------------------------------- 1 | logmuse>=0.2.4 2 | psutil 3 | pandas 4 | ubiquerg>=0.8.0 5 | yacman>=0.9.3 6 | pipestat>=0.11.0 7 | -------------------------------------------------------------------------------- /requirements/requirements-test.txt: -------------------------------------------------------------------------------- 1 | mock==2.0.0 2 | pytest>=4.6.9 3 | pytest-cov>=2.8.1 4 | hypothesis==4.38.0 5 | coveralls 6 | veracitools 7 | pytest-remotedata 8 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import os 4 | import sys 5 | 6 | extra = {} 7 | 8 | try: 9 | from setuptools import setup 10 | except ImportError: 11 | from distutils.core import setup 12 | 13 | 14 | def read_reqs_file(reqs_name): 15 | """Read requirements file for given requirements group.""" 16 | path_reqs_file = os.path.join( 17 | "requirements", "requirements-{}.txt".format(reqs_name) 18 | ) 19 | with open(path_reqs_file, "r") as reqs_file: 20 | return [ 21 | pkg.rstrip() for pkg in reqs_file.readlines() if not pkg.startswith("#") 22 | ] 23 | 24 | 25 | with open(os.path.join("pypiper", "_version.py"), "r") as versionfile: 26 | version = versionfile.readline().split()[-1].strip("\"'\n") 27 | 28 | 29 | basic_reqs = read_reqs_file("pypiper") 30 | 31 | # Requirements for tests 32 | test_reqs = read_reqs_file("test") 33 | 34 | # Allow specification of desired features, which implies dependencies. 35 | addl_reqs = { 36 | bundle_name: read_reqs_file(bundle_name) for bundle_name in ["ngstk", "plot"] 37 | } 38 | 39 | # Complete collection of user requirements. 40 | addl_reqs["all"] = list({pkg for bundle in addl_reqs.values() for pkg in bundle}) 41 | 42 | # Dev installation is full user + test. 43 | addl_reqs["dev"] = list(set(test_reqs + addl_reqs["all"])) 44 | 45 | with open("README.md") as f: 46 | long_description = f.read() 47 | 48 | setup( 49 | name="piper", 50 | packages=["pypiper"], 51 | install_requires=basic_reqs, 52 | version=version, 53 | description="A lightweight python toolkit for gluing together restartable, robust command line pipelines", 54 | long_description=long_description, 55 | long_description_content_type="text/markdown", 56 | classifiers=[ 57 | "Development Status :: 4 - Beta", 58 | "License :: OSI Approved :: BSD License", 59 | "Programming Language :: Python :: 3.8", 60 | "Programming Language :: Python :: 3.9", 61 | "Programming Language :: Python :: 3.10", 62 | "Programming Language :: Python :: 3.11", 63 | "Topic :: Scientific/Engineering :: Bio-Informatics", 64 | ], 65 | author="Nathan Sheffield, Johanna Klughammer, Andre Rendeiro", 66 | author_email="nathan@code.databio.org, jklughammer@cemm.oeaw.ac.at, arendeiro@cemm.oeaw.ac.at", 67 | url="https://github.com/databio/pypiper/", 68 | license="BSD2", 69 | test_suite="tests", # python setup.py test 70 | tests_require=test_reqs, # Test-specific package dependencies 71 | # Extra package if doing `python setup.py test` 72 | setup_requires=( 73 | ["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else [] 74 | ), 75 | extras_require=addl_reqs, 76 | # Version-specific items 77 | **extra 78 | ) 79 | -------------------------------------------------------------------------------- /tests/Data/default_pipestat_output_schema.yaml: -------------------------------------------------------------------------------- 1 | #NOTE: 2 | # This is output schema can be customized for your specific pipeline. 3 | #See here for more details: 4 | # https://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format 5 | pipeline_name: default_pipeline_name 6 | samples: 7 | number_of_things: 8 | type: integer 9 | description: "Number of things" -------------------------------------------------------------------------------- /tests/Data/sample_output_schema.yaml: -------------------------------------------------------------------------------- 1 | pipeline_name: test_pipe 2 | samples: 3 | number_of_things: 4 | type: integer 5 | description: "Number of things" 6 | percentage_of_things: 7 | type: number 8 | description: "Percentage of things" 9 | name_of_something: 10 | type: string 11 | description: "Name of something" 12 | switch_value: 13 | type: boolean 14 | description: "Is the switch on or off" 15 | output_file: 16 | type: file 17 | description: "This a path to the output file" 18 | output_image: 19 | type: image 20 | description: "This a path to the output image" 21 | md5sum: 22 | type: string 23 | description: "MD5SUM of an object" 24 | highlight: true 25 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/pypiper/7c0e129440509610fb1d476a4076357105aebf8c/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """ Fixtures and configuration visible to all tests """ 2 | 3 | import copy 4 | import os 5 | from functools import partial 6 | 7 | import pytest 8 | 9 | from pypiper import Pipeline, PipelineManager, Stage 10 | 11 | __author__ = "Vince Reuter" 12 | __email__ = "vreuter@virginia.edu" 13 | 14 | 15 | # Use a weird suffix for glob specificity. 16 | OUTPUT_SUFFIX = ".testout" 17 | 18 | TEST_PIPE_NAME = "test-pipe" 19 | 20 | FILE1_TEXT = "hello there" 21 | FILE2_TEXT = "hello2" 22 | FILE3_TEXT = "third" 23 | CONTENTS = [FILE1_TEXT, FILE2_TEXT, FILE3_TEXT] 24 | 25 | FILE1_NAME = "file1{}".format(OUTPUT_SUFFIX) 26 | FILE2_NAME = "file2{}".format(OUTPUT_SUFFIX) 27 | FILE3_NAME = "file3{}".format(OUTPUT_SUFFIX) 28 | FILENAMES = [FILE1_NAME, FILE2_NAME, FILE3_NAME] 29 | 30 | FILE_TEXT_PAIRS = list(zip(FILENAMES, CONTENTS)) 31 | 32 | 33 | @pytest.fixture 34 | def get_pipe_manager(tmpdir): 35 | """Provide safe creation of pipeline manager, with multi=True.""" 36 | 37 | def get_mgr(**kwargs): 38 | if "outfolder" in kwargs: 39 | kwd_args = kwargs 40 | else: 41 | kwd_args = copy.deepcopy(kwargs) 42 | kwd_args["outfolder"] = tmpdir.strpath 43 | return PipelineManager(multi=True, **kwd_args) 44 | 45 | return get_mgr 46 | 47 | 48 | @pytest.fixture 49 | def pl_mgr(request, get_pipe_manager): 50 | """Provide a PipelineManager and ensure that it's stopped.""" 51 | pm = get_pipe_manager(name=TEST_PIPE_NAME) 52 | 53 | def _ensure_stopped(): 54 | pm.stop_pipeline() 55 | 56 | request.addfinalizer(_ensure_stopped) 57 | return pm 58 | 59 | 60 | @pytest.fixture 61 | def dummy_pipe(pl_mgr): 62 | """Provide a basic Pipeline instance for a test case.""" 63 | return DummyPipeline(pl_mgr) 64 | 65 | 66 | def write_file1(folder): 67 | _write(*FILE_TEXT_PAIRS[0], folder=folder) 68 | 69 | 70 | def write_file2(folder): 71 | _write(*FILE_TEXT_PAIRS[1], folder=folder) 72 | 73 | 74 | def write_file3(folder): 75 | _write(*FILE_TEXT_PAIRS[2], folder=folder) 76 | 77 | 78 | def _write(filename, content, folder=None): 79 | path = os.path.join(folder, filename) 80 | with open(path, "w") as f: 81 | f.write(content) 82 | 83 | 84 | class DummyPipeline(Pipeline): 85 | """Basic pipeline implementation for tests""" 86 | 87 | def __init__(self, manager): 88 | super(DummyPipeline, self).__init__(TEST_PIPE_NAME, manager=manager) 89 | 90 | def stages(self): 91 | """ 92 | Establish the stages/phases for this test pipeline. 93 | 94 | :return list[pypiper.Stage]: Sequence of stages for this pipeline. 95 | """ 96 | # File content writers parameterized with output folder. 97 | fixed_folder_funcs = [] 98 | for f in [write_file1, write_file2, write_file3]: 99 | f_fixed = partial(f, folder=self.outfolder) 100 | f_fixed.__name__ = f.__name__ 101 | fixed_folder_funcs.append(f_fixed) 102 | return [Stage(f) for f in fixed_folder_funcs] 103 | -------------------------------------------------------------------------------- /tests/helpers.py: -------------------------------------------------------------------------------- 1 | """ Helpers for tests """ 2 | 3 | import glob 4 | import os 5 | from functools import partial 6 | 7 | import pytest 8 | 9 | from pypiper import Pipeline 10 | from pypiper.utils import checkpoint_filepath 11 | 12 | __author__ = "Vince Reuter" 13 | __email__ = "vreuter@virginia.edu" 14 | 15 | 16 | def assert_equal_dirpath(p1, p2): 17 | """ 18 | Assert that a pair of folder paths has two equal members. 19 | 20 | :param str p1: One path to compare. 21 | :param str p2: Other path to compare. 22 | """ 23 | assert p1.rstrip(os.sep) == p2.rstrip(os.sep) 24 | 25 | 26 | def fetch_checkpoint_files(pm): 27 | """ 28 | Fetch all of a manager's checkpoint file paths. 29 | 30 | :param pyiper.PipelineManager pm: manager for which checkpoint files' 31 | paths are of interest. 32 | :return Iterable[str]: collection of all of given manager's checkpoint 33 | files' paths. 34 | """ 35 | pattern = checkpoint_filepath("*", pm) 36 | return glob.glob(pattern) 37 | 38 | 39 | def named_param(argnames, argvalues): 40 | """ 41 | Improve pytest's native labeling of test case parameterization. 42 | 43 | This function thinly wraps the 'parametrize' mark from pytest, adding 44 | clearer labeling of each individual parameterized test case, overriding 45 | the index-based labeling that pytest uses by default. 46 | 47 | :param str argnames: Single parameter name, named in the plural only for 48 | concordance with the native pytest name. 49 | :param Iterable argvalues: Arguments for the parameter, what define the 50 | distinct test cases. 51 | :return functools.partial: Parameterize version of parametrize, with 52 | values and ids fixed. 53 | """ 54 | return partial( 55 | pytest.mark.parametrize( 56 | argnames=argnames, 57 | argvalues=argvalues, 58 | ids=lambda val: "{}={}".format(argnames, val), 59 | ) 60 | ) 61 | 62 | 63 | class SafeTestPipeline(Pipeline): 64 | """Pipeline for tests that protects against bad file descriptor.""" 65 | 66 | def __init__(self, *args, **kwargs): 67 | kwd_args = {"multi": True} # Like interactive mode. 68 | kwd_args.update(kwargs) 69 | super(SafeTestPipeline, self).__init__(*args, **kwd_args) 70 | -------------------------------------------------------------------------------- /tests/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/pypiper/7c0e129440509610fb1d476a4076357105aebf8c/tests/pipeline/__init__.py -------------------------------------------------------------------------------- /tests/pipeline/conftest.py: -------------------------------------------------------------------------------- 1 | """ Test configuration for Pipeline tests. """ 2 | 3 | import os 4 | 5 | import pytest 6 | 7 | from pypiper import Stage 8 | from tests.helpers import SafeTestPipeline 9 | 10 | __author__ = "Vince Reuter" 11 | __email__ = "vreuter@virginia.edu" 12 | 13 | 14 | READ_ALIGNER_FILENAME = "aligner.lst" 15 | PEAK_CALLER_FILENAME = "caller.lst" 16 | 17 | 18 | def pytest_generate_tests(metafunc): 19 | """Dynamic test case parameterization.""" 20 | if "pl_name" in metafunc.fixturenames: 21 | metafunc.parametrize("pl_name", [read_aligner.__name__, call_peaks.__name__]) 22 | 23 | 24 | # Dummy functions used as elements of pipeline stages() collections. 25 | def merge_input(): 26 | pass 27 | 28 | 29 | def qc(): 30 | pass 31 | 32 | 33 | def align_reads(): 34 | pass 35 | 36 | 37 | def call_peaks(): 38 | pass 39 | 40 | 41 | class FunctionNameWriterPipeline(SafeTestPipeline): 42 | """Basic pipeline that writes to file the names of its functions.""" 43 | 44 | def __init__(self, name, outfolder, filename, functions): 45 | """ 46 | Name and outfolder go to generic pipeline ctor; filename and functions 47 | collection are used specifically by instances of this class. 48 | 49 | :param str name: Name for this pipeline. 50 | :param str outfolder: Path to pipeline's output folder. 51 | :param str filename: Name for file in which to write function names. 52 | :param Sequence[callable] functions: Functions on which this pipeline 53 | is to operate (i.e., the functions for which name should be 54 | written to output file). 55 | """ 56 | # Set instance-specific variables. 57 | self.name_output_file = filename 58 | self.functions = functions 59 | # Get the stages() benefit of superclass extension. 60 | super(FunctionNameWriterPipeline, self).__init__(name=name, outfolder=outfolder) 61 | 62 | def write_name(self, func): 63 | """ 64 | Write the name of a function to this pipeline's output file. 65 | 66 | :param callable func: Name of function to write to the output file. 67 | """ 68 | outpath = os.path.join(self.outfolder, self.name_output_file) 69 | with open(outpath, "a") as f: 70 | f.write(func.__name__ + os.linesep) 71 | 72 | def run(self, **kwargs): 73 | """Start with clean output file, then use superclass method.""" 74 | # Ensure that we start with a clean file since the nature of the 75 | # operations performed (sequential file writes) creates desire to 76 | # open output file in append mode rather than write mode. 77 | output_file = os.path.join(self.outfolder, self.name_output_file) 78 | if os.path.exists(output_file): 79 | os.unlink(output_file) 80 | super(FunctionNameWriterPipeline, self).run(**kwargs) 81 | 82 | def stages(self): 83 | """Sequence of operations to perform.""" 84 | return [Stage(self.write_name, (f,), name=f.__name__) for f in self.functions] 85 | 86 | 87 | # Functions and fixtures 88 | 89 | 90 | def get_read_aligner(outfolder): 91 | """Create a dummy 'read aligner' pipeline.""" 92 | return FunctionNameWriterPipeline( 93 | "read-aligner", outfolder, READ_ALIGNER_FILENAME, [merge_input, qc, align_reads] 94 | ) 95 | 96 | 97 | def get_peak_caller(outfolder): 98 | """Create a dummy 'peak caller' pipeline.""" 99 | return FunctionNameWriterPipeline( 100 | "peak-caller", outfolder, PEAK_CALLER_FILENAME, [align_reads, call_peaks] 101 | ) 102 | 103 | 104 | def get_pipeline(name, outfolder): 105 | """ 106 | Build and return pipeline instance associated with given name. 107 | 108 | :param str name: Name of the pipeline to build. 109 | :param str outfolder: Path to output folder for use by pipeline instance. 110 | :return SafeTestPipeline: A test-session-safe instance of a Pipeline. 111 | """ 112 | if name == read_aligner.__name__: 113 | return get_read_aligner(outfolder) 114 | elif name == call_peaks.__name__: 115 | return get_peak_caller(outfolder) 116 | else: 117 | raise ValueError("Unknown pipeline request: '{}'".format(name)) 118 | 119 | 120 | @pytest.fixture 121 | def read_aligner(tmpdir): 122 | """Provide test case with a read aligner pipeline instance.""" 123 | return get_read_aligner(outfolder=tmpdir.strpath) 124 | 125 | 126 | @pytest.fixture 127 | def peak_caller(tmpdir): 128 | """Provide test case with a 'PeakCaller' pipeline instance.""" 129 | return get_peak_caller(outfolder=tmpdir.strpath) 130 | -------------------------------------------------------------------------------- /tests/pipeline/test_multi_pipeline_sample.py: -------------------------------------------------------------------------------- 1 | """ Tests for case in which multiple pipelines process a single sample. """ 2 | 3 | import os 4 | 5 | from pypiper.utils import checkpoint_filepath 6 | from tests.helpers import fetch_checkpoint_files, named_param 7 | 8 | from .conftest import get_peak_caller, get_pipeline, get_read_aligner 9 | 10 | __author__ = "Vince Reuter" 11 | __email__ = "vreuter@virginia.edu" 12 | 13 | 14 | def test_checkpoints_are_pipeline_unique(tmpdir): 15 | """Names of checkpoint files depend on both stage and pipeline.""" 16 | 17 | # Note: conceptually, this tests an underlying mechanistic aspect of the 18 | # checkpointing system. 19 | 20 | # Create two different pipelines. 21 | align_reads = get_read_aligner(tmpdir.strpath) 22 | call_peaks = get_peak_caller(tmpdir.strpath) 23 | 24 | # Get the stage names associated with each pipeline. 25 | alignment_stage_names = set(map(lambda s: s.name, align_reads.stages())) 26 | peak_call_stage_names = set(map(lambda s: s.name, call_peaks.stages())) 27 | 28 | # Check that we have one specific stage name shared between the pipelines. 29 | assert {"align_reads"} == alignment_stage_names & peak_call_stage_names 30 | assert align_reads.outfolder == call_peaks.outfolder 31 | 32 | # We begin with no checkpoint files. 33 | assert [] == list(fetch_checkpoint_files(align_reads.manager)) 34 | assert [] == list(fetch_checkpoint_files(call_peaks.manager)) 35 | 36 | # Run each pipeline. 37 | align_reads.run() 38 | call_peaks.run() 39 | 40 | # We expect a different checkpoint file for each stage of each pipeline. 41 | align_reads_expected = { 42 | checkpoint_filepath(s.name, align_reads) for s in align_reads.stages() 43 | } 44 | call_peaks_expected = { 45 | checkpoint_filepath(s.name, call_peaks) for s in call_peaks.stages() 46 | } 47 | 48 | # Pipeline names are unique here, and each checkpoint name includes 49 | # pipeline name for disambiguation, so even a pair of pipelines with a 50 | # nonempty stage name intersection has an empty checkpoint filenames 51 | # intersection, so long as the pipeline names are unique. 52 | assert set() == (align_reads_expected & call_peaks_expected) 53 | 54 | # When not setting start/stop parameters and beginning with no checkpoint 55 | # files in place, each pipeline generates its full set of checkpoint files. 56 | expected_checkpoints = align_reads_expected | call_peaks_expected 57 | observed_checkpoints = set(fetch_checkpoint_files(align_reads)) | set( 58 | fetch_checkpoint_files(call_peaks) 59 | ) 60 | 61 | # Verify satisfaction of expectation. 62 | try: 63 | assert expected_checkpoints == observed_checkpoints 64 | except AssertionError: 65 | only_exp = expected_checkpoints - observed_checkpoints 66 | exp_and_obs = expected_checkpoints & observed_checkpoints 67 | only_obs = observed_checkpoints - expected_checkpoints 68 | print("Only in expected:\n{}".format("\n".join(only_exp))) 69 | print("Expected and observed:\n{}".format("\n".join(exp_and_obs))) 70 | print("Only in observed:\n{}".format("\n".join(only_obs))) 71 | raise 72 | 73 | 74 | def test_pipeline_checkpoint_respect_sensitivity_and_specificity(tmpdir): 75 | """Pipeline respects only its own checkpoint(s) for stage skipping.""" 76 | 77 | # Note: conceptually, this is more of an effect- or outcome-based test 78 | # of the checkpointing system with respect to stage skipping. 79 | 80 | align_reads = get_read_aligner(tmpdir.strpath) 81 | call_peaks = get_peak_caller(tmpdir.strpath) 82 | 83 | align_reads_stage_names = [s.name for s in align_reads.stages()] 84 | call_peaks_stage_names = [s.name for s in call_peaks.stages()] 85 | assert {"align_reads"} == set(align_reads_stage_names) & set(call_peaks_stage_names) 86 | 87 | # Set up the checkpoints for the read alignment pipeline by allowing it 88 | # to execute once. 89 | align_reads.run() 90 | assert os.path.isfile(checkpoint_filepath("align_reads", align_reads.manager)) 91 | peaks_align_check_fpath = checkpoint_filepath("align_reads", call_peaks.manager) 92 | assert not os.path.isfile(peaks_align_check_fpath) 93 | 94 | call_peaks.run() 95 | exp_lines = [func + os.linesep for func in call_peaks_stage_names] 96 | call_peaks_outpath = os.path.join(call_peaks.outfolder, call_peaks.name_output_file) 97 | with open(call_peaks_outpath, "r") as f: 98 | obs_lines = f.readlines() 99 | assert exp_lines == obs_lines 100 | -------------------------------------------------------------------------------- /tests/pipeline/test_pipeline_checkpoint.py: -------------------------------------------------------------------------------- 1 | """ Tests for a pipeline's ability to checkpoint its stages. """ 2 | 3 | import os 4 | import time 5 | 6 | from pypiper.utils import checkpoint_filepath 7 | from tests.helpers import fetch_checkpoint_files, named_param 8 | 9 | from .conftest import get_pipeline 10 | 11 | __author__ = "Vince Reuter" 12 | __email__ = "vreuter@virginia.edu" 13 | 14 | 15 | def test_pipeline_checkpoint_respect_sensitivity_checkpoint_perspective( 16 | pl_name, tmpdir 17 | ): 18 | """Pipeline can skip past its stage(s) for which checkpoint exists.""" 19 | 20 | # Create the pipeline. 21 | pipeline = get_pipeline(pl_name, tmpdir.strpath) 22 | 23 | # Negative control to start test, that we have no checkpoint files. 24 | assert [] == fetch_checkpoint_files(pipeline.manager) 25 | 26 | # Generate some checkpoints. 27 | pipeline.run() 28 | 29 | # Verify that we created each of the checkpoints. 30 | expected = [ 31 | checkpoint_filepath(f.__name__, pipeline.manager) for f in pipeline.functions 32 | ] 33 | observed = fetch_checkpoint_files(pipeline.manager) 34 | assert set(expected) == set(observed) 35 | 36 | # Collect checkpoint file timestamps for comparison after second run. 37 | timestamps = {f: os.path.getmtime(f) for f in observed} 38 | 39 | # Remove the checkpoint for the final stage. 40 | last_aligner_stage = pipeline.functions[-1] 41 | last_aligner_checkfile = checkpoint_filepath(last_aligner_stage, pipeline.manager) 42 | os.unlink(last_aligner_checkfile) 43 | 44 | # Verify removal of final stage checkpoint file. 45 | assert all([os.path.isfile(f) for f in expected[:-1]]) 46 | assert not os.path.exists(last_aligner_checkfile) 47 | assert set(expected) != set(fetch_checkpoint_files(pipeline.manager)) 48 | 49 | # Delay briefly so that we can more reliably compare checkpoint file 50 | # timestamps after a second pipeline run. 51 | time.sleep(0.05) 52 | 53 | # Repeat the pipeline's execution, but now with checkpoint file(s) for a 54 | # subset of its stages in place. 55 | pipeline.run() 56 | 57 | # Verify that we've restored the full collection of the pipeline's 58 | # checkpoint files to existence. 59 | observed = fetch_checkpoint_files(pipeline.manager) 60 | exp = set(expected) 61 | obs = set(observed) 62 | assert set(expected) == set( 63 | observed 64 | ), "Expected only:\n{}\nExpected and observed:\n{}\nObserved only:\n{}".format( 65 | exp - obs, exp & obs, obs - exp 66 | ) 67 | 68 | # Verify the we didn't recreate the checkpoint file for each skipped stage. 69 | for f in expected[:-1]: 70 | expected_timestamp = timestamps[f] 71 | observed_timestamp = os.path.getmtime(f) 72 | assert expected_timestamp == observed_timestamp 73 | 74 | # Verify the we did in fact recreate the checkpoint file for the stage 75 | # that was rerun. 76 | assert ( 77 | os.path.getmtime(last_aligner_checkfile) > timestamps[last_aligner_checkfile] 78 | ), "Recreated checkpoint file ('{}') should be newer than original".format( 79 | last_aligner_checkfile 80 | ) 81 | 82 | 83 | def test_pipeline_checkpoint_sensitivity_effect_perspective(pl_name, tmpdir): 84 | """The pipeline skips execution of stages with extant checkpoint.""" 85 | 86 | # Create the pipeline, then check creation of output file. 87 | pipeline = get_pipeline(pl_name, tmpdir.strpath) 88 | output_file = os.path.join(pipeline.outfolder, pipeline.name_output_file) 89 | assert not os.path.exists(output_file) 90 | pipeline.run() 91 | assert os.path.isfile(output_file) 92 | 93 | # Validate pipeline effects (output file content). 94 | with open(output_file, "r") as f: 95 | lines = f.readlines() 96 | assert [s.name + os.linesep for s in pipeline.stages()] == lines 97 | 98 | # Verify presence of checkpoint files to support our expectation about 99 | # which stages should be skipped and which should be run during the second 100 | # time through the pipeline's execution. 101 | exp_cp_fpaths = set( 102 | checkpoint_filepath(s.name, pipeline.manager) for s in pipeline.stages() 103 | ) 104 | assert exp_cp_fpaths == set(fetch_checkpoint_files(pipeline.manager)) 105 | final_stage = pipeline.stages()[-1] 106 | final_stage_fpath = checkpoint_filepath(final_stage.name, pipeline.manager) 107 | os.unlink(final_stage_fpath) 108 | 109 | # Verify the effect of the second execution of the pipeline. 110 | pipeline.run() 111 | with open(output_file, "r") as f: 112 | lines = f.readlines() 113 | assert [final_stage.name + os.linesep] == lines 114 | 115 | 116 | @named_param("overwrite", [False, True]) 117 | def test_pipeline_reruns_downstream_stages_according_to_parameterization( 118 | overwrite, pl_name, tmpdir 119 | ): 120 | """Pipeline overwrites downstream stages unless configured otherwise.""" 121 | 122 | pl = get_pipeline(pl_name, tmpdir.strpath) 123 | 124 | # Create checkpoint file for each stage. 125 | stage_names = [s.name for s in pl.stages()] 126 | assert 1 < len( 127 | stage_names 128 | ), "Need pipeline with at least two stages to run this test." 129 | for s_name in stage_names: 130 | open(checkpoint_filepath(s_name, pl.manager), "w").close() 131 | 132 | # Remove the checkpoint file for the penultimate stage. 133 | penultimate_stage = stage_names[-2] 134 | os.unlink(checkpoint_filepath(penultimate_stage, pl.manager)) 135 | 136 | # Configure the pipeline based on parameterization and run it starting 137 | # from the penultimate stage. 138 | pl.manager.overwrite_checkpoints = overwrite 139 | pl.run(start_point=penultimate_stage) 140 | 141 | # If we're overwriting downstream checkpoints, the last two stages are 142 | # run while otherwise only the penultimate stage is run. 143 | exp_stages = [stage_names[-2]] 144 | if overwrite: 145 | exp_stages.append(stage_names[-1]) 146 | exp_lines = [func + os.linesep for func in stage_names[-2:]] 147 | outpath = os.path.join(pl.outfolder, pl.name_output_file) 148 | with open(outpath, "r") as f: 149 | obs_lines = f.readlines() 150 | assert exp_lines == obs_lines 151 | -------------------------------------------------------------------------------- /tests/pipeline/test_pipeline_constructor.py: -------------------------------------------------------------------------------- 1 | """ Tests for construction of a Pipeline """ 2 | 3 | import pytest 4 | 5 | from pypiper import Pipeline, PipelineManager, Stage 6 | from tests.helpers import SafeTestPipeline, assert_equal_dirpath, named_param 7 | 8 | __author__ = "Vince Reuter" 9 | __email__ = "vreuter@virginia.edu" 10 | 11 | 12 | def test_pipeline_requires_stages_definition(tmpdir): 13 | """To create a pipeline, define stages (execution steps).""" 14 | 15 | class NoStagesPipeline(SafeTestPipeline): 16 | pass 17 | 18 | name = "test-pipe" 19 | 20 | # Sensitivity: test exception for bad case. 21 | with pytest.raises(TypeError): 22 | NoStagesPipeline(name=name, outfolder=tmpdir.strpath) 23 | # Specificity: test no exception for good case. 24 | _MinimalPipeline(name=name, outfolder=tmpdir.strpath) 25 | 26 | 27 | class JustManagerArgument: 28 | """A pipeline can be created with just a manager argument.""" 29 | 30 | NAME_HOOK = "pl_mgr_name" 31 | 32 | @pytest.fixture 33 | def pl_mgr(self, request, get_pipe_manager): 34 | """Provide each of this class's test cases with pipeline manager.""" 35 | if self.NAME_HOOK in request.fixturenames: 36 | name = request.getfixturevalue(self.NAME_HOOK) 37 | else: 38 | name = "test-pipe" 39 | return get_pipe_manager(name=name) 40 | 41 | @named_param(argnames=NAME_HOOK, argvalues=["arbitrary-pipeline", "DummyPipe"]) 42 | def test_pipeline_adopts_manager_name(self, pl_mgr_name, pl_mgr): 43 | """If given just a manager, a pipeline uses the manager name.""" 44 | pl = Pipeline(manager=pl_mgr) 45 | assert pl_mgr_name == pl_mgr.name 46 | assert pl_mgr_name == pl.name 47 | 48 | def test_pipeline_adopts_manager_output_folder(self, pl_mgr): 49 | """Pipeline uses manager output folder if given just manager.""" 50 | pl = Pipeline(manager=pl_mgr) 51 | assert pl_mgr.outfolder == pl.outfolder 52 | 53 | 54 | class MinimalArgumentsWithoutManagerTests: 55 | """Tests for pipeline constructor argument provision without manager.""" 56 | 57 | def test_pipeline_creates_manager(self, tmpdir): 58 | """If not passed a pipeline manager, a pipeline creates one.""" 59 | empty = _MinimalPipeline(name="minimal", outfolder=tmpdir.strpath) 60 | assert isinstance(empty.manager, PipelineManager) 61 | 62 | @named_param("pipe_name", ["test-pipe", "DummyPipeline"]) 63 | def test_manager_adopts_pipeline_name(self, pipe_name, tmpdir): 64 | """Autogenerated pipeline manager uses pipeline's name.""" 65 | pl = _MinimalPipeline(name=pipe_name, outfolder=tmpdir.strpath) 66 | assert pipe_name == pl.name 67 | assert pl.name == pl.manager.name 68 | 69 | def test_manager_adopts_pipeline_output_folder(self, tmpdir): 70 | """Autogenerated pipeline manager uses pipeline's output folder.""" 71 | pl = _MinimalPipeline(name="test-pipe", outfolder=tmpdir.strpath) 72 | assert_equal_dirpath(tmpdir.strpath, pl.outfolder) 73 | 74 | 75 | class ConceptuallyOverlappingArgumentsTests: 76 | """ 77 | Test cases in which pipeline's argument space is overspecified. 78 | 79 | Specifically, there are two main argument specification strategies for 80 | creating a pipeline, each of which is minimal in its own way. One is to 81 | directly pass a PipelineManager, and the other is to pass a name and a 82 | path to an output folder. The manager implies both the name and the 83 | output folder, and the name + output folder can be used in conjunction 84 | to create a pipeline manager if one's not passed. This class aims to test 85 | the outcomes of cases in which the combination of arguments passed to the 86 | pipeline constructor overspecifies the space defined by pipeline name, 87 | output folder path, and pipeline manager. 88 | 89 | """ 90 | 91 | def test_same_name_for_manager_and_pipeline(self, tmpdir, get_pipe_manager): 92 | """Pipeline name and manager with matching name is unproblematic.""" 93 | name = "test-pipe" 94 | pm = get_pipe_manager(name=name, outfolder=tmpdir.strpath) 95 | pl = _MinimalPipeline(name=name, manager=pm) 96 | assert name == pl.manager.name 97 | 98 | def test_different_name_for_manager_and_pipeline(self, tmpdir, get_pipe_manager): 99 | """If given, pipeline favors its own name over manager's.""" 100 | manager_name = "manager" 101 | pipeline_name = "pipeline" 102 | pm = get_pipe_manager(name=manager_name, outfolder=tmpdir.strpath) 103 | pl = _MinimalPipeline(name=pipeline_name, manager=pm) 104 | assert pipeline_name == pl.name 105 | assert manager_name == pl.manager.name 106 | 107 | @named_param("output_folder", argvalues=["test-output", "testing-output-folder"]) 108 | def test_pipeline_ignores_outfolder_if_manager_is_passed( 109 | self, output_folder, tmpdir, get_pipe_manager 110 | ): 111 | """Manager's output folder trumps explicit output folder.""" 112 | pm = get_pipe_manager(name="test-pipe", outfolder=tmpdir.strpath) 113 | pl = _MinimalPipeline(manager=pm, outfolder=output_folder) 114 | assert_equal_dirpath(tmpdir.strpath, pl.outfolder) 115 | 116 | def test_name_outfolder_and_manager(self, tmpdir, get_pipe_manager): 117 | """Tests provision of all three primary pipeline arguments.""" 118 | name = "test-pipe" 119 | pm = get_pipe_manager(name=name, outfolder=tmpdir.strpath) 120 | pl = _MinimalPipeline(name=name, manager=pm, outfolder=tmpdir.strpath) 121 | assert name == pl.name 122 | assert_equal_dirpath(tmpdir.strpath, pl.outfolder) 123 | assert pm == pl.manager 124 | 125 | 126 | def test_pipeline_requires_either_manager_or_outfolder(): 127 | """Pipeline must be passed pipeline manager or output folder.""" 128 | with pytest.raises(TypeError): 129 | _MinimalPipeline() 130 | 131 | 132 | def test_empty_pipeline_manager_name_and_no_explicit_pipeline_name( 133 | tmpdir, get_pipe_manager 134 | ): 135 | """If no name's passed to pipeline, the manager must have valid name.""" 136 | pm = get_pipe_manager(name="", outfolder=tmpdir.strpath) 137 | with pytest.raises(ValueError): 138 | _MinimalPipeline(manager=pm) 139 | 140 | 141 | class AnonymousFunctionStageTests: 142 | """Tests for anonymous function as a pipeline stage.""" 143 | 144 | def test_anonymous_stage_without_name_is_prohibited(self, tmpdir): 145 | """Anonymous function as Stage must be paired with name.""" 146 | with pytest.raises(TypeError): 147 | _AnonymousStageWithoutNamePipeline( 148 | name="test-pipe", outfolder=tmpdir.strpath 149 | ) 150 | 151 | def test_anonymous_stage_with_name_is_permitted(self, tmpdir): 152 | """Anonymous function as Stage must be paired with name.""" 153 | _AnonymousStageWithNamePipeline(name="test-pipe", outfolder=tmpdir.strpath) 154 | 155 | 156 | class _AnonymousStageWithoutNamePipeline(SafeTestPipeline): 157 | """Anonymous function as stage is prohibited unless paired with name.""" 158 | 159 | def stages(self): 160 | return [lambda: None] 161 | 162 | 163 | class _AnonymousStageWithNamePipeline(SafeTestPipeline): 164 | """Anonymous function as Stage is allowed if wrapped with a name.""" 165 | 166 | def stages(self): 167 | return [("NullStage", lambda: None)] 168 | 169 | 170 | @pytest.fixture 171 | def empty_pipeline(request): 172 | """Provide test case with minimal pipeline instance.""" 173 | if "pipe_name" in request.fixturenames: 174 | name = request.getfixturevalue("pipe_name") 175 | else: 176 | name = "minimal" 177 | return _MinimalPipeline(name) 178 | 179 | 180 | class _MinimalPipeline(SafeTestPipeline): 181 | """Minimal pipeline declaration.""" 182 | 183 | def stages(self): 184 | """Sham stages definition.""" 185 | return [_do_nothing] 186 | 187 | 188 | def _do_nothing(): 189 | return 190 | -------------------------------------------------------------------------------- /tests/pipeline_manager/test_halt.py: -------------------------------------------------------------------------------- 1 | """ Tests for effects of pipeline manager's halt() function. """ 2 | 3 | import os 4 | 5 | import pytest 6 | 7 | from pypiper.exceptions import PipelineHalt 8 | from pypiper.flags import COMPLETE_FLAG, PAUSE_FLAG 9 | from tests.helpers import named_param 10 | 11 | __author__ = "Vince Reuter" 12 | __email__ = "vreuter@virginia.edu" 13 | 14 | 15 | def test_halt_state(get_pipe_manager): 16 | """Requesting a halt alters manager state.""" 17 | pm = get_pipe_manager(name="test-pipe") 18 | assert pm._active 19 | pm.halt(raise_error=False) 20 | assert pm.halted 21 | assert not pm._active 22 | 23 | 24 | def test_halt_file(get_pipe_manager): 25 | """Requesting a halt produces a particular flag file.""" 26 | pm = get_pipe_manager(name="TestPM") 27 | path_halt_file = pm._flag_file_path(PAUSE_FLAG) 28 | assert not os.path.isfile(path_halt_file) 29 | pm.halt(raise_error=False) 30 | assert os.path.isfile(path_halt_file) 31 | 32 | 33 | @named_param("raise_error", [False, True, None]) 34 | def test_halt_exceptionality(get_pipe_manager, raise_error): 35 | """Halting is conditionally exceptional""" 36 | pm = get_pipe_manager(name="halt-error") 37 | if raise_error is None: 38 | # Default is exceptional. 39 | with pytest.raises(PipelineHalt): 40 | pm.halt() 41 | elif raise_error: 42 | with pytest.raises(PipelineHalt): 43 | pm.halt(raise_error=True) 44 | else: 45 | pm.halt(raise_error=False) 46 | 47 | 48 | @named_param("raise_error", [False, True]) 49 | @named_param("test_type", argvalues=["halt_flag", "complete_flag"]) 50 | def test_halt_status_supersedes_completed(get_pipe_manager, raise_error, test_type): 51 | """Halting pipeline replaces completed flag with halt flag.""" 52 | 53 | # Create manager and completion flag. 54 | pm = get_pipe_manager(name="halt-status-flag") 55 | pm._set_status_flag(COMPLETE_FLAG) 56 | path_complete_flag = pm._flag_file_path(COMPLETE_FLAG) 57 | assert os.path.isfile(path_complete_flag) 58 | 59 | # Perform the halt. 60 | try: 61 | pm.halt(raise_error=raise_error) 62 | except PipelineHalt: 63 | # We don't care about exceptionality here, just that the flag files 64 | # are adjusted regardless of the halt type. 65 | pass 66 | 67 | # Check either the presence of the halt flag or the absence of the 68 | # completion flag, depending on test parameterization. 69 | if test_type == "halt_flag": 70 | path_halt_flag = pm._flag_file_path(PAUSE_FLAG) 71 | assert os.path.isfile(path_halt_flag) 72 | elif test_type == "complete_flag": 73 | assert not os.path.isfile(path_complete_flag) 74 | else: 75 | raise ValueError("Unknown test type: '{}'".format(test_type)) 76 | -------------------------------------------------------------------------------- /tests/pipeline_manager/test_manager_constructor.py: -------------------------------------------------------------------------------- 1 | """ Test effects of construction of a pipeline manager. """ 2 | 3 | import argparse 4 | import os 5 | 6 | import pytest 7 | 8 | from pypiper.manager import CHECKPOINT_SPECIFICATIONS, LOGFILE_SUFFIX 9 | from tests.helpers import named_param 10 | 11 | __author__ = "Vince Reuter" 12 | __email__ = "vreuter@virginia.edu" 13 | 14 | 15 | def pytest_generate_tests(metafunc): 16 | """Dynamic test case generation for this module's test cases.""" 17 | if "spec_type" in metafunc.fixturenames: 18 | metafunc.parametrize(argnames="spec_type", argvalues=["cmdl", "ctor"]) 19 | 20 | 21 | @named_param("checkpoint_type", argvalues=["curr_checkpoint", "prev_checkpoint"]) 22 | def test_manager_starts_in_null_checkpoint_state(get_pipe_manager, checkpoint_type): 23 | """A pipeline manager begins with null checkpoint states.""" 24 | pm = get_pipe_manager(name="ctor-checkpoint-state") 25 | assert getattr(pm, checkpoint_type) is None 26 | 27 | 28 | def test_logger_logfile_collision_with_manager_logfile_is_expected_error__issue_212( 29 | get_pipe_manager, tmpdir 30 | ): 31 | pipe_name = "test_issue212" 32 | with pytest.raises(ValueError) as err_ctx: 33 | get_pipe_manager( 34 | name=pipe_name, 35 | logger_kwargs={ 36 | "logfile": os.path.join(tmpdir.strpath, pipe_name + LOGFILE_SUFFIX) 37 | }, 38 | ) 39 | assert str(err_ctx.value).startswith( 40 | f"The logfile given for the pipeline manager's logger matches that which will be used by the manager itself" 41 | ) 42 | 43 | 44 | class ManagerConstructorCheckpointSpecificationTests: 45 | """Tests for manager's constructor's ability to parse and set 46 | checkpoint specifications, which can determine aspects of control flow.""" 47 | 48 | def test_no_checkpoint_specifications(self, get_pipe_manager): 49 | """A manager may be constructed without any checkpoint provision.""" 50 | get_pipe_manager(name="test-pipe") 51 | 52 | @named_param("start_point", ["filter_reads", "align_reads"]) 53 | def test_just_start(self, get_pipe_manager, spec_type, start_point): 54 | """Starting point may be set from command-line or ctor keyword.""" 55 | spec_data = {"start_point": start_point} 56 | if spec_type == "cmdl": 57 | kwargs = {"args": argparse.Namespace(**spec_data)} 58 | else: 59 | kwargs = spec_data 60 | pm = get_pipe_manager(name="start-test", **kwargs) 61 | assert start_point == pm.start_point 62 | 63 | @named_param("stop_type", ["stop_before", "stop_after"]) 64 | @named_param("stop_point", ["align_reads", "call_peaks"]) 65 | def test_just_stop(self, get_pipe_manager, spec_type, stop_type, stop_point): 66 | """Particular stopping type is set correctly.""" 67 | spec_data = {stop_type: stop_point} 68 | if spec_type == "cmdl": 69 | kwargs = {"args": argparse.Namespace(**spec_data)} 70 | else: 71 | kwargs = spec_data 72 | pm = get_pipe_manager(name="stop-test", **kwargs) 73 | assert stop_point == getattr(pm, stop_type) 74 | 75 | @named_param("start_point", ["merge_input", "filter_reads"]) 76 | @named_param("stop_point", ["align_reads", "calc_stats"]) 77 | @named_param("stop_type", ["stop_before", "stop_after"]) 78 | def test_start_and_stop( 79 | self, get_pipe_manager, spec_type, stop_type, start_point, stop_point 80 | ): 81 | """Specifying both start and stop works just fine.""" 82 | spec_data = {"start_point": start_point, stop_type: stop_point} 83 | if spec_type == "cmdl": 84 | kwargs = {"args": argparse.Namespace(**spec_data)} 85 | else: 86 | kwargs = spec_data 87 | pm = get_pipe_manager(name="start-and-stop-test", **kwargs) 88 | assert start_point == pm.start_point 89 | assert stop_point == getattr(pm, stop_type) 90 | 91 | @named_param("stop_before", ["align_reads", "call_peaks"]) 92 | @named_param("stop_after", ["fastqc", "align_reads"]) 93 | @named_param("stop_before_type", ["cmdl", "ctor"]) 94 | @named_param("stop_after_type", ["cmdl", "ctor"]) 95 | def test_both_stop_modes_is_prohibited( 96 | self, 97 | get_pipe_manager, 98 | stop_before_type, 99 | stop_after_type, 100 | stop_before, 101 | stop_after, 102 | ): 103 | """Provision of both prospective and retrospective stop is bad.""" 104 | raw_kwargs = {"stop_before": stop_before, "stop_after": stop_after} 105 | cmdl_kwargs = {} 106 | if stop_before_type == "cmdl": 107 | cmdl_kwargs["stop_before"] = raw_kwargs.pop("stop_before") 108 | if stop_after_type == "cmdl": 109 | cmdl_kwargs["stop_after"] = raw_kwargs.pop("stop_after") 110 | args = argparse.Namespace(**cmdl_kwargs) 111 | with pytest.raises(TypeError): 112 | get_pipe_manager(name="test-double-stop", args=args, **raw_kwargs) 113 | 114 | @pytest.mark.parametrize( 115 | argnames=["start_point", "stop_point"], 116 | argvalues=[("fastqc", "align_reads"), ("align_reads", "call_peaks")], 117 | ) 118 | @pytest.mark.parametrize( 119 | argnames=["start_spec_type", "stop_spec_type"], 120 | argvalues=[("cmdl", "ctor"), ("ctor", "cmdl")], 121 | ) 122 | @named_param("stop_type", ["stop_before", "stop_after"]) 123 | def test_complementary_specification_modes( 124 | self, 125 | get_pipe_manager, 126 | start_spec_type, 127 | stop_spec_type, 128 | stop_type, 129 | start_point, 130 | stop_point, 131 | ): 132 | """Command-line and keyword specifications can harmonize.""" 133 | raw_kwargs = {"start_point": start_point, stop_type: stop_point} 134 | cmdl_kwargs = {} 135 | if start_spec_type == "cmdl": 136 | cmdl_kwargs["start_point"] = raw_kwargs.pop("start_point") 137 | if stop_spec_type == "cmdl": 138 | cmdl_kwargs[stop_type] = raw_kwargs.pop(stop_type) 139 | args = argparse.Namespace(**cmdl_kwargs) 140 | pm = get_pipe_manager(name="complementary-test", args=args, **raw_kwargs) 141 | assert start_point == pm.start_point 142 | assert stop_point == getattr(pm, stop_type) 143 | 144 | @named_param( 145 | "check_specs", 146 | [ 147 | ["start_point"], 148 | ["stop_before"], 149 | ["stop_after"], 150 | ["start_point", "stop_before"], 151 | ["start_point", "stop_after"], 152 | ], 153 | ) 154 | def test_command_line_beats_constructor_keyword( 155 | self, get_pipe_manager, check_specs 156 | ): 157 | """Command-line specification is favored over constructor keyword.""" 158 | 159 | # Declare values to use for respective specification modes. 160 | cmdl_values = { 161 | "start_point": "merge_input", 162 | "stop_before": "call_peaks", 163 | "stop_after": "align_reads", 164 | } 165 | ctor_values = { 166 | "start_point": "fastqc", 167 | "stop_before": "align_reads", 168 | "stop_after": "filter_reads", 169 | } 170 | 171 | # Create specifications based on current test case parameterization. 172 | cmdl_kwargs = {cp_spec: cmdl_values[cp_spec] for cp_spec in check_specs} 173 | ctor_kwargs = {cp_spec: ctor_values[cp_spec] for cp_spec in check_specs} 174 | args = argparse.Namespace(**cmdl_kwargs) 175 | 176 | # Build the pipeline manager. 177 | pm = get_pipe_manager(name="cmdl-preference", args=args, **ctor_kwargs) 178 | 179 | # Verify the preference for command-line value over variable keyword 180 | # argument value. 181 | for cp_spec in check_specs: 182 | assert cmdl_kwargs[cp_spec] == getattr(pm, cp_spec) 183 | 184 | # Verify that the non-specified values were set to null. 185 | for cp_spec in set(CHECKPOINT_SPECIFICATIONS) - set(check_specs): 186 | assert getattr(pm, cp_spec) is None 187 | -------------------------------------------------------------------------------- /tests/pipeline_manager/test_manager_state.py: -------------------------------------------------------------------------------- 1 | """ Tests related to pipeline manager state. """ 2 | 3 | import os 4 | 5 | import pytest 6 | 7 | from pypiper.utils import checkpoint_filepath, pipeline_filepath 8 | from tests.helpers import named_param 9 | 10 | __author__ = "Vince Reuter" 11 | __email__ = "vreuter@virginia.edu" 12 | 13 | 14 | def test_starts_running(get_pipe_manager): 15 | """A PipelineManager begins running during its construction.""" 16 | pm = get_pipe_manager(name="TestPM") 17 | assert pm._active 18 | 19 | 20 | # Parameters governing execution: 21 | # 1 -- checkpoint existence 22 | # 3 -- halt state (.halted) 23 | 24 | 25 | class ExecutionSkippingTests: 26 | """Tests for cases in which command execution should be skipped.""" 27 | 28 | @named_param("start_point", ["align_reads", "make_call"]) 29 | def test_skips_to_start(self, get_pipe_manager, start_point): 30 | """The pipeline manager can skip to a starting point.""" 31 | 32 | # Initialize the manager. 33 | pm = get_pipe_manager(name="StartTestPM", start_point=start_point) 34 | 35 | # Make a call that should be skipped on the basis of not yet 36 | # reaching the start point. 37 | pm.timestamp(checkpoint="merge_reads") 38 | path_merge_file = pipeline_filepath(pm, filename="merge.txt") 39 | assert not os.path.isfile(path_merge_file) 40 | cmd = "touch {}".format(path_merge_file) 41 | pm.run(cmd, target=path_merge_file) 42 | assert not os.path.isfile(path_merge_file) 43 | 44 | # Make a call that should also be skipped on the basis of not yet 45 | # reaching the designated starting/activation point. 46 | pm.timestamp(checkpoint="fastqc") 47 | fastqc_folder = os.path.join(pm.outfolder, "fastqc") 48 | os.makedirs(fastqc_folder) 49 | fastqc_zipfile = os.path.join(fastqc_folder, "qc.zip") 50 | fastqc_rawfile = os.path.join(fastqc_folder, "qc.txt") 51 | cmds = [ 52 | "fastqc", 53 | "touch {}".format(fastqc_rawfile), 54 | "touch {}".format(fastqc_zipfile), 55 | ] 56 | pm.run(cmds, target=fastqc_zipfile) 57 | assert not os.path.isfile(fastqc_zipfile) 58 | assert not os.path.isfile(fastqc_rawfile) 59 | 60 | # Make a all that should be the first executed, on the basis of 61 | # being associated with the designated. 62 | pm.timestamp(checkpoint=start_point) 63 | path_first_file = pipeline_filepath(pm, filename="outfile.bam") 64 | cmd = "touch {}".format(path_first_file) 65 | pm.run(cmd, target=path_first_file) 66 | assert os.path.isfile(path_first_file) 67 | 68 | @named_param("num_skips", argvalues=[1, 2, 3]) 69 | def test_skips_execution_if_in_unstarted_state(self, get_pipe_manager, num_skips): 70 | """Pipeline manager skips command execution if not in active state.""" 71 | 72 | pm = get_pipe_manager(name="skip-execs") 73 | pm._active = False 74 | 75 | testfile = pipeline_filepath(pm, filename="output.txt") 76 | assert not os.path.isfile(testfile) 77 | 78 | cmd = "touch {}".format(testfile) 79 | num_calls = 0 80 | 81 | # Remain inactive for a parameterized number of call-skipping iterations, 82 | # then adopt active mode. 83 | while True: 84 | pm.run(cmd, target=testfile) 85 | num_calls += 1 86 | if num_calls == num_skips: 87 | pm._active = True 88 | elif num_calls > num_skips: 89 | break 90 | # If we're still looping, we've not yet made a call in active mode. 91 | assert not os.path.isfile(testfile) 92 | 93 | # We break the loop once we've made a call in active state. 94 | assert os.path.isfile(testfile) 95 | 96 | @named_param("num_skips", argvalues=[1, 2, 3]) 97 | def test_respects_checkpoints(self, get_pipe_manager, num_skips): 98 | """Manager can skip pipeline to where it's not yet checkpointed.""" 99 | 100 | pm = get_pipe_manager(name="respect-checkpoints") 101 | 102 | # Control for possibility that skips are due to being in inactive mode. 103 | assert pm._active 104 | 105 | stages = ["merge", "qc", "filter", "align", "call"] 106 | 107 | # Create checkpoints. 108 | for s in stages[:num_skips]: 109 | pm.timestamp(checkpoint=s) 110 | 111 | # Go through the stages and see that we're skipping checkpoints 112 | # that exist, then proceeding to execute each subsequent stage. 113 | for i, s in enumerate(stages): 114 | outfile = pipeline_filepath(pm, s + ".txt") 115 | cmd = "touch {}".format(outfile) 116 | pm.timestamp(checkpoint=s) 117 | pm.run(cmd, target=outfile) 118 | 119 | if i < num_skips: 120 | # We should not have created the output file. 121 | try: 122 | assert not os.path.isfile(outfile) 123 | except AssertionError: 124 | print("Have run {} stage(s) of {} skip(s)".format(i + 1, num_skips)) 125 | print("Current manager checkpoint: {}".format(pm.curr_checkpoint)) 126 | raise 127 | else: 128 | # We should have created the output file. 129 | try: 130 | assert os.path.isfile(outfile) 131 | except AssertionError: 132 | print("Have run {} stage(s) of {} skip(s)".format(i + 1, num_skips)) 133 | print("Current manager checkpoint: {}".format(pm.curr_checkpoint)) 134 | print("Active? {}".format(pm._active)) 135 | raise 136 | 137 | @named_param("halt_index", [1, 2, 3]) 138 | def test_respects_halt(self, get_pipe_manager, halt_index): 139 | """The pipeline manager skips execution if it's in halted state.""" 140 | pm = get_pipe_manager(name="respects-halt") 141 | targets = ["file{}.txt".format(i) for i in range(1, 5)] 142 | for i, t in enumerate(targets): 143 | if i == halt_index: 144 | pm.halt(raise_error=False) 145 | target = pipeline_filepath(pm, filename=t) 146 | cmd = "touch {}".format(target) 147 | pm.run(cmd, target=target) 148 | for i, t in enumerate(targets): 149 | target = pipeline_filepath(pm, filename=t) 150 | if i < halt_index: 151 | assert os.path.isfile(target) 152 | else: 153 | assert not os.path.isfile(target) 154 | -------------------------------------------------------------------------------- /tests/pipeline_manager/test_pipeline_manager_timestamp_checkpoint_filepath.py: -------------------------------------------------------------------------------- 1 | """ Tests for construction of checkpoint filepath """ 2 | 3 | import glob 4 | import os 5 | import time 6 | 7 | from pypiper import PipelineManager 8 | from pypiper.const import CHECKPOINT_EXTENSION 9 | from pypiper.stage import Stage 10 | from tests.helpers import named_param 11 | 12 | __author__ = "Vince Reuter" 13 | __email__ = "vreuter@virginia.edu" 14 | 15 | 16 | class DummyPM(PipelineManager): 17 | """Simple override of true PipelineManager, for __init__ simplicity""" 18 | 19 | def __init__(self, name, outfolder): 20 | self.name = name 21 | self.outfolder = outfolder 22 | self.start_point = None 23 | self.stop_before = None 24 | self.stop_after = None 25 | self.halt_on_next = False 26 | self.last_timestamp = time.time() 27 | self.prev_checkpoint = None 28 | self.curr_checkpoint = None 29 | 30 | 31 | class PipelineMangerTimestampCheckpointFilePathTests: 32 | """Tests for determination of checkpoint filepath.""" 33 | 34 | @named_param( 35 | argnames=["name1", "name2"], 36 | argvalues=[("chipseq", "ATACseq"), ("rnaKallisto", "wgbs")], 37 | ) 38 | @named_param(argnames="spec_type", argvalues=["stage_name", "stage", "function"]) 39 | def test_distinguishes_pipelines_within_outfolder( 40 | self, name1, name2, spec_type, tmpdir 41 | ): 42 | """ 43 | Checkpoint files within sample folder include pipeline name. 44 | 45 | More specifically, we often have the case that a single sample's 46 | name is the name of a subfolder, within the broader results 47 | directory for an entire project, in which to store output files 48 | associated with that particular sample. The sample in that case may 49 | be associated with a protocol that maps to multiple pipelines, and 50 | thus the sample may be processed by multiple pipelines. If each 51 | pipeline had a unique set of stage names, we'd be fine with no 52 | additional measures, but to avoid a checkpoint filename collision, 53 | in which we would be unable to know which pipeline had generated 54 | a given checkpoint file, we add the pipeline name to the checkpoint 55 | file and assume that we're not processing the sample with multiple 56 | identically named pipelines. 57 | 58 | """ 59 | 60 | # Define a dummy function to use as the callable for a Stage. 61 | def trim_reads(): 62 | pass 63 | 64 | def stage_spec(): 65 | if spec_type == "function": 66 | return trim_reads 67 | elif spec_type not in ["stage", "stage_name"]: 68 | raise ValueError( 69 | "Unrecognized stage specification type: {}".format(spec_type) 70 | ) 71 | else: 72 | s = Stage(trim_reads) 73 | return s.name if spec_type == "stage_name" else s 74 | 75 | outfolder = tmpdir.strpath 76 | 77 | # At start, we should have no checkpoints. 78 | all_checkpoints_pattern = os.path.join(outfolder, "*" + CHECKPOINT_EXTENSION) 79 | assert [] == glob.glob(all_checkpoints_pattern) 80 | 81 | plm1 = DummyPM(name1, outfolder) 82 | plm2 = DummyPM(name2, outfolder) 83 | 84 | checkpoint_name = "trim_reads" 85 | plm1.timestamp(checkpoint=stage_spec(), finished=True) 86 | 87 | # Find the checkpoints; there should only be one. 88 | checkpoint_pattern = os.path.join( 89 | outfolder, "{}_*{}".format(name1, CHECKPOINT_EXTENSION) 90 | ) 91 | checkpoints = glob.glob(checkpoint_pattern) 92 | assert 1 == len(checkpoints) 93 | assert 1 == len(glob.glob(all_checkpoints_pattern)) 94 | # Check that we have the expected checkpoint. 95 | exp_chkpt_fpath = os.path.join( 96 | outfolder, "{}_{}".format(name1, checkpoint_name + CHECKPOINT_EXTENSION) 97 | ) 98 | assert exp_chkpt_fpath == checkpoints[0] 99 | 100 | # Create a second checkpoint with the same stage, but with a manager 101 | # of a different name. 102 | plm2.timestamp(checkpoint=stage_spec(), finished=True) 103 | checkpoint_pattern = os.path.join( 104 | outfolder, "{}_*{}".format(name2, CHECKPOINT_EXTENSION) 105 | ) 106 | checkpoints = glob.glob(checkpoint_pattern) 107 | assert 1 == len(checkpoints) 108 | all_checkpoints = glob.glob(all_checkpoints_pattern) 109 | assert 2 == len(all_checkpoints) 110 | exp_chkpt_fpath_2 = os.path.join( 111 | outfolder, "{}_{}".format(name2, checkpoint_name + CHECKPOINT_EXTENSION) 112 | ) 113 | 114 | assert {exp_chkpt_fpath, exp_chkpt_fpath_2} == set(all_checkpoints) 115 | -------------------------------------------------------------------------------- /tests/pipeline_manager/test_set_status_flag.py: -------------------------------------------------------------------------------- 1 | """ Tests for changes to pipepline manager's status flag. """ 2 | 3 | import pytest 4 | 5 | from pypiper.flags import * 6 | from pypiper.flags import __all__ as ALL_FLAGS 7 | from tests.helpers import named_param 8 | 9 | __author__ = "Vince Reuter" 10 | __email__ = "vreuter@virginia.edu" 11 | 12 | 13 | @named_param( 14 | argnames="status", 15 | argvalues=[ 16 | RUN_FLAG, 17 | COMPLETE_FLAG, 18 | FAIL_FLAG, 19 | PAUSE_FLAG, 20 | WAIT_FLAG, 21 | ], 22 | ) 23 | def test_set_status_flag_is_idempotent(get_pipe_manager, status): 24 | """Calls to manager's status flag setter are idempotent.""" 25 | pm = get_pipe_manager(name="TestPM") 26 | pm._set_status_flag(status) 27 | assert status == pm.status 28 | pm._set_status_flag(status) 29 | assert status == pm.status 30 | 31 | 32 | @pytest.mark.parametrize( 33 | argnames=["init_state", "new_state"], 34 | argvalues=[ 35 | (WAIT_FLAG, RUN_FLAG), 36 | (WAIT_FLAG, COMPLETE_FLAG), 37 | (WAIT_FLAG, FAIL_FLAG), 38 | (RUN_FLAG, COMPLETE_FLAG), 39 | (RUN_FLAG, PAUSE_FLAG), 40 | (RUN_FLAG, FAIL_FLAG), 41 | (FAIL_FLAG, RUN_FLAG), 42 | ], 43 | ) 44 | def test_changes_status_state(get_pipe_manager, init_state, new_state): 45 | """Manager setting status flag changes is internal status/state.""" 46 | pm = get_pipe_manager(name="test-pipe") 47 | assert pm.status == RUN_FLAG 48 | pm._set_status_flag(init_state) 49 | assert init_state == pm.status 50 | pm._set_status_flag(new_state) 51 | assert new_state == pm.status 52 | -------------------------------------------------------------------------------- /tests/test_packaging.py: -------------------------------------------------------------------------------- 1 | """ Validate what's available directly on the top-level import. """ 2 | 3 | from inspect import isfunction 4 | 5 | import pytest 6 | 7 | __author__ = "Vince Reuter" 8 | __email__ = "vreuter@virginia.edu" 9 | 10 | 11 | @pytest.mark.parametrize( 12 | ["obj_name", "typecheck"], 13 | [ 14 | ("add_logging_options", isfunction), 15 | ("check_all_commands", isfunction), 16 | ("determine_uncallable", isfunction), 17 | ("logger_via_cli", isfunction), 18 | ], 19 | ) 20 | def test_top_level_exports(obj_name, typecheck): 21 | """At package level, validate object availability and type.""" 22 | import pypiper 23 | 24 | try: 25 | obj = getattr(pypiper, obj_name) 26 | except AttributeError: 27 | pytest.fail("Unavailable on {}: {}".format(pypiper.__name__, obj_name)) 28 | else: 29 | assert typecheck(obj) 30 | -------------------------------------------------------------------------------- /tests/test_pipeline_filepath.py: -------------------------------------------------------------------------------- 1 | """ Tests for utility functions """ 2 | 3 | import os 4 | 5 | import mock 6 | import pytest 7 | 8 | from pypiper.utils import pipeline_filepath 9 | 10 | __author__ = "Vince Reuter" 11 | __email__ = "vreuter@virginia.edu" 12 | 13 | 14 | PIPELINE_NAMES = ["chiapet", "chipseq", "atacseq", "kallisto", "wgbs"] 15 | SUFFICES = [".txt", "_results.csv", ".stats.tsv", "-data.json"] 16 | 17 | 18 | @pytest.fixture 19 | def pl_mgr(request, tmpdir): 20 | """ 21 | Provide test case with a mocked PipelineManager instance. 22 | 23 | :param pytest.fixtures.FixtureRequet request: test case requesting the 24 | setup fixture / parameterization 25 | :param py.path.local.LocalPath tmpdir: Test case temporary path object. 26 | :return mock.MagicMock: Mocked PipelineManager, sufficient for test. 27 | """ 28 | 29 | # Select the pipeline name. 30 | if "pipe_name" in request.fixturenames: 31 | pipe_name = request.getfixturevalue("pipe_name") 32 | else: 33 | pipe_name = "test-pipe" 34 | 35 | # Set output folder and name attributes for mocked PipelineManager. 36 | mock_mgr = mock.Mock(outfolder=tmpdir.strpath) 37 | type(mock_mgr).name = pipe_name # Circumvent 'name' keyword on Mock. 38 | return mock_mgr 39 | 40 | 41 | def test_requires_filename_or_suffix(pl_mgr): 42 | """Either filename or suffix is required to build a path.""" 43 | with pytest.raises(TypeError): 44 | pipeline_filepath(pl_mgr) 45 | 46 | 47 | @pytest.mark.parametrize(argnames="pipe_name", argvalues=PIPELINE_NAMES) 48 | @pytest.mark.parametrize(argnames="suffix", argvalues=SUFFICES) 49 | @pytest.mark.parametrize( 50 | argnames="test_type", argvalues=["has_pipe_name", "has_suffix", "full_path"] 51 | ) 52 | def test_uses_pipeline_name_if_no_filename( 53 | pipe_name, suffix, test_type, pl_mgr, tmpdir 54 | ): 55 | """Pipeline name is proxy for filename if just suffix is given.""" 56 | 57 | observed = pipeline_filepath(pl_mgr, suffix=suffix) 58 | 59 | # Allow test type to determine assertion. 60 | if test_type == "has_pipe_name": 61 | assert pipe_name in observed 62 | elif test_type == "has_suffix": 63 | assert observed.endswith(suffix) 64 | elif test_type == "full_path": 65 | try: 66 | expected = os.path.join(tmpdir.strpath, pipe_name + suffix) 67 | assert expected == observed 68 | except AssertionError: 69 | print("OUTFOLDER: {}".format(pl_mgr.outfolder)) 70 | raise 71 | else: 72 | raise ValueError("Unrecognized test type: '{}'".format(test_type)) 73 | 74 | 75 | @pytest.mark.parametrize( 76 | argnames="filename", argvalues=["testfile" + suffix for suffix in SUFFICES] 77 | ) 78 | @pytest.mark.parametrize(argnames="test_type", argvalues=["filename", "filepath"]) 79 | def test_direct_filename(tmpdir, filename, pl_mgr, test_type): 80 | """When given, filename is used instead of pipeline name.""" 81 | fullpath = pipeline_filepath(pl_mgr, filename=filename) 82 | if test_type == "filename": 83 | _, observed = os.path.split(fullpath) 84 | assert filename == observed 85 | elif test_type == "filepath": 86 | expected = os.path.join(tmpdir.strpath, filename) 87 | assert expected == fullpath 88 | else: 89 | raise ValueError("Unrecognized test type: '{}'".format(test_type)) 90 | 91 | 92 | @pytest.mark.parametrize(argnames="filename", argvalues=["output", "testfile"]) 93 | @pytest.mark.parametrize(argnames="suffix", argvalues=SUFFICES) 94 | def test_suffix_is_appended_to_filename_if_both_are_provided(pl_mgr, filename, suffix): 95 | """Suffix is appended to filename if both are provided.""" 96 | expected = filename + suffix 97 | fullpath = pipeline_filepath(pl_mgr, filename=filename, suffix=suffix) 98 | _, observed = os.path.split(fullpath) 99 | assert expected == observed 100 | -------------------------------------------------------------------------------- /tests/utils_tests/test_check_command_callability.py: -------------------------------------------------------------------------------- 1 | """ Tests for checking a collection of commands for callability """ 2 | 3 | import os 4 | 5 | import mock 6 | import pytest 7 | from ubiquerg import powerset 8 | from veracitools import ExpectContext 9 | 10 | from pypiper import utils as piper_utils 11 | 12 | __author__ = "Vince Reuter" 13 | __email__ = "vreuter@virginia.edu" 14 | 15 | 16 | EXTENSIONS = [ 17 | ".py", 18 | ".rb", 19 | ".sh", 20 | ".java", 21 | ".jar", 22 | ".pl", 23 | ".o", 24 | ".R", 25 | ".r", 26 | ".cpp", 27 | ".c", 28 | ".hs", 29 | ".scala", 30 | ".class", 31 | ] 32 | 33 | 34 | def _touch(f): 35 | """'touch' the given file. 36 | 37 | :param str f: filepath to create 38 | """ 39 | with open(f, "w"): 40 | print("touch: {}".format(f)) 41 | 42 | 43 | def _make_exec(f): 44 | """ 45 | 'touch' a file and set exec bit. 46 | 47 | :param str f: path to create 48 | """ 49 | import subprocess 50 | 51 | _touch(f) 52 | subprocess.check_call(["chmod", "+x", f]) 53 | 54 | 55 | def pytest_generate_tests(metafunc): 56 | """Dynamic test case generation and parameterization for this module""" 57 | if "str_list_monad" in metafunc.fixturenames: 58 | metafunc.parametrize("str_list_monad", [lambda s: s, lambda s: [s]]) 59 | 60 | 61 | @pytest.mark.skip(reason="test is broken") 62 | @pytest.mark.parametrize("filename", ["testfile" + x for x in EXTENSIONS]) 63 | @pytest.mark.parametrize( 64 | ["setup", "pretest", "exp_miss"], 65 | [ 66 | (lambda _: None, lambda f: not os.path.exists(f), lambda _: True), 67 | ( 68 | _touch, 69 | lambda f: os.path.isfile(f) and not os.access(f, os.X_OK), 70 | lambda f: not f.endswith(".jar"), 71 | ), 72 | ( 73 | _make_exec, 74 | lambda f: os.path.isfile(f) and os.access(f, os.X_OK), 75 | lambda _: False, 76 | ), 77 | ], 78 | ) 79 | def test_callability_checker_defaults(tmpdir, filename, setup, pretest, exp_miss): 80 | """Verify behavior of callability checker with default parameterization.""" 81 | cmd = os.path.join(tmpdir.strpath, filename) 82 | setup(cmd) 83 | assert pretest(cmd) 84 | extra_commands = ["this-is-not-a-program", "man", "ls"] 85 | expected = ["this-is-not-a-program"] 86 | if exp_miss(cmd): 87 | expected.append(cmd) 88 | observed = [c for c, _ in piper_utils.determine_uncallable([cmd] + extra_commands)] 89 | print("expected: {}".format(expected)) 90 | print("observed: {}".format(observed)) 91 | assert len(expected) == len(observed) 92 | assert set(expected) == set(observed) 93 | 94 | 95 | @pytest.mark.parametrize( 96 | ["uncall_result", "expectation"], [([], True), ([("noncmd", "noncmd")], TypeError)] 97 | ) 98 | @pytest.mark.parametrize("handler", [lambda: True, "not-a-function"]) 99 | def test_check_all_bad_handler_is_type_error_iff_uncallability_exists( 100 | uncall_result, str_list_monad, handler, expectation 101 | ): 102 | """Invalid handler evaluation is conditional having >= 1 uncallable command.""" 103 | cmd = "noncmd" 104 | with mock.patch.object( 105 | piper_utils, "determine_uncallable", return_value=uncall_result 106 | ), ExpectContext(expectation, piper_utils.check_all_commands) as check: 107 | check(cmds=str_list_monad(cmd), handle=handler) 108 | 109 | 110 | @pytest.mark.parametrize( 111 | ["create_result", "expected"], 112 | [ 113 | ( 114 | lambda bads: Exception("{} bad commands: {}".format(len(bads), bads)), 115 | Exception, 116 | ), 117 | (lambda bads: "{} bad commands: {}".format(len(bads), bads), False), 118 | ], 119 | ) 120 | def test_check_all_result_is_conjunctive(create_result, expected, str_list_monad): 121 | """Even one uncallable means result is False or an Exception occurs.""" 122 | cmd = "noncmd" 123 | with mock.patch.object( 124 | piper_utils, "determine_uncallable", return_value=[(cmd, cmd)] 125 | ), ExpectContext(expected, piper_utils.check_all_commands) as check: 126 | check(cmds=str_list_monad(cmd), get_bad_result=create_result) 127 | 128 | 129 | @pytest.mark.parametrize("commands", ["man", "ls", ["man", "ls"]]) 130 | @pytest.mark.parametrize( 131 | ["transforms", "expectation"], 132 | [(arg, lambda res: isinstance(res, list)) for arg in [None, []]] 133 | + [(arg, TypeError) for arg in [1, "a"]], 134 | ) 135 | def test_check_all_requires_iterable_transformations_argument( 136 | commands, transforms, expectation 137 | ): 138 | """If transformations arg is non-null, it must be iterable.""" 139 | 140 | def call(): 141 | return piper_utils.determine_uncallable(commands, transformations=transforms) 142 | 143 | if isinstance(expectation, type) and issubclass(expectation, Exception): 144 | with pytest.raises(expectation): 145 | call() 146 | else: 147 | assert expectation(call()) 148 | 149 | 150 | @pytest.mark.parametrize( 151 | "commands", powerset(["ls", "picard.jar", "$ENVVAR"], nonempty=True) 152 | ) 153 | def test_transformation_accumulation(commands): 154 | """Accumulation of transformations works as expected""" 155 | mapjar = lambda c: "java -jar {}".format(c) 156 | envjar = "env.jar" 157 | transforms = [ 158 | (lambda c: c == "$ENVVAR", lambda _: envjar), 159 | (lambda c: c.endswith(".jar"), mapjar), 160 | ] 161 | exps = {"ls": "ls", "picard.jar": mapjar("picard.jar"), "$ENVVAR": mapjar(envjar)} 162 | with mock.patch.object(piper_utils, "is_command_callable", return_value=False): 163 | res = piper_utils.determine_uncallable( 164 | commands, transformations=transforms, accumulate=True 165 | ) 166 | expectation = [(c, exps[c]) for c in commands] 167 | print("EXPECTED: {}".format(expectation)) 168 | print("OBSERVED: {}".format(res)) 169 | assert expectation == res 170 | 171 | 172 | @pytest.mark.parametrize( 173 | "transforms", 174 | [ 175 | {(lambda _: True, lambda c: c), (lambda _: False, lambda c: c)}, 176 | { 177 | "id": (lambda _: True, lambda c: c), 178 | "java": (lambda c: c.endswith(".jar"), lambda c: "java -jar {}".format(c)), 179 | }, 180 | ], 181 | ) 182 | def test_non_accumulative_but_unordered_transformation_is_exceptional(transforms): 183 | with pytest.raises(Exception) as err_ctx: 184 | piper_utils.determine_uncallable("ls", transformations=transforms) 185 | exp_msg = ( 186 | "If transformations are unordered, non-accumulation of " 187 | "effects may lead to nondeterministic behavior." 188 | ) 189 | assert str(err_ctx.value) == exp_msg 190 | -------------------------------------------------------------------------------- /tests/utils_tests/test_head_util.py: -------------------------------------------------------------------------------- 1 | """ Tests for the head() utility function """ 2 | 3 | import random 4 | import string 5 | 6 | import pytest 7 | from hypothesis import given 8 | from hypothesis import strategies as st 9 | 10 | from pypiper.utils import head 11 | 12 | __author__ = "Vince Reuter" 13 | __email__ = "vreuter@virginia.edu" 14 | 15 | 16 | NUMBERS_AND_LETTERS = list(string.ascii_letters) + list(range(-9, 10)) 17 | 18 | # Strategy for generating a pretty arbitrary atomic 19 | ATOMICS = st.deferred( 20 | lambda: st.booleans() 21 | | st.characters() 22 | | st.integers() 23 | | st.floats(allow_nan=False) 24 | | st.text() 25 | ) 26 | 27 | 28 | def pytest_generate_tests(metafunc): 29 | """Test case generation/parameterization for this module.""" 30 | if "seqtype" in metafunc.fixturenames: 31 | metafunc.parametrize("seqtype", [tuple, list]) 32 | if "iter_cast" in metafunc.fixturenames: 33 | metafunc.parametrize("iter_cast", [lambda c: c, lambda c: iter(c)]) 34 | if "h" in metafunc.fixturenames and "xs" in metafunc.fixturenames: 35 | metafunc.parametrize( 36 | ["h", "xs"], 37 | [ 38 | ( 39 | random.choice(NUMBERS_AND_LETTERS), 40 | [ 41 | random.choice(NUMBERS_AND_LETTERS) 42 | for _ in range(random.randint(5, 10)) 43 | ], 44 | ) 45 | for _ in range(10) 46 | ], 47 | ) 48 | 49 | 50 | @given(obj=ATOMICS) 51 | def test_head_atomic(obj): 52 | """head() of an atomic object is the object itself.""" 53 | assert obj == head(obj) 54 | 55 | 56 | def test_head_empty_string(): 57 | """Empty string is exception to exceptional-ness of empty collection.""" 58 | assert "" == head("") 59 | 60 | 61 | @pytest.mark.parametrize("coll", [dict(), set(), tuple(), list()]) 62 | def test_head_empty_collection(coll): 63 | """Request for first element from an empty Iterable is exceptional.""" 64 | with pytest.raises(ValueError): 65 | head(coll) 66 | 67 | 68 | def test_head_nonempty_sequential_collection(h, xs, seqtype, iter_cast): 69 | """Verify accuracy of request for first element from nonempty Iterable.""" 70 | c = seqtype([h]) + seqtype(xs) 71 | assert h == head(iter_cast(c)) 72 | 73 | 74 | def test_head_nonempty_set(): 75 | """Verify that head of nonempty set is non-exceptional.""" 76 | head({-1, 0, 1}) 77 | 78 | 79 | def test_head_nonempty_dict(): 80 | """Verify that head of nonempty dictionary is non-exceptional.""" 81 | head({"a": 1, "b": 2}) 82 | --------------------------------------------------------------------------------