├── .github
└── workflows
│ ├── black.yml
│ ├── python-publish.yml
│ └── run-pytest.yml
├── .gitignore
├── .travis.yml
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── design-notes.md
├── docs
├── README.md
├── advanced-run-method.md
├── best-practices.md
├── changelog.md
├── clean.md
├── cli.md
├── conf.py
├── configuration.md
├── contributing.md
├── faq.md
├── features.md
├── img
│ ├── error.svg
│ ├── job_status.svg
│ ├── logging.svg
│ ├── memory.svg
│ ├── protection.svg
│ ├── pypiper.svg
│ ├── pypiper_bug.svg
│ ├── pypiper_logo.svg
│ ├── pypiper_logo_dark.svg
│ ├── recovery.svg
│ ├── reports.svg
│ ├── restartability.svg
│ └── simplicity.svg
├── ngstk_intro.md
├── outputs.md
├── philosophy.md
├── pipestat.md
├── report.md
└── support.md
├── docs_jupyter
├── basic-pipeline.ipynb
├── build
│ └── .gitignore
└── hello-world.ipynb
├── example_pipelines
├── basic.py
├── count_reads.py
├── hello_pypiper.py
└── logmuse_example.py
├── init_interactive.py
├── logo_pypiper.svg
├── mkdocs.yml
├── pypiper
├── __init__.py
├── _version.py
├── const.py
├── exceptions.py
├── flags.py
├── folder_context.py
├── manager.py
├── ngstk.py
├── pipeline.py
├── stage.py
└── utils.py
├── requirements
├── requirements-dev-extra.txt
├── requirements-docs.txt
├── requirements-ngstk.txt
├── requirements-plot.txt
├── requirements-pypiper.txt
└── requirements-test.txt
├── setup.cfg
├── setup.py
└── tests
├── Data
├── default_pipestat_output_schema.yaml
└── sample_output_schema.yaml
├── __init__.py
├── conftest.py
├── helpers.py
├── pipeline
├── __init__.py
├── conftest.py
├── test_multi_pipeline_sample.py
├── test_pipeline.py
├── test_pipeline_checkpoint.py
└── test_pipeline_constructor.py
├── pipeline_manager
├── test_halt.py
├── test_manager_constructor.py
├── test_manager_state.py
├── test_pipeline_manager.py
├── test_pipeline_manager_timestamp.py
├── test_pipeline_manager_timestamp_checkpoint_filepath.py
└── test_set_status_flag.py
├── test_packaging.py
├── test_pipeline_filepath.py
└── utils_tests
├── test_check_command_callability.py
└── test_head_util.py
/.github/workflows/black.yml:
--------------------------------------------------------------------------------
1 | name: Lint
2 |
3 | on: [pull_request]
4 |
5 | jobs:
6 | lint:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v4
10 | - uses: actions/setup-python@v5
11 | - uses: psf/black@stable
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflows will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | name: Upload Python Package
5 |
6 | on:
7 | release:
8 | types: [created]
9 |
10 | jobs:
11 | deploy:
12 | name: upload release to PyPI
13 | runs-on: ubuntu-latest
14 | permissions:
15 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
16 | steps:
17 | - uses: actions/checkout@v4
18 | - name: Set up Python
19 | uses: actions/setup-python@v5
20 | with:
21 | python-version: '3.x'
22 | - name: Install dependencies
23 | run: |
24 | python -m pip install --upgrade pip
25 | pip install setuptools wheel twine
26 | - name: Build and publish
27 | run: |
28 | python setup.py sdist bdist_wheel
29 | - name: Publish package distributions to PyPI
30 | uses: pypa/gh-action-pypi-publish@release/v1
31 |
32 |
--------------------------------------------------------------------------------
/.github/workflows/run-pytest.yml:
--------------------------------------------------------------------------------
1 | name: Run pytests
2 |
3 | on:
4 | pull_request:
5 | branches: [master, dev]
6 | workflow_dispatch:
7 | inputs: null
8 |
9 | jobs:
10 | pytest:
11 | runs-on: ${{ matrix.os }}
12 | strategy:
13 | matrix:
14 | python-version: ["3.8", "3.13"]
15 | os: [ubuntu-latest]
16 |
17 | steps:
18 | - uses: actions/checkout@v4
19 |
20 | - name: Set up Python ${{ matrix.python-version }}
21 | uses: actions/setup-python@v5
22 | with:
23 | python-version: ${{ matrix.python-version }}
24 |
25 | - name: Install dev dependencies
26 | run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi
27 |
28 | - name: Install test dependencies
29 | run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi
30 |
31 | - name: Install package
32 | run: python -m pip install .
33 |
34 | - name: Run pytest tests
35 | run: pytest tests -x -vv --remote-data
36 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # ignore test results
2 | tests/test/*
3 |
4 | # toy/experimental files
5 | *.csv
6 | *.tsv
7 | *.pkl
8 |
9 | # ignore eggs
10 | .eggs/
11 | *.egg
12 |
13 | # ignore built docs
14 | build/*
15 | doc/build/*
16 | docs/autodoc_build/*
17 |
18 | # ignore test results
19 | example_pipelines/hello_pypiper_results/*
20 |
21 | # generic ignore list:
22 | *.lst
23 |
24 | # Compiled source
25 | *.com
26 | *.class
27 | *.dll
28 | *.exe
29 | *.o
30 | *.so
31 | *.pyc
32 |
33 | # Packages
34 | # it's better to unpack these files and commit the raw source
35 | # git has its own built in compression methods
36 | *.7z
37 | *.dmg
38 | *.gz
39 | *.iso
40 | *.jar
41 | *.rar
42 | *.tar
43 | *.zip
44 |
45 | # Logs and databases
46 | *.log
47 | *.sql
48 | *.sqlite
49 |
50 | # OS generated files
51 | .DS_Store
52 | .DS_Store?
53 | ._*
54 | .Spotlight-V100
55 | .Trashes
56 | ehthumbs.db
57 | Thumbs.db
58 |
59 | # Gedit temporary files
60 | *~
61 |
62 | # libreoffice lock files:
63 | .~lock*
64 |
65 | # Default-named test output
66 | microtest/
67 | open_pipelines/
68 |
69 | # IDE-specific items
70 | .idea/
71 |
72 | # pytest-related
73 | .cache/
74 | .coverage
75 | .pytest_cache
76 | .hypothesis
77 |
78 | # Reserved files for comparison
79 | *RESERVE*
80 |
81 | # Build-related stuff
82 | dist/
83 | pypiper.egg-info/
84 | piper.egg-info/
85 |
86 |
87 | *ipynb_checkpoints*
88 | *.egg-info*
89 |
90 |
91 | example_pipelines/pipeline_output
92 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "2.7"
4 | - "3.5"
5 | - "3.6"
6 | os:
7 | - linux
8 | install:
9 | - pip install --upgrade six
10 | - pip install .
11 | - pip install -r requirements/reqs-ngstk.txt
12 | - pip install -r requirements/reqs-test.txt
13 | script: pytest -v --cov=pypiper
14 | after_success:
15 | - coveralls
16 | branches:
17 | only:
18 | - dev
19 | - master
20 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2018 Nathan Sheffield
2 |
3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
4 |
5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
6 |
7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8 |
9 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
10 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements/*
2 | include README.md
3 | include logo_pypiper.svg
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Pypiper
4 |
5 | [](http://pypiper.readthedocs.org/en/latest/?badge=latest)
6 | [](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml)
7 | [](http://pepkit.github.io)
8 | [](https://pypi.org/project/piper)
9 | [](https://github.com/psf/black)
10 |
11 | A lightweight python toolkit for gluing together restartable, robust shell pipelines. Learn more in the [documentation](http://pypiper.databio.org).
12 |
--------------------------------------------------------------------------------
/design-notes.md:
--------------------------------------------------------------------------------
1 | # Design decision notes
2 |
3 | ## Terms
4 | - **Stage** or **phase**: an arbitrarily defined logical processing *step* or
5 | *unit* of operation(s) within a pipeline (e.g., read trimming or peak calling
6 | - **Checkpoint**: closely tied to the notion of a *stage* or *phase*, a
7 | checkpoint represents a point in a pipeline that the author has deemed
8 | as sufficiently significant to warrant designation.
9 |
10 |
11 | ## Classes
12 |
13 | ### `Pipeline`
14 |
15 | Since a pipeline author determines how to compose logical units, steps, or
16 | phases to define the pipeline, this class is inherently abstract. We
17 | prefer to be able to impose and enforce the requirement for stage definitions up
18 | front. This precludes the definition or creation of a `Pipeline` without stages
19 | as we declare `stages` as an `abc.abstractproperty` in the definition of
20 | `Pipeline`. This also permits us to validate the stage definitions up front, at
21 | time of pipeline creation rather than waiting until invocation of something like
22 | `run`. A further benefit of this design is the ability to store the parsed,
23 | validated form of the stage definitions obtained during instance construction.
24 | This eliminates a potential need to pass the stage definitions among methods for
25 | which they're needed, thereby simplifying our function signatures.
26 |
27 | Not only do we want to provide a simple framework in which processing
28 | stage/phases may be enumerated and defined in sequence, but we also want to
29 | facilitate non-sequential stages to be defined by the pipeline author. In the
30 | context of say, testing multiple alternative ways to do the same conceptual task
31 | (e.g., read trimming or peak calling) within the same pipeline, in early
32 | pipeline development, it's particularly likely that the desire to define
33 | unordered stages may arise.
34 |
35 | Additionally, it would be nice to support varying degrees of expressive power
36 | and simplicity. To some extent, this is likely to present a trade-off, with
37 | greater expressive power coming at the expense of implementation simplicity
38 | for a developer who wishes to implement/extend `Pipeline`. Possibilities
39 | for some of the "levels" of simplicity and power include but are not limited to:
40 |
41 | ### `Stage`
42 |
43 |
44 | ## Checkpointing complexity
45 |
46 | ### Direct pipeline file writes
47 | - In the most basic case, the pipeline may d
48 |
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # a developer's pipeline framework
2 |
3 | [](http://pepkit.github.io)
4 | [](https://pypi.org/project/piper)
5 | [](http://pypiper.readthedocs.org/en/latest/?badge=latest)
6 | [](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml)
7 | [](https://github.com/psf/black)
8 |
9 | ## What is pypiper?
10 |
11 | `Pypiper` is a **development-oriented** pipeline framework. It is a python package that helps you write robust pipelines directly in python, handling mundane tasks like restartability, monitoring for time and memory use, monitoring job status, copious log output, robust error handling, easy debugging tools, and guaranteed file output integrity.
12 |
13 |
14 |
15 | ## What makes pypiper better?
16 | With Pypiper, **simplicity is paramount**. Prerequisites are few: base python and 2 common packages (`pyyaml` and `psutil`). It should take fewer than 15 minutes to build your first pipeline and only an hour or two to learn the advanced features. Pypiper pipelines are:
17 |
18 | 1. written in pure python, so they do not require learning a new language;
19 | 2. easy to modify, so they are simple to update and maintain;
20 | 3. simple to understand for an outsider, so they can be approached by others.
21 |
22 | These traits make pypiper ideally suited for **pipelines under active development**. Read more about the [pypiper philosophy](philosophy).
23 |
24 | ## Installing
25 |
26 | Releases are posted as [GitHub releases](https://github.com/databio/pypiper/releases), or you can install from PyPI using `pip`:
27 |
28 | Global scope for single user:
29 | ```{console}
30 | pip install --user --upgrade piper
31 | ```
32 |
33 | Within an active virtual environment:
34 | ```{console}
35 | pip install --upgrade piper
36 | ```
37 |
38 | ## Quick start
39 |
40 | To employ pypiper, you build something like a shell script, but pass the commands through the `run` method on a `PipelineManager` object. Build your pipeline in **pure python**:
41 |
42 | ```{python}
43 | #!/usr/bin/env python
44 |
45 | import pypiper
46 | outfolder = "hello_pypiper_results" # Choose a folder for your results
47 |
48 | # Create a PipelineManager, the workhorse of pypiper
49 | pm = pypiper.PipelineManager(name="hello_pypiper", outfolder=outfolder)
50 |
51 | # Timestamps to delineate pipeline sections are easy:
52 | pm.timestamp("Hello!")
53 |
54 | # Now build a command and pass it to pm.run()
55 | target_file = "hello_pypiper_results/output.txt"
56 | command = "echo 'Hello, Pypiper!' > " + target_file
57 | pm.run(command, target_file)
58 |
59 | pm.stop_pipeline()
60 | ```
61 |
62 | Then invoke your pipeline via the command-line:
63 |
64 | ```{console}
65 | python my_pipeline.py --help
66 | ```
67 |
68 | ## Pypiper strengths
69 |
70 | Pypiper differs from existing frameworks in its focus on **simplicity**. Pypiper requires learning no new language, as **pipelines are written in pure python**. Pypiper is geared toward **developing pipelines** that are contained in a single file, easy to update, and easy to understand.
71 |
--------------------------------------------------------------------------------
/docs/advanced-run-method.md:
--------------------------------------------------------------------------------
1 | # Run method options
2 |
3 | The `PipelineManager.run()` function is the core of `pypiper`. In its simplest case, all you need to provide is a command to run, but it can be much more powerful with additional arguments.
4 |
5 | ## The `cmd` argument
6 |
7 | Normally you just pass a string, but you can also pass a list of commands to `run`, like this:
8 |
9 | ```
10 | pm.run([cmd1, cmd2, cmd3])
11 | ```
12 |
13 | Pypiper will treat these commands as a group, running each one in turn (and monitoring them individually for time and memory use). The difference in doing it this way, rather than 3 separate calls to `run()` is that if the series does not complete, the entire series will be re-run. This is therefore useful to piece together commands that must all be run together.
14 |
15 | ## The `target` and `lock_name` arguments
16 |
17 | If you provide a `target` file, then `pypiper` will first check to see if that target exists, and only run the `command` if the `target` does not exist. To prevent two pipelines from running commands on the same target, `pypiper` will automatically derive a lock file name from your target file. You can use the `lock_name` argument to override this default. If you do not provide a `target`, then you will need to provide a `lock_name` argument because `pypiper` will not be able to derive one automatically.
18 |
19 | ## The `nofail` argument
20 |
21 | By default, a command that fails will cause the entire pipeline to halt. If you want to provide a command that *should not* halt the pipeline upon failure, set `nofail=True`. `nofail` can be used to implement non-essential parts of the pipeline.
22 |
23 | ## The `follow` argument
24 |
25 | The `PipelineManager.run` function has an optional argument named `follow` that is useful for checking or reporting results from a command. To the `follow` argument you must pass a python function (which may be either a defined function or a `lambda` function). These *follow functions* are then coupled to the command that is run; the follow function will be called by python **if and only if** the command is run.
26 |
27 | Why is this useful? The major use cases are QC checks and reporting results. We use a folllow function to run a QC check to make sure processes did what we expect, and then to report that result to the `stats` file. We only need to check the result and report the statistic once, so it's best to put these kind of checks in a `follow` function. Often, you'd like to run a function to examine the result of a command, but you only want to run that once, *right after the command that produced the result*. For example, counting the number of lines in a file after producing it, or counting the number of reads that aligned right after an alignment step. You want the counting process coupled to the alignment process, and don't need to re-run the counting every time you restart the pipeline. Because pypiper is smart, it will not re-run the alignment once it has been run; so there is no need to re-count the result on every pipeline run!
28 |
29 | *Follow functions* let you avoid running unnecessary processes repeatedly in the event that you restart your pipeline multiple times (for instance, while debugging later steps in the pipeline).
30 |
31 | ## The `container` argument
32 |
33 | If you specify a string here, `pypiper` will wrap the command in a `docker run` call using the given `container` image name.
34 |
35 | ## The `shell` argument: Python subprocess types
36 |
37 | Since Pypiper runs all your commands from within python (using the `subprocess` python module), it's nice to be aware of the two types of processes that `subprocess` allows: **direct processes** and **shell processes**.
38 |
39 | **Direct process**: A direct process is executed and managed by Python, so Python retains control over the process completely. This enables Python to monitor the memory use of the subprocess and keep track of it more efficiently. The disadvantage is that you may not use shell-specific operators; for instance, a shell like `Bash` is what understands an asterisk (`*`) for wildcard expansion, or a bracket (`>`) for output redirection, or a pipe (`|`) to string commands together; these therefore cannot be used in direct subprocesses in Python.
40 |
41 | **Shell process**: In a shell process, Python first spawns a shell, and then runs the command in that shell. The spawned shell is the process controlled by Python, but processes in the shell are not. This allows you to use shell operators (*e.g.* `*`, `>`), but at the cost of the ability to monitor each command independently, because Python does not have direct control over subprocesses run inside a subshell.
42 |
43 | Because we'd like to run *direct* subprocesses whenever possible, `pypiper` includes 2 nice provisions that help us deal with shell processes. First, pypiper automatically divides commands with pipes (`|`) and executes them as *direct* processes. This enables you to pass a piped shell command, but still get the benefit of a direct process. Each process in the pipe is monitored for return value and for memory use individually, and this information is reported in the pipeline log. Nice! Second, pypiper uses the `psutil` module to monitor memory of *all child processes*. That means when you use a shell process, we *do* monitor the memory use of that process (and any other processes it spawns), which gives us more accurate memory monitoring -- but not from each task individually.
44 |
45 | You can force Pypiper by specifying `shell=True` or `shell=False` to the `run` function, but really, you shouldn't have to. By default Pypiper will try to guess: if your command contains `*` or `>`, it will be run in a shell. If it contains a pipe (`|`), it will be split and run as direct, piped subprocesses. Anything else will be run as a direct subprocess.
46 |
--------------------------------------------------------------------------------
/docs/best-practices.md:
--------------------------------------------------------------------------------
1 |
2 | # Best practices
3 |
4 | Here are some guidelines for how you can design the most effective pipelines.
5 |
6 |
7 | * **Compartmentalize output into folders**.
8 | In your output, keep pipeline steps separate by organizing output into subfolders.
9 |
10 | * **Use git for versioning**.
11 | If you develop your pipeline in a git repository, Pypiper will automatically record the commit hash when you run a pipeline, making it easy to figure out **exactly** what code version you ran.
12 |
13 | * **Record stats as you go**.
14 | In other words, don't do all your stats (`report_result()`) and QC at the end; do it along the way. This facilitates monitoring and maximizes availability of statistics even when a pipeline fails.
15 |
16 | * **Use looper args**.
17 | Even if you're not using looper at first, use `looper_args` and your pipeline will be looper-ready when it comes time to run 500 samples.
18 |
19 | * **Use NGSTk early on**.
20 | `NGSTk` has lots of useful functions that you will probably need. We've worked hard to make these robust and universal. For example, using NGSTk, you can easily make your pipeline take flexible input formats (FASTQ or BAM). Right now you may always have the same input type (FASTQ, for example), but later you may want your pipeline to be able to work from `bam` files. We've already written simple functions to handle single or multiple BAM or FASTQ inputs; just use this infrastructure (in `NGSTk`) instead of writing your own, and you'll save yourself future headaches.
21 |
22 | * **Make some important parameters in the pipeline config, instead of hardcoding them**
23 | Pypiper makes it painfully easy to use a config file to make your pipeline configurable. Typically you'll start by hard-coding in those parameters in your pipeline steps. But you can select a few important parameters and make them customizable in the pipeline config. Start from the very beginning by making a `yaml` pipeline config file. See an example of a [pipeline config file](configuration.md).
24 |
--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | ## [0.14.4] -- 2025-02-25
4 | ### Changed
5 | - Fixed warnings for Python >3.12
6 | - Updated version of Python to 3.13 in pytests
7 |
8 |
9 | ## [0.14.3] -- 2024-10-02
10 | ### Changed
11 | - bump requirements to require pipestat>=0.11.0
12 |
13 | ## [0.14.2] -- 2024-05-07
14 | ### Changed
15 | - Addresses [#218](https://github.com/databio/pypiper/issues/218)
16 |
17 | ## [0.14.1] -- 2024-04-19
18 | ### Changed
19 | - remove pipestat_project_name from PipelineManager parameters
20 | - refactor pipestat_sample_name to pipestat_record_identifier in PipelineManager parameters
21 | - update requirements for pipestat 0.9.0, ubiquerg 0.8.0, and yacman 0.9.3
22 | - set `force_overwrite` to default to true, Issue #209
23 |
24 |
25 | ## [0.14.0] -- 2023-12-22
26 | ### Changed
27 | - refactor for pipestat v0.6.0 release
28 | - drop python 2.7
29 | - updated requirements
30 | - changed message_raw to be a value_dict when reporting to conform to pipestat
31 | - ### Fixed
32 | - fixed #196 and #197
33 | - ### Added
34 | - added `force_overwrite` to `report_result` and `report_object`
35 | - added pipestat_pipeline_type, defaulting to sample-level
36 |
37 | ## [0.13.2] -- 2023-08-02
38 | ### Fixed
39 | - fixed self.new_start overriding checkpoints.
40 |
41 | ## [0.13.1] -- 2023-07-14
42 | ### Fixed
43 | - added _safe_write_to_file back into pypiper for Pepatac backwards compatibility
44 |
45 | ## [0.13.0] -- 2023-06-29
46 | ### Added
47 |
48 | - [pipestat](http://pipestat.databio.org/en/latest/) support
49 |
50 | ## [0.12.3] -- 2022-01-25
51 |
52 | ### Fixed
53 | - A few bugs with compatibility with Python version 3.9
54 |
55 | ## [0.12.2] -- 2021-12-20
56 |
57 | ### Fixed
58 | - Removed use2to3 for compatibility with setuptools 58
59 |
60 | ## [0.12.1] -- 2019-08-29
61 |
62 | ### Fixed
63 | - Increased requirement for logmuse
64 |
65 | ### Changed
66 | - Sort argument outputs in logs
67 | - Fail messages can now be a string (previously required an Exception).
68 |
69 | ## [0.12.0] -- 2019-08-14
70 |
71 | ### Added
72 | - Use profile to determine total elapsed time
73 | - `logging` functions directly on `PipelineManager`
74 | - Re-export `add_logging_options` from `logmuse`, for direct use by a pipeline author.
75 | - `logger_via_cli` that defaults to the `strict=False` behavior of the same-named function from `logmuse`
76 | - Use logging for pypiper-generated output.
77 |
78 | ### Fixed
79 | - Fix childless processes memory monitoring issue
80 | - Fix problems with runtime reading from pipeline profile TSV formatted according to two styles
81 | - Fix problems running containerized executables that would sometimes hang
82 | - Fix inaccurate elapsed time accumulation
83 | - Fixed a bug that caused hanging when running in singularity containerized executables
84 | - Fixed bugs with merging bamfiles using samtools
85 |
86 | ### Changed
87 | - The hashes in the pipeline profile are produced from the entire original command, even if it is a pipe
88 | - Changed output to simplify and improve log readability
89 |
90 | ## [0.11.3] -- 2019-06-17
91 | ### Fixed
92 | - Fixed a bug that caused an OSError removing lock files for some filesystems.
93 |
94 | ## [0.11.2] -- 2019-06-06
95 | ### Fixed
96 | - Elevate `attmap` depdendency bound to require inclusion of improved path expansion behavior.
97 |
98 | ## [0.11.1] -- 2019-05-30
99 | ### Fixed
100 | - Elevate `attmap` dependency bound to require inclusion of a bugfix there.
101 |
102 | ## [0.11.0] -- 2019-05-13
103 | - Improve python3 handling of integers and strings
104 | - Fixed a bug with cleanup scripts in `dirty` mode
105 | - Restructured profile output with hash and processID, and made lock paths relative
106 | - Streamlined some logging outputs
107 | - Allows nested parenthesies and braces for piped commands
108 | - Fixed a bug that would have split a pipe within a braced command
109 | - Some performance improvements for ngstk functions
110 | - Allow `ngstk.input_to_fastq` to yield gzipped fastq files
111 |
112 | ## [0.10.0] -- 2019-03-22
113 | - Fixed a bug that raised exception with empty commands
114 | - Fixed the pipeline profiling issues
115 | - Major updates to internal systems: Switch to `attmap`
116 | - Revamped way of handling child subprocesses which should lead to more
117 | efficient memory monitoring of piped subprocesses, and more consistent
118 | handling of rogues subprocesses during pipeline failure.
119 | - Added force mode to ngstk `gzip` and `pigz` use.
120 | - Changed documentation from sphinx to mkdocs.
121 | - Fixed a bug with python3 output buffering
122 | - Implement multi-target commands
123 | - Fixed a bug that had prevented new start mode from working in certain cases.
124 | - Allow user to change units of memory passed in with default pypiper cli.
125 |
126 | ## [0.9.4] -- 2019-01-31
127 |
128 | - Point release to PyPI for README rendering.
129 |
130 | ## [0.9.3] -- 2019-01-31
131 |
132 | - Simple point release update to fix PyPI landing page.
133 |
134 | ## [0.9.2] -- 2019-01-30
135 |
136 | - Never echo protected-looking attribute request.
137 |
138 | ## [0.9.1] -- 2019-01-29
139 |
140 | - Fixed a bug in NGSTk that caused errors for read counting functions on
141 | MACOS. MACOS `wc` returns leading whitespace, which caused these functions
142 | to fail.
143 |
144 | ## [0.9.0] -- 2018-11-19
145 |
146 | - Use `psutil` to track aggregate memory usage for processes that spawn
147 | children. This results in accurate memory records for these processes.
148 | - Individual commands in a string of commands connected by shell pipes are
149 | now treated as individual commands, and and monitored individually for
150 | time and memory, and if a single component, fails, the entire string will
151 | fail. Previously, only the final return command was recorded, as in `bash`.
152 | - Various other small improvements (like waiting checking for dynamic recover
153 | flags)
154 |
155 |
156 | ## [0.8.1] -- 2018-09-20
157 |
158 | - Fixed a bug that caused a problem for some pipelines adding groups of pypiper args.
159 | - Improved the `run` waiting method to immediately stop upon job
160 | completion, rather than minute-increment polling. This should improve
161 | performance particularly in pipelines with many, medium-runtime steps, and
162 | improve accuracy of timing profiles.
163 |
164 |
165 | ## [0.8.0] -- 2018-06-15
166 |
167 | - Implemented 'new start' mode.
168 | - Improved error messages and exception handling for missing child software.
169 | - Clarified the built-in required vs. optional args by allowing pipeline authors to specify which of the pypiper args are required. The command-line help UI now displays these correctly as 'required arguments' instead of incorrectly as 'optional arguments'.
170 | - Corrected the sort order of added arguments, so they are listed in the help menu more naturally.
171 | - Fixed a bug that caused an erroneous error message indicating missing pypiper args.
172 | - Clarified the license is BSD2
173 | - Fixed a bug that neglected to list pyyaml as a dependency
174 |
175 | ## [0.7.2] -- 2018-06-05
176 |
177 | - Implemented the 'report object' function.
178 | - Cleanup files are now relative, so a moved folder could still be cleaned.
179 | - Fixed a bug that prevented install if pypandoc was not installed
180 | - Fixed a bug that caused an error in containers where /proc wasn't accessible
181 |
182 |
183 | ## [0.7.1] -- 2018-02-27
184 |
185 | - Package cleanup for Pypi.
186 |
187 | ## [0.7.0] -- 2017-12-12
188 |
189 | - Standardize `NGSTk` function naming.
190 | - Introduce `Stage` as a model for a logically related set of pipeline processing steps.
191 | - Introduce `Pipeline` framework for automated processing phase execution and checkpointing.
192 | - Add ability to start and/or stop a pipeline at arbitrary checkpoints.
193 | - Introduce new state for a paused/halted pipeline.
194 | - Improve spawned process shutdown to avoid zombie processes.
195 |
196 | ## [0.6.0] -- 2017-08-24
197 |
198 | - Adds 'dynamic recovery' capability. For jobs that are terminated by an interrupt, such as a SIGINT or SIGTERM (as opposed to a failed command), pypiper will now set a dynamic recovery flags. These jobs, when restarted, will automatically pick up where they left off, without requiring any user intervention. Previously, the user would have to specify recover mode (`-R`). Now, recover mode forces a recover regardless of failure type, but interrupted pipelines will auto-recover.
199 | - Pypiper now appropriately adds cleanup files intermediate files for failed runs. It adds them to the cleanup script.
200 | - Improves error messages so only a single exception is raised with a more direct relevance to the user/
201 | - Pypiper will automatically remove existing flags when the run starts, eliminating the earlier issue of confusion due to multiple flags present on runs that were restarted.
202 | - Fixes a bug that caused a pipeline to continue if a SIGTERM is given during a process that was marked `nofail`.
203 | - Pypiper now can handle multiple SIGTERMs without one canceling the shutdown procedure begun by the other.
204 | - Major improvements to documentation and tutorials.
205 | - Adds `report_figure` function.
206 |
207 | ## [0.5.0] -- 2017-07-21
208 |
209 | - Adds preliminary support for handling docker containers
210 | - Updates docs, adds Hello World example
211 | - Adds 'waiting' flag
212 | - Eliminates extra spaces in reported results
213 | - Pypiper module is version aware
214 | - Updates Success time format to eliminate space
215 | - Improves efficiency in some ngstk merging functions
216 |
217 | ## [0.4.0] -- 2017-01-23
218 |
219 | - First major public release!
220 | - Revamps pypiper args
221 | - Adds parallel compression/decompression with pigz
222 | - Various small bug fixes and speed improvements
223 |
--------------------------------------------------------------------------------
/docs/clean.md:
--------------------------------------------------------------------------------
1 | # Cleaning up intermediate files
2 |
3 | Many pipelines produce intermediate files along the way. Should you retain these files or delete them?
4 |
5 | On the one hand, you may not necessarily want to delete them *immediately* after creating them, because what if a later pipeline step fails and you need to inspect an intermediate file? On the other hand, you may not want those intermediate files sticking around forever because they waste valuable disk space.
6 |
7 | Pypiper solves this problem with the concept of a *clean list*. The clean list is simply a list of files that are flagged for eventual cleanup. A pipeline developer adds to this list using `pm.clean_add(filename)`. Files on the clean list are *not* cleaned immediately; instead, they are **removed as soon as the pipeline is completed successfully** (in other words, after `pm.complete_pipeline()` is called). The advantage is that intermediate files will always be available as long as a pipeline has not completed successfully.
8 |
9 | In case a user of a pipeline instead wants to retain these files indefinitely, he or she may simply add `--dirty` when invoking the pipeline script. This instructs pypiper to *not* clean the intermediate files, even after a successful pipeline run. In this case, `pypiper` will produce a shell script (`clean.sh`), which can be run to remove all flagged files at a later point.
10 |
--------------------------------------------------------------------------------
/docs/cli.md:
--------------------------------------------------------------------------------
1 | # Command-line arguments
2 |
3 | Your final pypiper pipeline will be a python script that a pipeline user will invoke on the command-line. You will likely need to allow the user to change some parameters on the command line, and to take full advantage of Pypiper (make your pipeline recoverable, etc.), you wil need to add command-line options to your pipeline that change pypiper's settings as well. Pypiper uses the typical Python [argparse module](https://docs.python.org/2/library/argparse.html) to define command-line arguments to your pipeline, and offers a series of built-in functions to help you populate your pipeline's `ArgumentParser` with pypiper-specific options.
4 |
5 | You can use an ArgumentParser as usual, adding whatever arguments you like. Then, you add Pypiper args to your parser with the function `add_pypiper_args()`, and pass command-line options and arguments to your `PipelineManager`, like this:
6 |
7 | ```{python}
8 | import pypiper, os, argparse
9 | parser = ArgumentParser(description='Write a short description here')
10 |
11 | # add any custom args here
12 | # e.g. parser.add_argument('--foo', help='foo help')
13 |
14 | # once you've established all your custom arguments, we can add the default
15 | # pypiper arguments to your parser like this:
16 |
17 | parser = pypiper.add_pypiper_args(parser)
18 |
19 | # Then, pass the args parsed along to the PipelineManger
20 |
21 | args = parser.parse_args()
22 |
23 | pipeline = pypiper.PipelineManager(name="my_pipeline", outfolder="out", \
24 | args=args)
25 | ```
26 |
27 | Once you've added pypiper arguments, your pipeline will then enable a few built-in arguments: `--recover`, `--follow`, and `--dirty`, for example. As a side bonus, all arguments (including any of your custom arguments) will be recorded in the log outputs.
28 |
29 | That's the basics. But you can customize things for more efficiency using a simple set of pre-built args and groups of args in pypiper:
30 |
31 |
32 | # Universal pypiper options
33 |
34 | With that said, there are a few universal (Pypiper-added) options that are frequently (but not necessarily always) honored by pypiper pipelines. These default pypiper arguments are detailed below:
35 |
36 | - `-R, --recover`
37 | Recover mode, overwrite locks. This argument will tell pypiper to recover from a failed previous run. Pypiper will execute commands until it encounters a locked file, at which point it will re-execute the failed command and continue from there.
38 |
39 | - `-F, --follow`
40 | Force run follow-functions. By default, follow-functions are only run if their corresponding `run` command was run; with this option you can force all follow functions to run. This is useful for regenerating QC data on existing output. For more details, see :ref:`the follow argument `.
41 |
42 | - `-D, --dirty`
43 | Make all cleanups manual. By default, pypiper pipelines will delete any intermediate files. For debugging, you may want to turn this option off -- you can do that by specifying **dirty mode**.
44 |
45 | - `-N, --new-start`
46 | New start mode. This flag will tell pypiper to start over, and run every command, even if its target output already exists.
47 |
48 |
49 | ## Customizing `add_pypiper_args()`
50 |
51 |
52 | There are two ways to modulate the arguments added by `add_pypiper_args()` function: the `groups` argument, which lets you add argument groups; or the `args` argument, which lets you add arguments indvidually. By default, `add_pypiper_args()` add all arguments listed in the `pypiper` group. You may instead pass a list of one or more of these groups of arguments (to `groups`) or individual arguments (to `args`) to customize exactly the set of built-in options your pipeline implements.
53 |
54 | For example, `parser.add_pypiper_args(parser, groups=['pypiper', 'common'])` will add all arguments listed under `pypiper` and `common` below:
55 |
56 |
57 | ## Built-in arguments accessed with `add_pypiper_args()`
58 |
59 | Individual arguments that are understood and used by pypiper:
60 |
61 | - `-R, --recover`: for a failed pipeline run, start off at the last successful step.
62 | - `-N, --new-start`: Just recreate everything, even if it exists.
63 | - `-D, --dirty`: Disables automatic cleaning of temporary files, so all intermediate files will still exist after a pipeline run (either sucessful or failed). Useful for debugging a pipeline even if it succeeds.
64 | - `-F, --follow`: Runs all `follow-functions`, regardless of whether the accompanying command is run.
65 | - `-C, --config`: Pypiper pipeline config yaml file.
66 |
67 | Individual arguments just provided for convenience and standardization:
68 | - `-S, --sample-name`: name of the sample
69 | - `-I, --input`: primary input file (e.g. read1)
70 | - `-I2, --input2`: secondary input file (e.g. read2)
71 | - `-O, --output-parent`: parent folder for pipeline results (the pipeline will use this as the parent directory for a folder named `sample-name`)
72 | - `-P, --cores`: Number of cores to use
73 | - `-M, --mem`: Amount of memory in megabytes
74 | - `-G, --genome`: Reference genome assembly (e.g. `hg38`)
75 | - `-Q, --simple-or-paired`: For sequencing data, is input single-end or paired-end?
76 |
77 | ## Pre-built collections of arguments added via `groups`:
78 |
79 | - pypiper: `recover`, `new-start`, `dirty`, `follow`
80 | - common: `input`, `sample-name`
81 | - config: `config`
82 | - resource: `mem`, `cores`
83 | - looper: `config`, `output-parent`, `mem`, `cores`
84 | - ngs: `input`, `sample-name`, `input2`, `genome`, `single-or-paired`
85 |
86 |
87 | ## Specifying required built-in arguments
88 |
89 | If you're using the built-in arguments, you may want to module which are required and which are not. That way, you can piggyback on how `ArgumentParser` handles required arguments very nicely -- if the user does not specify a required argument, the pipeline will automatically prompt with usage instructions.
90 |
91 | By default, built-in arguments are not flagged as required, but you can pass a list of required built-ins to the `required` parameter, like `add_pypiper_args(parser, args=["sample-name"], required=["sample-name"])`.
92 |
93 |
94 | ## Examples
95 |
96 | import pypiper, os, argparse
97 | parser = ArgumentParser(description='Write a short description here')
98 |
99 | # add just arguments from group `pypiper`
100 | parser = pypiper.add_pypiper_args(parser, groups=["pypiper"])
101 |
102 | # add just arguments from group `common`
103 | parser = pypiper.add_pypiper_args(parser, groups=["common"])
104 |
105 | # add arguments from two groups
106 | parser = pypiper.add_pypiper_args(parser, groups=["common", "resources"],
107 | required=["sample-name", "output-parent"])
108 |
109 | # add individual argument
110 | parser = pypiper.add_pypiper_args(parser, args=["genome"])
111 |
112 | # add some groups and some individual arguments
113 | parser = pypiper.add_pypiper_args(parser, args=["genome"], groups=["looper", "ngs"])
114 |
--------------------------------------------------------------------------------
/docs/configuration.md:
--------------------------------------------------------------------------------
1 | # Pipeline configuration files
2 |
3 | If you write a pipeline config file in `yaml` format and name it the same thing as the pipeline (but replacing `.py` with `.yaml`), pypiper will automatically load and provide access to these configuration options, and make it possible to pass customized config files on the command line. This is very useful for tweaking a pipeline for a similar project with slightly different parameters, without having to re-write the pipeline.
4 |
5 | It's easy: just load the `PipelineManager` with `args` (as described in [command-line arguments](cli.md)), and you have access to the config file automatically in in `pipeline.config`.
6 |
7 | For example, in `myscript.py` you write:
8 |
9 | ```{python}
10 | parser = pypiper.add_pipeline_args(parser, args=["config"])
11 | pipeline = pypiper.PipelineManager(name="my_pipeline", outfolder=outfolder, \
12 | args = parser)
13 | ```
14 |
15 | And in the same folder, you include `myscript.yaml`:
16 |
17 |
18 |
19 | my_section:
20 | setting1: True
21 | setting2: 15
22 |
23 | Then you can access these settings automatically in your script using:
24 |
25 |
26 |
27 | pipeline.config.my_section.setting1
28 | pipeline.config.my_section.setting2
29 |
30 |
31 | This `yaml` file is useful for any parameters *not related to the input Sample* (which should be passed on the command-line). By convention, for consistency across pipelines, we use sections called `tools`, `resources`, and `parameters`, but the developer has the freedom to add other sections/variables as needed.
32 |
33 | Here's a more realist example pipeline configuration file:
34 |
35 |
36 | ```{yaml}
37 | # paths to required tools
38 | tools:
39 | java: "/home/user/.local/tools/java"
40 | trimmomatic: "/home/user/.local/tools/trimmomatic.jar"
41 | fastqc: "fastqc"
42 | samtools: "samtools"
43 | bsmap: "/home/user/.local/tools/bsmap"
44 | split_reads: "/home/user/.local/tools/split_reads.py"
45 |
46 | # paths to reference genomes, adapter files, and other required shared data
47 | resources:
48 | resources: "/data/groups/lab_bock/shared/resources"
49 | genomes: "/data/groups/lab_bock/shared/resources/genomes/"
50 | adapters: "/data/groups/lab_bock/shared/resources/adapters/"
51 |
52 | # parameters passed to bioinformatic tools, subclassed by tool
53 | parameters:
54 | trimmomatic:
55 | quality_encoding: "phred33"
56 | threads: 30
57 | illuminaclip:
58 | adapter_fasta: "/home/user/.local/tools/resources/cpgseq_adapter.fa"
59 | seed_mismatches: 2
60 | palindrome_clip_threshold: 40
61 | simple_clip_threshold: 7
62 | slidingwindow:
63 | window_size: 4
64 | required_quality: 15
65 | maxinfo:
66 | target_length: 17
67 | strictness: 0.5
68 | minlen:
69 | min_length: 17
70 | bsmap:
71 | seed_size: 12
72 | mismatches_allowed_for_background: 0.10
73 | mismatches_allowed_for_left_splitreads: 0.06
74 | mismatches_allowed_for_right_splitreads: 0.00
75 | equal_best_hits: 100
76 | quality_threshold: 15
77 | quality_encoding: 33
78 | max_number_of_Ns: 3
79 | processors: 8
80 | random_number_seed: 0
81 | map_to_strands: 0
82 | ```
83 |
84 |
85 |
--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 |
2 | # Contributing
3 |
4 | We welcome contributions in the form of pull requests.
5 |
6 | If proposing changes to package source code, please run the test suite in `python2` and `python3` by running `pytest` or `python setup.py test` from within the repository root.
7 |
8 | If using `pytest` directly, we suggest first activating the appropriate Python version's virtual environment and running `pip install --ugprade ./`.
9 | Otherwise, simply specify the appropriate Python version, i.e. `python2 setup.py test` or `python3 setup.py test`.
--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
1 | # FAQ
2 |
3 | ## How can I run my pipeline on more than 1 sample?
4 |
5 | Pypiper only handles individual-sample pipelines. To run it on multiple samples, write a loop, or use [looper](http://looper.readthedocs.io/). Dividing multi-sample handling from individual sample handling is a conceptual advantage that allows us to write a nice, universal, generic sample-handler that you only have to learn once.
6 |
7 | ## What cluster resources can pypiper use?
8 |
9 | Pypiper is compute-agnostic. You run it wherever you want. If you want a nice way to submit pipelines for samples any cluster manager, check out [looper](http://looper.readthedocs.io/), which can run your pipeline on any compute infrastructure using the [divvy python package](http://code.databio.org/divvy).
10 |
11 | ## What does it mean for a sample to be in the "waiting" state?
12 |
13 | Waiting means `pypiper` encountered a file lock, but no recovery flag. So the pipeline thinks a process (from another run or another process) is currently writing that file. It periodically checks for the lock file to disappear, and assumes that the other process will unlock the file when finished. If you are sure there's not another process writing to that file, you can get `pypiper` to continue by deleting the corresponding `lock` file. In the future, you can use `pypiper's` recover mode (`-R`) to automatically restart a process when a `lock` file is found, instead of waiting.
14 |
15 | ## What is the 'elapsed time' in output?
16 |
17 | The "elapsed" time is referring to the amount of time since the preceding timestamp, not since the start of the pipeline. Timestamps are all displayed with a flag: `_TIME_`. The total cumulative time for the pipeline is displayed only at the end.
18 |
19 | ## How should I run a QC step to check results of one of my commands?
20 |
21 | Usually, you only want to run a QC step if the result was created in the same pipeline run. There's no need to re-run that step if you have to restart the pipeline due to an error later on. If you use `run()` for these steps, then they'll need to run each time the pipeline runs. Instead, this is exactly why we created [the follow argument](../advanced-run-method/#the-follow-argument) This option lets you couple a QC step to a `run()` call, so it only gets excecuted when it is required.
22 |
23 | ## How do I solve installation errors involving `psutil` and/or a compiler like `gcc` or `clang`?
24 |
25 | If you have trouble with installation and it looks like one of these pieces of software is involved, please check the [`psutil` installation guide](https://github.com/giampaolo/psutil/blob/master/INSTALL.rst).
26 |
27 |
--------------------------------------------------------------------------------
/docs/features.md:
--------------------------------------------------------------------------------
1 | # Pypiper features at-a-glance
2 |
3 |  **Simplicity**
4 |
5 | Pipelines are simple both to use and to develop. A pypiper pipeline is nothing more than a python script. You run it on the command line like you would any other python script. The basic documentation is just a few pages. It should only take you 15 minutes to write your first pipeline.
6 |
7 |  **Restartability**
8 |
9 | Commands check for their targets and only run if the target needs to be created. This provides computational advantages, and also means the pipeline will pick up where it left off in case it needs to be restarted or extended.
10 |
11 |  **File integrity protection**
12 |
13 | Pypiper uses automatic file locks. This ensures that tasks complete, and pipelines never continue with half-finished analysis. It also ensures that multiple pipeline runs will not interfere with one another -even if the steps are identical and produce the same files.
14 |
15 |  **Copious logging**
16 |
17 | Pypiper automatically prints output to screen and also stores it in a log file, so all subprocess output is captured permanently. It also provides copious information on versions, compute host, and easy timestamping.
18 |
19 |  **Memory use monitoring**
20 |
21 | Processes are polled for memory use, allowing you to more accurately gauge your future memory requirements.
22 |
23 |  **Job status monitoring**
24 |
25 | Pypiper automatically creates status flag files, so you can summarize the current state (`running`, `failed`, or `completed`) of hundreds of jobs simultaneously.
26 |
27 |  **Easy result reports**
28 |
29 | Pypiper provides functions to put key-value pairs into an easy-to-parse stats file, making it easy to summarize your pipeline results.
30 |
31 |  **Robust error handling**
32 |
33 | Pypiper closes pipelines gracefully on interrupt or termination signals, converting the status to `failed`. By default, a process that returns a nonzero value halts the pipeline, unlike in bash, where by default the pipeline would continue using an incomplete or failed result. This behavior can be overridden as desired with a single parameter.
34 |
35 |  **Dynamic recovery**
36 |
37 | If a job is interrupted (with SIGINT or SIGTERM), either from a user or by a cluster resource manager, pypiper will set a `dynamic recovery` flag. The next time the run is started, it will automatically pick up where it left off. This makes pypiper pipelines `automatically pre-emption ready`, so they can be immediately deployed on servers where jobs may be pre-empted.
38 |
--------------------------------------------------------------------------------
/docs/img/error.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
294 |
--------------------------------------------------------------------------------
/docs/img/protection.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
219 |
--------------------------------------------------------------------------------
/docs/img/pypiper_bug.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
160 |
--------------------------------------------------------------------------------
/docs/img/recovery.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
240 |
--------------------------------------------------------------------------------
/docs/img/simplicity.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
281 |
--------------------------------------------------------------------------------
/docs/ngstk_intro.md:
--------------------------------------------------------------------------------
1 |
2 | # NGSTk - Next Gen Sequencing Toolkit
3 |
4 | Pypiper functions are generic; they simply accept command-line commands and run them. You could use this to produce a pipeline in any domain. To add to this, it's helpful to build convenience functions specific to your scientific domain. It's really easy to create your own library of python functions by creating a python package. Then, you just need to import your package in your pipeline script and make use of the common functions. We refer to this type of package as a "toolkit".
5 |
6 | Pypiper includes a built-in toolkit called NGSTk (next-generation sequencing toolkit). NGSTk simply provides some convenient helper functions to create common shell commands, like converting from file formats (_e.g._ `bam_to_fastq()`), merging files (_e.g._ `merge_bams()`), counting reads, etc. These make it faster to design bioinformatics pipelines in Pypiper, but are entirely optional.
7 |
8 | Here's how to use `NGSTk`:
9 |
10 | ```{python}
11 | import pypiper
12 | pm = pypiper.PipelineManager(..., args = args)
13 |
14 | # Create a ngstk object (pass the PipelineManager as an argument)
15 | ngstk = pypiper.NGSTk(pm = pm)
16 |
17 | # Now you use use ngstk functions
18 | cmd = ngstk.index_bam("sample.bam")
19 | pm.run(cmd, target="sample.bam")
20 | ```
21 |
22 | A complete list of functions is in the [API](../autodoc_build/pypiper) or in the [source code for NGSTk](https://github.com/databio/pypiper/blob/master/pypiper/ngstk.py).
23 |
--------------------------------------------------------------------------------
/docs/outputs.md:
--------------------------------------------------------------------------------
1 |
2 | # Outputs explained
3 |
4 | Assume you are using a pypiper pipeline named `PIPE` ( it passes `name="PIPE"` to the PipelineManager constructor). By default, your `PipelineManager` will produce the following outputs automatically (in addition to any output created by the actual pipeline commands you run):
5 |
6 | * **PIPE_log.md**
7 | The log starts with a bunch of useful information about your run: a starting timestamp, version numbers of the pipeline and pypiper, a declaration of all arguments passed to the pipeline, the compute host, etc. Then, all output sent to screen is automatically logged to this file, providing a complete record of your run.
8 |
9 | * **PIPE_status.flag**
10 | As the pipeline runs, it produces a flag in the output directory, which can be either `PIPE_running.flag`, `PIPE_failed.flag`, or `PIPE_completed.flag`. These flags make it easy to assess the current state of running pipelines for individual samples, and for many samples in a project simultaneously.
11 |
12 | * **stats.yaml**
13 | Any results reported by the pipeline are saved as key-value pairs in this file, for easy parsing.
14 |
15 | * **PIPE_profile.md**
16 | A profile log file that provides, for every process run by the pipeline, 3 items: 1) the process name; 2) the clock time taken by the process; and 3) the memory high water mark used by the process. This file makes it easy to profile pipelines for memory and time resources.
17 |
18 | * **PIPE_commands.md**
19 | Pypiper produces a log file containing all the commands run by the pipeline, verbatim. These are also included in the main log.
20 |
21 | Multiple pipelines can easily be run on the same sample, using the same output folder (and possibly sharing intermediate files), as the result outputs will be identifiable by the `PIPE_` identifier.
22 |
23 | These files are [markdown](https://daringfireball.net/projects/markdown/) making it easy to read either in text format, or to quickly convert to a pretty format like HTML.
24 |
--------------------------------------------------------------------------------
/docs/philosophy.md:
--------------------------------------------------------------------------------
1 | # Pypiper's development philosophy
2 |
3 | ## Who should use Pypiper?
4 |
5 | The target audience for pypiper is an individual who wants to build a basic
6 | pipeline, but **wants to do better job than just writing a shell script, without
7 | learning a new language or system**. Many bioinformatics pipelines are simple
8 | shell scripts that piece together commands, because that seems the most
9 | accessible. Although there has been an explosion of more feature-rich pipeline
10 | development frameworks, these often require substantial training and investment
11 | to write a pipeline that could be more quickly written as a shell script.
12 | Pipelines built using a framework are also harder to understand for users
13 | unfamiliar with the framework, and require more experience to develop and
14 | modify. Pypiper tries to give 80% of the benefits of a professional-scale
15 | pipelining system while requiring very little additional effort.
16 |
17 | If you have a shell script that would benefit from a layer of "handling code",
18 | Pypiper helps you convert that set of shell commands into a production-scale
19 | workflow, automatically handling the annoying details (restartablilty, file
20 | integrity, logging) to make your pipeline robust and restartable.
21 |
22 | Pypiper's strength is its simplicity. If all you want is a
23 | shell-like script, but now with the power of python, some built-in benefits, and
24 | syntactic sugar, then Pypiper is for you.
25 |
26 | ## What Pypiper does NOT do
27 |
28 | Pypiper tries to exploit the [Pareto principle](https://en.wikipedia.org/wiki/Pareto_principle) -- you'll get 80% of the
29 | features with only 20% of the work of other pipeline management systems. So,
30 | there are a few things Pypiper deliberately doesn't do:
31 |
32 |
33 | - Task dependencies. Pypiper runs sequential pipelines. We view this as an
34 | advantage because it makes the pipeline easier to write, easier to understand,
35 | easier to modify, and easier to debug -- critical things for pipelines that
36 | are still under active development (which is most pipelines in bioinformatics). For
37 | developmental pipelines, the complexity introduced by task dependencies is not
38 | worth the minimal benefit -- read this [post on parallelism in
39 | bioinformatics](http://databio.org/posts/paralellism_in_bioinformatics.html)
40 | for an explanation.
41 |
42 | - Cluster submission. Pypiper pipelines are scripts. You can run them on
43 | whatever computing resources you have. We have divided cluster resource
44 | management into a separate project called
45 | [looper](http://looper.readthedocs.io/). Pypiper builds individual,
46 | single-sample pipelines that can be run one sample at a time.
47 | [Looper](http://looper.readthedocs.io/) then processes groups of samples,
48 | submitting appropriate pipelines to a cluster or server. The two projects are
49 | independent and can be used separately, keeping things simple and modular.
50 |
51 |
52 | ## Yet another pipeline system?
53 |
54 | As I began to put together production-scale pipelines, I found a lot of relevant
55 | pipelining systems, but was universally disappointed. For my needs, they were
56 | all overly complex. I wanted something **simple enough to quickly write and
57 | maintain** a pipeline without having to learn a lot of new functions and
58 | conventions, but robust enough to handle requirements like restartability and
59 | memory usage monitoring. Everything related was either a pre-packaged pipeline
60 | for a defined purpose, or a heavy-duty development environment that was overkill
61 | for a simple pipeline. Both of these seemed to be targeted toward ultra-
62 | efficient uses, and neither fit my needs: I had a set of commands already in
63 | mind -- I just needed a wrapper that could take that code and make it
64 | automatically restartable, logged, robust to crashing, easy to debug, and so
65 | forth.
66 |
67 | Pypiper has evolved over the years and gained lots of cool new features. But its
68 | core principal has remained the same: simplicity. A pypiper pipeline can be
69 | nothing more than a familiar python script that strings together a few shell
70 | commands.
--------------------------------------------------------------------------------
/docs/pipestat.md:
--------------------------------------------------------------------------------
1 | # Pipestat
2 |
3 | Starting with pypiper v0.13.0 [pipestat](http://pipestat.databio.org) is the recommended way of reporting pipeline statistics.
4 | You can browse the pipestat documentation to learn more about it, but briefly pipestat is a tool that standardizes reporting of pipeline results. It provides 1) a standard specification for how pipeline outputs should be stored; and 2) an implementation to easily write results to that format from within Python or from the command line.
5 |
6 | ## Advancements
7 |
8 | There are a multiple advantages of using pipestat instead of the current pipeline results reporting system:
9 |
10 | 1. **Database results storage:** the results can be stored either in a database or a YAML-formatted results file. This way a pypiper pipeline running in an emphemeral compute environment can report the results to the database and exit. No need to sync the results with a central results storage.
11 | 2. **Strict and clear results definition:** all the results that can be reported by a pipeline run *must* be pre-defined in a [pipestat results schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format) that in a simplest case just indicates the result's type. This presents pipestat clients with the possibility to *reliably* gather all the possible results and related metadata.
12 | 3. **On-the-fly results validation:** the schema is used to validate and/or convert the reported result to a strictly determined type, which makes the connection of pypiper with downstream pipeline results processing software seamless.
13 | 4. **Unified, pipeline-agnostic results interface:** other pipelines, possibly created with different pipeline frameworks, can read and write results via Python API or command line interface. This feature significantly incerases your pipeline interoperability.
14 |
15 | ## Setup
16 |
17 | In order to start reporting results with pipestat in your pipeline all you need to do is define a [pipestat resuts schema](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format):
18 |
19 | ```yaml
20 | my_int_result:
21 | type: integer
22 | description: "This is my first result"
23 | my_str_result:
24 | type: string
25 | ```
26 |
27 | And in the simplest case... that's it! Now you can use `pipestat` property of the `PipelineManager` object to report/retrieve results.
28 |
29 | Pypiper *by default* will use a YAML-formated file to store the reported results in the selected `outfolder` and will look for `pipestat_results_schema.yaml` file in the pipeline Python script directory.
30 |
31 | ### Advanced features
32 |
33 | Pypiper-pipestat integration really shines when more advanced features are used. Here's how to set them up.
34 |
35 | #### Configure custom pipestat options
36 |
37 | You can configure pipestat by passing arguments with custom values to `pypiper.PipelineManager` constructor:
38 |
39 | ```python
40 | pm = pypiper.PipelineManager(
41 | ...,
42 | pipestat_schema="custom_results_schema.yaml",
43 | pipestat_results_file="custom_results_file.yaml",
44 | pipestat_sample_name="my_record",
45 | pipestat_project_name="my_namespace",
46 | pipestat_config="custom_pipestat_config.yaml",
47 | )
48 | ```
49 |
50 | #### Use a database to store reported results
51 |
52 | In order to establish a database connection pipestat requires few pieces of information, which *must* be provided in a [pipestat configuration file](http://pipestat.databio.org/en/latest/config/) passed to the `PipelineManager` constructor.
53 |
54 | This is an example of such a file:
55 |
56 | ```yaml
57 | database:
58 | name: pypiper # database name
59 | user: pypiper # database user name
60 | password: pypiper # database password
61 | host: localhost # database host address
62 | port: 5433 # port the database is running on
63 | dialect: postgresql # type of the databse
64 | driver: psycopg2 # driver to use to communicate
65 | ```
66 |
67 | For reference, here is a Docker command that would run a PostgreSQL instance that could be used to store the pipeline results when configured with with the configuration file above:
68 |
69 | ```console
70 | docker volume create postgres-data
71 |
72 | docker run -d --name pypiper-postgres \
73 | -p 5432:5433 -e POSTGRES_PASSWORD=pypiper \
74 | -e POSTGRES_USER=pypiper -e POSTGRES_DB=pypiper \
75 | -v postgres-data:/var/lib/postgresql/data postgres
76 | ```
77 |
78 | #### Highlight results
79 |
80 | The pipestat results schema can include any number of additional attributes for results. An example of that is *results highlighting*.
81 |
82 | When a `highlight: true` attribute is included attribute under result identifier in the schema file the highlighted results can be later retrieved by pipestat clients via `PipelineManager.pipestat.highlighted_results` property, which simply returns a list of result identifiers. to be presented in a special way.
83 |
84 | ### Usage
85 |
86 | Since a pipeline run-specific `PipestatManager` instance is attached to the `PipelineManager` object all the public pipestat API can be used. Please refer to the [pipestat API documentation](http://pipestat.databio.org/en/latest/autodoc_build/pipestat/) to read about all the currently available features.
87 |
88 | Here we present the most commonly used features:
89 |
90 | - results reporting
91 |
92 | *report a result, convert to schema-defined type and overwrite previously reported result*
93 |
94 | ```python
95 | results = {
96 | "my_int_result": 10,
97 | "my_str_result": "test"
98 | }
99 | pm.pipestat.report(
100 | values=results,
101 | strict_type=True,
102 | force_overwrite=True
103 | )
104 | ```
105 |
106 | - results retrieval
107 |
108 | ```python
109 | pm.pipestat.retrieve(result_identifier="my_int_result")
110 | ```
111 |
112 | - results schema exploration
113 |
114 | ```python
115 | pm.pipestat.schema
116 | ```
117 |
118 | - exploration of canonical [jsonschema](https://json-schema.org/) representation of result schemas
119 |
120 | ```python
121 | pm.pipestat.result_schemas
122 | ```
123 |
--------------------------------------------------------------------------------
/docs/report.md:
--------------------------------------------------------------------------------
1 | # Reporting statistics
2 |
3 | One of the most useful features of pypiper is the `report_result` function. This function provides a way to record small-scale results, like summary statistics. It standardizes the output so that universal tools can be built to process all the pipeline results from any pipeline, because the results are all reported in the same way.
4 |
5 | When you call `pm.report_result(key, value)`, pypiper simply writes the key-value pair to a `tsv` file (`stats.tsv`) in the pipeline output folder. These `stats.tsv` files can then later be read and aggregated systematically by other tools, such as `looper summarize`.
6 |
7 | ## Reporting objects
8 |
9 | **Note**: Reporting objects will be deprecated in a future release. It is recommended to use `report_result`.
10 |
11 | Starting in version 0.8, pypiper now implements a second reporting function, `report_object`. This is analogous to the `report_result` function, but instead of reporting simple key-value pairs, it lets you record any produced file as an output. Most commonly, this is used to record figures (PDFs, PNGs, etc.) produced by the pipeline. It can also be used to report other files, like HTML files.
12 |
13 | Pypiper writes results to `objects.tsv`, which can then be aggregated for project-level summaries of plots and other pipeline result files.
14 |
15 |
16 | ## Re-using previously reported results
17 |
18 | We frequently want to use the `report_result` capability in `follow` functions. It's a convenient place to do something like count or assess the result of a long-running command, and then report some summary statistic on it. One potential hangup with this strategy is dealing with secondary results after a pipeline is interrupted and restarted. By secondary result, I mean one that requires knowing the value of an earlier result. For example, if you want to compute the **percentage of reads that aligned**, you need to first know the **total reads** -- but what if your pipeline got interrupted and calculation of **total reads** happened in an earlier pipeline run?
19 |
20 | To solve this issue, Pypiper has a neat function called `get_stat` that lets you retrieve any value you've reported with `report_result` so you could use it to calculate statistics elsewhere in the pipeline. It will retrieve this either from memory, if the calculation of that result happened during the current pipeline run, or from the `stats.tsv` file, if the result was reported by an earlier run (or even another pipeline). So you could, in theory, calculate statistics based on results across pipelines.
21 |
22 | An example for how to use this is how we handle calculating the alignment rate in an NGS pipeline:
23 |
24 | ```{python}
25 | x = myngstk.count_mapped_reads(bamfile, args.paired_end)
26 | pm.report_result("Aligned_reads", x)
27 | rr = float(pm.get_stat("Raw_reads"))
28 | pm.report_result("Alignment_rate", round((rr * 100 / float(x), 3))
29 | ```
30 |
31 | Here, we use `get_stat` to grab a result that we reported previously (with `report_result`), when we counted the number of `Raw_reads` (earlier in the pipeline). We need this after the alignment to calculate the alignment rate. Later, now that we've reported `Alignment_rate`, you could harvest this stat again for use with `pm.get_stat("Alignment_rate")`. This is useful because you could put this block of code in a `follow` statement so it may not be executed, but you can still grab a reported result like this even if the execution happened outside of the current pipeline run; you'd only have to do the calculation once.
32 |
--------------------------------------------------------------------------------
/docs/support.md:
--------------------------------------------------------------------------------
1 |
2 | # Support
3 |
4 | If you find a bug or want request a feature, open an issue at https://github.com/databio/pypiper/issues.
5 |
--------------------------------------------------------------------------------
/docs_jupyter/build/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 |
--------------------------------------------------------------------------------
/docs_jupyter/hello-world.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Hello world\n",
8 | "\n",
9 | "This brief tutorial will run your first basic pypiper pipeline to ensure you have everything set up correctly. \n",
10 | "\n",
11 | "Just run these 3 lines of code and you're running your first pypiper pipeline!\n",
12 | "\n",
13 | "### Install the latest version of pypiper\n",
14 | "\n",
15 | "```{console}\n",
16 | "pip install --user piper\n",
17 | "```\n",
18 | "\n",
19 | "\n",
20 | "### Download hello_pypiper.py\n",
21 | "```{console}\n",
22 | "wget https://raw.githubusercontent.com/databio/pypiper/master/example_pipelines/hello_pypiper.py\n",
23 | "```\n",
24 | "\n",
25 | "This is a basic pipeline. Here are the contents:"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 4,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "name": "stdout",
35 | "output_type": "stream",
36 | "text": [
37 | "#!/usr/bin/env python\n",
38 | "\n",
39 | "import pypiper\n",
40 | "outfolder = \"hello_pypiper_results\" # Choose a folder for your results\n",
41 | "\n",
42 | "# Create a PipelineManager, the workhorse of pypiper\n",
43 | "pm = pypiper.PipelineManager(name=\"hello_pypiper\", outfolder=outfolder)\n",
44 | "\n",
45 | "# Timestamps to delineate pipeline sections are easy:\n",
46 | "pm.timestamp(\"Hello!\")\n",
47 | "\n",
48 | "# Now build a command-line command however you like, and pass it to pm.run()\n",
49 | "target_file = \"hello_pypiper_results/output.txt\"\n",
50 | "cmd = \"echo 'Hello, Pypiper!' > \" + target_file\n",
51 | "pm.run(cmd, target_file)\n",
52 | "\n",
53 | "pm.stop_pipeline()\n"
54 | ]
55 | }
56 | ],
57 | "source": [
58 | "cat ../example_pipelines/hello_pypiper.py"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "### Run it!"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 2,
71 | "metadata": {},
72 | "outputs": [
73 | {
74 | "name": "stdout",
75 | "output_type": "stream",
76 | "text": [
77 | "### [Pipeline run code and environment:]\n",
78 | "\n",
79 | "* Command: `../example_pipelines/hello_pypiper.py`\n",
80 | "* Compute host: nox\n",
81 | "* Working dir: /home/sheffien/code/pypiper/docs_jupyter\n",
82 | "* Outfolder: hello_pypiper_results/\n",
83 | "* Pipeline started at: (03-16 23:47:37) elapsed: 0.0 _TIME_\n",
84 | "\n",
85 | "### [Version log:]\n",
86 | "\n",
87 | "* Python version: 3.6.7\n",
88 | "* Pypiper dir: `/home/sheffien/.local/lib/python3.6/site-packages/pypiper`\n",
89 | "* Pypiper version: 0.9.5dev\n",
90 | "* Pipeline dir: `/home/sheffien/code/pypiper/example_pipelines`\n",
91 | "* Pipeline version: None\n",
92 | "* Pipeline hash: b'134e8c8f723da66697ab4f5b204315979b4e1042\\n'\n",
93 | "* Pipeline branch: b'* dev\\n'\n",
94 | "* Pipeline date: b'2019-03-16 11:41:56 -0400\\n'\n",
95 | "* Pipeline diff: b' 1 file changed, 16 insertions(+), 1 deletion(-)\\n'\n",
96 | "\n",
97 | "### [Arguments passed to pipeline:]\n",
98 | "\n",
99 | "\n",
100 | "----------------------------------------\n",
101 | "\n",
102 | "\n",
103 | "Changed status from initializing to running.\n",
104 | "No config file\n",
105 | "Hello! (03-16 23:47:37) elapsed: 0.0 _TIME_\n",
106 | "\n",
107 | "Target to produce: `hello_pypiper_results/output.txt`\n",
108 | "\n",
109 | "\n",
110 | "> `echo 'Hello, Pypiper!' > hello_pypiper_results/output.txt`\n",
111 | "\n",
112 | "
\n",
113 | "
\n",
114 | "Process 128 returned: (0). Elapsed: 0:00:00. Peak memory: (Process: None; Pipeline: 0GB)\n",
115 | "\n",
116 | "Changed status from running to completed.\n",
117 | "\n",
118 | "> `Time`\t0:00:00\thello_pypiper\t_RES_\n",
119 | "\n",
120 | "> `Success`\t03-16-23:47:37\thello_pypiper\t_RES_\n",
121 | "\n",
122 | "##### [Epilogue:]\n",
123 | "* Total elapsed time: 0:00:00\n",
124 | "* Peak memory used: 0 GB\n",
125 | "* Pipeline completed at: (03-16 23:47:37) elapsed: 0.0 _TIME_\n",
126 | "\n",
127 | "Pypiper terminating spawned child process 114...(tee)\n"
128 | ]
129 | }
130 | ],
131 | "source": [
132 | "python3 ../example_pipelines/hello_pypiper.py"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "This output is printed to your screen and also recorded in a log file (called ``hello_pypiper_log.md``). There are a few other outputs from the pipeline as well. All results are placed in a folder called ``hello_pypiper_results``. Navigate to that folder to observe the output of the pipeline, which will include these files:\n",
140 | "\n",
141 | " * hello_pypiper_commands.sh\n",
142 | " * hello_pypiper_completed.flag\n",
143 | " * hello_pypiper_log.md\n",
144 | " * hello_pypiper_profile.tsv\n",
145 | " * output.txt\n",
146 | " * stats.tsv\n",
147 | "\n",
148 | "These files are explained in more detail in the reference section [outputs explained](outputs). \n",
149 | "\n",
150 | "What's next? That depends on if you're interested in just *running* pypiper pipelines, or if you want to *develop* pypiper pipelines. The next sections are a series of HOW-TO articles that address each of these scenarios.\n"
151 | ]
152 | }
153 | ],
154 | "metadata": {
155 | "kernelspec": {
156 | "display_name": "Bash",
157 | "language": "bash",
158 | "name": "bash"
159 | },
160 | "language_info": {
161 | "codemirror_mode": "shell",
162 | "file_extension": ".sh",
163 | "mimetype": "text/x-sh",
164 | "name": "bash"
165 | }
166 | },
167 | "nbformat": 4,
168 | "nbformat_minor": 2
169 | }
170 |
--------------------------------------------------------------------------------
/example_pipelines/basic.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """Getting Started: A simple sample pipeline built using pypiper."""
4 |
5 | # This is a runnable example. You can run it to see what the output
6 | # looks like.
7 |
8 | # First, make sure you can import the pypiper package
9 |
10 | import os
11 |
12 | import pypiper
13 |
14 | # Create a PipelineManager instance (don't forget to name it!)
15 | # This starts the pipeline.
16 |
17 | pm = pypiper.PipelineManager(name="BASIC", outfolder="pipeline_output/")
18 |
19 | # Now just build shell command strings, and use the run function
20 | # to execute them in order. run needs 2 things: a command, and the
21 | # target file you are creating.
22 |
23 | # First, generate some random data
24 |
25 | # specify target file:
26 | tgt = "pipeline_output/test.out"
27 |
28 | # build the command
29 | cmd = f"shuf -i 1-500000000 -n 10000000 > {tgt}"
30 |
31 | # and run with run().
32 | pm.run(cmd, target=tgt)
33 |
34 | # Now copy the data into a new file.
35 | # first specify target file and build command:
36 | tgt = "pipeline_output/copied.out"
37 | cmd = f"cp pipeline_output/test.out {tgt}"
38 | pm.run(cmd, target=tgt)
39 |
40 | # You can also string multiple commands together, which will execute
41 | # in order as a group to create the final target.
42 | cmd1 = "sleep 5"
43 | cmd2 = "touch pipeline_output/touched.out"
44 | pm.run([cmd1, cmd2], target="pipeline_output/touched.out")
45 |
46 | # A command without a target will run every time.
47 | # Find the biggest line
48 | cmd = "awk 'n < $0 {n=$0} END{print n}' pipeline_output/test.out"
49 | pm.run(cmd, "lock.max")
50 |
51 | # Use checkprint() to get the results of a command, and then use
52 | # report_result() to print and log key-value pairs in the stats file:
53 | last_entry = pm.checkprint("tail -n 1 pipeline_output/copied.out")
54 | pm.report_result("last_entry", last_entry)
55 |
56 |
57 | # Now, stop the pipeline to complete gracefully.
58 | pm.stop_pipeline()
59 |
60 | # Observe your outputs in the pipeline_output folder
61 | # to see what you've created.
62 |
--------------------------------------------------------------------------------
/example_pipelines/count_reads.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """
4 | Counts reads.
5 | """
6 |
7 | __author__ = "Nathan Sheffield"
8 | __email__ = "nathan@code.databio.org"
9 | __license__ = "GPL3"
10 | __version__ = "0.1"
11 |
12 | import os
13 | import re
14 | import subprocess
15 | import sys
16 | from argparse import ArgumentParser
17 |
18 | import yaml
19 |
20 | import pypiper
21 |
22 | parser = ArgumentParser(
23 | description="A pipeline to count the number of reads and file size. Accepts"
24 | " BAM, fastq, or fastq.gz files."
25 | )
26 |
27 | # First, add standard arguments from Pypiper.
28 | # groups="pypiper" will add all the arguments that pypiper uses,
29 | # and adding "common" adds arguments for --input and --sample--name
30 | # and "output_parent". You can read more about your options for standard
31 | # arguments in the pypiper docs (section "command-line arguments")
32 | parser = pypiper.add_pypiper_args(
33 | parser,
34 | groups=["pypiper", "common", "ngs"],
35 | args=["output-parent", "config"],
36 | required=["sample-name", "output-parent"],
37 | )
38 |
39 | # Add any pipeline-specific arguments if you like here.
40 |
41 | args = parser.parse_args()
42 |
43 | if not args.input or not args.output_parent:
44 | parser.print_help()
45 | raise SystemExit
46 |
47 | if args.single_or_paired == "paired":
48 | args.paired_end = True
49 | else:
50 | args.paired_end = False
51 |
52 | # args for `output_parent` and `sample_name` were added by the standard
53 | # `add_pypiper_args` function.
54 | # A good practice is to make an output folder for each sample, housed under
55 | # the parent output folder, like this:
56 | outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name))
57 |
58 | # Create a PipelineManager object and start the pipeline
59 | pm = pypiper.PipelineManager(name="count", outfolder=outfolder, args=args)
60 |
61 | # NGSTk is a "toolkit" that comes with pypiper, providing some functions
62 | # for dealing with genome sequence data. You can read more about toolkits in the
63 | # documentation
64 |
65 | # Create a ngstk object
66 | ngstk = pypiper.NGSTk(pm=pm)
67 |
68 | raw_folder = os.path.join(outfolder, "raw/")
69 | fastq_folder = os.path.join(outfolder, "fastq/")
70 |
71 | # Merge/Link sample input and Fastq conversion
72 | # These commands merge (if multiple) or link (if single) input files,
73 | # then convert (if necessary, for bam, fastq, or gz format) files to fastq.
74 |
75 | # We'll start with a timestamp that will provide a division for this section
76 | # in the log file
77 | pm.timestamp("### Merge/link and fastq conversion: ")
78 |
79 | # Now we'll rely on 2 NGSTk functions that can handle inputs of various types
80 | # and convert these to fastq files.
81 |
82 | local_input_files = ngstk.merge_or_link(
83 | [args.input, args.input2], raw_folder, args.sample_name
84 | )
85 |
86 | cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq(
87 | local_input_files, args.sample_name, args.paired_end, fastq_folder
88 | )
89 |
90 |
91 | # Now we'll use another NGSTk function to grab the file size from the input files
92 | #
93 | pm.report_result("File_mb", ngstk.get_file_size(local_input_files))
94 |
95 |
96 | # And then count the number of reads in the file
97 |
98 | n_input_files = len(list(filter(bool, local_input_files)))
99 |
100 | raw_reads = (
101 | sum(
102 | [
103 | int(ngstk.count_reads(input_file, args.paired_end))
104 | for input_file in local_input_files
105 | ]
106 | )
107 | / n_input_files
108 | )
109 |
110 | # Finally, we use the report_result() function to print the output and
111 | # log the key-value pair in the standard stats.tsv file
112 | pm.report_result("Raw_reads", str(raw_reads))
113 |
114 | # Cleanup
115 | pm.stop_pipeline()
116 |
--------------------------------------------------------------------------------
/example_pipelines/hello_pypiper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import pypiper
4 |
5 | outfolder = "hello_pypiper_results" # Choose a folder for your results
6 |
7 | # Create a PipelineManager, the workhorse of pypiper
8 | pm = pypiper.PipelineManager(name="hello_pypiper", outfolder=outfolder)
9 |
10 | # Timestamps to delineate pipeline sections are easy:
11 | pm.timestamp("Hello!")
12 |
13 | # Now build a command-line command however you like, and pass it to pm.run()
14 | target_file = "hello_pypiper_results/output.txt"
15 | cmd = f"echo 'Hello, Pypiper!' > {target_file}"
16 | pm.run(cmd, target_file)
17 |
18 | pm.stop_pipeline()
19 |
--------------------------------------------------------------------------------
/example_pipelines/logmuse_example.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """
4 | Counts reads.
5 | """
6 |
7 | __author__ = "Nathan Sheffield"
8 | __email__ = "nathan@code.databio.org"
9 | __license__ = "GPL3"
10 | __version__ = "0.1"
11 |
12 | import os
13 | import re
14 | import subprocess
15 | import sys
16 | from argparse import ArgumentParser
17 |
18 | import yaml
19 |
20 | import pypiper
21 |
22 |
23 | def build_argparser():
24 | parser = ArgumentParser(
25 | description="A pipeline to count the number of reads and file size. Accepts"
26 | " BAM, fastq, or fastq.gz files."
27 | )
28 |
29 | # First, add standard arguments from Pypiper.
30 | # groups="pypiper" will add all the arguments that pypiper uses,
31 | # and adding "common" adds arguments for --input and --sample--name
32 | # and "output_parent". You can read more about your options for standard
33 | # arguments in the pypiper docs (section "command-line arguments")
34 | parser = pypiper.add_pypiper_args(
35 | parser,
36 | groups=["pypiper", "common", "ngs", "logmuse"],
37 | args=["output-parent", "config"],
38 | required=["sample-name", "output-parent"],
39 | )
40 |
41 | # Add any pipeline-specific arguments if you like here.
42 |
43 | # args for `output_parent` and `sample_name` were added by the standard
44 | # `add_pypiper_args` function.
45 |
46 | return parser
47 |
48 |
49 | def run_pipeline():
50 | # A good practice is to make an output folder for each sample, housed under
51 | # the parent output folder, like this:
52 | outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name))
53 |
54 | # Create a PipelineManager object and start the pipeline
55 | pm = pypiper.PipelineManager(name="logmuse-test", outfolder=outfolder, args=args)
56 | pm.info("Getting started!")
57 | # NGSTk is a "toolkit" that comes with pypiper, providing some functions
58 | # for dealing with genome sequence data. You can read more about toolkits in the
59 | # documentation
60 |
61 | files = [str(x) + ".tmp" for x in range(1, 20)]
62 |
63 | pm.run("touch " + " ".join(files), target=files, clean=True)
64 |
65 | # Create a ngstk object
66 | ngstk = pypiper.NGSTk(pm=pm)
67 |
68 | raw_folder = os.path.join(outfolder, "raw/")
69 | fastq_folder = os.path.join(outfolder, "fastq/")
70 |
71 | # Merge/Link sample input and Fastq conversion
72 | # These commands merge (if multiple) or link (if single) input files,
73 | # then convert (if necessary, for bam, fastq, or gz format) files to fastq.
74 |
75 | # We'll start with a timestamp that will provide a division for this section
76 | # in the log file
77 | pm.timestamp("### Merge/link and fastq conversion: ")
78 |
79 | # Now we'll rely on 2 NGSTk functions that can handle inputs of various types
80 | # and convert these to fastq files.
81 |
82 | local_input_files = ngstk.merge_or_link(
83 | [args.input, args.input2], raw_folder, args.sample_name
84 | )
85 |
86 | cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq(
87 | local_input_files, args.sample_name, args.paired_end, fastq_folder
88 | )
89 |
90 | # Now we'll use another NGSTk function to grab the file size from the input files
91 | #
92 | pm.report_result("File_mb", ngstk.get_file_size(local_input_files))
93 |
94 | # And then count the number of reads in the file
95 |
96 | n_input_files = len(list(filter(bool, local_input_files)))
97 |
98 | raw_reads = (
99 | sum(
100 | [
101 | int(ngstk.count_reads(input_file, args.paired_end))
102 | for input_file in local_input_files
103 | ]
104 | )
105 | / n_input_files
106 | )
107 |
108 | # Finally, we use the report_result() function to print the output and
109 | # log the key-value pair in the standard stats.tsv file
110 | pm.report_result("Raw_reads", str(raw_reads))
111 |
112 | # Cleanup
113 | pm.stop_pipeline()
114 |
115 |
116 | if __name__ == "__main__":
117 | try:
118 | parser = build_argparser()
119 | args = parser.parse_args()
120 |
121 | if not args.input or not args.output_parent:
122 | parser.print_help()
123 | raise SystemExit
124 |
125 | if args.single_or_paired == "paired":
126 | args.paired_end = True
127 | else:
128 | args.paired_end = False
129 |
130 | sys.exit(run_pipeline())
131 | except KeyboardInterrupt:
132 | sys.exit(1)
133 |
--------------------------------------------------------------------------------
/init_interactive.py:
--------------------------------------------------------------------------------
1 | """ Create dummy PipelineManager and NGSTk instance for interactive session. """
2 |
3 | import os
4 |
5 | from pypiper import NGSTk, PipelineManager
6 |
7 | __author__ = "Vince Reuter"
8 | __email__ = "vreuter@virginia.edu"
9 |
10 |
11 | pm = PipelineManager(name="interactive", outfolder=os.path.expanduser("~"))
12 | tk = NGSTk(pm=pm)
13 |
--------------------------------------------------------------------------------
/logo_pypiper.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
160 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Pypiper
2 | site_logo: img/pypiper_logo_dark.svg
3 | site_url: http://code.databio.org/pypiper/
4 | repo_url: http://github.com/databio/pypiper
5 | pypi_name: piper
6 |
7 | nav:
8 | - Getting Started:
9 | - Introduction: README.md
10 | - Philosophy: philosophy.md
11 | - Features at-a-glance: features.md
12 | - Hello world: hello-world.md
13 | - Developer guides:
14 | - Building a basic pipeline: basic-pipeline.md
15 | - Using the run method: advanced-run-method.md
16 | - Automatic command-line arguments: cli.md
17 | - Configuring pipelines: configuration.md
18 | - Reporting statistics: report.md
19 | - Reporting statistics with pipestat: pipestat.md
20 | - Cleaning up intermediate files: clean.md
21 | - Best practices: best-practices.md
22 | - Toolkits:
23 | - "NGSTk: the NGS toolkit": ngstk_intro.md
24 | - Reference:
25 | - Catalog of pipeline outputs: outputs.md
26 | - Pypiper API: autodoc_build/pypiper.md
27 | - NGSTk API: autodoc_build/ngstk.md
28 | - FAQ: faq.md
29 | - Support: support.md
30 | - Contributing: contributing.md
31 | - Changelog: changelog.md
32 |
33 | theme: databio
34 |
35 | plugins:
36 | - databio:
37 | autodoc_build: "docs/autodoc_build"
38 | autodoc_package: "pypiper"
39 | no_top_level: true
40 | build_list:
41 | pypiper: [PipelineManager]
42 | ngstk: [NGSTk]
43 | - search
44 |
45 |
--------------------------------------------------------------------------------
/pypiper/__init__.py:
--------------------------------------------------------------------------------
1 | # Implicitly re-export so logmuse usage by pipeline author routes through here.
2 | from logmuse import add_logging_options
3 |
4 | from ._version import __version__
5 | from .exceptions import *
6 | from .manager import *
7 | from .ngstk import *
8 | from .pipeline import *
9 | from .stage import *
10 | from .utils import *
11 |
--------------------------------------------------------------------------------
/pypiper/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.14.4"
2 |
--------------------------------------------------------------------------------
/pypiper/const.py:
--------------------------------------------------------------------------------
1 | """ Pypiper constants. """
2 |
3 | CHECKPOINT_EXTENSION = ".checkpoint"
4 | DEFAULT_SAMPLE_NAME = "DEFAULT_SAMPLE_NAME"
5 | PIPELINE_CHECKPOINT_DELIMITER = "_"
6 | STAGE_NAME_SPACE_REPLACEMENT = "-"
7 | PROFILE_COLNAMES = ["pid", "hash", "cid", "runtime", "mem", "cmd", "lock"]
8 |
--------------------------------------------------------------------------------
/pypiper/exceptions.py:
--------------------------------------------------------------------------------
1 | """ Custom pypiper exceptions """
2 |
3 | __author__ = "Vince Reuter"
4 | __email__ = "vreuter@virginia.edu"
5 |
6 |
7 | __all__ = [
8 | "PipelineError",
9 | "PipelineHalt",
10 | "IllegalPipelineDefinitionError",
11 | "IllegalPipelineExecutionError",
12 | "MissingCheckpointError",
13 | "UnknownPipelineStageError",
14 | "UnsupportedFiletypeException",
15 | "SubprocessError",
16 | ]
17 |
18 |
19 | class PipelineError(Exception):
20 | """General pipeline error."""
21 |
22 | pass
23 |
24 |
25 | class SubprocessError(Exception):
26 | pass
27 |
28 |
29 | class IllegalPipelineDefinitionError(PipelineError):
30 | pass
31 |
32 |
33 | class IllegalPipelineExecutionError(PipelineError):
34 | """Represent cases of illogical start/stop run() declarations."""
35 |
36 | pass
37 |
38 |
39 | class MissingCheckpointError(Exception):
40 | """Represent case of expected but absent checkpoint file."""
41 |
42 | def __init__(self, checkpoint, filepath):
43 | msg = "{}: '{}'".format(checkpoint, filepath)
44 | super(MissingCheckpointError, self).__init__(msg)
45 |
46 |
47 | class UnknownPipelineStageError(Exception):
48 | """
49 | Triggered by use of unknown/undefined name for a pipeline stage.
50 |
51 | :param str stage_name: Name of the stage triggering the exception.
52 | :param pypiper.Pipeline pipeline: Pipeline for which the stage is unknown/undefined.
53 | """
54 |
55 | def __init__(self, stage_name, pipeline=None):
56 | message = stage_name
57 | if pipeline is not None:
58 | try:
59 | stages = pipeline.stages()
60 | except AttributeError:
61 | # Just don't contextualize the error with known stages.
62 | pass
63 | else:
64 | message = "{}; defined stages: {}".format(
65 | message, ", ".join(map(str, stages))
66 | )
67 | super(UnknownPipelineStageError, self).__init__(message)
68 |
69 |
70 | class PipelineHalt(Exception):
71 | """
72 | Execution-stopping exception for halting a pipeline.
73 |
74 | This is useful for stopping execution of a truly script-like pipeline.
75 | That is, a pipeline that doesn't bundle/define stages or wrap run() calls
76 | in functions. In this case, we want to be able to stop the Python process
77 | as it chugs through a pipeline script, and we can do that by having a
78 | PipelineManager's halt method raise this exception.
79 |
80 | """
81 |
82 | def __init__(self, checkpoint=None, finished=None):
83 | if checkpoint is None:
84 | super(PipelineHalt, self).__init__()
85 | else:
86 | if isinstance(checkpoint, str):
87 | last_stage_done = checkpoint
88 | else:
89 | last_stage_done = getattr(checkpoint, "name", None) or getattr(
90 | checkpoint, "__name__", None
91 | )
92 | if not last_stage_done:
93 | super(PipelineHalt, self).__init__()
94 | else:
95 | if finished is None:
96 | msg = last_stage_done
97 | elif finished:
98 | msg = "Finished '{}'".format(last_stage_done)
99 | else:
100 | msg = "Stopped at '{}'".format(last_stage_done)
101 | super(PipelineHalt, self).__init__(msg)
102 |
103 |
104 | class UnsupportedFiletypeException(Exception):
105 | """Restrict filetype domain."""
106 |
107 | # Use superclass ctor to allow file name/path or extension to pass
108 | # through as the message for why this error is occurring.
109 | pass
110 |
--------------------------------------------------------------------------------
/pypiper/flags.py:
--------------------------------------------------------------------------------
1 | """ Status flags """
2 |
3 | # TODO: ultimately, these should migrate to pep.
4 | RUN_FLAG = "running"
5 | COMPLETE_FLAG = "completed"
6 | FAIL_FLAG = "failed"
7 | WAIT_FLAG = "waiting"
8 | PAUSE_FLAG = "partial"
9 | FLAGS = [RUN_FLAG, COMPLETE_FLAG, FAIL_FLAG, WAIT_FLAG, PAUSE_FLAG]
10 |
11 | __all__ = ["COMPLETE_FLAG", "FAIL_FLAG", "FLAGS", "PAUSE_FLAG", "RUN_FLAG", "WAIT_FLAG"]
12 |
--------------------------------------------------------------------------------
/pypiper/folder_context.py:
--------------------------------------------------------------------------------
1 | """ Context manager for temporarily changing folder. """
2 |
3 | import os
4 |
5 | __author__ = "Vince Reuter"
6 | __email__ = "vreuter@virginia.edu"
7 |
8 |
9 | class FolderContext(object):
10 | """Context manager for temporarily changing directory."""
11 |
12 | def __init__(self, folder):
13 | """
14 | Store the previous working path to restore upon exit.
15 |
16 | :param str folder: Path to set as new working directory
17 | """
18 | if not os.path.isdir(folder):
19 | raise ValueError("Requested temp entry to non-folder: {}".format(folder))
20 | self._prevdir = os.getcwd()
21 | self._currdir = folder
22 |
23 | def __enter__(self):
24 | """Make the working directory switch."""
25 | os.chdir(self._currdir)
26 |
27 | def __exit__(self, exc_type, exc_val, exc_tb):
28 | """Switch back to the previous working directory."""
29 | if not os.path.isdir(self._prevdir):
30 | raise RuntimeError(
31 | "Return path is no longer a directory: {}".format(self._prevdir)
32 | )
33 | os.chdir(self._prevdir)
34 |
--------------------------------------------------------------------------------
/pypiper/stage.py:
--------------------------------------------------------------------------------
1 | """ Conceptualize a pipeline processing phase/stage. """
2 |
3 | import copy
4 |
5 | from .utils import translate_stage_name
6 |
7 | __author__ = "Vince Reuter"
8 | __email__ = "vreuter@virginia.edu"
9 |
10 |
11 | __all__ = ["Stage"]
12 |
13 |
14 | class Stage(object):
15 | """
16 | Single stage/phase of a pipeline; a logical processing "unit". A stage is a
17 | collection of commands that is checkpointed.
18 | """
19 |
20 | def __init__(
21 | self,
22 | func,
23 | f_args=None,
24 | f_kwargs=None,
25 | name=None,
26 | checkpoint=True,
27 | *,
28 | nofail=False
29 | ):
30 | """
31 | A function, perhaps with arguments, defines the stage.
32 |
33 | :param callable func: The processing logic that defines the stage
34 | :param tuple f_args: Positional arguments for func
35 | :param dict f_kwargs: Keyword arguments for func
36 | :param str name: name for the phase/stage
37 | :param callable func: Object that defines how the stage will execute.
38 | :param bool nofail: Allow a failure of this stage to not fail the pipeline
39 | in which it's running
40 | """
41 | if isinstance(func, Stage):
42 | raise TypeError("Cannot create Stage from Stage")
43 | super(Stage, self).__init__()
44 | self.f = func
45 | self.f_args = f_args or tuple()
46 | self.f_kwargs = f_kwargs or dict()
47 | self.name = name or func.__name__
48 | self.checkpoint = checkpoint
49 | self.nofail = nofail
50 |
51 | @property
52 | def checkpoint_name(self):
53 | """
54 | Determine the checkpoint name for this Stage.
55 |
56 | :return str | NoneType: Checkpoint name for this stage; null if this
57 | Stage is designated as a non-checkpoint.
58 | """
59 | return translate_stage_name(self.name) if self.checkpoint else None
60 |
61 | def run(self, *args, **kwargs):
62 | """Alternate form for direct call; execute stage."""
63 | self(*args, **kwargs)
64 |
65 | def __call__(self, *args, **update_kwargs):
66 | """Execute the stage, allowing updates to args/kwargs."""
67 | kwargs = copy.deepcopy(self.f_kwargs)
68 | kwargs.update(update_kwargs)
69 | args = args or self.f_args
70 | self.f(*args, **kwargs)
71 |
72 | def __eq__(self, other):
73 | return (
74 | isinstance(other, Stage)
75 | and self.f.__name__ == other.f.__name__
76 | and (
77 | {k: v for k, v in self.__dict__.items() if k != "f"}
78 | == {k: v for k, v in other.__dict__.items() if k != "f"}
79 | )
80 | )
81 |
82 | def __ne__(self, other):
83 | return not (self == other)
84 |
85 | def __repr__(self):
86 | return (
87 | "{klass} '{n}': f={f}, args={pos}, kwargs={kwd}, "
88 | "checkpoint={check}".format(
89 | klass=self.__class__.__name__,
90 | f=self.f,
91 | n=self.name,
92 | pos=self.f_args,
93 | kwd=self.f_kwargs,
94 | check=self.checkpoint,
95 | )
96 | )
97 |
98 | def __str__(self):
99 | return "{}: '{}'".format(self.__class__.__name__, self.name)
100 |
--------------------------------------------------------------------------------
/requirements/requirements-dev-extra.txt:
--------------------------------------------------------------------------------
1 | black
2 |
--------------------------------------------------------------------------------
/requirements/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | mkdocs>=1.0
2 | markdown-include
3 | pydoc-markdown
4 | piper
5 | pipestat>=0.9.0a1
6 | https://github.com/databio/mkdocs-databio/archive/master.zip
--------------------------------------------------------------------------------
/requirements/requirements-ngstk.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | pysam
4 | yacman
5 | pipestat>=0.1.0
--------------------------------------------------------------------------------
/requirements/requirements-plot.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | scipy
3 | seaborn
4 |
--------------------------------------------------------------------------------
/requirements/requirements-pypiper.txt:
--------------------------------------------------------------------------------
1 | logmuse>=0.2.4
2 | psutil
3 | pandas
4 | ubiquerg>=0.8.0
5 | yacman>=0.9.3
6 | pipestat>=0.11.0
7 |
--------------------------------------------------------------------------------
/requirements/requirements-test.txt:
--------------------------------------------------------------------------------
1 | mock==2.0.0
2 | pytest>=4.6.9
3 | pytest-cov>=2.8.1
4 | hypothesis==4.38.0
5 | coveralls
6 | veracitools
7 | pytest-remotedata
8 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest
3 |
4 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | import os
4 | import sys
5 |
6 | extra = {}
7 |
8 | try:
9 | from setuptools import setup
10 | except ImportError:
11 | from distutils.core import setup
12 |
13 |
14 | def read_reqs_file(reqs_name):
15 | """Read requirements file for given requirements group."""
16 | path_reqs_file = os.path.join(
17 | "requirements", "requirements-{}.txt".format(reqs_name)
18 | )
19 | with open(path_reqs_file, "r") as reqs_file:
20 | return [
21 | pkg.rstrip() for pkg in reqs_file.readlines() if not pkg.startswith("#")
22 | ]
23 |
24 |
25 | with open(os.path.join("pypiper", "_version.py"), "r") as versionfile:
26 | version = versionfile.readline().split()[-1].strip("\"'\n")
27 |
28 |
29 | basic_reqs = read_reqs_file("pypiper")
30 |
31 | # Requirements for tests
32 | test_reqs = read_reqs_file("test")
33 |
34 | # Allow specification of desired features, which implies dependencies.
35 | addl_reqs = {
36 | bundle_name: read_reqs_file(bundle_name) for bundle_name in ["ngstk", "plot"]
37 | }
38 |
39 | # Complete collection of user requirements.
40 | addl_reqs["all"] = list({pkg for bundle in addl_reqs.values() for pkg in bundle})
41 |
42 | # Dev installation is full user + test.
43 | addl_reqs["dev"] = list(set(test_reqs + addl_reqs["all"]))
44 |
45 | with open("README.md") as f:
46 | long_description = f.read()
47 |
48 | setup(
49 | name="piper",
50 | packages=["pypiper"],
51 | install_requires=basic_reqs,
52 | version=version,
53 | description="A lightweight python toolkit for gluing together restartable, robust command line pipelines",
54 | long_description=long_description,
55 | long_description_content_type="text/markdown",
56 | classifiers=[
57 | "Development Status :: 4 - Beta",
58 | "License :: OSI Approved :: BSD License",
59 | "Programming Language :: Python :: 3.8",
60 | "Programming Language :: Python :: 3.9",
61 | "Programming Language :: Python :: 3.10",
62 | "Programming Language :: Python :: 3.11",
63 | "Topic :: Scientific/Engineering :: Bio-Informatics",
64 | ],
65 | author="Nathan Sheffield, Johanna Klughammer, Andre Rendeiro",
66 | author_email="nathan@code.databio.org, jklughammer@cemm.oeaw.ac.at, arendeiro@cemm.oeaw.ac.at",
67 | url="https://github.com/databio/pypiper/",
68 | license="BSD2",
69 | test_suite="tests", # python setup.py test
70 | tests_require=test_reqs, # Test-specific package dependencies
71 | # Extra package if doing `python setup.py test`
72 | setup_requires=(
73 | ["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else []
74 | ),
75 | extras_require=addl_reqs,
76 | # Version-specific items
77 | **extra
78 | )
79 |
--------------------------------------------------------------------------------
/tests/Data/default_pipestat_output_schema.yaml:
--------------------------------------------------------------------------------
1 | #NOTE:
2 | # This is output schema can be customized for your specific pipeline.
3 | #See here for more details:
4 | # https://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema-format
5 | pipeline_name: default_pipeline_name
6 | samples:
7 | number_of_things:
8 | type: integer
9 | description: "Number of things"
--------------------------------------------------------------------------------
/tests/Data/sample_output_schema.yaml:
--------------------------------------------------------------------------------
1 | pipeline_name: test_pipe
2 | samples:
3 | number_of_things:
4 | type: integer
5 | description: "Number of things"
6 | percentage_of_things:
7 | type: number
8 | description: "Percentage of things"
9 | name_of_something:
10 | type: string
11 | description: "Name of something"
12 | switch_value:
13 | type: boolean
14 | description: "Is the switch on or off"
15 | output_file:
16 | type: file
17 | description: "This a path to the output file"
18 | output_image:
19 | type: image
20 | description: "This a path to the output image"
21 | md5sum:
22 | type: string
23 | description: "MD5SUM of an object"
24 | highlight: true
25 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/pypiper/7c0e129440509610fb1d476a4076357105aebf8c/tests/__init__.py
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | """ Fixtures and configuration visible to all tests """
2 |
3 | import copy
4 | import os
5 | from functools import partial
6 |
7 | import pytest
8 |
9 | from pypiper import Pipeline, PipelineManager, Stage
10 |
11 | __author__ = "Vince Reuter"
12 | __email__ = "vreuter@virginia.edu"
13 |
14 |
15 | # Use a weird suffix for glob specificity.
16 | OUTPUT_SUFFIX = ".testout"
17 |
18 | TEST_PIPE_NAME = "test-pipe"
19 |
20 | FILE1_TEXT = "hello there"
21 | FILE2_TEXT = "hello2"
22 | FILE3_TEXT = "third"
23 | CONTENTS = [FILE1_TEXT, FILE2_TEXT, FILE3_TEXT]
24 |
25 | FILE1_NAME = "file1{}".format(OUTPUT_SUFFIX)
26 | FILE2_NAME = "file2{}".format(OUTPUT_SUFFIX)
27 | FILE3_NAME = "file3{}".format(OUTPUT_SUFFIX)
28 | FILENAMES = [FILE1_NAME, FILE2_NAME, FILE3_NAME]
29 |
30 | FILE_TEXT_PAIRS = list(zip(FILENAMES, CONTENTS))
31 |
32 |
33 | @pytest.fixture
34 | def get_pipe_manager(tmpdir):
35 | """Provide safe creation of pipeline manager, with multi=True."""
36 |
37 | def get_mgr(**kwargs):
38 | if "outfolder" in kwargs:
39 | kwd_args = kwargs
40 | else:
41 | kwd_args = copy.deepcopy(kwargs)
42 | kwd_args["outfolder"] = tmpdir.strpath
43 | return PipelineManager(multi=True, **kwd_args)
44 |
45 | return get_mgr
46 |
47 |
48 | @pytest.fixture
49 | def pl_mgr(request, get_pipe_manager):
50 | """Provide a PipelineManager and ensure that it's stopped."""
51 | pm = get_pipe_manager(name=TEST_PIPE_NAME)
52 |
53 | def _ensure_stopped():
54 | pm.stop_pipeline()
55 |
56 | request.addfinalizer(_ensure_stopped)
57 | return pm
58 |
59 |
60 | @pytest.fixture
61 | def dummy_pipe(pl_mgr):
62 | """Provide a basic Pipeline instance for a test case."""
63 | return DummyPipeline(pl_mgr)
64 |
65 |
66 | def write_file1(folder):
67 | _write(*FILE_TEXT_PAIRS[0], folder=folder)
68 |
69 |
70 | def write_file2(folder):
71 | _write(*FILE_TEXT_PAIRS[1], folder=folder)
72 |
73 |
74 | def write_file3(folder):
75 | _write(*FILE_TEXT_PAIRS[2], folder=folder)
76 |
77 |
78 | def _write(filename, content, folder=None):
79 | path = os.path.join(folder, filename)
80 | with open(path, "w") as f:
81 | f.write(content)
82 |
83 |
84 | class DummyPipeline(Pipeline):
85 | """Basic pipeline implementation for tests"""
86 |
87 | def __init__(self, manager):
88 | super(DummyPipeline, self).__init__(TEST_PIPE_NAME, manager=manager)
89 |
90 | def stages(self):
91 | """
92 | Establish the stages/phases for this test pipeline.
93 |
94 | :return list[pypiper.Stage]: Sequence of stages for this pipeline.
95 | """
96 | # File content writers parameterized with output folder.
97 | fixed_folder_funcs = []
98 | for f in [write_file1, write_file2, write_file3]:
99 | f_fixed = partial(f, folder=self.outfolder)
100 | f_fixed.__name__ = f.__name__
101 | fixed_folder_funcs.append(f_fixed)
102 | return [Stage(f) for f in fixed_folder_funcs]
103 |
--------------------------------------------------------------------------------
/tests/helpers.py:
--------------------------------------------------------------------------------
1 | """ Helpers for tests """
2 |
3 | import glob
4 | import os
5 | from functools import partial
6 |
7 | import pytest
8 |
9 | from pypiper import Pipeline
10 | from pypiper.utils import checkpoint_filepath
11 |
12 | __author__ = "Vince Reuter"
13 | __email__ = "vreuter@virginia.edu"
14 |
15 |
16 | def assert_equal_dirpath(p1, p2):
17 | """
18 | Assert that a pair of folder paths has two equal members.
19 |
20 | :param str p1: One path to compare.
21 | :param str p2: Other path to compare.
22 | """
23 | assert p1.rstrip(os.sep) == p2.rstrip(os.sep)
24 |
25 |
26 | def fetch_checkpoint_files(pm):
27 | """
28 | Fetch all of a manager's checkpoint file paths.
29 |
30 | :param pyiper.PipelineManager pm: manager for which checkpoint files'
31 | paths are of interest.
32 | :return Iterable[str]: collection of all of given manager's checkpoint
33 | files' paths.
34 | """
35 | pattern = checkpoint_filepath("*", pm)
36 | return glob.glob(pattern)
37 |
38 |
39 | def named_param(argnames, argvalues):
40 | """
41 | Improve pytest's native labeling of test case parameterization.
42 |
43 | This function thinly wraps the 'parametrize' mark from pytest, adding
44 | clearer labeling of each individual parameterized test case, overriding
45 | the index-based labeling that pytest uses by default.
46 |
47 | :param str argnames: Single parameter name, named in the plural only for
48 | concordance with the native pytest name.
49 | :param Iterable argvalues: Arguments for the parameter, what define the
50 | distinct test cases.
51 | :return functools.partial: Parameterize version of parametrize, with
52 | values and ids fixed.
53 | """
54 | return partial(
55 | pytest.mark.parametrize(
56 | argnames=argnames,
57 | argvalues=argvalues,
58 | ids=lambda val: "{}={}".format(argnames, val),
59 | )
60 | )
61 |
62 |
63 | class SafeTestPipeline(Pipeline):
64 | """Pipeline for tests that protects against bad file descriptor."""
65 |
66 | def __init__(self, *args, **kwargs):
67 | kwd_args = {"multi": True} # Like interactive mode.
68 | kwd_args.update(kwargs)
69 | super(SafeTestPipeline, self).__init__(*args, **kwd_args)
70 |
--------------------------------------------------------------------------------
/tests/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/pypiper/7c0e129440509610fb1d476a4076357105aebf8c/tests/pipeline/__init__.py
--------------------------------------------------------------------------------
/tests/pipeline/conftest.py:
--------------------------------------------------------------------------------
1 | """ Test configuration for Pipeline tests. """
2 |
3 | import os
4 |
5 | import pytest
6 |
7 | from pypiper import Stage
8 | from tests.helpers import SafeTestPipeline
9 |
10 | __author__ = "Vince Reuter"
11 | __email__ = "vreuter@virginia.edu"
12 |
13 |
14 | READ_ALIGNER_FILENAME = "aligner.lst"
15 | PEAK_CALLER_FILENAME = "caller.lst"
16 |
17 |
18 | def pytest_generate_tests(metafunc):
19 | """Dynamic test case parameterization."""
20 | if "pl_name" in metafunc.fixturenames:
21 | metafunc.parametrize("pl_name", [read_aligner.__name__, call_peaks.__name__])
22 |
23 |
24 | # Dummy functions used as elements of pipeline stages() collections.
25 | def merge_input():
26 | pass
27 |
28 |
29 | def qc():
30 | pass
31 |
32 |
33 | def align_reads():
34 | pass
35 |
36 |
37 | def call_peaks():
38 | pass
39 |
40 |
41 | class FunctionNameWriterPipeline(SafeTestPipeline):
42 | """Basic pipeline that writes to file the names of its functions."""
43 |
44 | def __init__(self, name, outfolder, filename, functions):
45 | """
46 | Name and outfolder go to generic pipeline ctor; filename and functions
47 | collection are used specifically by instances of this class.
48 |
49 | :param str name: Name for this pipeline.
50 | :param str outfolder: Path to pipeline's output folder.
51 | :param str filename: Name for file in which to write function names.
52 | :param Sequence[callable] functions: Functions on which this pipeline
53 | is to operate (i.e., the functions for which name should be
54 | written to output file).
55 | """
56 | # Set instance-specific variables.
57 | self.name_output_file = filename
58 | self.functions = functions
59 | # Get the stages() benefit of superclass extension.
60 | super(FunctionNameWriterPipeline, self).__init__(name=name, outfolder=outfolder)
61 |
62 | def write_name(self, func):
63 | """
64 | Write the name of a function to this pipeline's output file.
65 |
66 | :param callable func: Name of function to write to the output file.
67 | """
68 | outpath = os.path.join(self.outfolder, self.name_output_file)
69 | with open(outpath, "a") as f:
70 | f.write(func.__name__ + os.linesep)
71 |
72 | def run(self, **kwargs):
73 | """Start with clean output file, then use superclass method."""
74 | # Ensure that we start with a clean file since the nature of the
75 | # operations performed (sequential file writes) creates desire to
76 | # open output file in append mode rather than write mode.
77 | output_file = os.path.join(self.outfolder, self.name_output_file)
78 | if os.path.exists(output_file):
79 | os.unlink(output_file)
80 | super(FunctionNameWriterPipeline, self).run(**kwargs)
81 |
82 | def stages(self):
83 | """Sequence of operations to perform."""
84 | return [Stage(self.write_name, (f,), name=f.__name__) for f in self.functions]
85 |
86 |
87 | # Functions and fixtures
88 |
89 |
90 | def get_read_aligner(outfolder):
91 | """Create a dummy 'read aligner' pipeline."""
92 | return FunctionNameWriterPipeline(
93 | "read-aligner", outfolder, READ_ALIGNER_FILENAME, [merge_input, qc, align_reads]
94 | )
95 |
96 |
97 | def get_peak_caller(outfolder):
98 | """Create a dummy 'peak caller' pipeline."""
99 | return FunctionNameWriterPipeline(
100 | "peak-caller", outfolder, PEAK_CALLER_FILENAME, [align_reads, call_peaks]
101 | )
102 |
103 |
104 | def get_pipeline(name, outfolder):
105 | """
106 | Build and return pipeline instance associated with given name.
107 |
108 | :param str name: Name of the pipeline to build.
109 | :param str outfolder: Path to output folder for use by pipeline instance.
110 | :return SafeTestPipeline: A test-session-safe instance of a Pipeline.
111 | """
112 | if name == read_aligner.__name__:
113 | return get_read_aligner(outfolder)
114 | elif name == call_peaks.__name__:
115 | return get_peak_caller(outfolder)
116 | else:
117 | raise ValueError("Unknown pipeline request: '{}'".format(name))
118 |
119 |
120 | @pytest.fixture
121 | def read_aligner(tmpdir):
122 | """Provide test case with a read aligner pipeline instance."""
123 | return get_read_aligner(outfolder=tmpdir.strpath)
124 |
125 |
126 | @pytest.fixture
127 | def peak_caller(tmpdir):
128 | """Provide test case with a 'PeakCaller' pipeline instance."""
129 | return get_peak_caller(outfolder=tmpdir.strpath)
130 |
--------------------------------------------------------------------------------
/tests/pipeline/test_multi_pipeline_sample.py:
--------------------------------------------------------------------------------
1 | """ Tests for case in which multiple pipelines process a single sample. """
2 |
3 | import os
4 |
5 | from pypiper.utils import checkpoint_filepath
6 | from tests.helpers import fetch_checkpoint_files, named_param
7 |
8 | from .conftest import get_peak_caller, get_pipeline, get_read_aligner
9 |
10 | __author__ = "Vince Reuter"
11 | __email__ = "vreuter@virginia.edu"
12 |
13 |
14 | def test_checkpoints_are_pipeline_unique(tmpdir):
15 | """Names of checkpoint files depend on both stage and pipeline."""
16 |
17 | # Note: conceptually, this tests an underlying mechanistic aspect of the
18 | # checkpointing system.
19 |
20 | # Create two different pipelines.
21 | align_reads = get_read_aligner(tmpdir.strpath)
22 | call_peaks = get_peak_caller(tmpdir.strpath)
23 |
24 | # Get the stage names associated with each pipeline.
25 | alignment_stage_names = set(map(lambda s: s.name, align_reads.stages()))
26 | peak_call_stage_names = set(map(lambda s: s.name, call_peaks.stages()))
27 |
28 | # Check that we have one specific stage name shared between the pipelines.
29 | assert {"align_reads"} == alignment_stage_names & peak_call_stage_names
30 | assert align_reads.outfolder == call_peaks.outfolder
31 |
32 | # We begin with no checkpoint files.
33 | assert [] == list(fetch_checkpoint_files(align_reads.manager))
34 | assert [] == list(fetch_checkpoint_files(call_peaks.manager))
35 |
36 | # Run each pipeline.
37 | align_reads.run()
38 | call_peaks.run()
39 |
40 | # We expect a different checkpoint file for each stage of each pipeline.
41 | align_reads_expected = {
42 | checkpoint_filepath(s.name, align_reads) for s in align_reads.stages()
43 | }
44 | call_peaks_expected = {
45 | checkpoint_filepath(s.name, call_peaks) for s in call_peaks.stages()
46 | }
47 |
48 | # Pipeline names are unique here, and each checkpoint name includes
49 | # pipeline name for disambiguation, so even a pair of pipelines with a
50 | # nonempty stage name intersection has an empty checkpoint filenames
51 | # intersection, so long as the pipeline names are unique.
52 | assert set() == (align_reads_expected & call_peaks_expected)
53 |
54 | # When not setting start/stop parameters and beginning with no checkpoint
55 | # files in place, each pipeline generates its full set of checkpoint files.
56 | expected_checkpoints = align_reads_expected | call_peaks_expected
57 | observed_checkpoints = set(fetch_checkpoint_files(align_reads)) | set(
58 | fetch_checkpoint_files(call_peaks)
59 | )
60 |
61 | # Verify satisfaction of expectation.
62 | try:
63 | assert expected_checkpoints == observed_checkpoints
64 | except AssertionError:
65 | only_exp = expected_checkpoints - observed_checkpoints
66 | exp_and_obs = expected_checkpoints & observed_checkpoints
67 | only_obs = observed_checkpoints - expected_checkpoints
68 | print("Only in expected:\n{}".format("\n".join(only_exp)))
69 | print("Expected and observed:\n{}".format("\n".join(exp_and_obs)))
70 | print("Only in observed:\n{}".format("\n".join(only_obs)))
71 | raise
72 |
73 |
74 | def test_pipeline_checkpoint_respect_sensitivity_and_specificity(tmpdir):
75 | """Pipeline respects only its own checkpoint(s) for stage skipping."""
76 |
77 | # Note: conceptually, this is more of an effect- or outcome-based test
78 | # of the checkpointing system with respect to stage skipping.
79 |
80 | align_reads = get_read_aligner(tmpdir.strpath)
81 | call_peaks = get_peak_caller(tmpdir.strpath)
82 |
83 | align_reads_stage_names = [s.name for s in align_reads.stages()]
84 | call_peaks_stage_names = [s.name for s in call_peaks.stages()]
85 | assert {"align_reads"} == set(align_reads_stage_names) & set(call_peaks_stage_names)
86 |
87 | # Set up the checkpoints for the read alignment pipeline by allowing it
88 | # to execute once.
89 | align_reads.run()
90 | assert os.path.isfile(checkpoint_filepath("align_reads", align_reads.manager))
91 | peaks_align_check_fpath = checkpoint_filepath("align_reads", call_peaks.manager)
92 | assert not os.path.isfile(peaks_align_check_fpath)
93 |
94 | call_peaks.run()
95 | exp_lines = [func + os.linesep for func in call_peaks_stage_names]
96 | call_peaks_outpath = os.path.join(call_peaks.outfolder, call_peaks.name_output_file)
97 | with open(call_peaks_outpath, "r") as f:
98 | obs_lines = f.readlines()
99 | assert exp_lines == obs_lines
100 |
--------------------------------------------------------------------------------
/tests/pipeline/test_pipeline_checkpoint.py:
--------------------------------------------------------------------------------
1 | """ Tests for a pipeline's ability to checkpoint its stages. """
2 |
3 | import os
4 | import time
5 |
6 | from pypiper.utils import checkpoint_filepath
7 | from tests.helpers import fetch_checkpoint_files, named_param
8 |
9 | from .conftest import get_pipeline
10 |
11 | __author__ = "Vince Reuter"
12 | __email__ = "vreuter@virginia.edu"
13 |
14 |
15 | def test_pipeline_checkpoint_respect_sensitivity_checkpoint_perspective(
16 | pl_name, tmpdir
17 | ):
18 | """Pipeline can skip past its stage(s) for which checkpoint exists."""
19 |
20 | # Create the pipeline.
21 | pipeline = get_pipeline(pl_name, tmpdir.strpath)
22 |
23 | # Negative control to start test, that we have no checkpoint files.
24 | assert [] == fetch_checkpoint_files(pipeline.manager)
25 |
26 | # Generate some checkpoints.
27 | pipeline.run()
28 |
29 | # Verify that we created each of the checkpoints.
30 | expected = [
31 | checkpoint_filepath(f.__name__, pipeline.manager) for f in pipeline.functions
32 | ]
33 | observed = fetch_checkpoint_files(pipeline.manager)
34 | assert set(expected) == set(observed)
35 |
36 | # Collect checkpoint file timestamps for comparison after second run.
37 | timestamps = {f: os.path.getmtime(f) for f in observed}
38 |
39 | # Remove the checkpoint for the final stage.
40 | last_aligner_stage = pipeline.functions[-1]
41 | last_aligner_checkfile = checkpoint_filepath(last_aligner_stage, pipeline.manager)
42 | os.unlink(last_aligner_checkfile)
43 |
44 | # Verify removal of final stage checkpoint file.
45 | assert all([os.path.isfile(f) for f in expected[:-1]])
46 | assert not os.path.exists(last_aligner_checkfile)
47 | assert set(expected) != set(fetch_checkpoint_files(pipeline.manager))
48 |
49 | # Delay briefly so that we can more reliably compare checkpoint file
50 | # timestamps after a second pipeline run.
51 | time.sleep(0.05)
52 |
53 | # Repeat the pipeline's execution, but now with checkpoint file(s) for a
54 | # subset of its stages in place.
55 | pipeline.run()
56 |
57 | # Verify that we've restored the full collection of the pipeline's
58 | # checkpoint files to existence.
59 | observed = fetch_checkpoint_files(pipeline.manager)
60 | exp = set(expected)
61 | obs = set(observed)
62 | assert set(expected) == set(
63 | observed
64 | ), "Expected only:\n{}\nExpected and observed:\n{}\nObserved only:\n{}".format(
65 | exp - obs, exp & obs, obs - exp
66 | )
67 |
68 | # Verify the we didn't recreate the checkpoint file for each skipped stage.
69 | for f in expected[:-1]:
70 | expected_timestamp = timestamps[f]
71 | observed_timestamp = os.path.getmtime(f)
72 | assert expected_timestamp == observed_timestamp
73 |
74 | # Verify the we did in fact recreate the checkpoint file for the stage
75 | # that was rerun.
76 | assert (
77 | os.path.getmtime(last_aligner_checkfile) > timestamps[last_aligner_checkfile]
78 | ), "Recreated checkpoint file ('{}') should be newer than original".format(
79 | last_aligner_checkfile
80 | )
81 |
82 |
83 | def test_pipeline_checkpoint_sensitivity_effect_perspective(pl_name, tmpdir):
84 | """The pipeline skips execution of stages with extant checkpoint."""
85 |
86 | # Create the pipeline, then check creation of output file.
87 | pipeline = get_pipeline(pl_name, tmpdir.strpath)
88 | output_file = os.path.join(pipeline.outfolder, pipeline.name_output_file)
89 | assert not os.path.exists(output_file)
90 | pipeline.run()
91 | assert os.path.isfile(output_file)
92 |
93 | # Validate pipeline effects (output file content).
94 | with open(output_file, "r") as f:
95 | lines = f.readlines()
96 | assert [s.name + os.linesep for s in pipeline.stages()] == lines
97 |
98 | # Verify presence of checkpoint files to support our expectation about
99 | # which stages should be skipped and which should be run during the second
100 | # time through the pipeline's execution.
101 | exp_cp_fpaths = set(
102 | checkpoint_filepath(s.name, pipeline.manager) for s in pipeline.stages()
103 | )
104 | assert exp_cp_fpaths == set(fetch_checkpoint_files(pipeline.manager))
105 | final_stage = pipeline.stages()[-1]
106 | final_stage_fpath = checkpoint_filepath(final_stage.name, pipeline.manager)
107 | os.unlink(final_stage_fpath)
108 |
109 | # Verify the effect of the second execution of the pipeline.
110 | pipeline.run()
111 | with open(output_file, "r") as f:
112 | lines = f.readlines()
113 | assert [final_stage.name + os.linesep] == lines
114 |
115 |
116 | @named_param("overwrite", [False, True])
117 | def test_pipeline_reruns_downstream_stages_according_to_parameterization(
118 | overwrite, pl_name, tmpdir
119 | ):
120 | """Pipeline overwrites downstream stages unless configured otherwise."""
121 |
122 | pl = get_pipeline(pl_name, tmpdir.strpath)
123 |
124 | # Create checkpoint file for each stage.
125 | stage_names = [s.name for s in pl.stages()]
126 | assert 1 < len(
127 | stage_names
128 | ), "Need pipeline with at least two stages to run this test."
129 | for s_name in stage_names:
130 | open(checkpoint_filepath(s_name, pl.manager), "w").close()
131 |
132 | # Remove the checkpoint file for the penultimate stage.
133 | penultimate_stage = stage_names[-2]
134 | os.unlink(checkpoint_filepath(penultimate_stage, pl.manager))
135 |
136 | # Configure the pipeline based on parameterization and run it starting
137 | # from the penultimate stage.
138 | pl.manager.overwrite_checkpoints = overwrite
139 | pl.run(start_point=penultimate_stage)
140 |
141 | # If we're overwriting downstream checkpoints, the last two stages are
142 | # run while otherwise only the penultimate stage is run.
143 | exp_stages = [stage_names[-2]]
144 | if overwrite:
145 | exp_stages.append(stage_names[-1])
146 | exp_lines = [func + os.linesep for func in stage_names[-2:]]
147 | outpath = os.path.join(pl.outfolder, pl.name_output_file)
148 | with open(outpath, "r") as f:
149 | obs_lines = f.readlines()
150 | assert exp_lines == obs_lines
151 |
--------------------------------------------------------------------------------
/tests/pipeline/test_pipeline_constructor.py:
--------------------------------------------------------------------------------
1 | """ Tests for construction of a Pipeline """
2 |
3 | import pytest
4 |
5 | from pypiper import Pipeline, PipelineManager, Stage
6 | from tests.helpers import SafeTestPipeline, assert_equal_dirpath, named_param
7 |
8 | __author__ = "Vince Reuter"
9 | __email__ = "vreuter@virginia.edu"
10 |
11 |
12 | def test_pipeline_requires_stages_definition(tmpdir):
13 | """To create a pipeline, define stages (execution steps)."""
14 |
15 | class NoStagesPipeline(SafeTestPipeline):
16 | pass
17 |
18 | name = "test-pipe"
19 |
20 | # Sensitivity: test exception for bad case.
21 | with pytest.raises(TypeError):
22 | NoStagesPipeline(name=name, outfolder=tmpdir.strpath)
23 | # Specificity: test no exception for good case.
24 | _MinimalPipeline(name=name, outfolder=tmpdir.strpath)
25 |
26 |
27 | class JustManagerArgument:
28 | """A pipeline can be created with just a manager argument."""
29 |
30 | NAME_HOOK = "pl_mgr_name"
31 |
32 | @pytest.fixture
33 | def pl_mgr(self, request, get_pipe_manager):
34 | """Provide each of this class's test cases with pipeline manager."""
35 | if self.NAME_HOOK in request.fixturenames:
36 | name = request.getfixturevalue(self.NAME_HOOK)
37 | else:
38 | name = "test-pipe"
39 | return get_pipe_manager(name=name)
40 |
41 | @named_param(argnames=NAME_HOOK, argvalues=["arbitrary-pipeline", "DummyPipe"])
42 | def test_pipeline_adopts_manager_name(self, pl_mgr_name, pl_mgr):
43 | """If given just a manager, a pipeline uses the manager name."""
44 | pl = Pipeline(manager=pl_mgr)
45 | assert pl_mgr_name == pl_mgr.name
46 | assert pl_mgr_name == pl.name
47 |
48 | def test_pipeline_adopts_manager_output_folder(self, pl_mgr):
49 | """Pipeline uses manager output folder if given just manager."""
50 | pl = Pipeline(manager=pl_mgr)
51 | assert pl_mgr.outfolder == pl.outfolder
52 |
53 |
54 | class MinimalArgumentsWithoutManagerTests:
55 | """Tests for pipeline constructor argument provision without manager."""
56 |
57 | def test_pipeline_creates_manager(self, tmpdir):
58 | """If not passed a pipeline manager, a pipeline creates one."""
59 | empty = _MinimalPipeline(name="minimal", outfolder=tmpdir.strpath)
60 | assert isinstance(empty.manager, PipelineManager)
61 |
62 | @named_param("pipe_name", ["test-pipe", "DummyPipeline"])
63 | def test_manager_adopts_pipeline_name(self, pipe_name, tmpdir):
64 | """Autogenerated pipeline manager uses pipeline's name."""
65 | pl = _MinimalPipeline(name=pipe_name, outfolder=tmpdir.strpath)
66 | assert pipe_name == pl.name
67 | assert pl.name == pl.manager.name
68 |
69 | def test_manager_adopts_pipeline_output_folder(self, tmpdir):
70 | """Autogenerated pipeline manager uses pipeline's output folder."""
71 | pl = _MinimalPipeline(name="test-pipe", outfolder=tmpdir.strpath)
72 | assert_equal_dirpath(tmpdir.strpath, pl.outfolder)
73 |
74 |
75 | class ConceptuallyOverlappingArgumentsTests:
76 | """
77 | Test cases in which pipeline's argument space is overspecified.
78 |
79 | Specifically, there are two main argument specification strategies for
80 | creating a pipeline, each of which is minimal in its own way. One is to
81 | directly pass a PipelineManager, and the other is to pass a name and a
82 | path to an output folder. The manager implies both the name and the
83 | output folder, and the name + output folder can be used in conjunction
84 | to create a pipeline manager if one's not passed. This class aims to test
85 | the outcomes of cases in which the combination of arguments passed to the
86 | pipeline constructor overspecifies the space defined by pipeline name,
87 | output folder path, and pipeline manager.
88 |
89 | """
90 |
91 | def test_same_name_for_manager_and_pipeline(self, tmpdir, get_pipe_manager):
92 | """Pipeline name and manager with matching name is unproblematic."""
93 | name = "test-pipe"
94 | pm = get_pipe_manager(name=name, outfolder=tmpdir.strpath)
95 | pl = _MinimalPipeline(name=name, manager=pm)
96 | assert name == pl.manager.name
97 |
98 | def test_different_name_for_manager_and_pipeline(self, tmpdir, get_pipe_manager):
99 | """If given, pipeline favors its own name over manager's."""
100 | manager_name = "manager"
101 | pipeline_name = "pipeline"
102 | pm = get_pipe_manager(name=manager_name, outfolder=tmpdir.strpath)
103 | pl = _MinimalPipeline(name=pipeline_name, manager=pm)
104 | assert pipeline_name == pl.name
105 | assert manager_name == pl.manager.name
106 |
107 | @named_param("output_folder", argvalues=["test-output", "testing-output-folder"])
108 | def test_pipeline_ignores_outfolder_if_manager_is_passed(
109 | self, output_folder, tmpdir, get_pipe_manager
110 | ):
111 | """Manager's output folder trumps explicit output folder."""
112 | pm = get_pipe_manager(name="test-pipe", outfolder=tmpdir.strpath)
113 | pl = _MinimalPipeline(manager=pm, outfolder=output_folder)
114 | assert_equal_dirpath(tmpdir.strpath, pl.outfolder)
115 |
116 | def test_name_outfolder_and_manager(self, tmpdir, get_pipe_manager):
117 | """Tests provision of all three primary pipeline arguments."""
118 | name = "test-pipe"
119 | pm = get_pipe_manager(name=name, outfolder=tmpdir.strpath)
120 | pl = _MinimalPipeline(name=name, manager=pm, outfolder=tmpdir.strpath)
121 | assert name == pl.name
122 | assert_equal_dirpath(tmpdir.strpath, pl.outfolder)
123 | assert pm == pl.manager
124 |
125 |
126 | def test_pipeline_requires_either_manager_or_outfolder():
127 | """Pipeline must be passed pipeline manager or output folder."""
128 | with pytest.raises(TypeError):
129 | _MinimalPipeline()
130 |
131 |
132 | def test_empty_pipeline_manager_name_and_no_explicit_pipeline_name(
133 | tmpdir, get_pipe_manager
134 | ):
135 | """If no name's passed to pipeline, the manager must have valid name."""
136 | pm = get_pipe_manager(name="", outfolder=tmpdir.strpath)
137 | with pytest.raises(ValueError):
138 | _MinimalPipeline(manager=pm)
139 |
140 |
141 | class AnonymousFunctionStageTests:
142 | """Tests for anonymous function as a pipeline stage."""
143 |
144 | def test_anonymous_stage_without_name_is_prohibited(self, tmpdir):
145 | """Anonymous function as Stage must be paired with name."""
146 | with pytest.raises(TypeError):
147 | _AnonymousStageWithoutNamePipeline(
148 | name="test-pipe", outfolder=tmpdir.strpath
149 | )
150 |
151 | def test_anonymous_stage_with_name_is_permitted(self, tmpdir):
152 | """Anonymous function as Stage must be paired with name."""
153 | _AnonymousStageWithNamePipeline(name="test-pipe", outfolder=tmpdir.strpath)
154 |
155 |
156 | class _AnonymousStageWithoutNamePipeline(SafeTestPipeline):
157 | """Anonymous function as stage is prohibited unless paired with name."""
158 |
159 | def stages(self):
160 | return [lambda: None]
161 |
162 |
163 | class _AnonymousStageWithNamePipeline(SafeTestPipeline):
164 | """Anonymous function as Stage is allowed if wrapped with a name."""
165 |
166 | def stages(self):
167 | return [("NullStage", lambda: None)]
168 |
169 |
170 | @pytest.fixture
171 | def empty_pipeline(request):
172 | """Provide test case with minimal pipeline instance."""
173 | if "pipe_name" in request.fixturenames:
174 | name = request.getfixturevalue("pipe_name")
175 | else:
176 | name = "minimal"
177 | return _MinimalPipeline(name)
178 |
179 |
180 | class _MinimalPipeline(SafeTestPipeline):
181 | """Minimal pipeline declaration."""
182 |
183 | def stages(self):
184 | """Sham stages definition."""
185 | return [_do_nothing]
186 |
187 |
188 | def _do_nothing():
189 | return
190 |
--------------------------------------------------------------------------------
/tests/pipeline_manager/test_halt.py:
--------------------------------------------------------------------------------
1 | """ Tests for effects of pipeline manager's halt() function. """
2 |
3 | import os
4 |
5 | import pytest
6 |
7 | from pypiper.exceptions import PipelineHalt
8 | from pypiper.flags import COMPLETE_FLAG, PAUSE_FLAG
9 | from tests.helpers import named_param
10 |
11 | __author__ = "Vince Reuter"
12 | __email__ = "vreuter@virginia.edu"
13 |
14 |
15 | def test_halt_state(get_pipe_manager):
16 | """Requesting a halt alters manager state."""
17 | pm = get_pipe_manager(name="test-pipe")
18 | assert pm._active
19 | pm.halt(raise_error=False)
20 | assert pm.halted
21 | assert not pm._active
22 |
23 |
24 | def test_halt_file(get_pipe_manager):
25 | """Requesting a halt produces a particular flag file."""
26 | pm = get_pipe_manager(name="TestPM")
27 | path_halt_file = pm._flag_file_path(PAUSE_FLAG)
28 | assert not os.path.isfile(path_halt_file)
29 | pm.halt(raise_error=False)
30 | assert os.path.isfile(path_halt_file)
31 |
32 |
33 | @named_param("raise_error", [False, True, None])
34 | def test_halt_exceptionality(get_pipe_manager, raise_error):
35 | """Halting is conditionally exceptional"""
36 | pm = get_pipe_manager(name="halt-error")
37 | if raise_error is None:
38 | # Default is exceptional.
39 | with pytest.raises(PipelineHalt):
40 | pm.halt()
41 | elif raise_error:
42 | with pytest.raises(PipelineHalt):
43 | pm.halt(raise_error=True)
44 | else:
45 | pm.halt(raise_error=False)
46 |
47 |
48 | @named_param("raise_error", [False, True])
49 | @named_param("test_type", argvalues=["halt_flag", "complete_flag"])
50 | def test_halt_status_supersedes_completed(get_pipe_manager, raise_error, test_type):
51 | """Halting pipeline replaces completed flag with halt flag."""
52 |
53 | # Create manager and completion flag.
54 | pm = get_pipe_manager(name="halt-status-flag")
55 | pm._set_status_flag(COMPLETE_FLAG)
56 | path_complete_flag = pm._flag_file_path(COMPLETE_FLAG)
57 | assert os.path.isfile(path_complete_flag)
58 |
59 | # Perform the halt.
60 | try:
61 | pm.halt(raise_error=raise_error)
62 | except PipelineHalt:
63 | # We don't care about exceptionality here, just that the flag files
64 | # are adjusted regardless of the halt type.
65 | pass
66 |
67 | # Check either the presence of the halt flag or the absence of the
68 | # completion flag, depending on test parameterization.
69 | if test_type == "halt_flag":
70 | path_halt_flag = pm._flag_file_path(PAUSE_FLAG)
71 | assert os.path.isfile(path_halt_flag)
72 | elif test_type == "complete_flag":
73 | assert not os.path.isfile(path_complete_flag)
74 | else:
75 | raise ValueError("Unknown test type: '{}'".format(test_type))
76 |
--------------------------------------------------------------------------------
/tests/pipeline_manager/test_manager_constructor.py:
--------------------------------------------------------------------------------
1 | """ Test effects of construction of a pipeline manager. """
2 |
3 | import argparse
4 | import os
5 |
6 | import pytest
7 |
8 | from pypiper.manager import CHECKPOINT_SPECIFICATIONS, LOGFILE_SUFFIX
9 | from tests.helpers import named_param
10 |
11 | __author__ = "Vince Reuter"
12 | __email__ = "vreuter@virginia.edu"
13 |
14 |
15 | def pytest_generate_tests(metafunc):
16 | """Dynamic test case generation for this module's test cases."""
17 | if "spec_type" in metafunc.fixturenames:
18 | metafunc.parametrize(argnames="spec_type", argvalues=["cmdl", "ctor"])
19 |
20 |
21 | @named_param("checkpoint_type", argvalues=["curr_checkpoint", "prev_checkpoint"])
22 | def test_manager_starts_in_null_checkpoint_state(get_pipe_manager, checkpoint_type):
23 | """A pipeline manager begins with null checkpoint states."""
24 | pm = get_pipe_manager(name="ctor-checkpoint-state")
25 | assert getattr(pm, checkpoint_type) is None
26 |
27 |
28 | def test_logger_logfile_collision_with_manager_logfile_is_expected_error__issue_212(
29 | get_pipe_manager, tmpdir
30 | ):
31 | pipe_name = "test_issue212"
32 | with pytest.raises(ValueError) as err_ctx:
33 | get_pipe_manager(
34 | name=pipe_name,
35 | logger_kwargs={
36 | "logfile": os.path.join(tmpdir.strpath, pipe_name + LOGFILE_SUFFIX)
37 | },
38 | )
39 | assert str(err_ctx.value).startswith(
40 | f"The logfile given for the pipeline manager's logger matches that which will be used by the manager itself"
41 | )
42 |
43 |
44 | class ManagerConstructorCheckpointSpecificationTests:
45 | """Tests for manager's constructor's ability to parse and set
46 | checkpoint specifications, which can determine aspects of control flow."""
47 |
48 | def test_no_checkpoint_specifications(self, get_pipe_manager):
49 | """A manager may be constructed without any checkpoint provision."""
50 | get_pipe_manager(name="test-pipe")
51 |
52 | @named_param("start_point", ["filter_reads", "align_reads"])
53 | def test_just_start(self, get_pipe_manager, spec_type, start_point):
54 | """Starting point may be set from command-line or ctor keyword."""
55 | spec_data = {"start_point": start_point}
56 | if spec_type == "cmdl":
57 | kwargs = {"args": argparse.Namespace(**spec_data)}
58 | else:
59 | kwargs = spec_data
60 | pm = get_pipe_manager(name="start-test", **kwargs)
61 | assert start_point == pm.start_point
62 |
63 | @named_param("stop_type", ["stop_before", "stop_after"])
64 | @named_param("stop_point", ["align_reads", "call_peaks"])
65 | def test_just_stop(self, get_pipe_manager, spec_type, stop_type, stop_point):
66 | """Particular stopping type is set correctly."""
67 | spec_data = {stop_type: stop_point}
68 | if spec_type == "cmdl":
69 | kwargs = {"args": argparse.Namespace(**spec_data)}
70 | else:
71 | kwargs = spec_data
72 | pm = get_pipe_manager(name="stop-test", **kwargs)
73 | assert stop_point == getattr(pm, stop_type)
74 |
75 | @named_param("start_point", ["merge_input", "filter_reads"])
76 | @named_param("stop_point", ["align_reads", "calc_stats"])
77 | @named_param("stop_type", ["stop_before", "stop_after"])
78 | def test_start_and_stop(
79 | self, get_pipe_manager, spec_type, stop_type, start_point, stop_point
80 | ):
81 | """Specifying both start and stop works just fine."""
82 | spec_data = {"start_point": start_point, stop_type: stop_point}
83 | if spec_type == "cmdl":
84 | kwargs = {"args": argparse.Namespace(**spec_data)}
85 | else:
86 | kwargs = spec_data
87 | pm = get_pipe_manager(name="start-and-stop-test", **kwargs)
88 | assert start_point == pm.start_point
89 | assert stop_point == getattr(pm, stop_type)
90 |
91 | @named_param("stop_before", ["align_reads", "call_peaks"])
92 | @named_param("stop_after", ["fastqc", "align_reads"])
93 | @named_param("stop_before_type", ["cmdl", "ctor"])
94 | @named_param("stop_after_type", ["cmdl", "ctor"])
95 | def test_both_stop_modes_is_prohibited(
96 | self,
97 | get_pipe_manager,
98 | stop_before_type,
99 | stop_after_type,
100 | stop_before,
101 | stop_after,
102 | ):
103 | """Provision of both prospective and retrospective stop is bad."""
104 | raw_kwargs = {"stop_before": stop_before, "stop_after": stop_after}
105 | cmdl_kwargs = {}
106 | if stop_before_type == "cmdl":
107 | cmdl_kwargs["stop_before"] = raw_kwargs.pop("stop_before")
108 | if stop_after_type == "cmdl":
109 | cmdl_kwargs["stop_after"] = raw_kwargs.pop("stop_after")
110 | args = argparse.Namespace(**cmdl_kwargs)
111 | with pytest.raises(TypeError):
112 | get_pipe_manager(name="test-double-stop", args=args, **raw_kwargs)
113 |
114 | @pytest.mark.parametrize(
115 | argnames=["start_point", "stop_point"],
116 | argvalues=[("fastqc", "align_reads"), ("align_reads", "call_peaks")],
117 | )
118 | @pytest.mark.parametrize(
119 | argnames=["start_spec_type", "stop_spec_type"],
120 | argvalues=[("cmdl", "ctor"), ("ctor", "cmdl")],
121 | )
122 | @named_param("stop_type", ["stop_before", "stop_after"])
123 | def test_complementary_specification_modes(
124 | self,
125 | get_pipe_manager,
126 | start_spec_type,
127 | stop_spec_type,
128 | stop_type,
129 | start_point,
130 | stop_point,
131 | ):
132 | """Command-line and keyword specifications can harmonize."""
133 | raw_kwargs = {"start_point": start_point, stop_type: stop_point}
134 | cmdl_kwargs = {}
135 | if start_spec_type == "cmdl":
136 | cmdl_kwargs["start_point"] = raw_kwargs.pop("start_point")
137 | if stop_spec_type == "cmdl":
138 | cmdl_kwargs[stop_type] = raw_kwargs.pop(stop_type)
139 | args = argparse.Namespace(**cmdl_kwargs)
140 | pm = get_pipe_manager(name="complementary-test", args=args, **raw_kwargs)
141 | assert start_point == pm.start_point
142 | assert stop_point == getattr(pm, stop_type)
143 |
144 | @named_param(
145 | "check_specs",
146 | [
147 | ["start_point"],
148 | ["stop_before"],
149 | ["stop_after"],
150 | ["start_point", "stop_before"],
151 | ["start_point", "stop_after"],
152 | ],
153 | )
154 | def test_command_line_beats_constructor_keyword(
155 | self, get_pipe_manager, check_specs
156 | ):
157 | """Command-line specification is favored over constructor keyword."""
158 |
159 | # Declare values to use for respective specification modes.
160 | cmdl_values = {
161 | "start_point": "merge_input",
162 | "stop_before": "call_peaks",
163 | "stop_after": "align_reads",
164 | }
165 | ctor_values = {
166 | "start_point": "fastqc",
167 | "stop_before": "align_reads",
168 | "stop_after": "filter_reads",
169 | }
170 |
171 | # Create specifications based on current test case parameterization.
172 | cmdl_kwargs = {cp_spec: cmdl_values[cp_spec] for cp_spec in check_specs}
173 | ctor_kwargs = {cp_spec: ctor_values[cp_spec] for cp_spec in check_specs}
174 | args = argparse.Namespace(**cmdl_kwargs)
175 |
176 | # Build the pipeline manager.
177 | pm = get_pipe_manager(name="cmdl-preference", args=args, **ctor_kwargs)
178 |
179 | # Verify the preference for command-line value over variable keyword
180 | # argument value.
181 | for cp_spec in check_specs:
182 | assert cmdl_kwargs[cp_spec] == getattr(pm, cp_spec)
183 |
184 | # Verify that the non-specified values were set to null.
185 | for cp_spec in set(CHECKPOINT_SPECIFICATIONS) - set(check_specs):
186 | assert getattr(pm, cp_spec) is None
187 |
--------------------------------------------------------------------------------
/tests/pipeline_manager/test_manager_state.py:
--------------------------------------------------------------------------------
1 | """ Tests related to pipeline manager state. """
2 |
3 | import os
4 |
5 | import pytest
6 |
7 | from pypiper.utils import checkpoint_filepath, pipeline_filepath
8 | from tests.helpers import named_param
9 |
10 | __author__ = "Vince Reuter"
11 | __email__ = "vreuter@virginia.edu"
12 |
13 |
14 | def test_starts_running(get_pipe_manager):
15 | """A PipelineManager begins running during its construction."""
16 | pm = get_pipe_manager(name="TestPM")
17 | assert pm._active
18 |
19 |
20 | # Parameters governing execution:
21 | # 1 -- checkpoint existence
22 | # 3 -- halt state (.halted)
23 |
24 |
25 | class ExecutionSkippingTests:
26 | """Tests for cases in which command execution should be skipped."""
27 |
28 | @named_param("start_point", ["align_reads", "make_call"])
29 | def test_skips_to_start(self, get_pipe_manager, start_point):
30 | """The pipeline manager can skip to a starting point."""
31 |
32 | # Initialize the manager.
33 | pm = get_pipe_manager(name="StartTestPM", start_point=start_point)
34 |
35 | # Make a call that should be skipped on the basis of not yet
36 | # reaching the start point.
37 | pm.timestamp(checkpoint="merge_reads")
38 | path_merge_file = pipeline_filepath(pm, filename="merge.txt")
39 | assert not os.path.isfile(path_merge_file)
40 | cmd = "touch {}".format(path_merge_file)
41 | pm.run(cmd, target=path_merge_file)
42 | assert not os.path.isfile(path_merge_file)
43 |
44 | # Make a call that should also be skipped on the basis of not yet
45 | # reaching the designated starting/activation point.
46 | pm.timestamp(checkpoint="fastqc")
47 | fastqc_folder = os.path.join(pm.outfolder, "fastqc")
48 | os.makedirs(fastqc_folder)
49 | fastqc_zipfile = os.path.join(fastqc_folder, "qc.zip")
50 | fastqc_rawfile = os.path.join(fastqc_folder, "qc.txt")
51 | cmds = [
52 | "fastqc",
53 | "touch {}".format(fastqc_rawfile),
54 | "touch {}".format(fastqc_zipfile),
55 | ]
56 | pm.run(cmds, target=fastqc_zipfile)
57 | assert not os.path.isfile(fastqc_zipfile)
58 | assert not os.path.isfile(fastqc_rawfile)
59 |
60 | # Make a all that should be the first executed, on the basis of
61 | # being associated with the designated.
62 | pm.timestamp(checkpoint=start_point)
63 | path_first_file = pipeline_filepath(pm, filename="outfile.bam")
64 | cmd = "touch {}".format(path_first_file)
65 | pm.run(cmd, target=path_first_file)
66 | assert os.path.isfile(path_first_file)
67 |
68 | @named_param("num_skips", argvalues=[1, 2, 3])
69 | def test_skips_execution_if_in_unstarted_state(self, get_pipe_manager, num_skips):
70 | """Pipeline manager skips command execution if not in active state."""
71 |
72 | pm = get_pipe_manager(name="skip-execs")
73 | pm._active = False
74 |
75 | testfile = pipeline_filepath(pm, filename="output.txt")
76 | assert not os.path.isfile(testfile)
77 |
78 | cmd = "touch {}".format(testfile)
79 | num_calls = 0
80 |
81 | # Remain inactive for a parameterized number of call-skipping iterations,
82 | # then adopt active mode.
83 | while True:
84 | pm.run(cmd, target=testfile)
85 | num_calls += 1
86 | if num_calls == num_skips:
87 | pm._active = True
88 | elif num_calls > num_skips:
89 | break
90 | # If we're still looping, we've not yet made a call in active mode.
91 | assert not os.path.isfile(testfile)
92 |
93 | # We break the loop once we've made a call in active state.
94 | assert os.path.isfile(testfile)
95 |
96 | @named_param("num_skips", argvalues=[1, 2, 3])
97 | def test_respects_checkpoints(self, get_pipe_manager, num_skips):
98 | """Manager can skip pipeline to where it's not yet checkpointed."""
99 |
100 | pm = get_pipe_manager(name="respect-checkpoints")
101 |
102 | # Control for possibility that skips are due to being in inactive mode.
103 | assert pm._active
104 |
105 | stages = ["merge", "qc", "filter", "align", "call"]
106 |
107 | # Create checkpoints.
108 | for s in stages[:num_skips]:
109 | pm.timestamp(checkpoint=s)
110 |
111 | # Go through the stages and see that we're skipping checkpoints
112 | # that exist, then proceeding to execute each subsequent stage.
113 | for i, s in enumerate(stages):
114 | outfile = pipeline_filepath(pm, s + ".txt")
115 | cmd = "touch {}".format(outfile)
116 | pm.timestamp(checkpoint=s)
117 | pm.run(cmd, target=outfile)
118 |
119 | if i < num_skips:
120 | # We should not have created the output file.
121 | try:
122 | assert not os.path.isfile(outfile)
123 | except AssertionError:
124 | print("Have run {} stage(s) of {} skip(s)".format(i + 1, num_skips))
125 | print("Current manager checkpoint: {}".format(pm.curr_checkpoint))
126 | raise
127 | else:
128 | # We should have created the output file.
129 | try:
130 | assert os.path.isfile(outfile)
131 | except AssertionError:
132 | print("Have run {} stage(s) of {} skip(s)".format(i + 1, num_skips))
133 | print("Current manager checkpoint: {}".format(pm.curr_checkpoint))
134 | print("Active? {}".format(pm._active))
135 | raise
136 |
137 | @named_param("halt_index", [1, 2, 3])
138 | def test_respects_halt(self, get_pipe_manager, halt_index):
139 | """The pipeline manager skips execution if it's in halted state."""
140 | pm = get_pipe_manager(name="respects-halt")
141 | targets = ["file{}.txt".format(i) for i in range(1, 5)]
142 | for i, t in enumerate(targets):
143 | if i == halt_index:
144 | pm.halt(raise_error=False)
145 | target = pipeline_filepath(pm, filename=t)
146 | cmd = "touch {}".format(target)
147 | pm.run(cmd, target=target)
148 | for i, t in enumerate(targets):
149 | target = pipeline_filepath(pm, filename=t)
150 | if i < halt_index:
151 | assert os.path.isfile(target)
152 | else:
153 | assert not os.path.isfile(target)
154 |
--------------------------------------------------------------------------------
/tests/pipeline_manager/test_pipeline_manager_timestamp_checkpoint_filepath.py:
--------------------------------------------------------------------------------
1 | """ Tests for construction of checkpoint filepath """
2 |
3 | import glob
4 | import os
5 | import time
6 |
7 | from pypiper import PipelineManager
8 | from pypiper.const import CHECKPOINT_EXTENSION
9 | from pypiper.stage import Stage
10 | from tests.helpers import named_param
11 |
12 | __author__ = "Vince Reuter"
13 | __email__ = "vreuter@virginia.edu"
14 |
15 |
16 | class DummyPM(PipelineManager):
17 | """Simple override of true PipelineManager, for __init__ simplicity"""
18 |
19 | def __init__(self, name, outfolder):
20 | self.name = name
21 | self.outfolder = outfolder
22 | self.start_point = None
23 | self.stop_before = None
24 | self.stop_after = None
25 | self.halt_on_next = False
26 | self.last_timestamp = time.time()
27 | self.prev_checkpoint = None
28 | self.curr_checkpoint = None
29 |
30 |
31 | class PipelineMangerTimestampCheckpointFilePathTests:
32 | """Tests for determination of checkpoint filepath."""
33 |
34 | @named_param(
35 | argnames=["name1", "name2"],
36 | argvalues=[("chipseq", "ATACseq"), ("rnaKallisto", "wgbs")],
37 | )
38 | @named_param(argnames="spec_type", argvalues=["stage_name", "stage", "function"])
39 | def test_distinguishes_pipelines_within_outfolder(
40 | self, name1, name2, spec_type, tmpdir
41 | ):
42 | """
43 | Checkpoint files within sample folder include pipeline name.
44 |
45 | More specifically, we often have the case that a single sample's
46 | name is the name of a subfolder, within the broader results
47 | directory for an entire project, in which to store output files
48 | associated with that particular sample. The sample in that case may
49 | be associated with a protocol that maps to multiple pipelines, and
50 | thus the sample may be processed by multiple pipelines. If each
51 | pipeline had a unique set of stage names, we'd be fine with no
52 | additional measures, but to avoid a checkpoint filename collision,
53 | in which we would be unable to know which pipeline had generated
54 | a given checkpoint file, we add the pipeline name to the checkpoint
55 | file and assume that we're not processing the sample with multiple
56 | identically named pipelines.
57 |
58 | """
59 |
60 | # Define a dummy function to use as the callable for a Stage.
61 | def trim_reads():
62 | pass
63 |
64 | def stage_spec():
65 | if spec_type == "function":
66 | return trim_reads
67 | elif spec_type not in ["stage", "stage_name"]:
68 | raise ValueError(
69 | "Unrecognized stage specification type: {}".format(spec_type)
70 | )
71 | else:
72 | s = Stage(trim_reads)
73 | return s.name if spec_type == "stage_name" else s
74 |
75 | outfolder = tmpdir.strpath
76 |
77 | # At start, we should have no checkpoints.
78 | all_checkpoints_pattern = os.path.join(outfolder, "*" + CHECKPOINT_EXTENSION)
79 | assert [] == glob.glob(all_checkpoints_pattern)
80 |
81 | plm1 = DummyPM(name1, outfolder)
82 | plm2 = DummyPM(name2, outfolder)
83 |
84 | checkpoint_name = "trim_reads"
85 | plm1.timestamp(checkpoint=stage_spec(), finished=True)
86 |
87 | # Find the checkpoints; there should only be one.
88 | checkpoint_pattern = os.path.join(
89 | outfolder, "{}_*{}".format(name1, CHECKPOINT_EXTENSION)
90 | )
91 | checkpoints = glob.glob(checkpoint_pattern)
92 | assert 1 == len(checkpoints)
93 | assert 1 == len(glob.glob(all_checkpoints_pattern))
94 | # Check that we have the expected checkpoint.
95 | exp_chkpt_fpath = os.path.join(
96 | outfolder, "{}_{}".format(name1, checkpoint_name + CHECKPOINT_EXTENSION)
97 | )
98 | assert exp_chkpt_fpath == checkpoints[0]
99 |
100 | # Create a second checkpoint with the same stage, but with a manager
101 | # of a different name.
102 | plm2.timestamp(checkpoint=stage_spec(), finished=True)
103 | checkpoint_pattern = os.path.join(
104 | outfolder, "{}_*{}".format(name2, CHECKPOINT_EXTENSION)
105 | )
106 | checkpoints = glob.glob(checkpoint_pattern)
107 | assert 1 == len(checkpoints)
108 | all_checkpoints = glob.glob(all_checkpoints_pattern)
109 | assert 2 == len(all_checkpoints)
110 | exp_chkpt_fpath_2 = os.path.join(
111 | outfolder, "{}_{}".format(name2, checkpoint_name + CHECKPOINT_EXTENSION)
112 | )
113 |
114 | assert {exp_chkpt_fpath, exp_chkpt_fpath_2} == set(all_checkpoints)
115 |
--------------------------------------------------------------------------------
/tests/pipeline_manager/test_set_status_flag.py:
--------------------------------------------------------------------------------
1 | """ Tests for changes to pipepline manager's status flag. """
2 |
3 | import pytest
4 |
5 | from pypiper.flags import *
6 | from pypiper.flags import __all__ as ALL_FLAGS
7 | from tests.helpers import named_param
8 |
9 | __author__ = "Vince Reuter"
10 | __email__ = "vreuter@virginia.edu"
11 |
12 |
13 | @named_param(
14 | argnames="status",
15 | argvalues=[
16 | RUN_FLAG,
17 | COMPLETE_FLAG,
18 | FAIL_FLAG,
19 | PAUSE_FLAG,
20 | WAIT_FLAG,
21 | ],
22 | )
23 | def test_set_status_flag_is_idempotent(get_pipe_manager, status):
24 | """Calls to manager's status flag setter are idempotent."""
25 | pm = get_pipe_manager(name="TestPM")
26 | pm._set_status_flag(status)
27 | assert status == pm.status
28 | pm._set_status_flag(status)
29 | assert status == pm.status
30 |
31 |
32 | @pytest.mark.parametrize(
33 | argnames=["init_state", "new_state"],
34 | argvalues=[
35 | (WAIT_FLAG, RUN_FLAG),
36 | (WAIT_FLAG, COMPLETE_FLAG),
37 | (WAIT_FLAG, FAIL_FLAG),
38 | (RUN_FLAG, COMPLETE_FLAG),
39 | (RUN_FLAG, PAUSE_FLAG),
40 | (RUN_FLAG, FAIL_FLAG),
41 | (FAIL_FLAG, RUN_FLAG),
42 | ],
43 | )
44 | def test_changes_status_state(get_pipe_manager, init_state, new_state):
45 | """Manager setting status flag changes is internal status/state."""
46 | pm = get_pipe_manager(name="test-pipe")
47 | assert pm.status == RUN_FLAG
48 | pm._set_status_flag(init_state)
49 | assert init_state == pm.status
50 | pm._set_status_flag(new_state)
51 | assert new_state == pm.status
52 |
--------------------------------------------------------------------------------
/tests/test_packaging.py:
--------------------------------------------------------------------------------
1 | """ Validate what's available directly on the top-level import. """
2 |
3 | from inspect import isfunction
4 |
5 | import pytest
6 |
7 | __author__ = "Vince Reuter"
8 | __email__ = "vreuter@virginia.edu"
9 |
10 |
11 | @pytest.mark.parametrize(
12 | ["obj_name", "typecheck"],
13 | [
14 | ("add_logging_options", isfunction),
15 | ("check_all_commands", isfunction),
16 | ("determine_uncallable", isfunction),
17 | ("logger_via_cli", isfunction),
18 | ],
19 | )
20 | def test_top_level_exports(obj_name, typecheck):
21 | """At package level, validate object availability and type."""
22 | import pypiper
23 |
24 | try:
25 | obj = getattr(pypiper, obj_name)
26 | except AttributeError:
27 | pytest.fail("Unavailable on {}: {}".format(pypiper.__name__, obj_name))
28 | else:
29 | assert typecheck(obj)
30 |
--------------------------------------------------------------------------------
/tests/test_pipeline_filepath.py:
--------------------------------------------------------------------------------
1 | """ Tests for utility functions """
2 |
3 | import os
4 |
5 | import mock
6 | import pytest
7 |
8 | from pypiper.utils import pipeline_filepath
9 |
10 | __author__ = "Vince Reuter"
11 | __email__ = "vreuter@virginia.edu"
12 |
13 |
14 | PIPELINE_NAMES = ["chiapet", "chipseq", "atacseq", "kallisto", "wgbs"]
15 | SUFFICES = [".txt", "_results.csv", ".stats.tsv", "-data.json"]
16 |
17 |
18 | @pytest.fixture
19 | def pl_mgr(request, tmpdir):
20 | """
21 | Provide test case with a mocked PipelineManager instance.
22 |
23 | :param pytest.fixtures.FixtureRequet request: test case requesting the
24 | setup fixture / parameterization
25 | :param py.path.local.LocalPath tmpdir: Test case temporary path object.
26 | :return mock.MagicMock: Mocked PipelineManager, sufficient for test.
27 | """
28 |
29 | # Select the pipeline name.
30 | if "pipe_name" in request.fixturenames:
31 | pipe_name = request.getfixturevalue("pipe_name")
32 | else:
33 | pipe_name = "test-pipe"
34 |
35 | # Set output folder and name attributes for mocked PipelineManager.
36 | mock_mgr = mock.Mock(outfolder=tmpdir.strpath)
37 | type(mock_mgr).name = pipe_name # Circumvent 'name' keyword on Mock.
38 | return mock_mgr
39 |
40 |
41 | def test_requires_filename_or_suffix(pl_mgr):
42 | """Either filename or suffix is required to build a path."""
43 | with pytest.raises(TypeError):
44 | pipeline_filepath(pl_mgr)
45 |
46 |
47 | @pytest.mark.parametrize(argnames="pipe_name", argvalues=PIPELINE_NAMES)
48 | @pytest.mark.parametrize(argnames="suffix", argvalues=SUFFICES)
49 | @pytest.mark.parametrize(
50 | argnames="test_type", argvalues=["has_pipe_name", "has_suffix", "full_path"]
51 | )
52 | def test_uses_pipeline_name_if_no_filename(
53 | pipe_name, suffix, test_type, pl_mgr, tmpdir
54 | ):
55 | """Pipeline name is proxy for filename if just suffix is given."""
56 |
57 | observed = pipeline_filepath(pl_mgr, suffix=suffix)
58 |
59 | # Allow test type to determine assertion.
60 | if test_type == "has_pipe_name":
61 | assert pipe_name in observed
62 | elif test_type == "has_suffix":
63 | assert observed.endswith(suffix)
64 | elif test_type == "full_path":
65 | try:
66 | expected = os.path.join(tmpdir.strpath, pipe_name + suffix)
67 | assert expected == observed
68 | except AssertionError:
69 | print("OUTFOLDER: {}".format(pl_mgr.outfolder))
70 | raise
71 | else:
72 | raise ValueError("Unrecognized test type: '{}'".format(test_type))
73 |
74 |
75 | @pytest.mark.parametrize(
76 | argnames="filename", argvalues=["testfile" + suffix for suffix in SUFFICES]
77 | )
78 | @pytest.mark.parametrize(argnames="test_type", argvalues=["filename", "filepath"])
79 | def test_direct_filename(tmpdir, filename, pl_mgr, test_type):
80 | """When given, filename is used instead of pipeline name."""
81 | fullpath = pipeline_filepath(pl_mgr, filename=filename)
82 | if test_type == "filename":
83 | _, observed = os.path.split(fullpath)
84 | assert filename == observed
85 | elif test_type == "filepath":
86 | expected = os.path.join(tmpdir.strpath, filename)
87 | assert expected == fullpath
88 | else:
89 | raise ValueError("Unrecognized test type: '{}'".format(test_type))
90 |
91 |
92 | @pytest.mark.parametrize(argnames="filename", argvalues=["output", "testfile"])
93 | @pytest.mark.parametrize(argnames="suffix", argvalues=SUFFICES)
94 | def test_suffix_is_appended_to_filename_if_both_are_provided(pl_mgr, filename, suffix):
95 | """Suffix is appended to filename if both are provided."""
96 | expected = filename + suffix
97 | fullpath = pipeline_filepath(pl_mgr, filename=filename, suffix=suffix)
98 | _, observed = os.path.split(fullpath)
99 | assert expected == observed
100 |
--------------------------------------------------------------------------------
/tests/utils_tests/test_check_command_callability.py:
--------------------------------------------------------------------------------
1 | """ Tests for checking a collection of commands for callability """
2 |
3 | import os
4 |
5 | import mock
6 | import pytest
7 | from ubiquerg import powerset
8 | from veracitools import ExpectContext
9 |
10 | from pypiper import utils as piper_utils
11 |
12 | __author__ = "Vince Reuter"
13 | __email__ = "vreuter@virginia.edu"
14 |
15 |
16 | EXTENSIONS = [
17 | ".py",
18 | ".rb",
19 | ".sh",
20 | ".java",
21 | ".jar",
22 | ".pl",
23 | ".o",
24 | ".R",
25 | ".r",
26 | ".cpp",
27 | ".c",
28 | ".hs",
29 | ".scala",
30 | ".class",
31 | ]
32 |
33 |
34 | def _touch(f):
35 | """'touch' the given file.
36 |
37 | :param str f: filepath to create
38 | """
39 | with open(f, "w"):
40 | print("touch: {}".format(f))
41 |
42 |
43 | def _make_exec(f):
44 | """
45 | 'touch' a file and set exec bit.
46 |
47 | :param str f: path to create
48 | """
49 | import subprocess
50 |
51 | _touch(f)
52 | subprocess.check_call(["chmod", "+x", f])
53 |
54 |
55 | def pytest_generate_tests(metafunc):
56 | """Dynamic test case generation and parameterization for this module"""
57 | if "str_list_monad" in metafunc.fixturenames:
58 | metafunc.parametrize("str_list_monad", [lambda s: s, lambda s: [s]])
59 |
60 |
61 | @pytest.mark.skip(reason="test is broken")
62 | @pytest.mark.parametrize("filename", ["testfile" + x for x in EXTENSIONS])
63 | @pytest.mark.parametrize(
64 | ["setup", "pretest", "exp_miss"],
65 | [
66 | (lambda _: None, lambda f: not os.path.exists(f), lambda _: True),
67 | (
68 | _touch,
69 | lambda f: os.path.isfile(f) and not os.access(f, os.X_OK),
70 | lambda f: not f.endswith(".jar"),
71 | ),
72 | (
73 | _make_exec,
74 | lambda f: os.path.isfile(f) and os.access(f, os.X_OK),
75 | lambda _: False,
76 | ),
77 | ],
78 | )
79 | def test_callability_checker_defaults(tmpdir, filename, setup, pretest, exp_miss):
80 | """Verify behavior of callability checker with default parameterization."""
81 | cmd = os.path.join(tmpdir.strpath, filename)
82 | setup(cmd)
83 | assert pretest(cmd)
84 | extra_commands = ["this-is-not-a-program", "man", "ls"]
85 | expected = ["this-is-not-a-program"]
86 | if exp_miss(cmd):
87 | expected.append(cmd)
88 | observed = [c for c, _ in piper_utils.determine_uncallable([cmd] + extra_commands)]
89 | print("expected: {}".format(expected))
90 | print("observed: {}".format(observed))
91 | assert len(expected) == len(observed)
92 | assert set(expected) == set(observed)
93 |
94 |
95 | @pytest.mark.parametrize(
96 | ["uncall_result", "expectation"], [([], True), ([("noncmd", "noncmd")], TypeError)]
97 | )
98 | @pytest.mark.parametrize("handler", [lambda: True, "not-a-function"])
99 | def test_check_all_bad_handler_is_type_error_iff_uncallability_exists(
100 | uncall_result, str_list_monad, handler, expectation
101 | ):
102 | """Invalid handler evaluation is conditional having >= 1 uncallable command."""
103 | cmd = "noncmd"
104 | with mock.patch.object(
105 | piper_utils, "determine_uncallable", return_value=uncall_result
106 | ), ExpectContext(expectation, piper_utils.check_all_commands) as check:
107 | check(cmds=str_list_monad(cmd), handle=handler)
108 |
109 |
110 | @pytest.mark.parametrize(
111 | ["create_result", "expected"],
112 | [
113 | (
114 | lambda bads: Exception("{} bad commands: {}".format(len(bads), bads)),
115 | Exception,
116 | ),
117 | (lambda bads: "{} bad commands: {}".format(len(bads), bads), False),
118 | ],
119 | )
120 | def test_check_all_result_is_conjunctive(create_result, expected, str_list_monad):
121 | """Even one uncallable means result is False or an Exception occurs."""
122 | cmd = "noncmd"
123 | with mock.patch.object(
124 | piper_utils, "determine_uncallable", return_value=[(cmd, cmd)]
125 | ), ExpectContext(expected, piper_utils.check_all_commands) as check:
126 | check(cmds=str_list_monad(cmd), get_bad_result=create_result)
127 |
128 |
129 | @pytest.mark.parametrize("commands", ["man", "ls", ["man", "ls"]])
130 | @pytest.mark.parametrize(
131 | ["transforms", "expectation"],
132 | [(arg, lambda res: isinstance(res, list)) for arg in [None, []]]
133 | + [(arg, TypeError) for arg in [1, "a"]],
134 | )
135 | def test_check_all_requires_iterable_transformations_argument(
136 | commands, transforms, expectation
137 | ):
138 | """If transformations arg is non-null, it must be iterable."""
139 |
140 | def call():
141 | return piper_utils.determine_uncallable(commands, transformations=transforms)
142 |
143 | if isinstance(expectation, type) and issubclass(expectation, Exception):
144 | with pytest.raises(expectation):
145 | call()
146 | else:
147 | assert expectation(call())
148 |
149 |
150 | @pytest.mark.parametrize(
151 | "commands", powerset(["ls", "picard.jar", "$ENVVAR"], nonempty=True)
152 | )
153 | def test_transformation_accumulation(commands):
154 | """Accumulation of transformations works as expected"""
155 | mapjar = lambda c: "java -jar {}".format(c)
156 | envjar = "env.jar"
157 | transforms = [
158 | (lambda c: c == "$ENVVAR", lambda _: envjar),
159 | (lambda c: c.endswith(".jar"), mapjar),
160 | ]
161 | exps = {"ls": "ls", "picard.jar": mapjar("picard.jar"), "$ENVVAR": mapjar(envjar)}
162 | with mock.patch.object(piper_utils, "is_command_callable", return_value=False):
163 | res = piper_utils.determine_uncallable(
164 | commands, transformations=transforms, accumulate=True
165 | )
166 | expectation = [(c, exps[c]) for c in commands]
167 | print("EXPECTED: {}".format(expectation))
168 | print("OBSERVED: {}".format(res))
169 | assert expectation == res
170 |
171 |
172 | @pytest.mark.parametrize(
173 | "transforms",
174 | [
175 | {(lambda _: True, lambda c: c), (lambda _: False, lambda c: c)},
176 | {
177 | "id": (lambda _: True, lambda c: c),
178 | "java": (lambda c: c.endswith(".jar"), lambda c: "java -jar {}".format(c)),
179 | },
180 | ],
181 | )
182 | def test_non_accumulative_but_unordered_transformation_is_exceptional(transforms):
183 | with pytest.raises(Exception) as err_ctx:
184 | piper_utils.determine_uncallable("ls", transformations=transforms)
185 | exp_msg = (
186 | "If transformations are unordered, non-accumulation of "
187 | "effects may lead to nondeterministic behavior."
188 | )
189 | assert str(err_ctx.value) == exp_msg
190 |
--------------------------------------------------------------------------------
/tests/utils_tests/test_head_util.py:
--------------------------------------------------------------------------------
1 | """ Tests for the head() utility function """
2 |
3 | import random
4 | import string
5 |
6 | import pytest
7 | from hypothesis import given
8 | from hypothesis import strategies as st
9 |
10 | from pypiper.utils import head
11 |
12 | __author__ = "Vince Reuter"
13 | __email__ = "vreuter@virginia.edu"
14 |
15 |
16 | NUMBERS_AND_LETTERS = list(string.ascii_letters) + list(range(-9, 10))
17 |
18 | # Strategy for generating a pretty arbitrary atomic
19 | ATOMICS = st.deferred(
20 | lambda: st.booleans()
21 | | st.characters()
22 | | st.integers()
23 | | st.floats(allow_nan=False)
24 | | st.text()
25 | )
26 |
27 |
28 | def pytest_generate_tests(metafunc):
29 | """Test case generation/parameterization for this module."""
30 | if "seqtype" in metafunc.fixturenames:
31 | metafunc.parametrize("seqtype", [tuple, list])
32 | if "iter_cast" in metafunc.fixturenames:
33 | metafunc.parametrize("iter_cast", [lambda c: c, lambda c: iter(c)])
34 | if "h" in metafunc.fixturenames and "xs" in metafunc.fixturenames:
35 | metafunc.parametrize(
36 | ["h", "xs"],
37 | [
38 | (
39 | random.choice(NUMBERS_AND_LETTERS),
40 | [
41 | random.choice(NUMBERS_AND_LETTERS)
42 | for _ in range(random.randint(5, 10))
43 | ],
44 | )
45 | for _ in range(10)
46 | ],
47 | )
48 |
49 |
50 | @given(obj=ATOMICS)
51 | def test_head_atomic(obj):
52 | """head() of an atomic object is the object itself."""
53 | assert obj == head(obj)
54 |
55 |
56 | def test_head_empty_string():
57 | """Empty string is exception to exceptional-ness of empty collection."""
58 | assert "" == head("")
59 |
60 |
61 | @pytest.mark.parametrize("coll", [dict(), set(), tuple(), list()])
62 | def test_head_empty_collection(coll):
63 | """Request for first element from an empty Iterable is exceptional."""
64 | with pytest.raises(ValueError):
65 | head(coll)
66 |
67 |
68 | def test_head_nonempty_sequential_collection(h, xs, seqtype, iter_cast):
69 | """Verify accuracy of request for first element from nonempty Iterable."""
70 | c = seqtype([h]) + seqtype(xs)
71 | assert h == head(iter_cast(c))
72 |
73 |
74 | def test_head_nonempty_set():
75 | """Verify that head of nonempty set is non-exceptional."""
76 | head({-1, 0, 1})
77 |
78 |
79 | def test_head_nonempty_dict():
80 | """Verify that head of nonempty dictionary is non-exceptional."""
81 | head({"a": 1, "b": 2})
82 |
--------------------------------------------------------------------------------