├── .github
    └── workflows
    │   ├── conventional-prs.yml
    │   ├── main.yml
    │   └── release-please.yml
├── .gitignore
├── .snakemake-workflow-catalog.yml
├── .test
    └── config
    │   ├── config.yml
    │   └── samples.tsv
├── CHANGELOG.md
├── LICENSE
├── README.md
├── config
    ├── README.md
    ├── config.yml
    └── schemas
    │   ├── config.schema.yml
    │   └── samples.schema.yml
└── workflow
    ├── Snakefile
    ├── envs
        ├── get_genome.yml
        ├── simulate_reads.yml
        └── validate_genome.yml
    ├── rules
        ├── common.smk
        └── process_reads.smk
    └── scripts
        └── validate_fasta.py


/.github/workflows/conventional-prs.yml:
--------------------------------------------------------------------------------
 1 | name: Lint PR
 2 | on:
 3 |   pull_request_target:
 4 |     types:
 5 |       - opened
 6 |       - reopened
 7 |       - edited
 8 |       - synchronize
 9 | 
10 | permissions:
11 |   pull-requests: read
12 | 
13 | jobs:
14 |   main:
15 |     name: Validate PR title
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: amannn/action-semantic-pull-request@v5
19 |         env:
20 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
21 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |   pull_request:
 7 |     branches: [main]
 8 | 
 9 | jobs:
10 |   Formatting:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v4
14 |         with:
15 |           fetch-depth: 0
16 |       - name: Formatting
17 |         uses: super-linter/super-linter@v7
18 |         env:
19 |           VALIDATE_ALL_CODEBASE: false
20 |           DEFAULT_BRANCH: main
21 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
22 |           VALIDATE_SNAKEMAKE_SNAKEFMT: true
23 |           VALIDATE_YAML_PRETTIER: true
24 | 
25 |   Linting:
26 |     runs-on: ubuntu-latest
27 |     steps:
28 |       - uses: actions/checkout@v4
29 |       - name: Lint workflow
30 |         uses: snakemake/snakemake-github-action@v2
31 |         with:
32 |           directory: .
33 |           snakefile: workflow/Snakefile
34 |           args: "--lint"
35 | 
36 |   Testing:
37 |     runs-on: ubuntu-latest
38 |     needs:
39 |       - Linting
40 |       - Formatting
41 |     steps:
42 |       - uses: actions/checkout@v4
43 | 
44 |       - name: Test workflow
45 |         uses: snakemake/snakemake-github-action@v2
46 |         with:
47 |           directory: .test
48 |           snakefile: workflow/Snakefile
49 |           args: "--sdm conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache --all-temp"
50 | 
51 |       - name: Test report
52 |         uses: snakemake/snakemake-github-action@v2
53 |         with:
54 |           directory: .test
55 |           snakefile: workflow/Snakefile
56 |           args: "--report report.zip"
57 | 


--------------------------------------------------------------------------------
/.github/workflows/release-please.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - main
 5 | 
 6 | permissions:
 7 |   contents: write
 8 |   pull-requests: write
 9 | 
10 | name: release-please
11 | 
12 | jobs:
13 |   release-please:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - uses: googleapis/release-please-action@v4
17 |         with:
18 |           token: ${{ secrets.GITHUB_TOKEN }}
19 |           release-type: go # just keep a changelog, no version anywhere outside of git tags
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | results/**
 2 | resources/**
 3 | logs/**
 4 | .snakemake
 5 | .snakemake/**
 6 | .test/results/*
 7 | workflow/notebooks/.ipynb_checkpoints/**
 8 | **/.Rhistory
 9 | **/*.Rproj
10 | **/.Rproj.user/**
11 | **/.RData
12 | **/Rplots.pdf
13 | 


--------------------------------------------------------------------------------
/.snakemake-workflow-catalog.yml:
--------------------------------------------------------------------------------
 1 | # configuration of display in snakemake workflow catalog: https://snakemake.github.io/snakemake-workflow-catalog
 2 | 
 3 | usage:
 4 |   mandatory-flags:
 5 |     desc: # describe your flags here in a few sentences
 6 |     flags: # put your flags here
 7 |   software-stack-deployment:
 8 |     conda: true # whether pipeline works with '--sdm conda'
 9 |     apptainer: true # whether pipeline works with '--sdm apptainer/singularity'
10 |     apptainer+conda: true # whether pipeline works with '--sdm conda apptainer/singularity'
11 |     report: true # whether creation of reports using 'snakemake --report report.zip' is supported
12 | 


--------------------------------------------------------------------------------
/.test/config/config.yml:
--------------------------------------------------------------------------------
1 | samplesheet: "config/samples.tsv"
2 | 
3 | get_genome:
4 |   ncbi_ftp: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz
5 | 
6 | simulate_reads:
7 |   read_length: 100
8 |   read_number: 10000
9 | 


--------------------------------------------------------------------------------
/.test/config/samples.tsv:
--------------------------------------------------------------------------------
1 | sample	condition	replicate	read1	read2
2 | sample1	wild_type	1	sample1.bwa.read1.fastq.gz	sample1.bwa.read2.fastq.gz
3 | sample2	wild_type	2	sample2.bwa.read1.fastq.gz	sample2.bwa.read2.fastq.gz
4 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## 1.0.0 (2025-05-07)
 4 | 
 5 | 
 6 | ### Features
 7 | 
 8 | * complete minimal workflow as template ([2348055](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/234805535a6353a3db59d5bba0a4b38fe8194d97))
 9 | * complete, reproducible example workflow ([1dfa7ad](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/1dfa7adb0120880ae5e85c57551d5e698a057497))
10 | * larger update to feature fully-functional example and github actions ([93c08fc](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/93c08fc9db2f8619af7b90784db83d18ed656f25))
11 | * major simplification of rules, replacement of others by wrappers ([3811ef7](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/3811ef796df4fe38fb7161f9a1b06fac9db86d5b))
12 | * major simplification of template and update docs ([81ee089](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/81ee08989857366893593a333615523f05295f87))
13 | * replaced get genome script with simple shell command ([9208995](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/9208995b78433ce3680a0b0e453ddcf5915abcef))
14 | * update github actions workflow in linting part ([27d53ee](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/27d53eecfad935f50bc62a30248141891a4329ee))
15 | * update github actions workflow. check formatting of yaml files using prettier ([9f5131b](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/9f5131bf0eeaf1eb7fb0937b2840f73db2a02724))
16 | * updated all GH actions to latest versions ([4d7b3a2](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/4d7b3a2b143c304b6dcf487664c392c4a5e98f74))
17 | * updated github actions workflow ([fd36648](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/fd3664841b830ae670549aabb214eb6004aa696d))
18 | * updated github actions workflow ([7a3a40e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/7a3a40e62df01b37a802a085e7210014eb3fba82))
19 | 
20 | 
21 | ### Bug Fixes
22 | 
23 | * 2nd attempt to fix release please wf ([f81847f](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/f81847fdfd39d99e795006da4f84701ee6ba8ddc))
24 | * added usage docs ([776b97e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/776b97e3d0e928d98f4c48e619090b47f702dcab))
25 | * all-temp needs explicit input of multiqc zip dirs ([026c35a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/026c35aebfb140746bc823ce06327e25c9a40cf1))
26 | * change release type to 'go', fixes release please wf ([658c784](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/658c784ab5d70b117ce9dd386f5b07f8e4ff782d))
27 | * change release type to 'go', fixes release please wf ([a81ab9d](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/a81ab9def05667e23c5e59ac881c7a57b9f1b767))
28 | * code review issues ([97faf1a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/97faf1ae8bde189094e6b46568f3911f01b625fd))
29 | * dont remove temp files for test runs ([0c2c8d1](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/0c2c8d19c51648872d09a8f697826b9445bafc81))
30 | * formatting, logging ([d6c819e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/d6c819efcadde1ad4af342152d3aef2a982983d0))
31 | * lint error and docs update ([cf59f11](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/cf59f11acc11c01866ad56971fd132661f4f32be))
32 | * removed unused templates, update catalog yml ([b5c292f](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/b5c292ff4b476441d8068ca8013e3b931d30fc04))
33 | * snakefmt error ([70d670a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/70d670a91c79c0a9d89c59fff6add3f1036753a3))
34 | * update release-please GH workflow ([1dad25d](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/1dad25da5de222982b0cdf35a91be6ecc5a81a42))
35 | * update release-please GH workflow ([0ea4df2](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/0ea4df2f746e0fc760c06a3b902e2ee8bdf2ff42))
36 | * update snakemake action ([fac8662](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/fac8662193fa501fdfc2f3bb94e7549b96dec500))
37 | * updated schemas and params docs ([facf377](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/facf377a7cc107b3e8db0793b21027a9f3df0eeb))
38 | * updates to enable release-please action again ([8d9552b](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/8d9552b8369ca6b115ee00777f45cf641312dde3))
39 | * various changes to formatting and example rules ([b9b2366](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/b9b236645ad961cd7a8886c1697b27f3694ee047))
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021, AUTHORS
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Snakemake workflow: `<name>`
 2 | 
 3 | [![Snakemake](https://img.shields.io/badge/snakemake-≥8.0.0-brightgreen.svg)](https://snakemake.github.io)
 4 | [![GitHub actions status](https://github.com/snakemake-workflows/snakemake-workflow-template/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/snakemake-workflows/snakemake-workflow-template/actions/workflows/main.yml)
 5 | [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
 6 | [![workflow catalog](https://img.shields.io/badge/Snakemake%20workflow%20catalog-darkgreen)](https://snakemake.github.io/snakemake-workflow-catalog/docs/workflows/<owner>/<repo>)
 7 | 
 8 | A Snakemake workflow for `<description>`
 9 | 
10 | - [Snakemake workflow: `<name>`](#snakemake-workflow-name)
11 |   - [Usage](#usage)
12 |   - [Deployment options](#deployment-options)
13 |   - [Authors](#authors)
14 |   - [References](#references)
15 |   - [TODO](#todo)
16 | 
17 | ## Usage
18 | 
19 | The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/docs/workflows/<owner>/<repo>).
20 | 
21 | Detailed information about input data and workflow configuration can also be found in the [`config/README.md`](config/README.md).
22 | 
23 | If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository or its DOI.
24 | 
25 | ## Deployment options
26 | 
27 | To run the workflow from command line, change the working directory.
28 | 
29 | ```bash
30 | cd path/to/snakemake-workflow-name
31 | ```
32 | 
33 | Adjust options in the default config file `config/config.yml`.
34 | Before running the complete workflow, you can perform a dry run using:
35 | 
36 | ```bash
37 | snakemake --dry-run
38 | ```
39 | 
40 | To run the workflow with test files using **conda**:
41 | 
42 | ```bash
43 | snakemake --cores 2 --sdm conda --directory .test
44 | ```
45 | 
46 | To run the workflow with **apptainer** / **singularity**, add a link to a container registry in the `Snakefile`, for example `container: "oras://ghcr.io/<user>/<repository>:<version>"` for Github's container registry.
47 | Run the workflow with:
48 | 
49 | ```bash
50 | snakemake --cores 2 --sdm conda apptainer --directory .test
51 | ```
52 | 
53 | ## Authors
54 | 
55 | - Firstname Lastname
56 |   - Affiliation
57 |   - ORCID profile
58 |   - home page
59 | 
60 | ## References
61 | 
62 | > Köster, J., Mölder, F., Jablonski, K. P., Letcher, B., Hall, M. B., Tomkins-Tinch, C. H., Sochat, V., Forster, J., Lee, S., Twardziok, S. O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., & Nahnsen, S. _Sustainable data analysis with Snakemake_. F1000Research, 10:33, 10, 33, **2021**. https://doi.org/10.12688/f1000research.29032.2.
63 | 
64 | ## TODO
65 | 
66 | - Replace `<owner>` and `<repo>` everywhere in the template with the correct user name/organization, and the repository name. The workflow will be automatically added to the [snakemake workflow catalog](https://snakemake.github.io/snakemake-workflow-catalog/index.html) once it is publicly available on Github.
67 | - Replace `<name>` with the workflow name (can be the same as `<repo>`).
68 | - Replace `<description>` with a description of what the workflow does.
69 | - Update the [deployment](#deployment-options), [authors](#authors) and [references](#references) sections.
70 | - Update the `README.md` badges. Add or remove badges for `conda`/`singularity`/`apptainer` usage depending on the workflow's [deployment](#deployment-options) options.
71 | - Do not forget to also adjust the configuration-specific `config/README.md` file.
72 | 


--------------------------------------------------------------------------------
/config/README.md:
--------------------------------------------------------------------------------
 1 | ## Workflow overview
 2 | 
 3 | This workflow is a best-practice workflow for `<detailed description>`.
 4 | The workflow is built using [snakemake](https://snakemake.readthedocs.io/en/stable/) and consists of the following steps:
 5 | 
 6 | 1. Download genome reference from NCBI
 7 | 2. Validate downloaded genome (`python` script)
 8 | 3. Simulate short read sequencing data on the fly (`dwgsim`)
 9 | 4. Check quality of input read data (`FastQC`)
10 | 5. Collect statistics from tool output (`MultiQC`)
11 | 
12 | ## Running the workflow
13 | 
14 | ### Input data
15 | 
16 | This template workflow creates artificial sequencing data in `*.fastq.gz` format.
17 | It does not contain actual input data.
18 | The simulated input files are nevertheless created based on a mandatory table linked in the `config.yml` file (default: `.test/samples.tsv`).
19 | The sample sheet has the following layout:
20 | 
21 | | sample  | condition | replicate | read1                      | read2                      |
22 | | ------- | --------- | --------- | -------------------------- | -------------------------- |
23 | | sample1 | wild_type | 1         | sample1.bwa.read1.fastq.gz | sample1.bwa.read2.fastq.gz |
24 | | sample2 | wild_type | 2         | sample2.bwa.read1.fastq.gz | sample2.bwa.read2.fastq.gz |
25 | 
26 | ### Parameters
27 | 
28 | This table lists all parameters that can be used to run the workflow.
29 | 
30 | | parameter          | type | details                               | default                        |
31 | | ------------------ | ---- | ------------------------------------- | ------------------------------ |
32 | | **samplesheet**    |      |                                       |                                |
33 | | path               | str  | path to samplesheet, mandatory        | "config/samples.tsv"           |
34 | | **get_genome**     |      |                                       |                                |
35 | | ncbi_ftp           | str  | link to a genome on NCBI's FTP server | link to _S. cerevisiae_ genome |
36 | | **simulate_reads** |      |                                       |                                |
37 | | read_length        | num  | length of target reads in bp          | 100                            |
38 | | read_number        | num  | number of total reads to be simulated | 10000                          |
39 | 


--------------------------------------------------------------------------------
/config/config.yml:
--------------------------------------------------------------------------------
1 | samplesheet: ".test/config/samples.tsv"
2 | 
3 | get_genome:
4 |   ncbi_ftp: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz
5 | 
6 | simulate_reads:
7 |   read_length: 100
8 |   read_number: 10000
9 | 


--------------------------------------------------------------------------------
/config/schemas/config.schema.yml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-07/schema#"
 2 | description: main configuration schema for the workflow
 3 | properties:
 4 |   samplesheet:
 5 |     type: string
 6 |     description: path to sample-sheet TSV file
 7 | 
 8 |   get_genome:
 9 |     type: object
10 |     properties:
11 |       ncbi_ftp:
12 |         type: string
13 |         description: URL for genome retrieval
14 |     required: ["ncbi_ftp"]
15 | 
16 |   simulate_reads:
17 |     type: object
18 |     properties:
19 |       read_length:
20 |         type: number
21 |         description: length of target reads in bp
22 |       read_number:
23 |         type: number
24 |         description: number of total reads to be simulated
25 | 
26 | required:
27 |   - samplesheet
28 |   - get_genome
29 |   - simulate_reads
30 | 


--------------------------------------------------------------------------------
/config/schemas/samples.schema.yml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-07/schema#"
 2 | description: entries for the sample sheet
 3 | properties:
 4 |   sample:
 5 |     type: string
 6 |     description: sample name/identifier
 7 |   condition:
 8 |     type: string
 9 |     description: sample condition that will be compared during differential analysis
10 |   replicate:
11 |     type: integer
12 |     default: 1
13 |     description: consecutive numbers representing multiple replicates of one condition
14 |   read1:
15 |     type: string
16 |     description: names of fastq.gz files, read 1
17 |   read2:
18 |     type: string
19 |     description: names of fastq.gz files, read 2 (optional)
20 | 
21 | required:
22 |   - sample
23 |   - condition
24 |   - replicate
25 |   - read1
26 | 


--------------------------------------------------------------------------------
/workflow/Snakefile:
--------------------------------------------------------------------------------
 1 | # Main entrypoint of the workflow.
 2 | # Please follow the best practices:
 3 | # https://snakemake.readthedocs.io/en/stable/snakefiles/best_practices.html,
 4 | # in particular regarding the standardized folder structure mentioned there.
 5 | 
 6 | 
 7 | # load configuration
 8 | # -----------------------------------------------------
 9 | configfile: "config/config.yml"
10 | 
11 | 
12 | # load rules
13 | # -----------------------------------------------------
14 | include: "rules/common.smk"
15 | include: "rules/process_reads.smk"
16 | 
17 | 
18 | # optional messages, log and error handling
19 | # -----------------------------------------------------
20 | onstart:
21 |     print("\n--- Analysis started ---\n")
22 | 
23 | 
24 | onsuccess:
25 |     print("\n--- Workflow finished! ---\n")
26 | 
27 | 
28 | onerror:
29 |     print("\n--- An error occurred! ---\n")
30 | 
31 | 
32 | # target rules
33 | # -----------------------------------------------------
34 | rule all:
35 |     input:
36 |         "results/multiqc/multiqc_report.html",
37 |     default_target: True
38 | 


--------------------------------------------------------------------------------
/workflow/envs/get_genome.yml:
--------------------------------------------------------------------------------
1 | name: get_genome
2 | channels:
3 |   - conda-forge
4 |   - bioconda
5 |   - nodefaults
6 | dependencies:
7 |   - gzip=1.14
8 |   - wget=1.21.4
9 | 


--------------------------------------------------------------------------------
/workflow/envs/simulate_reads.yml:
--------------------------------------------------------------------------------
1 | name: simulate_reads
2 | channels:
3 |   - conda-forge
4 |   - bioconda
5 |   - nodefaults
6 | dependencies:
7 |   - dwgsim=1.1.14
8 | 


--------------------------------------------------------------------------------
/workflow/envs/validate_genome.yml:
--------------------------------------------------------------------------------
1 | name: validate_genome
2 | channels:
3 |   - conda-forge
4 |   - bioconda
5 |   - nodefaults
6 | dependencies:
7 |   - python=3.12
8 |   - biopython=1.85
9 | 


--------------------------------------------------------------------------------
/workflow/rules/common.smk:
--------------------------------------------------------------------------------
 1 | # import basic packages
 2 | import pandas as pd
 3 | from snakemake.utils import validate
 4 | 
 5 | 
 6 | # read sample sheet
 7 | samples = (
 8 |     pd.read_csv(config["samplesheet"], sep="\t", dtype={"sample": str})
 9 |     .set_index("sample", drop=False)
10 |     .sort_index()
11 | )
12 | 
13 | 
14 | # validate sample sheet and config file
15 | validate(samples, schema="../../config/schemas/samples.schema.yml")
16 | validate(config, schema="../../config/schemas/config.schema.yml")
17 | 


--------------------------------------------------------------------------------
/workflow/rules/process_reads.smk:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------- #
  2 | # EXAMPLE WORKFLOW                                      #
  3 | # ----------------------------------------------------- #
  4 | 
  5 | 
  6 | # fetch genome sequence from NCBI
  7 | # -----------------------------------------------------
  8 | rule get_genome:
  9 |     output:
 10 |         fasta="results/get_genome/genome.fna",
 11 |     conda:
 12 |         "../envs/get_genome.yml"
 13 |     message:
 14 |         """--- Downloading genome sequence."""
 15 |     params:
 16 |         ncbi_ftp=lookup(within=config, dpath="get_genome/ncbi_ftp"),
 17 |     log:
 18 |         "results/get_genome/genome.log",
 19 |     shell:
 20 |         "wget -O results/get_genome/genome.fna.gz {params.ncbi_ftp} > {log} 2>&1 && "
 21 |         "gunzip results/get_genome/genome.fna.gz >> {log} 2>&1"
 22 | 
 23 | 
 24 | # validate genome sequence file
 25 | # -----------------------------------------------------
 26 | rule validate_genome:
 27 |     input:
 28 |         fasta=rules.get_genome.output.fasta,
 29 |     output:
 30 |         fasta="results/validate_genome/genome.fna",
 31 |     conda:
 32 |         "../envs/validate_genome.yml"
 33 |     message:
 34 |         """--- Validating genome sequence file."""
 35 |     log:
 36 |         "results/validate_genome/genome.log",
 37 |     script:
 38 |         "../scripts/validate_fasta.py"
 39 | 
 40 | 
 41 | # simulate read data using DWGSIM
 42 | # -----------------------------------------------------
 43 | rule simulate_reads:
 44 |     input:
 45 |         fasta=rules.validate_genome.output.fasta,
 46 |     output:
 47 |         multiext(
 48 |             "results/simulate_reads/{sample}",
 49 |             read1=".bwa.read1.fastq.gz",
 50 |             read2=".bwa.read2.fastq.gz",
 51 |         ),
 52 |     conda:
 53 |         "../envs/simulate_reads.yml"
 54 |     message:
 55 |         """--- Simulating read data with DWGSIM."""
 56 |     params:
 57 |         output_type=1,
 58 |         read_length=lookup(within=config, dpath="simulate_reads/read_length"),
 59 |         read_number=lookup(within=config, dpath="simulate_reads/read_number"),
 60 |     log:
 61 |         "results/simulate_reads/{sample}.log",
 62 |     shell:
 63 |         "output_prefix=`echo {output.read1} | cut -f 1 -d .`;"
 64 |         "dwgsim "
 65 |         " -1 {params.read_length}"
 66 |         " -2 {params.read_length}"
 67 |         " -N {params.read_number}"
 68 |         " -o {params.output_type}"
 69 |         " {input.fasta}"
 70 |         " ${{output_prefix}}"
 71 |         " > {log} 2>&1"
 72 | 
 73 | 
 74 | # make QC report
 75 | # -----------------------------------------------------
 76 | rule fastqc:
 77 |     input:
 78 |         fastq="results/simulate_reads/{sample}.bwa.{read}.fastq.gz",
 79 |     output:
 80 |         html="results/fastqc/{sample}.bwa.{read}_fastqc.html",
 81 |         zip="results/fastqc/{sample}.bwa.{read}_fastqc.zip",
 82 |     params:
 83 |         extra="--quiet",
 84 |     message:
 85 |         """--- Checking fastq files with FastQC."""
 86 |     log:
 87 |         "results/fastqc/{sample}.bwa.{read}.log",
 88 |     threads: 1
 89 |     wrapper:
 90 |         "v6.0.0/bio/fastqc"
 91 | 
 92 | 
 93 | # run multiQC on tool output
 94 | # -----------------------------------------------------
 95 | rule multiqc:
 96 |     input:
 97 |         expand(
 98 |             "results/fastqc/{sample}.bwa.{read}_fastqc.{ext}",
 99 |             sample=samples.index,
100 |             read=["read1", "read2"],
101 |             ext=["html", "zip"],
102 |         ),
103 |     output:
104 |         report="results/multiqc/multiqc_report.html",
105 |     params:
106 |         extra="--verbose --dirs",
107 |     message:
108 |         """--- Generating MultiQC report for seq data."""
109 |     log:
110 |         "results/multiqc/multiqc.log",
111 |     wrapper:
112 |         "v6.0.0/bio/multiqc"
113 | 


--------------------------------------------------------------------------------
/workflow/scripts/validate_fasta.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from Bio import SeqIO
 3 | 
 4 | sys.stderr = open(snakemake.log[0], "w", buffering=1)
 5 | 
 6 | 
 7 | def validate_fasta(input_fasta, output_fasta):
 8 |     try:
 9 |         with open(input_fasta, "r") as fasta_file:
10 |             records = list(SeqIO.parse(fasta_file, "fasta"))
11 |             if not records:
12 |                 raise ValueError("FASTA file is empty or improperly formatted.")
13 |             else:
14 |                 summary = [f"Validated sequence records for {output_fasta}:"]
15 |                 summary += [f"{i.name}: {i.description}" for i in records]
16 |         with open(output_fasta, "w") as validated_file:
17 |             SeqIO.write(records, validated_file, "fasta")
18 |         sys.stderr.write("\n".join(summary))
19 |     except Exception as e:
20 |         sys.stderr.write(f"Validation failed: {e}\n")
21 |         raise
22 | 
23 | 
24 | validate_fasta(snakemake.input["fasta"], snakemake.output["fasta"])
25 | 


--------------------------------------------------------------------------------