├── .github └── workflows │ ├── conventional-prs.yml │ ├── main.yml │ └── release-please.yml ├── .gitignore ├── .snakemake-workflow-catalog.yml ├── .test └── config │ ├── config.yml │ └── samples.tsv ├── CHANGELOG.md ├── LICENSE ├── README.md ├── config ├── README.md ├── config.yml └── schemas │ ├── config.schema.yml │ └── samples.schema.yml └── workflow ├── Snakefile ├── envs ├── get_genome.yml ├── simulate_reads.yml └── validate_genome.yml ├── rules ├── common.smk └── process_reads.smk └── scripts └── validate_fasta.py /.github/workflows/conventional-prs.yml: -------------------------------------------------------------------------------- 1 | name: Lint PR 2 | on: 3 | pull_request_target: 4 | types: 5 | - opened 6 | - reopened 7 | - edited 8 | - synchronize 9 | 10 | permissions: 11 | pull-requests: read 12 | 13 | jobs: 14 | main: 15 | name: Validate PR title 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: amannn/action-semantic-pull-request@v5 19 | env: 20 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 21 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | jobs: 10 | Formatting: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | with: 15 | fetch-depth: 0 16 | - name: Formatting 17 | uses: super-linter/super-linter@v7 18 | env: 19 | VALIDATE_ALL_CODEBASE: false 20 | DEFAULT_BRANCH: main 21 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 22 | VALIDATE_SNAKEMAKE_SNAKEFMT: true 23 | VALIDATE_YAML_PRETTIER: true 24 | 25 | Linting: 26 | runs-on: ubuntu-latest 27 | steps: 28 | - uses: actions/checkout@v4 29 | - name: Lint workflow 30 | uses: snakemake/snakemake-github-action@v2 31 | with: 32 | directory: . 33 | snakefile: workflow/Snakefile 34 | args: "--lint" 35 | 36 | Testing: 37 | runs-on: ubuntu-latest 38 | needs: 39 | - Linting 40 | - Formatting 41 | steps: 42 | - uses: actions/checkout@v4 43 | 44 | - name: Test workflow 45 | uses: snakemake/snakemake-github-action@v2 46 | with: 47 | directory: .test 48 | snakefile: workflow/Snakefile 49 | args: "--sdm conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache --all-temp" 50 | 51 | - name: Test report 52 | uses: snakemake/snakemake-github-action@v2 53 | with: 54 | directory: .test 55 | snakefile: workflow/Snakefile 56 | args: "--report report.zip" 57 | -------------------------------------------------------------------------------- /.github/workflows/release-please.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | 6 | permissions: 7 | contents: write 8 | pull-requests: write 9 | 10 | name: release-please 11 | 12 | jobs: 13 | release-please: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: googleapis/release-please-action@v4 17 | with: 18 | token: ${{ secrets.GITHUB_TOKEN }} 19 | release-type: go # just keep a changelog, no version anywhere outside of git tags 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | results/** 2 | resources/** 3 | logs/** 4 | .snakemake 5 | .snakemake/** 6 | .test/results/* 7 | workflow/notebooks/.ipynb_checkpoints/** 8 | **/.Rhistory 9 | **/*.Rproj 10 | **/.Rproj.user/** 11 | **/.RData 12 | **/Rplots.pdf 13 | -------------------------------------------------------------------------------- /.snakemake-workflow-catalog.yml: -------------------------------------------------------------------------------- 1 | # configuration of display in snakemake workflow catalog: https://snakemake.github.io/snakemake-workflow-catalog 2 | 3 | usage: 4 | mandatory-flags: 5 | desc: # describe your flags here in a few sentences 6 | flags: # put your flags here 7 | software-stack-deployment: 8 | conda: true # whether pipeline works with '--sdm conda' 9 | apptainer: true # whether pipeline works with '--sdm apptainer/singularity' 10 | apptainer+conda: true # whether pipeline works with '--sdm conda apptainer/singularity' 11 | report: true # whether creation of reports using 'snakemake --report report.zip' is supported 12 | -------------------------------------------------------------------------------- /.test/config/config.yml: -------------------------------------------------------------------------------- 1 | samplesheet: "config/samples.tsv" 2 | 3 | get_genome: 4 | ncbi_ftp: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz 5 | 6 | simulate_reads: 7 | read_length: 100 8 | read_number: 10000 9 | -------------------------------------------------------------------------------- /.test/config/samples.tsv: -------------------------------------------------------------------------------- 1 | sample condition replicate read1 read2 2 | sample1 wild_type 1 sample1.bwa.read1.fastq.gz sample1.bwa.read2.fastq.gz 3 | sample2 wild_type 2 sample2.bwa.read1.fastq.gz sample2.bwa.read2.fastq.gz 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 1.0.0 (2025-05-07) 4 | 5 | 6 | ### Features 7 | 8 | * complete minimal workflow as template ([2348055](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/234805535a6353a3db59d5bba0a4b38fe8194d97)) 9 | * complete, reproducible example workflow ([1dfa7ad](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/1dfa7adb0120880ae5e85c57551d5e698a057497)) 10 | * larger update to feature fully-functional example and github actions ([93c08fc](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/93c08fc9db2f8619af7b90784db83d18ed656f25)) 11 | * major simplification of rules, replacement of others by wrappers ([3811ef7](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/3811ef796df4fe38fb7161f9a1b06fac9db86d5b)) 12 | * major simplification of template and update docs ([81ee089](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/81ee08989857366893593a333615523f05295f87)) 13 | * replaced get genome script with simple shell command ([9208995](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/9208995b78433ce3680a0b0e453ddcf5915abcef)) 14 | * update github actions workflow in linting part ([27d53ee](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/27d53eecfad935f50bc62a30248141891a4329ee)) 15 | * update github actions workflow. check formatting of yaml files using prettier ([9f5131b](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/9f5131bf0eeaf1eb7fb0937b2840f73db2a02724)) 16 | * updated all GH actions to latest versions ([4d7b3a2](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/4d7b3a2b143c304b6dcf487664c392c4a5e98f74)) 17 | * updated github actions workflow ([fd36648](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/fd3664841b830ae670549aabb214eb6004aa696d)) 18 | * updated github actions workflow ([7a3a40e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/7a3a40e62df01b37a802a085e7210014eb3fba82)) 19 | 20 | 21 | ### Bug Fixes 22 | 23 | * 2nd attempt to fix release please wf ([f81847f](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/f81847fdfd39d99e795006da4f84701ee6ba8ddc)) 24 | * added usage docs ([776b97e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/776b97e3d0e928d98f4c48e619090b47f702dcab)) 25 | * all-temp needs explicit input of multiqc zip dirs ([026c35a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/026c35aebfb140746bc823ce06327e25c9a40cf1)) 26 | * change release type to 'go', fixes release please wf ([658c784](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/658c784ab5d70b117ce9dd386f5b07f8e4ff782d)) 27 | * change release type to 'go', fixes release please wf ([a81ab9d](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/a81ab9def05667e23c5e59ac881c7a57b9f1b767)) 28 | * code review issues ([97faf1a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/97faf1ae8bde189094e6b46568f3911f01b625fd)) 29 | * dont remove temp files for test runs ([0c2c8d1](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/0c2c8d19c51648872d09a8f697826b9445bafc81)) 30 | * formatting, logging ([d6c819e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/d6c819efcadde1ad4af342152d3aef2a982983d0)) 31 | * lint error and docs update ([cf59f11](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/cf59f11acc11c01866ad56971fd132661f4f32be)) 32 | * removed unused templates, update catalog yml ([b5c292f](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/b5c292ff4b476441d8068ca8013e3b931d30fc04)) 33 | * snakefmt error ([70d670a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/70d670a91c79c0a9d89c59fff6add3f1036753a3)) 34 | * update release-please GH workflow ([1dad25d](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/1dad25da5de222982b0cdf35a91be6ecc5a81a42)) 35 | * update release-please GH workflow ([0ea4df2](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/0ea4df2f746e0fc760c06a3b902e2ee8bdf2ff42)) 36 | * update snakemake action ([fac8662](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/fac8662193fa501fdfc2f3bb94e7549b96dec500)) 37 | * updated schemas and params docs ([facf377](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/facf377a7cc107b3e8db0793b21027a9f3df0eeb)) 38 | * updates to enable release-please action again ([8d9552b](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/8d9552b8369ca6b115ee00777f45cf641312dde3)) 39 | * various changes to formatting and example rules ([b9b2366](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/b9b236645ad961cd7a8886c1697b27f3694ee047)) 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021, AUTHORS 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Snakemake workflow: `` 2 | 3 | [![Snakemake](https://img.shields.io/badge/snakemake-≥8.0.0-brightgreen.svg)](https://snakemake.github.io) 4 | [![GitHub actions status](https://github.com/snakemake-workflows/snakemake-workflow-template/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/snakemake-workflows/snakemake-workflow-template/actions/workflows/main.yml) 5 | [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) 6 | [![workflow catalog](https://img.shields.io/badge/Snakemake%20workflow%20catalog-darkgreen)](https://snakemake.github.io/snakemake-workflow-catalog/docs/workflows//) 7 | 8 | A Snakemake workflow for `` 9 | 10 | - [Snakemake workflow: ``](#snakemake-workflow-name) 11 | - [Usage](#usage) 12 | - [Deployment options](#deployment-options) 13 | - [Authors](#authors) 14 | - [References](#references) 15 | - [TODO](#todo) 16 | 17 | ## Usage 18 | 19 | The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/docs/workflows//). 20 | 21 | Detailed information about input data and workflow configuration can also be found in the [`config/README.md`](config/README.md). 22 | 23 | If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository or its DOI. 24 | 25 | ## Deployment options 26 | 27 | To run the workflow from command line, change the working directory. 28 | 29 | ```bash 30 | cd path/to/snakemake-workflow-name 31 | ``` 32 | 33 | Adjust options in the default config file `config/config.yml`. 34 | Before running the complete workflow, you can perform a dry run using: 35 | 36 | ```bash 37 | snakemake --dry-run 38 | ``` 39 | 40 | To run the workflow with test files using **conda**: 41 | 42 | ```bash 43 | snakemake --cores 2 --sdm conda --directory .test 44 | ``` 45 | 46 | To run the workflow with **apptainer** / **singularity**, add a link to a container registry in the `Snakefile`, for example `container: "oras://ghcr.io//:"` for Github's container registry. 47 | Run the workflow with: 48 | 49 | ```bash 50 | snakemake --cores 2 --sdm conda apptainer --directory .test 51 | ``` 52 | 53 | ## Authors 54 | 55 | - Firstname Lastname 56 | - Affiliation 57 | - ORCID profile 58 | - home page 59 | 60 | ## References 61 | 62 | > Köster, J., Mölder, F., Jablonski, K. P., Letcher, B., Hall, M. B., Tomkins-Tinch, C. H., Sochat, V., Forster, J., Lee, S., Twardziok, S. O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., & Nahnsen, S. _Sustainable data analysis with Snakemake_. F1000Research, 10:33, 10, 33, **2021**. https://doi.org/10.12688/f1000research.29032.2. 63 | 64 | ## TODO 65 | 66 | - Replace `` and `` everywhere in the template with the correct user name/organization, and the repository name. The workflow will be automatically added to the [snakemake workflow catalog](https://snakemake.github.io/snakemake-workflow-catalog/index.html) once it is publicly available on Github. 67 | - Replace `` with the workflow name (can be the same as ``). 68 | - Replace `` with a description of what the workflow does. 69 | - Update the [deployment](#deployment-options), [authors](#authors) and [references](#references) sections. 70 | - Update the `README.md` badges. Add or remove badges for `conda`/`singularity`/`apptainer` usage depending on the workflow's [deployment](#deployment-options) options. 71 | - Do not forget to also adjust the configuration-specific `config/README.md` file. 72 | -------------------------------------------------------------------------------- /config/README.md: -------------------------------------------------------------------------------- 1 | ## Workflow overview 2 | 3 | This workflow is a best-practice workflow for ``. 4 | The workflow is built using [snakemake](https://snakemake.readthedocs.io/en/stable/) and consists of the following steps: 5 | 6 | 1. Download genome reference from NCBI 7 | 2. Validate downloaded genome (`python` script) 8 | 3. Simulate short read sequencing data on the fly (`dwgsim`) 9 | 4. Check quality of input read data (`FastQC`) 10 | 5. Collect statistics from tool output (`MultiQC`) 11 | 12 | ## Running the workflow 13 | 14 | ### Input data 15 | 16 | This template workflow creates artificial sequencing data in `*.fastq.gz` format. 17 | It does not contain actual input data. 18 | The simulated input files are nevertheless created based on a mandatory table linked in the `config.yml` file (default: `.test/samples.tsv`). 19 | The sample sheet has the following layout: 20 | 21 | | sample | condition | replicate | read1 | read2 | 22 | | ------- | --------- | --------- | -------------------------- | -------------------------- | 23 | | sample1 | wild_type | 1 | sample1.bwa.read1.fastq.gz | sample1.bwa.read2.fastq.gz | 24 | | sample2 | wild_type | 2 | sample2.bwa.read1.fastq.gz | sample2.bwa.read2.fastq.gz | 25 | 26 | ### Parameters 27 | 28 | This table lists all parameters that can be used to run the workflow. 29 | 30 | | parameter | type | details | default | 31 | | ------------------ | ---- | ------------------------------------- | ------------------------------ | 32 | | **samplesheet** | | | | 33 | | path | str | path to samplesheet, mandatory | "config/samples.tsv" | 34 | | **get_genome** | | | | 35 | | ncbi_ftp | str | link to a genome on NCBI's FTP server | link to _S. cerevisiae_ genome | 36 | | **simulate_reads** | | | | 37 | | read_length | num | length of target reads in bp | 100 | 38 | | read_number | num | number of total reads to be simulated | 10000 | 39 | -------------------------------------------------------------------------------- /config/config.yml: -------------------------------------------------------------------------------- 1 | samplesheet: ".test/config/samples.tsv" 2 | 3 | get_genome: 4 | ncbi_ftp: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz 5 | 6 | simulate_reads: 7 | read_length: 100 8 | read_number: 10000 9 | -------------------------------------------------------------------------------- /config/schemas/config.schema.yml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-07/schema#" 2 | description: main configuration schema for the workflow 3 | properties: 4 | samplesheet: 5 | type: string 6 | description: path to sample-sheet TSV file 7 | 8 | get_genome: 9 | type: object 10 | properties: 11 | ncbi_ftp: 12 | type: string 13 | description: URL for genome retrieval 14 | required: ["ncbi_ftp"] 15 | 16 | simulate_reads: 17 | type: object 18 | properties: 19 | read_length: 20 | type: number 21 | description: length of target reads in bp 22 | read_number: 23 | type: number 24 | description: number of total reads to be simulated 25 | 26 | required: 27 | - samplesheet 28 | - get_genome 29 | - simulate_reads 30 | -------------------------------------------------------------------------------- /config/schemas/samples.schema.yml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-07/schema#" 2 | description: entries for the sample sheet 3 | properties: 4 | sample: 5 | type: string 6 | description: sample name/identifier 7 | condition: 8 | type: string 9 | description: sample condition that will be compared during differential analysis 10 | replicate: 11 | type: integer 12 | default: 1 13 | description: consecutive numbers representing multiple replicates of one condition 14 | read1: 15 | type: string 16 | description: names of fastq.gz files, read 1 17 | read2: 18 | type: string 19 | description: names of fastq.gz files, read 2 (optional) 20 | 21 | required: 22 | - sample 23 | - condition 24 | - replicate 25 | - read1 26 | -------------------------------------------------------------------------------- /workflow/Snakefile: -------------------------------------------------------------------------------- 1 | # Main entrypoint of the workflow. 2 | # Please follow the best practices: 3 | # https://snakemake.readthedocs.io/en/stable/snakefiles/best_practices.html, 4 | # in particular regarding the standardized folder structure mentioned there. 5 | 6 | 7 | # load configuration 8 | # ----------------------------------------------------- 9 | configfile: "config/config.yml" 10 | 11 | 12 | # load rules 13 | # ----------------------------------------------------- 14 | include: "rules/common.smk" 15 | include: "rules/process_reads.smk" 16 | 17 | 18 | # optional messages, log and error handling 19 | # ----------------------------------------------------- 20 | onstart: 21 | print("\n--- Analysis started ---\n") 22 | 23 | 24 | onsuccess: 25 | print("\n--- Workflow finished! ---\n") 26 | 27 | 28 | onerror: 29 | print("\n--- An error occurred! ---\n") 30 | 31 | 32 | # target rules 33 | # ----------------------------------------------------- 34 | rule all: 35 | input: 36 | "results/multiqc/multiqc_report.html", 37 | default_target: True 38 | -------------------------------------------------------------------------------- /workflow/envs/get_genome.yml: -------------------------------------------------------------------------------- 1 | name: get_genome 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - nodefaults 6 | dependencies: 7 | - gzip=1.14 8 | - wget=1.21.4 9 | -------------------------------------------------------------------------------- /workflow/envs/simulate_reads.yml: -------------------------------------------------------------------------------- 1 | name: simulate_reads 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - nodefaults 6 | dependencies: 7 | - dwgsim=1.1.14 8 | -------------------------------------------------------------------------------- /workflow/envs/validate_genome.yml: -------------------------------------------------------------------------------- 1 | name: validate_genome 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - nodefaults 6 | dependencies: 7 | - python=3.12 8 | - biopython=1.85 9 | -------------------------------------------------------------------------------- /workflow/rules/common.smk: -------------------------------------------------------------------------------- 1 | # import basic packages 2 | import pandas as pd 3 | from snakemake.utils import validate 4 | 5 | 6 | # read sample sheet 7 | samples = ( 8 | pd.read_csv(config["samplesheet"], sep="\t", dtype={"sample": str}) 9 | .set_index("sample", drop=False) 10 | .sort_index() 11 | ) 12 | 13 | 14 | # validate sample sheet and config file 15 | validate(samples, schema="../../config/schemas/samples.schema.yml") 16 | validate(config, schema="../../config/schemas/config.schema.yml") 17 | -------------------------------------------------------------------------------- /workflow/rules/process_reads.smk: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------- # 2 | # EXAMPLE WORKFLOW # 3 | # ----------------------------------------------------- # 4 | 5 | 6 | # fetch genome sequence from NCBI 7 | # ----------------------------------------------------- 8 | rule get_genome: 9 | output: 10 | fasta="results/get_genome/genome.fna", 11 | conda: 12 | "../envs/get_genome.yml" 13 | message: 14 | """--- Downloading genome sequence.""" 15 | params: 16 | ncbi_ftp=lookup(within=config, dpath="get_genome/ncbi_ftp"), 17 | log: 18 | "results/get_genome/genome.log", 19 | shell: 20 | "wget -O results/get_genome/genome.fna.gz {params.ncbi_ftp} > {log} 2>&1 && " 21 | "gunzip results/get_genome/genome.fna.gz >> {log} 2>&1" 22 | 23 | 24 | # validate genome sequence file 25 | # ----------------------------------------------------- 26 | rule validate_genome: 27 | input: 28 | fasta=rules.get_genome.output.fasta, 29 | output: 30 | fasta="results/validate_genome/genome.fna", 31 | conda: 32 | "../envs/validate_genome.yml" 33 | message: 34 | """--- Validating genome sequence file.""" 35 | log: 36 | "results/validate_genome/genome.log", 37 | script: 38 | "../scripts/validate_fasta.py" 39 | 40 | 41 | # simulate read data using DWGSIM 42 | # ----------------------------------------------------- 43 | rule simulate_reads: 44 | input: 45 | fasta=rules.validate_genome.output.fasta, 46 | output: 47 | multiext( 48 | "results/simulate_reads/{sample}", 49 | read1=".bwa.read1.fastq.gz", 50 | read2=".bwa.read2.fastq.gz", 51 | ), 52 | conda: 53 | "../envs/simulate_reads.yml" 54 | message: 55 | """--- Simulating read data with DWGSIM.""" 56 | params: 57 | output_type=1, 58 | read_length=lookup(within=config, dpath="simulate_reads/read_length"), 59 | read_number=lookup(within=config, dpath="simulate_reads/read_number"), 60 | log: 61 | "results/simulate_reads/{sample}.log", 62 | shell: 63 | "output_prefix=`echo {output.read1} | cut -f 1 -d .`;" 64 | "dwgsim " 65 | " -1 {params.read_length}" 66 | " -2 {params.read_length}" 67 | " -N {params.read_number}" 68 | " -o {params.output_type}" 69 | " {input.fasta}" 70 | " ${{output_prefix}}" 71 | " > {log} 2>&1" 72 | 73 | 74 | # make QC report 75 | # ----------------------------------------------------- 76 | rule fastqc: 77 | input: 78 | fastq="results/simulate_reads/{sample}.bwa.{read}.fastq.gz", 79 | output: 80 | html="results/fastqc/{sample}.bwa.{read}_fastqc.html", 81 | zip="results/fastqc/{sample}.bwa.{read}_fastqc.zip", 82 | params: 83 | extra="--quiet", 84 | message: 85 | """--- Checking fastq files with FastQC.""" 86 | log: 87 | "results/fastqc/{sample}.bwa.{read}.log", 88 | threads: 1 89 | wrapper: 90 | "v6.0.0/bio/fastqc" 91 | 92 | 93 | # run multiQC on tool output 94 | # ----------------------------------------------------- 95 | rule multiqc: 96 | input: 97 | expand( 98 | "results/fastqc/{sample}.bwa.{read}_fastqc.{ext}", 99 | sample=samples.index, 100 | read=["read1", "read2"], 101 | ext=["html", "zip"], 102 | ), 103 | output: 104 | report="results/multiqc/multiqc_report.html", 105 | params: 106 | extra="--verbose --dirs", 107 | message: 108 | """--- Generating MultiQC report for seq data.""" 109 | log: 110 | "results/multiqc/multiqc.log", 111 | wrapper: 112 | "v6.0.0/bio/multiqc" 113 | -------------------------------------------------------------------------------- /workflow/scripts/validate_fasta.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from Bio import SeqIO 3 | 4 | sys.stderr = open(snakemake.log[0], "w", buffering=1) 5 | 6 | 7 | def validate_fasta(input_fasta, output_fasta): 8 | try: 9 | with open(input_fasta, "r") as fasta_file: 10 | records = list(SeqIO.parse(fasta_file, "fasta")) 11 | if not records: 12 | raise ValueError("FASTA file is empty or improperly formatted.") 13 | else: 14 | summary = [f"Validated sequence records for {output_fasta}:"] 15 | summary += [f"{i.name}: {i.description}" for i in records] 16 | with open(output_fasta, "w") as validated_file: 17 | SeqIO.write(records, validated_file, "fasta") 18 | sys.stderr.write("\n".join(summary)) 19 | except Exception as e: 20 | sys.stderr.write(f"Validation failed: {e}\n") 21 | raise 22 | 23 | 24 | validate_fasta(snakemake.input["fasta"], snakemake.output["fasta"]) 25 | --------------------------------------------------------------------------------