├── .editorconfig ├── .gitattributes ├── .github └── workflows │ ├── conventional-prs.yaml │ ├── main.yml │ └── release-please.yaml ├── .gitignore ├── .gitmodules ├── .snakemake-workflow-catalog.yml ├── .test ├── config_basic │ ├── config.yaml │ ├── samples.tsv │ └── units.tsv ├── config_complex │ ├── config.yaml │ ├── samples.tsv │ └── units.tsv └── ngs-test-data │ └── reads │ ├── a.chr21.1.fq │ ├── a.chr21.2.fq │ ├── a.scerevisiae.1.fq │ ├── a.scerevisiae.2.fq │ ├── b.chr21.1.fq │ ├── b.chr21.2.fq │ ├── b.scerevisiae.1.fq │ ├── b.scerevisiae.2.fq │ ├── c.scerevisiae.1.fq │ └── c.scerevisiae.2.fq ├── CHANGELOG.md ├── LICENSE ├── README.md ├── config ├── README.md ├── config.yaml ├── samples.tsv └── units.tsv └── workflow ├── Snakefile ├── envs ├── biomart.yaml ├── deseq2.yaml ├── gffutils.yaml ├── pandas.yaml └── rseqc.yaml ├── report ├── diffexp.rst ├── ma.rst ├── pca.rst └── workflow.rst ├── rules ├── align.smk ├── common.smk ├── diffexp.smk ├── qc.smk ├── ref.smk └── trim.smk ├── schemas ├── config.schema.yaml ├── samples.schema.yaml └── units.schema.yaml └── scripts ├── common └── __init__.py ├── count-matrix.py ├── deseq2-init.R ├── deseq2.R ├── gene2symbol.R ├── gtf2bed.py └── plot-pca.R /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig is awesome: http://EditorConfig.org 2 | 3 | # top-most EditorConfig file 4 | root = true 5 | 6 | [*] 7 | end_of_line = lf 8 | insert_final_newline = true 9 | charset = utf-8 10 | indent_style = space 11 | indent_size = 4 12 | 13 | [*.{yml,yaml}] 14 | indent_style = space 15 | indent_size = 2 16 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.smk linguist-language=Python 2 | Snakefile linguist-language=Python 3 | .test/* linguist-vendored=false 4 | .test/report.html linguist-generated=true 5 | -------------------------------------------------------------------------------- /.github/workflows/conventional-prs.yaml: -------------------------------------------------------------------------------- 1 | name: "Lint PR for conventional commits: https://www.conventionalcommits.org" 2 | 3 | on: 4 | pull_request_target: 5 | types: 6 | - opened 7 | - reopened 8 | - edited 9 | - synchronize 10 | 11 | jobs: 12 | main: 13 | name: Validate PR title 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: amannn/action-semantic-pull-request@v5 17 | env: 18 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 19 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches_ignore: [] 9 | 10 | jobs: 11 | formatting: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout with submodules 15 | uses: actions/checkout@v3 16 | with: 17 | submodules: recursive 18 | fetch-depth: 0 19 | - name: Formatting 20 | uses: github/super-linter@v5 21 | env: 22 | VALIDATE_ALL_CODEBASE: false 23 | DEFAULT_BRANCH: master 24 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 25 | VALIDATE_SNAKEMAKE_SNAKEFMT: true 26 | linting: 27 | runs-on: ubuntu-latest 28 | steps: 29 | - uses: actions/checkout@v3 30 | - name: Linting 31 | uses: snakemake/snakemake-github-action@v1.22.0 32 | with: 33 | directory: .test 34 | snakefile: workflow/Snakefile 35 | args: "--configfile .test/config_complex/config.yaml --lint" 36 | 37 | run-workflow: 38 | runs-on: ubuntu-latest 39 | needs: 40 | - linting 41 | - formatting 42 | steps: 43 | - name: Checkout repository with submodules 44 | uses: actions/checkout@v3 45 | with: 46 | submodules: recursive 47 | - name: Test workflow (basic model, no batch_effects) 48 | uses: snakemake/snakemake-github-action@v1.22.0 49 | with: 50 | directory: .test 51 | snakefile: workflow/Snakefile 52 | args: "--configfile .test/config_basic/config.yaml --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache" 53 | - name: Test report (basic model, no batch_effects) 54 | uses: snakemake/snakemake-github-action@v1.22.0 55 | with: 56 | directory: .test 57 | snakefile: workflow/Snakefile 58 | args: "--configfile .test/config_basic/config.yaml --report report.zip" 59 | - name: Test workflow (multiple variables_of_interest, include batch_effects) 60 | uses: snakemake/snakemake-github-action@v1.22.0 61 | with: 62 | directory: .test 63 | snakefile: workflow/Snakefile 64 | args: "--configfile .test/config_complex/config.yaml --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache" 65 | - name: Test report (multiple variables_of_interest, include batch_effects) 66 | uses: snakemake/snakemake-github-action@v1.22.0 67 | with: 68 | directory: .test 69 | snakefile: workflow/Snakefile 70 | args: "--configfile .test/config_complex/config.yaml --report report.zip" 71 | -------------------------------------------------------------------------------- /.github/workflows/release-please.yaml: -------------------------------------------------------------------------------- 1 | name: "release-please, see: https://github.com/marketplace/actions/release-please-action" 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | permissions: 9 | contents: write 10 | pull-requests: write 11 | 12 | jobs: 13 | release-please: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: google-github-actions/release-please-action@v3 17 | with: 18 | release-type: simple 19 | token: ${{ secrets.GITHUB_TOKEN }} 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .test/benchmarks/** 2 | .test/logs/** 3 | .test/results/** 4 | .test/resources/** 5 | .test/.snakemake/** 6 | benchmarks/** 7 | logs/** 8 | resources/** 9 | results/** 10 | .snakemake/** -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule ".test/data"] 2 | path = .test/data 3 | url = https://github.com/snakemake-workflows/ngs-test-data.git 4 | -------------------------------------------------------------------------------- /.snakemake-workflow-catalog.yml: -------------------------------------------------------------------------------- 1 | usage: 2 | software-stack-deployment: 3 | conda: true 4 | report: true -------------------------------------------------------------------------------- /.test/config_basic/config.yaml: -------------------------------------------------------------------------------- 1 | # path or URL to sample sheet (TSV format, columns: sample, condition, ...) 2 | samples: config_basic/samples.tsv 3 | # path or URL to sequencing unit sheet (TSV format, columns: sample, unit, fq1, fq2) 4 | # Units are technical replicates (e.g. lanes, or resequencing of the same biological 5 | # sample). 6 | units: config_basic/units.tsv 7 | 8 | 9 | ref: 10 | # Ensembl species name 11 | species: saccharomyces_cerevisiae 12 | # Ensembl release 13 | release: 100 14 | # Genome build 15 | build: R64-1-1 16 | 17 | 18 | trimming: 19 | # If you activate trimming by setting this to `True`, you will have to 20 | # specify the respective cutadapt adapter trimming flag for each unit 21 | # in the `units.tsv` file's `adapters` column 22 | activate: True 23 | 24 | mergeReads: 25 | activate: False 26 | 27 | pca: 28 | activate: True 29 | # Per default, a separate PCA plot is generated for each of the 30 | # `variables_of_interest` and the `batch_effects`, coloring according to 31 | # that variables groups. 32 | # If you want PCA plots for further columns in the samples.tsv sheet, you 33 | # can request them under labels as a list, for example: 34 | # - relatively_uninteresting_variable_X 35 | # - possible_batch_effect_Y 36 | labels: 37 | - condition 38 | 39 | diffexp: 40 | # variables where you are interested in whether they have 41 | # an effect on expression levels 42 | variables_of_interest: 43 | condition: 44 | # any fold change will be relative to this factor level 45 | base_level: untreated 46 | batch_effects: "" 47 | # contrasts for the deseq2 results method to determine fold changes 48 | contrasts: 49 | treated-vs-untreated: 50 | # must be one of the variables_of_interest 51 | variable_of_interest: condition 52 | level_of_interest: treated 53 | # The default model includes all interactions among variables_of_interest 54 | # and batch_effects added on. For the example above this implicitly is: 55 | # model: ~condition 56 | # For the default model to be used, simply specify an empty `model: ""` 57 | # With more variables_of_interest or batch_effects, you could introduce different 58 | # assumptions into your model, by specicifying a different model here. 59 | model: ~condition 60 | 61 | params: 62 | cutadapt-pe: "" 63 | cutadapt-se: "" 64 | star: "" 65 | -------------------------------------------------------------------------------- /.test/config_basic/samples.tsv: -------------------------------------------------------------------------------- 1 | sample_name condition 2 | A1 treated 3 | B1 untreated 4 | A2 treated 5 | B2 untreated 6 | -------------------------------------------------------------------------------- /.test/config_basic/units.tsv: -------------------------------------------------------------------------------- 1 | sample_name unit_name fq1 fq2 sra adapters strandedness 2 | A1 1 ngs-test-data/reads/a.scerevisiae.1.fq ngs-test-data/reads/a.scerevisiae.2.fq -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA yes 3 | B1 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA none 4 | A2 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA none 5 | B2 1 ngs-test-data/reads/b.scerevisiae.1.fq ngs-test-data/reads/b.scerevisiae.2.fq -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA reverse 6 | -------------------------------------------------------------------------------- /.test/config_complex/config.yaml: -------------------------------------------------------------------------------- 1 | # path or URL to sample sheet (TSV format, columns: sample, condition, ...) 2 | samples: config_complex/samples.tsv 3 | # path or URL to sequencing unit sheet (TSV format, columns: sample, unit, fq1, fq2) 4 | # Units are technical replicates (e.g. lanes, or resequencing of the same biological 5 | # sample). 6 | units: config_complex/units.tsv 7 | 8 | 9 | ref: 10 | # Ensembl species name 11 | species: saccharomyces_cerevisiae 12 | # Ensembl release 13 | release: 100 14 | # Genome build 15 | build: R64-1-1 16 | 17 | 18 | trimming: 19 | # If you activate trimming by setting this to `True`, you will have to 20 | # specify the respective cutadapt adapter trimming flag for each unit 21 | # in the `units.tsv` file's `adapters` column 22 | activate: False 23 | 24 | mergeReads: 25 | activate: False 26 | 27 | pca: 28 | activate: True 29 | # Per default, a separate PCA plot is generated for each of the 30 | # `variables_of_interest` and the `batch_effects`, coloring according to 31 | # that variables groups. 32 | # If you want PCA plots for further columns in the samples.tsv sheet, you 33 | # can request them under labels as a list, for example: 34 | # - relatively_uninteresting_variable_X 35 | # - possible_batch_effect_Y 36 | labels: 37 | # columns of sample sheet to use for PCA 38 | - jointly_handled 39 | 40 | diffexp: 41 | # variables where you are interested in whether they have 42 | # an effect on expression levels 43 | variables_of_interest: 44 | treatment_1: 45 | # any fold change will be relative to this factor level 46 | base_level: untreated 47 | treatment_2: 48 | # any fold change will be relative to this factor level 49 | base_level: untreated 50 | batch_effects: 51 | - jointly_handled 52 | # contrasts for the deseq2 results method to determine fold changes 53 | contrasts: 54 | treatment_1_alone: 55 | # must be one of the variables_of_interest 56 | variable_of_interest: treatment_1 57 | # the variable's level to test against the base_level 58 | level_of_interest: treated 59 | treatment_2_alone: 60 | # must be one of the variables_of_interest 61 | variable_of_interest: treatment_2 62 | # the variable's level to test against the base_level 63 | level_of_interest: treated 64 | # Must be a valid expression for option two in the contrasts description 65 | # of ?results in the DESeq2 package. For a more detailed intro, also see: 66 | # https://github.com/tavareshugo/tutorial_DESeq2_contrasts/blob/main/DESeq2_contrasts.md 67 | both_treatments: 'list(c("treatment_1_treated_vs_untreated", "treatment_2_treated_vs_untreated", "treatment_1treated.treatment_2treated"))' 68 | # The default model includes all interactions among variables_of_interest, 69 | # and batch_effects added on. For the example above this implicitly is: 70 | # model: ~jointly_handled + treatment_1 * treatment_2 71 | # For the default model to be used, simply specify an empty `model: ""` below. 72 | # If you want to introduce different assumptions into your model, you can 73 | # specify a different model to use, for example skipping the interaction: 74 | # model: ~jointly_handled + treatment_1 + treatment_2 75 | model: "" 76 | 77 | params: 78 | cutadapt-pe: "" 79 | cutadapt-se: "" 80 | star: "" 81 | -------------------------------------------------------------------------------- /.test/config_complex/samples.tsv: -------------------------------------------------------------------------------- 1 | sample_name treatment_1 treatment_2 jointly_handled 2 | A1 treated treated 1 3 | A2 treated treated 2 4 | A3 treated untreated 1 5 | A4 treated untreated 2 6 | B1 untreated treated 1 7 | B2 untreated treated 2 8 | B3 untreated untreated 1 9 | B4 untreated untreated 2 10 | -------------------------------------------------------------------------------- /.test/config_complex/units.tsv: -------------------------------------------------------------------------------- 1 | sample_name unit_name fq1 fq2 sra adapters strandedness 2 | A1 1 ngs-test-data/reads/a.scerevisiae.1.fq ngs-test-data/reads/a.scerevisiae.2.fq 3 | A2 1 ngs-test-data/reads/a.scerevisiae.1.fq ngs-test-data/reads/a.scerevisiae.2.fq 4 | A3 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq 5 | A4 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq 6 | B1 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq 7 | B2 1 ngs-test-data/reads/b.scerevisiae.1.fq ngs-test-data/reads/b.scerevisiae.2.fq 8 | B3 1 ngs-test-data/reads/b.scerevisiae.1.fq ngs-test-data/reads/b.scerevisiae.2.fq 9 | B4 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq 10 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [2.1.2](https://github.com/snakemake-workflows/rna-seq-star-deseq2/compare/v2.1.1...v2.1.2) (2024-06-05) 4 | 5 | 6 | ### Bug Fixes 7 | 8 | * use derived input for star_index ([#81](https://github.com/snakemake-workflows/rna-seq-star-deseq2/issues/81)) ([87fffe6](https://github.com/snakemake-workflows/rna-seq-star-deseq2/commit/87fffe6a1beaa86e95c3564061d2720cc73308c7)) 9 | 10 | ## [2.1.1](https://github.com/snakemake-workflows/rna-seq-star-deseq2/compare/v2.1.0...v2.1.1) (2024-03-25) 11 | 12 | 13 | ### Bug Fixes 14 | 15 | * release-please branch to `master` and set permissions ([#79](https://github.com/snakemake-workflows/rna-seq-star-deseq2/issues/79)) ([4b781cf](https://github.com/snakemake-workflows/rna-seq-star-deseq2/commit/4b781cfa14fb5474108594fbaefa0ac8519f19dc)) 16 | * remove unused ftp RemoteProvider and require recent snakemake 8 ([#76](https://github.com/snakemake-workflows/rna-seq-star-deseq2/issues/76)) ([0f18be7](https://github.com/snakemake-workflows/rna-seq-star-deseq2/commit/0f18be7618a8dfb998455edf1da89b7cfb2e1301)) 17 | 18 | 19 | ### Performance Improvements 20 | 21 | * update all wrapper to latest v3.5.3 ([#78](https://github.com/snakemake-workflows/rna-seq-star-deseq2/issues/78)) ([bc9ab71](https://github.com/snakemake-workflows/rna-seq-star-deseq2/commit/bc9ab713f7c11b04bae296a27970aceeb12ab1ae)) 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017, Johannes Köster 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Snakemake workflow: rna-seq-star-deseq2 2 | 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4737358.svg)](https://doi.org/10.5281/zenodo.4737358) 4 | [![Snakemake](https://img.shields.io/badge/snakemake-≥6.1.0-brightgreen.svg)](https://snakemake.github.io) 5 | [![GitHub actions status](https://github.com/snakemake-workflows/rna-seq-star-deseq2/workflows/Tests/badge.svg?branch=master)](https://github.com/snakemake-workflows/rna-seq-star-deseq2/actions?query=branch%3Amaster+workflow%3ATests) 6 | [![Conventional Commits](https://img.shields.io/badge/Conventional%20Commits-1.0.0-%23FE5196?logo=conventionalcommits&logoColor=white)](https://conventionalcommits.org) 7 | 8 | This workflow performs a differential gene expression analysis with STAR and Deseq2. 9 | 10 | ## Usage 11 | 12 | The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=snakemake-workflows%2Frna-seq-star-deseq2). 13 | 14 | If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and its DOI (see above). 15 | -------------------------------------------------------------------------------- /config/README.md: -------------------------------------------------------------------------------- 1 | # General configuration 2 | 3 | To configure this workflow, modify `config/config.yaml` according to your needs, following the explanations provided in the file. 4 | 5 | ## `DESeq2` differential expression analysis configuration 6 | 7 | To successfully run the differential expression analysis, you will need to tell DESeq2 which sample annotations to use (annotations are columns in the `samples.tsv` file described below). 8 | This is done in the `config.yaml` file with the entries under `diffexp:`. 9 | The comments for the entries should give all the necessary infos and linkouts. 10 | But if in doubt, please also consult the [`DESeq2` manual](https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html). 11 | 12 | # Sample and unit setup 13 | 14 | The sample and unit setup is specified via tab-separated tabular files (`.tsv`). 15 | Missing values can be specified by empty columns or by writing `NA`. 16 | 17 | ## sample sheet 18 | 19 | The default sample sheet is `config/samples.tsv` (as configured in `config/config.yaml`). 20 | Each sample refers to an actual physical sample, and replicates (both biological and technical) may be specified as separate samples. 21 | For each sample, you will always have to specify a `sample_name`. 22 | In addition, all `variables_of_interest` and `batch_effects` specified in the `config/config.yaml` under the `diffexp:` entry, will have to have corresponding columns in the `config/samples.tsv`. 23 | Finally, the sample sheet can contain any number of additional columns. 24 | So if in doubt about whether you might at some point need some metadata you already have at hand, just put it into the sample sheet already---your future self will thank you. 25 | 26 | ## unit sheet 27 | 28 | The default unit sheet is `config/units.tsv` (as configured in `config/config.yaml`). 29 | For each sample, add one or more sequencing units (for example if you have several runs or lanes per sample). 30 | 31 | ### `.fastq` file source 32 | 33 | For each unit, you will have to define a source for your `.fastq` files. 34 | This can be done via the columns `fq1`, `fq2` and `sra`, with either of: 35 | 1. A single `.fastq` file for single-end reads (`fq1` column only; `fq2` and `sra` columns present, but empty). 36 | The entry can be any path on your system, but we suggest something like a `raw/` data directory within your analysis directory. 37 | 2. Two `.fastq` files for paired-end reads (columns `fq1` and `fq2`; column `sra` present, but empty). 38 | As for the `fq1` column, the `fq2` column can also point to anywhere on your system. 39 | 3. A sequence read archive (SRA) accession number (`sra` column only; `fq1` and `fq2` columns present, but empty). 40 | The workflow will automatically download the corresponding `.fastq` data (currently assumed to be paired-end). 41 | The accession numbers usually start with SRR or ERR and you can find accession numbers for studies of interest with the [SRA Run Selector](https://trace.ncbi.nlm.nih.gov/Traces/study/). 42 | If both local files and an SRA accession are specified for the same unit, the local files will be used. 43 | 44 | ### adapter trimming 45 | 46 | If you set `trimming: activate:` in the `config/config.yaml` to `True`, you will have to provide at least one `cutadapt` adapter argument for each unit in the `adapters` column of the `units.tsv` file. 47 | You will need to find out the adapters used in the sequencing protocol that generated a unit: from your sequencing provider, or for published data from the study's metadata (or its authors). 48 | Then, enter the adapter sequences into the `adapters` column of that unit, preceded by the [correct `cutadapt` adapter argument](https://cutadapt.readthedocs.io/en/stable/guide.html#adapter-types). 49 | 50 | ### strandedness of library preparation protocol 51 | 52 | To get the correct `geneCounts` from `STAR` output, you can provide information on the strandedness of the library preparation protocol used for a unit. 53 | `STAR` can produce counts for unstranded (`none` - this is the default), forward oriented (`yes`) and reverse oriented (`reverse`) protocols. 54 | Enter the respective value into a `strandedness` column in the `units.tsv` file. 55 | -------------------------------------------------------------------------------- /config/config.yaml: -------------------------------------------------------------------------------- 1 | # path or URL to sample sheet (TSV format, columns: sample, condition, ...) 2 | samples: config/samples.tsv 3 | # path or URL to sequencing unit sheet (TSV format, columns: sample, unit, fq1, fq2) 4 | # Units are technical replicates (e.g. lanes, or resequencing of the same biological 5 | # sample). 6 | units: config/units.tsv 7 | 8 | 9 | ref: 10 | # Ensembl species name 11 | species: homo_sapiens 12 | # Ensembl release (make sure to take one where snpeff data is available, check 'snpEff databases' output) 13 | release: 100 14 | # Genome build 15 | build: GRCh38 16 | 17 | trimming: 18 | # If you activate trimming by setting this to `True`, you will have to 19 | # specify the respective cutadapt adapter trimming flag for each unit 20 | # in the `units.tsv` file's `adapters` column 21 | activate: False 22 | 23 | pca: 24 | activate: True 25 | # Per default, a separate PCA plot is generated for each of the 26 | # `variables_of_interest` and the `batch_effects`, coloring according to 27 | # that variables groups. 28 | # If you want PCA plots for further columns in the samples.tsv sheet, you 29 | # can request them under labels as a list, for example: 30 | # - relatively_uninteresting_variable_X 31 | # - possible_batch_effect_Y 32 | labels: "" 33 | 34 | diffexp: 35 | # variables for whome you are interested in whether they have an effect on 36 | # expression levels 37 | variables_of_interest: 38 | treatment_1: 39 | # any fold change will be relative to this factor level 40 | base_level: untreated 41 | treatment_2: 42 | # any fold change will be relative to this factor level 43 | base_level: untreated 44 | # variables whose effect you want to model to separate them from your 45 | # variables_of_interest 46 | batch_effects: 47 | - jointly_handled 48 | # contrasts for the deseq2 results method to determine fold changes 49 | contrasts: 50 | treatment_1: 51 | # must be one of the variables_of_interest, for details see: 52 | # https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#contrasts 53 | variable_of_interest: treatment_1 54 | # must be a level present in the variable_of_interest that is not the 55 | # base_level specified above 56 | level_of_interest: treated 57 | # The default model includes all interactions among variables_of_interest 58 | # and batch_effects added on. For the example above this implicitly is: 59 | # model: ~jointly_handled + treatment_1 * treatment_2 60 | # For the default model to be used, simply specify an empty `model: ""` below. 61 | # If you want to introduce different assumptions into your model, you can 62 | # specify a different model to use, for example skipping the interaction: 63 | # model: ~jointly_handled + treatment_1 + treatment_2 64 | model: "" 65 | 66 | 67 | params: 68 | cutadapt-pe: "" 69 | cutadapt-se: "" 70 | star: "" 71 | -------------------------------------------------------------------------------- /config/samples.tsv: -------------------------------------------------------------------------------- 1 | sample_name treatment_1 treatment_2 jointly_handled 2 | A untreated untreated 1 3 | B untreated treated 1 4 | C treated untreated 1 5 | D treated untreated 2 6 | E treated treated 2 7 | -------------------------------------------------------------------------------- /config/units.tsv: -------------------------------------------------------------------------------- 1 | sample_name unit_name fq1 fq2 sra adapters strandedness 2 | A lane1 A.1.fq.gz A.2.fq.gz 3 | A lane2 A2.1.fq.gz A2.2.fq.gz 4 | B lane1 B.1.fq.gz B.2.fq.gz 5 | C lane1 C.1.fq.gz C.2.fq.gz 6 | D lane1 D.1.fq.gz D.2.fq.gz 7 | E lane1 E.1.fq.gz E.2.fq.gz 8 | -------------------------------------------------------------------------------- /workflow/Snakefile: -------------------------------------------------------------------------------- 1 | from snakemake.utils import min_version 2 | 3 | ##### set minimum snakemake version ##### 4 | min_version("8.8.0") 5 | 6 | 7 | ##### setup report ##### 8 | configfile: "config/config.yaml" 9 | 10 | 11 | report: "report/workflow.rst" 12 | 13 | 14 | ##### setup singularity ##### 15 | 16 | 17 | # this container defines the underlying OS for each job when using the workflow 18 | # with --use-conda --use-singularity 19 | container: "docker://continuumio/miniconda3" 20 | 21 | 22 | ##### load rules ##### 23 | 24 | 25 | include: "rules/common.smk" 26 | include: "rules/ref.smk" 27 | include: "rules/trim.smk" 28 | include: "rules/qc.smk" 29 | include: "rules/align.smk" 30 | include: "rules/diffexp.smk" 31 | 32 | 33 | ##### target rules ##### 34 | 35 | 36 | rule all: 37 | input: 38 | get_final_output(), 39 | -------------------------------------------------------------------------------- /workflow/envs/biomart.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - bioconductor-biomart =2.56 7 | - r-tidyverse =2.0 8 | # remove once we can update to bioconductor-biomart of the 3.18 release, which will 9 | # include this proper fix for the underlying compatibility issue: 10 | # https://github.com/Bioconductor/BiocFileCache/pull/50 11 | - r-dbplyr=2.3.4 12 | -------------------------------------------------------------------------------- /workflow/envs/deseq2.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - bioconductor-deseq2 =1.38 7 | - r-stringr =1.5 8 | - r-ashr =2.2_54 9 | -------------------------------------------------------------------------------- /workflow/envs/gffutils.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - gffutils =0.12 7 | -------------------------------------------------------------------------------- /workflow/envs/pandas.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - nodefaults 4 | dependencies: 5 | - pandas =1.5 6 | -------------------------------------------------------------------------------- /workflow/envs/rseqc.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - rseqc =5.0 7 | -------------------------------------------------------------------------------- /workflow/report/diffexp.rst: -------------------------------------------------------------------------------- 1 | Table of differential expression results for per gene calculated with DESeq2. 2 | -------------------------------------------------------------------------------- /workflow/report/ma.rst: -------------------------------------------------------------------------------- 1 | `MA plot `_ of log fold change vs. mean of normalized counts for each gene when calculating differential expression for contrast {{ snakemake.wildcards.contrast }}. 2 | -------------------------------------------------------------------------------- /workflow/report/pca.rst: -------------------------------------------------------------------------------- 1 | Principal component analysis over the normalized counts of all genes. 2 | -------------------------------------------------------------------------------- /workflow/report/workflow.rst: -------------------------------------------------------------------------------- 1 | This workflow performs differential expression analysis on single- or paired-end RNA-seq data. 2 | After adapter removal with `Cutadapt `_, reads were mapped and gene counts were generated with `STAR `_. 3 | Gene counts of replicated were summed up. 4 | Integrated normalization and differential expression analysis was conducted with `DESeq2 `_ following standard procedure as outlined in the manual. 5 | -------------------------------------------------------------------------------- /workflow/rules/align.smk: -------------------------------------------------------------------------------- 1 | rule align: 2 | input: 3 | unpack(get_fq), 4 | index="resources/star_genome", 5 | gtf="resources/genome.gtf", 6 | output: 7 | aln="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam", 8 | reads_per_gene="results/star/{sample}_{unit}/ReadsPerGene.out.tab", 9 | log: 10 | "logs/star/{sample}_{unit}.log", 11 | params: 12 | idx=lambda wc, input: input.index, 13 | extra=lambda wc, input: f'--outSAMtype BAM SortedByCoordinate --quantMode GeneCounts --sjdbGTFfile {input.gtf} {config["params"]["star"]}', 14 | threads: 24 15 | wrapper: 16 | "v3.5.3/bio/star/align" 17 | -------------------------------------------------------------------------------- /workflow/rules/common.smk: -------------------------------------------------------------------------------- 1 | import glob 2 | 3 | import pandas as pd 4 | from snakemake.utils import validate 5 | 6 | validate(config, schema="../schemas/config.schema.yaml") 7 | 8 | samples = ( 9 | pd.read_csv(config["samples"], sep="\t", dtype={"sample_name": str}) 10 | .set_index("sample_name", drop=False) 11 | .sort_index() 12 | ) 13 | 14 | 15 | def get_final_output(): 16 | final_output = expand( 17 | "results/diffexp/{contrast}.diffexp.symbol.tsv", 18 | contrast=config["diffexp"]["contrasts"], 19 | ) 20 | final_output.append("results/deseq2/normcounts.symbol.tsv") 21 | final_output.append("results/counts/all.symbol.tsv") 22 | final_output.append("results/qc/multiqc_report.html") 23 | 24 | if config["pca"]["activate"]: 25 | # get all the variables to plot a PCA for 26 | pca_variables = list(config["diffexp"]["variables_of_interest"]) 27 | if config["diffexp"]["batch_effects"]: 28 | pca_variables.extend(config["diffexp"]["batch_effects"]) 29 | if config["pca"]["labels"]: 30 | pca_variables.extend(config["pca"]["labels"]) 31 | final_output.extend( 32 | expand("results/pca.{variable}.svg", variable=pca_variables) 33 | ) 34 | return final_output 35 | 36 | 37 | validate(samples, schema="../schemas/samples.schema.yaml") 38 | 39 | units = ( 40 | pd.read_csv(config["units"], sep="\t", dtype={"sample_name": str, "unit_name": str}) 41 | .set_index(["sample_name", "unit_name"], drop=False) 42 | .sort_index() 43 | ) 44 | validate(units, schema="../schemas/units.schema.yaml") 45 | 46 | 47 | wildcard_constraints: 48 | sample="|".join(samples["sample_name"]), 49 | unit="|".join(units["unit_name"]), 50 | 51 | 52 | def get_cutadapt_input(wildcards): 53 | unit = units.loc[wildcards.sample].loc[wildcards.unit] 54 | 55 | if pd.isna(unit["fq1"]): 56 | # SRA sample (always paired-end for now) 57 | accession = unit["sra"] 58 | return expand("sra/{accession}_{read}.fastq", accession=accession, read=[1, 2]) 59 | 60 | if unit["fq1"].endswith("gz"): 61 | ending = ".gz" 62 | else: 63 | ending = "" 64 | 65 | if pd.isna(unit["fq2"]): 66 | # single end local sample 67 | return "pipe/cutadapt/{S}/{U}.fq1.fastq{E}".format( 68 | S=unit.sample_name, U=unit.unit_name, E=ending 69 | ) 70 | else: 71 | # paired end local sample 72 | return expand( 73 | "pipe/cutadapt/{S}/{U}.{{read}}.fastq{E}".format( 74 | S=unit.sample_name, U=unit.unit_name, E=ending 75 | ), 76 | read=["fq1", "fq2"], 77 | ) 78 | 79 | 80 | def get_cutadapt_pipe_input(wildcards): 81 | files = list( 82 | sorted(glob.glob(units.loc[wildcards.sample].loc[wildcards.unit, wildcards.fq])) 83 | ) 84 | assert len(files) > 0 85 | return files 86 | 87 | 88 | def is_paired_end(sample): 89 | sample_units = units.loc[sample] 90 | fq2_null = sample_units["fq2"].isnull() 91 | sra_null = sample_units["sra"].isnull() 92 | paired = ~fq2_null | ~sra_null 93 | all_paired = paired.all() 94 | all_single = (~paired).all() 95 | assert ( 96 | all_single or all_paired 97 | ), "invalid units for sample {}, must be all paired end or all single end".format( 98 | sample 99 | ) 100 | return all_paired 101 | 102 | 103 | def get_fq(wildcards): 104 | if config["trimming"]["activate"]: 105 | # activated trimming, use trimmed data 106 | if is_paired_end(wildcards.sample): 107 | # paired-end sample 108 | return dict( 109 | zip( 110 | ["fq1", "fq2"], 111 | expand( 112 | "results/trimmed/{sample}_{unit}_{group}.fastq.gz", 113 | group=["R1", "R2"], 114 | **wildcards, 115 | ), 116 | ) 117 | ) 118 | # single end sample 119 | return { 120 | "fq1": "results/trimmed/{sample}_{unit}_single.fastq.gz".format(**wildcards) 121 | } 122 | else: 123 | # no trimming, use raw reads 124 | u = units.loc[(wildcards.sample, wildcards.unit)] 125 | if pd.isna(u["fq1"]): 126 | # SRA sample (always paired-end for now) 127 | accession = u["sra"] 128 | return dict( 129 | zip( 130 | ["fq1", "fq2"], 131 | expand( 132 | "sra/{accession}_{group}.fastq", 133 | accession=accession, 134 | group=["R1", "R2"], 135 | ), 136 | ) 137 | ) 138 | if not is_paired_end(wildcards.sample): 139 | return {"fq1": f"{u.fq1}"} 140 | else: 141 | return {"fq1": f"{u.fq1}", "fq2": f"{u.fq2}"} 142 | 143 | 144 | def get_strandedness(units): 145 | if "strandedness" in units.columns: 146 | return units["strandedness"].tolist() 147 | else: 148 | strand_list = ["none"] 149 | return strand_list * units.shape[0] 150 | 151 | 152 | def get_deseq2_threads(wildcards=None): 153 | # https://twitter.com/mikelove/status/918770188568363008 154 | few_coeffs = False if wildcards is None else len(get_contrast(wildcards)) < 10 155 | return 1 if len(samples) < 100 or few_coeffs else 6 156 | 157 | 158 | def is_activated(xpath): 159 | c = config 160 | for entry in xpath.split("/"): 161 | c = c.get(entry, {}) 162 | return bool(c.get("activate", False)) 163 | 164 | 165 | def get_bioc_species_name(): 166 | first_letter = config["ref"]["species"][0] 167 | subspecies = config["ref"]["species"].split("_")[1] 168 | return first_letter + subspecies 169 | 170 | 171 | def get_fastqs(wc): 172 | if config["trimming"]["activate"]: 173 | return expand( 174 | "results/trimmed/{sample}/{unit}_{read}.fastq.gz", 175 | unit=units.loc[wc.sample, "unit_name"], 176 | sample=wc.sample, 177 | read=wc.read, 178 | ) 179 | unit = units.loc[wc.sample] 180 | if all(pd.isna(unit["fq1"])): 181 | # SRA sample (always paired-end for now) 182 | accession = unit["sra"] 183 | return expand( 184 | "sra/{accession}_{read}.fastq", accession=accession, read=wc.read[-1] 185 | ) 186 | fq = "fq{}".format(wc.read[-1]) 187 | return units.loc[wc.sample, fq].tolist() 188 | 189 | 190 | def get_contrast(wildcards): 191 | return config["diffexp"]["contrasts"][wildcards.contrast] 192 | -------------------------------------------------------------------------------- /workflow/rules/diffexp.smk: -------------------------------------------------------------------------------- 1 | rule count_matrix: 2 | input: 3 | expand( 4 | "results/star/{unit.sample_name}_{unit.unit_name}/ReadsPerGene.out.tab", 5 | unit=units.itertuples(), 6 | ), 7 | output: 8 | "results/counts/all.tsv", 9 | log: 10 | "logs/count-matrix.log", 11 | params: 12 | samples=units["sample_name"].tolist(), 13 | strand=get_strandedness(units), 14 | conda: 15 | "../envs/pandas.yaml" 16 | script: 17 | "../scripts/count-matrix.py" 18 | 19 | 20 | rule gene_2_symbol: 21 | input: 22 | counts="{prefix}.tsv", 23 | output: 24 | symbol="{prefix}.symbol.tsv", 25 | params: 26 | species=get_bioc_species_name(), 27 | log: 28 | "logs/gene2symbol/{prefix}.log", 29 | conda: 30 | "../envs/biomart.yaml" 31 | script: 32 | "../scripts/gene2symbol.R" 33 | 34 | 35 | rule deseq2_init: 36 | input: 37 | counts="results/counts/all.tsv", 38 | output: 39 | "results/deseq2/all.rds", 40 | "results/deseq2/normcounts.tsv", 41 | conda: 42 | "../envs/deseq2.yaml" 43 | log: 44 | "logs/deseq2/init.log", 45 | threads: get_deseq2_threads() 46 | script: 47 | "../scripts/deseq2-init.R" 48 | 49 | 50 | rule pca: 51 | input: 52 | "results/deseq2/all.rds", 53 | output: 54 | report("results/pca.{variable}.svg", "../report/pca.rst"), 55 | conda: 56 | "../envs/deseq2.yaml" 57 | log: 58 | "logs/pca.{variable}.log", 59 | script: 60 | "../scripts/plot-pca.R" 61 | 62 | 63 | rule deseq2: 64 | input: 65 | "results/deseq2/all.rds", 66 | output: 67 | table=report("results/diffexp/{contrast}.diffexp.tsv", "../report/diffexp.rst"), 68 | ma_plot=report("results/diffexp/{contrast}.ma-plot.svg", "../report/ma.rst"), 69 | params: 70 | contrast=get_contrast, 71 | conda: 72 | "../envs/deseq2.yaml" 73 | log: 74 | "logs/deseq2/{contrast}.diffexp.log", 75 | threads: get_deseq2_threads() 76 | script: 77 | "../scripts/deseq2.R" 78 | -------------------------------------------------------------------------------- /workflow/rules/qc.smk: -------------------------------------------------------------------------------- 1 | ## RSEQC 2 | 3 | 4 | rule rseqc_gtf2bed: 5 | input: 6 | "resources/genome.gtf", 7 | output: 8 | bed="results/qc/rseqc/annotation.bed", 9 | db=temp("results/qc/rseqc/annotation.db"), 10 | log: 11 | "logs/rseqc_gtf2bed.log", 12 | conda: 13 | "../envs/gffutils.yaml" 14 | script: 15 | "../scripts/gtf2bed.py" 16 | 17 | 18 | rule rseqc_junction_annotation: 19 | input: 20 | bam="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam", 21 | bed="results/qc/rseqc/annotation.bed", 22 | output: 23 | "results/qc/rseqc/{sample}_{unit}.junctionanno.junction.bed", 24 | priority: 1 25 | log: 26 | "logs/rseqc/rseqc_junction_annotation/{sample}_{unit}.log", 27 | params: 28 | extra=r"-q 255", # STAR uses 255 as a score for unique mappers 29 | prefix=lambda w, output: output[0].replace(".junction.bed", ""), 30 | conda: 31 | "../envs/rseqc.yaml" 32 | shell: 33 | "junction_annotation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} " 34 | "> {log[0]} 2>&1" 35 | 36 | 37 | rule rseqc_junction_saturation: 38 | input: 39 | bam="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam", 40 | bed="results/qc/rseqc/annotation.bed", 41 | output: 42 | "results/qc/rseqc/{sample}_{unit}.junctionsat.junctionSaturation_plot.pdf", 43 | priority: 1 44 | log: 45 | "logs/rseqc/rseqc_junction_saturation/{sample}_{unit}.log", 46 | params: 47 | extra=r"-q 255", 48 | prefix=lambda w, output: output[0].replace(".junctionSaturation_plot.pdf", ""), 49 | conda: 50 | "../envs/rseqc.yaml" 51 | shell: 52 | "junction_saturation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} " 53 | "> {log} 2>&1" 54 | 55 | 56 | rule rseqc_stat: 57 | input: 58 | "results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam", 59 | output: 60 | "results/qc/rseqc/{sample}_{unit}.stats.txt", 61 | priority: 1 62 | log: 63 | "logs/rseqc/rseqc_stat/{sample}_{unit}.log", 64 | conda: 65 | "../envs/rseqc.yaml" 66 | shell: 67 | "bam_stat.py -i {input} > {output} 2> {log}" 68 | 69 | 70 | rule rseqc_infer: 71 | input: 72 | bam="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam", 73 | bed="results/qc/rseqc/annotation.bed", 74 | output: 75 | "results/qc/rseqc/{sample}_{unit}.infer_experiment.txt", 76 | priority: 1 77 | log: 78 | "logs/rseqc/rseqc_infer/{sample}_{unit}.log", 79 | conda: 80 | "../envs/rseqc.yaml" 81 | shell: 82 | "infer_experiment.py -r {input.bed} -i {input.bam} > {output} 2> {log}" 83 | 84 | 85 | rule rseqc_innerdis: 86 | input: 87 | bam="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam", 88 | bed="results/qc/rseqc/annotation.bed", 89 | output: 90 | "results/qc/rseqc/{sample}_{unit}.inner_distance_freq.inner_distance.txt", 91 | priority: 1 92 | log: 93 | "logs/rseqc/rseqc_innerdis/{sample}_{unit}.log", 94 | params: 95 | prefix=lambda w, output: output[0].replace(".inner_distance.txt", ""), 96 | conda: 97 | "../envs/rseqc.yaml" 98 | shell: 99 | "inner_distance.py -r {input.bed} -i {input.bam} -o {params.prefix} > {log} 2>&1" 100 | 101 | 102 | rule rseqc_readdis: 103 | input: 104 | bam="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam", 105 | bed="results/qc/rseqc/annotation.bed", 106 | output: 107 | "results/qc/rseqc/{sample}_{unit}.readdistribution.txt", 108 | priority: 1 109 | log: 110 | "logs/rseqc/rseqc_readdis/{sample}_{unit}.log", 111 | conda: 112 | "../envs/rseqc.yaml" 113 | shell: 114 | "read_distribution.py -r {input.bed} -i {input.bam} > {output} 2> {log}" 115 | 116 | 117 | rule rseqc_readdup: 118 | input: 119 | "results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam", 120 | output: 121 | "results/qc/rseqc/{sample}_{unit}.readdup.DupRate_plot.pdf", 122 | priority: 1 123 | log: 124 | "logs/rseqc/rseqc_readdup/{sample}_{unit}.log", 125 | params: 126 | prefix=lambda w, output: output[0].replace(".DupRate_plot.pdf", ""), 127 | conda: 128 | "../envs/rseqc.yaml" 129 | shell: 130 | "read_duplication.py -i {input} -o {params.prefix} > {log} 2>&1" 131 | 132 | 133 | rule rseqc_readgc: 134 | input: 135 | "results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam", 136 | output: 137 | "results/qc/rseqc/{sample}_{unit}.readgc.GC_plot.pdf", 138 | priority: 1 139 | log: 140 | "logs/rseqc/rseqc_readgc/{sample}_{unit}.log", 141 | params: 142 | prefix=lambda w, output: output[0].replace(".GC_plot.pdf", ""), 143 | conda: 144 | "../envs/rseqc.yaml" 145 | shell: 146 | "read_GC.py -i {input} -o {params.prefix} > {log} 2>&1" 147 | 148 | 149 | rule multiqc: 150 | input: 151 | expand( 152 | "results/star/{unit.sample_name}_{unit.unit_name}/Aligned.sortedByCoord.out.bam", 153 | unit=units.itertuples(), 154 | ), 155 | expand( 156 | "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.junctionanno.junction.bed", 157 | unit=units.itertuples(), 158 | ), 159 | expand( 160 | "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.junctionsat.junctionSaturation_plot.pdf", 161 | unit=units.itertuples(), 162 | ), 163 | expand( 164 | "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.infer_experiment.txt", 165 | unit=units.itertuples(), 166 | ), 167 | expand( 168 | "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.stats.txt", 169 | unit=units.itertuples(), 170 | ), 171 | expand( 172 | "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.inner_distance_freq.inner_distance.txt", 173 | unit=units.itertuples(), 174 | ), 175 | expand( 176 | "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.readdistribution.txt", 177 | unit=units.itertuples(), 178 | ), 179 | expand( 180 | "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.readdup.DupRate_plot.pdf", 181 | unit=units.itertuples(), 182 | ), 183 | expand( 184 | "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.readgc.GC_plot.pdf", 185 | unit=units.itertuples(), 186 | ), 187 | expand( 188 | "logs/rseqc/rseqc_junction_annotation/{unit.sample_name}_{unit.unit_name}.log", 189 | unit=units.itertuples(), 190 | ), 191 | output: 192 | "results/qc/multiqc_report.html", 193 | log: 194 | "logs/multiqc.log", 195 | wrapper: 196 | "v3.5.3/bio/multiqc" 197 | -------------------------------------------------------------------------------- /workflow/rules/ref.smk: -------------------------------------------------------------------------------- 1 | rule get_genome: 2 | output: 3 | "resources/genome.fasta", 4 | log: 5 | "logs/get-genome.log", 6 | params: 7 | species=config["ref"]["species"], 8 | datatype="dna", 9 | build=config["ref"]["build"], 10 | release=config["ref"]["release"], 11 | cache: True 12 | wrapper: 13 | "v3.5.3/bio/reference/ensembl-sequence" 14 | 15 | 16 | rule get_annotation: 17 | output: 18 | "resources/genome.gtf", 19 | params: 20 | species=config["ref"]["species"], 21 | fmt="gtf", 22 | build=config["ref"]["build"], 23 | release=config["ref"]["release"], 24 | flavor="", 25 | cache: True 26 | log: 27 | "logs/get_annotation.log", 28 | wrapper: 29 | "v3.5.3/bio/reference/ensembl-annotation" 30 | 31 | 32 | rule genome_faidx: 33 | input: 34 | "resources/genome.fasta", 35 | output: 36 | "resources/genome.fasta.fai", 37 | log: 38 | "logs/genome-faidx.log", 39 | cache: True 40 | wrapper: 41 | "v3.5.3/bio/samtools/faidx" 42 | 43 | 44 | rule bwa_index: 45 | input: 46 | "resources/genome.fasta", 47 | output: 48 | multiext("resources/genome.fasta", ".amb", ".ann", ".bwt", ".pac", ".sa"), 49 | log: 50 | "logs/bwa_index.log", 51 | resources: 52 | mem_mb=369000, 53 | cache: True 54 | wrapper: 55 | "v3.5.3/bio/bwa/index" 56 | 57 | 58 | rule star_index: 59 | input: 60 | fasta="resources/genome.fasta", 61 | annotation="resources/genome.gtf", 62 | output: 63 | directory("resources/star_genome"), 64 | threads: 4 65 | params: 66 | extra=lambda wc, input: f"--sjdbGTFfile {input.annotation} --sjdbOverhang 100", 67 | log: 68 | "logs/star_index_genome.log", 69 | cache: True 70 | wrapper: 71 | "v3.5.3/bio/star/index" 72 | -------------------------------------------------------------------------------- /workflow/rules/trim.smk: -------------------------------------------------------------------------------- 1 | rule get_sra: 2 | output: 3 | "sra/{accession}_1.fastq", 4 | "sra/{accession}_2.fastq", 5 | log: 6 | "logs/get-sra/{accession}.log", 7 | wrapper: 8 | "v3.5.3/bio/sra-tools/fasterq-dump" 9 | 10 | 11 | rule cutadapt_pipe: 12 | input: 13 | get_cutadapt_pipe_input, 14 | output: 15 | pipe("pipe/cutadapt/{sample}/{unit}.{fq}.{ext}"), 16 | log: 17 | "logs/pipe-fastqs/catadapt/{sample}_{unit}.{fq}.{ext}.log", 18 | wildcard_constraints: 19 | ext=r"fastq|fastq\.gz", 20 | threads: 0 21 | shell: 22 | "cat {input} > {output} 2> {log}" 23 | 24 | 25 | rule cutadapt_pe: 26 | input: 27 | get_cutadapt_input, 28 | output: 29 | fastq1="results/trimmed/{sample}_{unit}_R1.fastq.gz", 30 | fastq2="results/trimmed/{sample}_{unit}_R2.fastq.gz", 31 | qc="results/trimmed/{sample}_{unit}.paired.qc.txt", 32 | log: 33 | "logs/cutadapt/{sample}_{unit}.log", 34 | params: 35 | extra=config["params"]["cutadapt-pe"], 36 | adapters=lambda w: str(units.loc[w.sample].loc[w.unit, "adapters"]), 37 | threads: 8 38 | wrapper: 39 | "v3.5.3/bio/cutadapt/pe" 40 | 41 | 42 | rule cutadapt_se: 43 | input: 44 | get_cutadapt_input, 45 | output: 46 | fastq="results/trimmed/{sample}_{unit}_single.fastq.gz", 47 | qc="results/trimmed/{sample}_{unit}_single.qc.txt", 48 | log: 49 | "logs/cutadapt/{sample}_{unit}.log", 50 | params: 51 | extra=config["params"]["cutadapt-se"], 52 | adapters=lambda w: str(units.loc[w.sample].loc[w.unit, "adapters"]), 53 | threads: 8 54 | wrapper: 55 | "v3.5.3/bio/cutadapt/se" 56 | -------------------------------------------------------------------------------- /workflow/schemas/config.schema.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-06/schema#" 2 | 3 | description: snakemake configuration file 4 | 5 | type: object 6 | 7 | properties: 8 | samples: 9 | type: string 10 | units: 11 | type: string 12 | 13 | ref: 14 | type: object 15 | properties: 16 | species: 17 | type: string 18 | release: 19 | type: integer 20 | build: 21 | type: string 22 | required: 23 | - species 24 | - release 25 | - build 26 | 27 | trimming: 28 | type: object 29 | properties: 30 | activate: 31 | type: boolean 32 | required: 33 | - activate 34 | 35 | pca: 36 | type: object 37 | properties: 38 | activate: 39 | type: boolean 40 | labels: 41 | type: 42 | - array 43 | - string 44 | items: 45 | type: string 46 | required: 47 | - activate 48 | 49 | diffexp: 50 | type: object 51 | properties: 52 | contrasts: 53 | type: object 54 | model: 55 | type: string 56 | required: 57 | - contrasts 58 | 59 | params: 60 | type: object 61 | properties: 62 | cutadapt-pe: 63 | type: string 64 | cutadapt-se: 65 | type: string 66 | star: 67 | type: string 68 | required: 69 | - cutadapt-pe 70 | - cutadapt-se 71 | - star 72 | 73 | required: 74 | - samples 75 | - units 76 | - ref 77 | - pca 78 | - diffexp 79 | - params 80 | - trimming 81 | -------------------------------------------------------------------------------- /workflow/schemas/samples.schema.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-06/schema#" 2 | 3 | description: an entry in the sample sheet 4 | properties: 5 | sample_name: 6 | type: string 7 | description: sample name/identifier 8 | 9 | required: 10 | - sample_name 11 | -------------------------------------------------------------------------------- /workflow/schemas/units.schema.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | description: row of the units.tsv, representing a sequencing unit, i.e. single-end or paired-end data 3 | type: object 4 | properties: 5 | sample_name: 6 | type: string 7 | description: sample name/id the unit has been sequenced from 8 | unit_name: 9 | type: string 10 | description: unit id 11 | fq1: 12 | type: string 13 | description: path to FASTQ file 14 | fq2: 15 | type: string 16 | description: path to second FASTQ file (leave empty in case of single-end) 17 | sra: 18 | type: string 19 | description: SRA id for automatic download of unit 20 | adapters: 21 | type: string 22 | description: adapter trimming settings to use (for cutadapt) 23 | strandedness: 24 | type: string 25 | description: one of the values 'none', 'yes' or 'reverse' according to protocol strandedness 26 | 27 | required: 28 | - sample_name 29 | - unit_name 30 | -------------------------------------------------------------------------------- /workflow/scripts/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Any Python script in the scripts folder will be able to import from this module and beyond. 2 | -------------------------------------------------------------------------------- /workflow/scripts/count-matrix.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | # logging 4 | sys.stderr = open(snakemake.log[0], "w") 5 | 6 | import pandas as pd 7 | 8 | 9 | def get_column(strandedness): 10 | if pd.isnull(strandedness) or strandedness == "none": 11 | return 1 # non stranded protocol 12 | elif strandedness == "yes": 13 | return 2 # 3rd column 14 | elif strandedness == "reverse": 15 | return 3 # 4th column, usually for Illumina truseq 16 | else: 17 | raise ValueError( 18 | ( 19 | "'strandedness' column should be empty or have the " 20 | "value 'none', 'yes' or 'reverse', instead has the " 21 | "value {}" 22 | ).format(repr(strandedness)) 23 | ) 24 | 25 | 26 | counts = [ 27 | pd.read_table( 28 | f, index_col=0, usecols=[0, get_column(strandedness)], header=None, skiprows=4 29 | ) 30 | for f, strandedness in zip(snakemake.input, snakemake.params.strand) 31 | ] 32 | 33 | for t, sample in zip(counts, snakemake.params.samples): 34 | t.columns = [sample] 35 | 36 | matrix = pd.concat(counts, axis=1) 37 | matrix.index.name = "gene" 38 | # collapse technical replicates 39 | matrix = matrix.groupby(matrix.columns, axis=1, sort=False).sum() 40 | matrix.to_csv(snakemake.output[0], sep="\t") 41 | -------------------------------------------------------------------------------- /workflow/scripts/deseq2-init.R: -------------------------------------------------------------------------------- 1 | log <- file(snakemake@log[[1]], open = "wt") 2 | sink(log) 3 | sink(log, type="message") 4 | 5 | library(stringr) 6 | library("DESeq2") 7 | 8 | parallel <- FALSE 9 | if (snakemake@threads > 1) { 10 | library("BiocParallel") 11 | # setup parallelization 12 | register(MulticoreParam(snakemake@threads)) 13 | parallel <- TRUE 14 | } 15 | 16 | counts_data <- read.table( 17 | snakemake@input[["counts"]], 18 | header = TRUE, 19 | row.names = "gene", 20 | check.names = FALSE 21 | ) 22 | counts_data <- counts_data[, order(names(counts_data))] 23 | 24 | col_data <- read.table( 25 | snakemake@config[["samples"]], 26 | header = TRUE, 27 | row.names = "sample_name", 28 | check.names = FALSE 29 | ) 30 | col_data <- col_data[order(row.names(col_data)), , drop = FALSE] 31 | 32 | # properly set the base level to the configuration in config.yaml, avoiding 33 | # the default behaviour of choosing the alphabetical minimum level 34 | for (vof in names(snakemake@config[["diffexp"]][["variables_of_interest"]])) { 35 | snakemake@config[["diffexp"]][["variables_of_interest"]][[vof]] 36 | base_level <- snakemake@config[["diffexp"]][["variables_of_interest"]][[vof]][["base_level"]] 37 | col_data[[vof]] <- relevel( 38 | factor(col_data[[vof]]), base_level 39 | ) 40 | } 41 | 42 | # properly turn all batch effects into factors, even if they are numeric 43 | batch_effects <- snakemake@config[["diffexp"]][["batch_effects"]] 44 | for (effect in batch_effects) { 45 | if (str_length(effect) > 0) { 46 | col_data[[effect]] <- factor(col_data[[effect]]) 47 | } 48 | } 49 | 50 | # build up formula with additive batch_effects and all interactions between the 51 | # variables_of_interes 52 | 53 | design_formula <- snakemake@config[["diffexp"]][["model"]] 54 | 55 | if (str_length(design_formula) == 0) { 56 | batch_effects <- str_flatten(batch_effects, " + ") 57 | if (str_length(batch_effects) > 0) { 58 | batch_effects <- str_c(batch_effects, " + ") 59 | } 60 | vof_interactions <- str_flatten( 61 | names(snakemake@config[["diffexp"]][["variables_of_interest"]]), 62 | " * " 63 | ) 64 | design_formula <- str_c("~", batch_effects, vof_interactions) 65 | } 66 | 67 | dds <- DESeqDataSetFromMatrix( 68 | countData = counts_data, 69 | colData = col_data, 70 | design = as.formula(design_formula) 71 | ) 72 | 73 | # remove uninformative columns 74 | dds <- dds[rowSums(counts(dds)) > 1, ] 75 | # normalization and preprocessing 76 | dds <- DESeq(dds, parallel = parallel) 77 | 78 | # Write dds object as RDS 79 | saveRDS(dds, file = snakemake@output[[1]]) 80 | # Write normalized counts 81 | norm_counts <- counts(dds, normalized = TRUE) 82 | write.table( 83 | data.frame( 84 | "gene" = rownames(norm_counts), 85 | norm_counts 86 | ), 87 | file = snakemake@output[[2]], 88 | sep = "\t", 89 | row.names = FALSE 90 | ) 91 | -------------------------------------------------------------------------------- /workflow/scripts/deseq2.R: -------------------------------------------------------------------------------- 1 | log <- file(snakemake@log[[1]], open = "wt") 2 | sink(log) 3 | sink(log, type = "message") 4 | 5 | library("cli") 6 | library("DESeq2") 7 | 8 | parallel <- FALSE 9 | if (snakemake@threads > 1) { 10 | library("BiocParallel") 11 | # setup parallelization 12 | register(MulticoreParam(snakemake@threads)) 13 | parallel <- TRUE 14 | } 15 | 16 | dds <- readRDS(snakemake@input[[1]]) 17 | 18 | contrast_config <- snakemake@config[["diffexp"]][["contrasts"]][[ 19 | snakemake@wildcards[["contrast"]] 20 | ]] 21 | 22 | # basic case of contrast specification, see: 23 | # https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#contrasts 24 | if (length(contrast_config) == 2 && typeof(contrast_config) == "list") { 25 | if ( 26 | # check for existence contrast's variable_of_interest to 27 | # provide useful error message 28 | !(contrast_config[["variable_of_interest"]] %in% 29 | names(snakemake@config[["diffexp"]][["variables_of_interest"]]) 30 | ) 31 | ) { 32 | cli_abort( 33 | c( 34 | "config.yaml: All variable_of_interest entries under `diffexp: contrasts:`", 35 | " " = "must also exist under `diffexp: variables_of_interest:`.", 36 | "x" = "Could not find variable_of_interest: {contrast_config[['variable_of_interest']]}", 37 | " " = "It was not among the `diffexp: variables_of_interest:`", 38 | " " = "{names(snakemake@config[['diffexp']][['variables_of_interest']])}", 39 | "i" = "Are there any typos in the contrasts' `variable_of_interest:` entries?" 40 | ) 41 | ) 42 | } 43 | contrast <- c( 44 | contrast_config[["variable_of_interest"]], 45 | contrast_config[["level_of_interest"]], 46 | snakemake@config[["diffexp"]][["variables_of_interest"]][[ 47 | contrast_config[["variable_of_interest"]] 48 | ]][["base_level"]] 49 | ) 50 | # more complex contrast specification via list(c(), c()), see ?results docs of 51 | # the DESeq2 package and this tutorial (plus the linked seqanswers thread): 52 | # https://github.com/tavareshugo/tutorial_DESeq2_contrasts/blob/main/DESeq2_contrasts.md 53 | } else if ( 54 | length(contrast_config) == 1 && 55 | typeof(contrast_config) == "character" 56 | ) { 57 | contrast <- d <- eval(parse(text = contrast_config)) 58 | } 59 | 60 | res <- results( 61 | dds, 62 | contrast = contrast, 63 | parallel = parallel 64 | ) 65 | # shrink fold changes for lowly expressed genes 66 | # use ashr so we can use `contrast` as conversion to coef is not trivial, see: 67 | # https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#extended-section-on-shrinkage-estimators 68 | res <- lfcShrink( 69 | dds, 70 | contrast = contrast, 71 | res = res, 72 | type = "ashr" 73 | ) 74 | 75 | # sort by p-value 76 | res <- res[order(res$padj), ] 77 | # TODO explore IHW usage 78 | 79 | 80 | # store results 81 | svg(snakemake@output[["ma_plot"]]) 82 | plotMA(res, ylim = c(-2, 2)) 83 | dev.off() 84 | 85 | write.table( 86 | data.frame( 87 | "gene" = rownames(res), 88 | res 89 | ), 90 | file = snakemake@output[["table"]], 91 | row.names = FALSE, 92 | sep = "\t" 93 | ) 94 | -------------------------------------------------------------------------------- /workflow/scripts/gene2symbol.R: -------------------------------------------------------------------------------- 1 | library(biomaRt) 2 | library(tidyverse) 3 | # useful error messages upon aborting 4 | library("cli") 5 | 6 | # this variable holds a mirror name until 7 | # useEnsembl succeeds ("www" is last, because 8 | # of very frequent "Internal Server Error"s) 9 | mart <- "useast" 10 | rounds <- 0 11 | while ( class(mart)[[1]] != "Mart" ) { 12 | mart <- tryCatch( 13 | { 14 | # done here, because error function does not 15 | # modify outer scope variables, I tried 16 | if (mart == "www") rounds <- rounds + 1 17 | # equivalent to useMart, but you can choose 18 | # the mirror instead of specifying a host 19 | biomaRt::useEnsembl( 20 | biomart = "ENSEMBL_MART_ENSEMBL", 21 | dataset = str_c(snakemake@params[["species"]], "_gene_ensembl"), 22 | mirror = mart 23 | ) 24 | }, 25 | error = function(e) { 26 | # change or make configurable if you want more or 27 | # less rounds of tries of all the mirrors 28 | if (rounds >= 3) { 29 | cli_abort( 30 | str_c( 31 | "Have tried all 4 available Ensembl biomaRt mirrors ", 32 | rounds, 33 | " times. You might have a connection problem, or no mirror is responsive.\n", 34 | "The last error message was:\n", 35 | message(e) 36 | ) 37 | ) 38 | } 39 | # hop to next mirror 40 | mart <- switch(mart, 41 | useast = "uswest", 42 | uswest = "asia", 43 | asia = "www", 44 | www = { 45 | # wait before starting another round through the mirrors, 46 | # hoping that intermittent problems disappear 47 | Sys.sleep(30) 48 | "useast" 49 | } 50 | ) 51 | } 52 | ) 53 | } 54 | 55 | 56 | df <- read.table(snakemake@input[["counts"]], sep='\t', header=1) 57 | 58 | g2g <- biomaRt::getBM( 59 | attributes = c( "ensembl_gene_id", 60 | "external_gene_name"), 61 | filters = "ensembl_gene_id", 62 | values = df$gene, 63 | mart = mart, 64 | ) 65 | 66 | annotated <- merge(df, g2g, by.x="gene", by.y="ensembl_gene_id") 67 | annotated$gene <- ifelse(annotated$external_gene_name == '', annotated$gene, annotated$external_gene_name) 68 | annotated$external_gene_name <- NULL 69 | write.table(annotated, snakemake@output[["symbol"]], sep='\t', row.names=F) 70 | 71 | 72 | -------------------------------------------------------------------------------- /workflow/scripts/gtf2bed.py: -------------------------------------------------------------------------------- 1 | import gffutils 2 | 3 | db = gffutils.create_db( 4 | snakemake.input[0], 5 | dbfn=snakemake.output.db, 6 | force=True, 7 | keep_order=True, 8 | merge_strategy="merge", 9 | sort_attribute_values=True, 10 | disable_infer_genes=True, 11 | disable_infer_transcripts=True, 12 | ) 13 | 14 | with open(snakemake.output.bed, "w") as outfileobj: 15 | for tx in db.features_of_type("transcript", order_by="start"): 16 | bed = [s.strip() for s in db.bed12(tx).split("\t")] 17 | bed[3] = tx.id 18 | outfileobj.write("{}\n".format("\t".join(bed))) 19 | -------------------------------------------------------------------------------- /workflow/scripts/plot-pca.R: -------------------------------------------------------------------------------- 1 | log <- file(snakemake@log[[1]], open = "wt") 2 | sink(log) 3 | sink(log, type = "message") 4 | 5 | library("DESeq2") 6 | 7 | # load deseq2 data 8 | dds <- readRDS(snakemake@input[[1]]) 9 | 10 | # obtain normalized counts 11 | counts <- rlog(dds, blind=FALSE) 12 | svg(snakemake@output[[1]]) 13 | plotPCA(counts, intgroup = snakemake@wildcards[["variable"]]) 14 | dev.off() 15 | --------------------------------------------------------------------------------