├── .editorconfig
├── .gitattributes
├── .github
    └── workflows
    │   ├── conventional-prs.yaml
    │   ├── main.yml
    │   └── release-please.yaml
├── .gitignore
├── .gitmodules
├── .snakemake-workflow-catalog.yml
├── .test
    ├── config_basic
    │   ├── config.yaml
    │   ├── samples.tsv
    │   └── units.tsv
    ├── config_complex
    │   ├── config.yaml
    │   ├── samples.tsv
    │   └── units.tsv
    └── ngs-test-data
    │   └── reads
    │       ├── a.chr21.1.fq
    │       ├── a.chr21.2.fq
    │       ├── a.scerevisiae.1.fq
    │       ├── a.scerevisiae.2.fq
    │       ├── b.chr21.1.fq
    │       ├── b.chr21.2.fq
    │       ├── b.scerevisiae.1.fq
    │       ├── b.scerevisiae.2.fq
    │       ├── c.scerevisiae.1.fq
    │       └── c.scerevisiae.2.fq
├── CHANGELOG.md
├── LICENSE
├── README.md
├── config
    ├── README.md
    ├── config.yaml
    ├── samples.tsv
    └── units.tsv
└── workflow
    ├── Snakefile
    ├── envs
        ├── biomart.yaml
        ├── deseq2.yaml
        ├── gffutils.yaml
        ├── pandas.yaml
        └── rseqc.yaml
    ├── report
        ├── diffexp.rst
        ├── ma.rst
        ├── pca.rst
        └── workflow.rst
    ├── rules
        ├── align.smk
        ├── common.smk
        ├── diffexp.smk
        ├── qc.smk
        ├── ref.smk
        └── trim.smk
    ├── schemas
        ├── config.schema.yaml
        ├── samples.schema.yaml
        └── units.schema.yaml
    └── scripts
        ├── common
            └── __init__.py
        ├── count-matrix.py
        ├── deseq2-init.R
        ├── deseq2.R
        ├── gene2symbol.R
        ├── gtf2bed.py
        └── plot-pca.R


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig is awesome: http://EditorConfig.org
 2 | 
 3 | # top-most EditorConfig file
 4 | root = true
 5 | 
 6 | [*]
 7 | end_of_line = lf
 8 | insert_final_newline = true
 9 | charset = utf-8
10 | indent_style = space
11 | indent_size = 4
12 | 
13 | [*.{yml,yaml}]
14 | indent_style = space
15 | indent_size = 2
16 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.smk linguist-language=Python
2 | Snakefile linguist-language=Python
3 | .test/* linguist-vendored=false
4 | .test/report.html linguist-generated=true
5 | 


--------------------------------------------------------------------------------
/.github/workflows/conventional-prs.yaml:
--------------------------------------------------------------------------------
 1 | name: "Lint PR for conventional commits: https://www.conventionalcommits.org"
 2 | 
 3 | on:
 4 |   pull_request_target:
 5 |     types:
 6 |       - opened
 7 |       - reopened
 8 |       - edited
 9 |       - synchronize
10 | 
11 | jobs:
12 |   main:
13 |     name: Validate PR title
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - uses: amannn/action-semantic-pull-request@v5
17 |         env:
18 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
19 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches_ignore: []
 9 | 
10 | jobs:
11 |   formatting:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |     - name: Checkout with submodules
15 |       uses: actions/checkout@v3
16 |       with:
17 |         submodules: recursive
18 |         fetch-depth: 0
19 |     - name: Formatting
20 |       uses: github/super-linter@v5
21 |       env:
22 |         VALIDATE_ALL_CODEBASE: false
23 |         DEFAULT_BRANCH: master
24 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
25 |         VALIDATE_SNAKEMAKE_SNAKEFMT: true
26 |   linting:
27 |     runs-on: ubuntu-latest
28 |     steps:
29 |     - uses: actions/checkout@v3
30 |     - name: Linting
31 |       uses: snakemake/snakemake-github-action@v1.22.0
32 |       with:
33 |         directory: .test
34 |         snakefile: workflow/Snakefile
35 |         args: "--configfile .test/config_complex/config.yaml --lint"
36 | 
37 |   run-workflow:
38 |     runs-on: ubuntu-latest
39 |     needs:
40 |       - linting
41 |       - formatting
42 |     steps:
43 |     - name: Checkout repository with submodules
44 |       uses: actions/checkout@v3
45 |       with:
46 |         submodules: recursive
47 |     - name: Test workflow (basic model, no batch_effects)
48 |       uses: snakemake/snakemake-github-action@v1.22.0
49 |       with:
50 |         directory: .test
51 |         snakefile: workflow/Snakefile
52 |         args: "--configfile .test/config_basic/config.yaml --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache"
53 |     - name: Test report (basic model, no batch_effects)
54 |       uses: snakemake/snakemake-github-action@v1.22.0
55 |       with:
56 |         directory: .test
57 |         snakefile: workflow/Snakefile
58 |         args: "--configfile .test/config_basic/config.yaml --report report.zip"
59 |     - name: Test workflow (multiple variables_of_interest, include batch_effects)
60 |       uses: snakemake/snakemake-github-action@v1.22.0
61 |       with:
62 |         directory: .test
63 |         snakefile: workflow/Snakefile
64 |         args: "--configfile .test/config_complex/config.yaml --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache"
65 |     - name: Test report (multiple variables_of_interest, include batch_effects)
66 |       uses: snakemake/snakemake-github-action@v1.22.0
67 |       with:
68 |         directory: .test
69 |         snakefile: workflow/Snakefile
70 |         args: "--configfile .test/config_complex/config.yaml --report report.zip"
71 | 


--------------------------------------------------------------------------------
/.github/workflows/release-please.yaml:
--------------------------------------------------------------------------------
 1 | name: "release-please, see: https://github.com/marketplace/actions/release-please-action"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 | permissions:
 9 |   contents: write
10 |   pull-requests: write
11 | 
12 | jobs:
13 |   release-please:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - uses: google-github-actions/release-please-action@v3
17 |         with:
18 |           release-type: simple
19 |           token: ${{ secrets.GITHUB_TOKEN }}
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .test/benchmarks/**
 2 | .test/logs/**
 3 | .test/results/**
 4 | .test/resources/**
 5 | .test/.snakemake/**
 6 | benchmarks/**
 7 | logs/**
 8 | resources/**
 9 | results/**
10 | .snakemake/**


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule ".test/data"]
2 | 	path = .test/data
3 | 	url = https://github.com/snakemake-workflows/ngs-test-data.git
4 | 


--------------------------------------------------------------------------------
/.snakemake-workflow-catalog.yml:
--------------------------------------------------------------------------------
1 | usage:
2 |   software-stack-deployment:
3 |     conda: true 
4 |   report: true


--------------------------------------------------------------------------------
/.test/config_basic/config.yaml:
--------------------------------------------------------------------------------
 1 | # path or URL to sample sheet (TSV format, columns: sample, condition, ...)
 2 | samples: config_basic/samples.tsv
 3 | # path or URL to sequencing unit sheet (TSV format, columns: sample, unit, fq1, fq2)
 4 | # Units are technical replicates (e.g. lanes, or resequencing of the same biological
 5 | # sample).
 6 | units: config_basic/units.tsv
 7 | 
 8 | 
 9 | ref:
10 |   # Ensembl species name
11 |   species: saccharomyces_cerevisiae
12 |   # Ensembl release
13 |   release: 100
14 |   # Genome build
15 |   build: R64-1-1
16 | 
17 | 
18 | trimming:
19 |   # If you activate trimming by setting this to `True`, you will have to
20 |   # specify the respective cutadapt adapter trimming flag for each unit
21 |   # in the `units.tsv` file's `adapters` column
22 |   activate: True
23 | 
24 | mergeReads:
25 |   activate: False
26 | 
27 | pca:
28 |   activate: True
29 |   # Per default, a separate PCA plot is generated for each of the
30 |   # `variables_of_interest` and the `batch_effects`, coloring according to
31 |   # that variables groups.
32 |   # If you want PCA plots for further columns in the samples.tsv sheet, you
33 |   # can request them under labels as a list, for example:
34 |   # - relatively_uninteresting_variable_X
35 |   # - possible_batch_effect_Y
36 |   labels:
37 |     - condition
38 | 
39 | diffexp:
40 |   # variables where you are interested in whether they have
41 |   # an effect on expression levels
42 |   variables_of_interest:
43 |     condition:
44 |       # any fold change will be relative to this factor level
45 |       base_level: untreated
46 |   batch_effects: ""
47 |   # contrasts for the deseq2 results method to determine fold changes
48 |   contrasts:
49 |     treated-vs-untreated:
50 |       # must be one of the variables_of_interest
51 |       variable_of_interest: condition
52 |       level_of_interest: treated
53 |   # The default model includes all interactions among variables_of_interest
54 |   # and batch_effects added on. For the example above this implicitly is:
55 |   # model: ~condition
56 |   # For the default model to be used, simply specify an empty `model: ""`
57 |   # With more variables_of_interest or batch_effects, you could introduce different
58 |   # assumptions into your model, by specicifying a different model here.
59 |   model: ~condition
60 | 
61 | params:
62 |   cutadapt-pe: ""
63 |   cutadapt-se: ""
64 |   star: ""
65 | 


--------------------------------------------------------------------------------
/.test/config_basic/samples.tsv:
--------------------------------------------------------------------------------
1 | sample_name	condition
2 | A1	treated
3 | B1	untreated
4 | A2	treated
5 | B2	untreated
6 | 


--------------------------------------------------------------------------------
/.test/config_basic/units.tsv:
--------------------------------------------------------------------------------
1 | sample_name	unit_name	fq1	fq2	sra	adapters	strandedness
2 | A1	1	ngs-test-data/reads/a.scerevisiae.1.fq	ngs-test-data/reads/a.scerevisiae.2.fq		-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA	yes
3 | B1	1	ngs-test-data/reads/c.scerevisiae.1.fq	ngs-test-data/reads/c.scerevisiae.2.fq		-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA	none
4 | A2	1	ngs-test-data/reads/c.scerevisiae.1.fq	ngs-test-data/reads/c.scerevisiae.2.fq		-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA	none
5 | B2	1	ngs-test-data/reads/b.scerevisiae.1.fq	ngs-test-data/reads/b.scerevisiae.2.fq		-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA	reverse
6 | 


--------------------------------------------------------------------------------
/.test/config_complex/config.yaml:
--------------------------------------------------------------------------------
 1 | # path or URL to sample sheet (TSV format, columns: sample, condition, ...)
 2 | samples: config_complex/samples.tsv
 3 | # path or URL to sequencing unit sheet (TSV format, columns: sample, unit, fq1, fq2)
 4 | # Units are technical replicates (e.g. lanes, or resequencing of the same biological
 5 | # sample).
 6 | units: config_complex/units.tsv
 7 | 
 8 | 
 9 | ref:
10 |   # Ensembl species name
11 |   species: saccharomyces_cerevisiae
12 |   # Ensembl release
13 |   release: 100
14 |   # Genome build
15 |   build: R64-1-1
16 | 
17 | 
18 | trimming:
19 |   # If you activate trimming by setting this to `True`, you will have to
20 |   # specify the respective cutadapt adapter trimming flag for each unit
21 |   # in the `units.tsv` file's `adapters` column
22 |   activate: False
23 | 
24 | mergeReads:
25 |   activate: False
26 | 
27 | pca:
28 |   activate: True
29 |   # Per default, a separate PCA plot is generated for each of the
30 |   # `variables_of_interest` and the `batch_effects`, coloring according to
31 |   # that variables groups.
32 |   # If you want PCA plots for further columns in the samples.tsv sheet, you
33 |   # can request them under labels as a list, for example:
34 |   # - relatively_uninteresting_variable_X
35 |   # - possible_batch_effect_Y
36 |   labels:
37 |     # columns of sample sheet to use for PCA
38 |     - jointly_handled
39 | 
40 | diffexp:
41 |   # variables where you are interested in whether they have
42 |   # an effect on expression levels
43 |   variables_of_interest:
44 |     treatment_1:
45 |       # any fold change will be relative to this factor level
46 |       base_level: untreated
47 |     treatment_2:
48 |       # any fold change will be relative to this factor level
49 |       base_level: untreated
50 |   batch_effects:
51 |     - jointly_handled
52 |   # contrasts for the deseq2 results method to determine fold changes
53 |   contrasts:
54 |     treatment_1_alone:
55 |       # must be one of the variables_of_interest
56 |       variable_of_interest: treatment_1
57 |       # the variable's level to test against the base_level
58 |       level_of_interest: treated
59 |     treatment_2_alone:
60 |       # must be one of the variables_of_interest
61 |       variable_of_interest: treatment_2
62 |       # the variable's level to test against the base_level
63 |       level_of_interest: treated
64 |     # Must be a valid expression for option two in the contrasts description
65 |     # of ?results in the DESeq2 package. For a more detailed intro, also see:
66 |     # https://github.com/tavareshugo/tutorial_DESeq2_contrasts/blob/main/DESeq2_contrasts.md
67 |     both_treatments: 'list(c("treatment_1_treated_vs_untreated", "treatment_2_treated_vs_untreated", "treatment_1treated.treatment_2treated"))'
68 |   # The default model includes all interactions among variables_of_interest,
69 |   # and batch_effects added on. For the example above this implicitly is:
70 |   # model: ~jointly_handled + treatment_1 * treatment_2
71 |   # For the default model to be used, simply specify an empty `model: ""` below.
72 |   # If you want to introduce different assumptions into your model, you can
73 |   # specify a different model to use, for example skipping the interaction:
74 |   # model: ~jointly_handled + treatment_1 + treatment_2
75 |   model: ""
76 | 
77 | params:
78 |   cutadapt-pe: ""
79 |   cutadapt-se: ""
80 |   star: ""
81 | 


--------------------------------------------------------------------------------
/.test/config_complex/samples.tsv:
--------------------------------------------------------------------------------
 1 | sample_name	treatment_1	treatment_2	jointly_handled
 2 | A1	treated	treated	1
 3 | A2	treated	treated	2
 4 | A3	treated	untreated	1
 5 | A4	treated	untreated	2
 6 | B1	untreated	treated	1
 7 | B2	untreated	treated	2
 8 | B3	untreated	untreated	1
 9 | B4	untreated	untreated	2
10 | 


--------------------------------------------------------------------------------
/.test/config_complex/units.tsv:
--------------------------------------------------------------------------------
 1 | sample_name	unit_name	fq1	fq2	sra	adapters	strandedness
 2 | A1	1	ngs-test-data/reads/a.scerevisiae.1.fq	ngs-test-data/reads/a.scerevisiae.2.fq			
 3 | A2	1	ngs-test-data/reads/a.scerevisiae.1.fq	ngs-test-data/reads/a.scerevisiae.2.fq			
 4 | A3	1	ngs-test-data/reads/c.scerevisiae.1.fq	ngs-test-data/reads/c.scerevisiae.2.fq			
 5 | A4	1	ngs-test-data/reads/c.scerevisiae.1.fq	ngs-test-data/reads/c.scerevisiae.2.fq			
 6 | B1	1	ngs-test-data/reads/c.scerevisiae.1.fq	ngs-test-data/reads/c.scerevisiae.2.fq			
 7 | B2	1	ngs-test-data/reads/b.scerevisiae.1.fq	ngs-test-data/reads/b.scerevisiae.2.fq			
 8 | B3	1	ngs-test-data/reads/b.scerevisiae.1.fq	ngs-test-data/reads/b.scerevisiae.2.fq			
 9 | B4	1	ngs-test-data/reads/c.scerevisiae.1.fq	ngs-test-data/reads/c.scerevisiae.2.fq			
10 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## [2.1.2](https://github.com/snakemake-workflows/rna-seq-star-deseq2/compare/v2.1.1...v2.1.2) (2024-06-05)
 4 | 
 5 | 
 6 | ### Bug Fixes
 7 | 
 8 | * use derived input for star_index ([#81](https://github.com/snakemake-workflows/rna-seq-star-deseq2/issues/81)) ([87fffe6](https://github.com/snakemake-workflows/rna-seq-star-deseq2/commit/87fffe6a1beaa86e95c3564061d2720cc73308c7))
 9 | 
10 | ## [2.1.1](https://github.com/snakemake-workflows/rna-seq-star-deseq2/compare/v2.1.0...v2.1.1) (2024-03-25)
11 | 
12 | 
13 | ### Bug Fixes
14 | 
15 | * release-please branch to `master` and set permissions ([#79](https://github.com/snakemake-workflows/rna-seq-star-deseq2/issues/79)) ([4b781cf](https://github.com/snakemake-workflows/rna-seq-star-deseq2/commit/4b781cfa14fb5474108594fbaefa0ac8519f19dc))
16 | * remove unused ftp RemoteProvider and require recent snakemake 8 ([#76](https://github.com/snakemake-workflows/rna-seq-star-deseq2/issues/76)) ([0f18be7](https://github.com/snakemake-workflows/rna-seq-star-deseq2/commit/0f18be7618a8dfb998455edf1da89b7cfb2e1301))
17 | 
18 | 
19 | ### Performance Improvements
20 | 
21 | * update all wrapper to latest v3.5.3 ([#78](https://github.com/snakemake-workflows/rna-seq-star-deseq2/issues/78)) ([bc9ab71](https://github.com/snakemake-workflows/rna-seq-star-deseq2/commit/bc9ab713f7c11b04bae296a27970aceeb12ab1ae))
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017, Johannes Köster
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Snakemake workflow: rna-seq-star-deseq2
 2 | 
 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4737358.svg)](https://doi.org/10.5281/zenodo.4737358)
 4 | [![Snakemake](https://img.shields.io/badge/snakemake-≥6.1.0-brightgreen.svg)](https://snakemake.github.io)
 5 | [![GitHub actions status](https://github.com/snakemake-workflows/rna-seq-star-deseq2/workflows/Tests/badge.svg?branch=master)](https://github.com/snakemake-workflows/rna-seq-star-deseq2/actions?query=branch%3Amaster+workflow%3ATests)
 6 | [![Conventional Commits](https://img.shields.io/badge/Conventional%20Commits-1.0.0-%23FE5196?logo=conventionalcommits&logoColor=white)](https://conventionalcommits.org)
 7 | 
 8 | This workflow performs a differential gene expression analysis with STAR and Deseq2.
 9 | 
10 | ## Usage
11 | 
12 | The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=snakemake-workflows%2Frna-seq-star-deseq2).
13 | 
14 | If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and its DOI (see above).
15 | 


--------------------------------------------------------------------------------
/config/README.md:
--------------------------------------------------------------------------------
 1 | # General configuration
 2 | 
 3 | To configure this workflow, modify `config/config.yaml` according to your needs, following the explanations provided in the file.
 4 | 
 5 | ## `DESeq2` differential expression analysis configuration
 6 | 
 7 | To successfully run the differential expression analysis, you will need to tell DESeq2 which sample annotations to use (annotations are columns in the `samples.tsv` file described below).
 8 | This is done in the `config.yaml` file with the entries under `diffexp:`.
 9 | The comments for the entries should give all the necessary infos and linkouts.
10 | But if in doubt, please also consult the [`DESeq2` manual](https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html).
11 | 
12 | # Sample and unit setup
13 | 
14 | The sample and unit setup is specified via tab-separated tabular files (`.tsv`).
15 | Missing values can be specified by empty columns or by writing `NA`.
16 | 
17 | ## sample sheet
18 | 
19 | The default sample sheet is `config/samples.tsv` (as configured in `config/config.yaml`).
20 | Each sample refers to an actual physical sample, and replicates (both biological and technical) may be specified as separate samples.
21 | For each sample, you will always have to specify a `sample_name`.
22 | In addition, all `variables_of_interest` and `batch_effects` specified in the `config/config.yaml` under the `diffexp:` entry, will have to have corresponding columns in the `config/samples.tsv`.
23 | Finally, the sample sheet can contain any number of additional columns.
24 | So if in doubt about whether you might at some point need some metadata you already have at hand, just put it into the sample sheet already---your future self will thank you.
25 | 
26 | ## unit sheet
27 | 
28 | The default unit sheet is `config/units.tsv` (as configured in `config/config.yaml`).
29 | For each sample, add one or more sequencing units (for example if you have several runs or lanes per sample).
30 | 
31 | ### `.fastq` file source
32 | 
33 | For each unit, you will have to define a source for your `.fastq` files.
34 | This can be done via the columns `fq1`, `fq2` and `sra`, with either of:
35 | 1. A single `.fastq` file for single-end reads (`fq1` column only; `fq2` and `sra` columns present, but empty).
36 |   The entry can be any path on your system, but we suggest something like a `raw/` data directory within your analysis directory.
37 | 2. Two `.fastq` files for paired-end reads (columns `fq1` and `fq2`; column `sra` present, but empty).
38 |   As for the `fq1` column, the `fq2` column can also point to anywhere on your system.
39 | 3. A sequence read archive (SRA) accession number (`sra` column only; `fq1` and `fq2` columns present, but empty).
40 |   The workflow will automatically download the corresponding `.fastq` data (currently assumed to be paired-end).
41 |   The accession numbers usually start with SRR or ERR and you can find accession numbers for studies of interest with the [SRA Run Selector](https://trace.ncbi.nlm.nih.gov/Traces/study/).
42 | If both local files and an SRA accession are specified for the same unit, the local files will be used.
43 | 
44 | ### adapter trimming
45 | 
46 | If you set `trimming: activate:` in the `config/config.yaml` to `True`, you will have to provide at least one `cutadapt` adapter argument for each unit in the `adapters` column of the `units.tsv` file.
47 | You will need to find out the adapters used in the sequencing protocol that generated a unit: from your sequencing provider, or for published data from the study's metadata (or its authors).
48 | Then, enter the adapter sequences into the `adapters` column of that unit, preceded by the [correct `cutadapt` adapter argument](https://cutadapt.readthedocs.io/en/stable/guide.html#adapter-types).
49 | 
50 | ### strandedness of library preparation protocol
51 | 
52 | To get the correct `geneCounts` from `STAR` output, you can provide information on the strandedness of the library preparation protocol used for a unit.
53 | `STAR` can produce counts for unstranded (`none` - this is the default), forward oriented (`yes`) and reverse oriented (`reverse`) protocols.  
54 | Enter the respective value into a `strandedness` column in the `units.tsv` file.
55 | 


--------------------------------------------------------------------------------
/config/config.yaml:
--------------------------------------------------------------------------------
 1 | # path or URL to sample sheet (TSV format, columns: sample, condition, ...)
 2 | samples: config/samples.tsv
 3 | # path or URL to sequencing unit sheet (TSV format, columns: sample, unit, fq1, fq2)
 4 | # Units are technical replicates (e.g. lanes, or resequencing of the same biological
 5 | # sample).
 6 | units: config/units.tsv
 7 | 
 8 | 
 9 | ref:
10 |   # Ensembl species name
11 |   species: homo_sapiens
12 |   # Ensembl release (make sure to take one where snpeff data is available, check 'snpEff databases' output)
13 |   release: 100
14 |   # Genome build
15 |   build: GRCh38
16 | 
17 | trimming:
18 |   # If you activate trimming by setting this to `True`, you will have to
19 |   # specify the respective cutadapt adapter trimming flag for each unit
20 |   # in the `units.tsv` file's `adapters` column
21 |   activate: False
22 | 
23 | pca:
24 |   activate: True
25 |   # Per default, a separate PCA plot is generated for each of the
26 |   # `variables_of_interest` and the `batch_effects`, coloring according to
27 |   # that variables groups.
28 |   # If you want PCA plots for further columns in the samples.tsv sheet, you
29 |   # can request them under labels as a list, for example:
30 |   # - relatively_uninteresting_variable_X
31 |   # - possible_batch_effect_Y
32 |   labels: ""
33 | 
34 | diffexp:
35 |   # variables for whome you are interested in whether they have an effect on
36 |   # expression levels
37 |   variables_of_interest:
38 |     treatment_1:
39 |       # any fold change will be relative to this factor level
40 |       base_level: untreated
41 |     treatment_2:
42 |       # any fold change will be relative to this factor level
43 |       base_level: untreated
44 |   # variables whose effect you want to model to separate them from your
45 |   # variables_of_interest
46 |   batch_effects:
47 |     - jointly_handled
48 |   # contrasts for the deseq2 results method to determine fold changes
49 |   contrasts:
50 |     treatment_1:
51 |       # must be one of the variables_of_interest, for details see:
52 |       # https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#contrasts
53 |       variable_of_interest: treatment_1
54 |       # must be a level present in the variable_of_interest that is not the
55 |       # base_level specified above
56 |       level_of_interest: treated
57 |   # The default model includes all interactions among variables_of_interest
58 |   # and batch_effects added on. For the example above this implicitly is:
59 |   # model: ~jointly_handled + treatment_1 * treatment_2
60 |   # For the default model to be used, simply specify an empty `model: ""` below.
61 |   # If you want to introduce different assumptions into your model, you can
62 |   # specify a different model to use, for example skipping the interaction:
63 |   # model: ~jointly_handled + treatment_1 + treatment_2
64 |   model: ""
65 | 
66 | 
67 | params:
68 |   cutadapt-pe: ""
69 |   cutadapt-se: ""
70 |   star: ""
71 | 


--------------------------------------------------------------------------------
/config/samples.tsv:
--------------------------------------------------------------------------------
1 | sample_name	treatment_1	treatment_2	jointly_handled
2 | A	untreated	untreated	1
3 | B	untreated	treated	1
4 | C	treated	untreated	1
5 | D	treated	untreated	2
6 | E	treated	treated	2
7 | 


--------------------------------------------------------------------------------
/config/units.tsv:
--------------------------------------------------------------------------------
1 | sample_name	unit_name	fq1	fq2	sra	adapters	strandedness
2 | A	lane1	A.1.fq.gz	A.2.fq.gz		
3 | A	lane2	A2.1.fq.gz	A2.2.fq.gz		
4 | B	lane1	B.1.fq.gz	B.2.fq.gz		
5 | C	lane1	C.1.fq.gz	C.2.fq.gz		
6 | D	lane1	D.1.fq.gz	D.2.fq.gz		
7 | E	lane1	E.1.fq.gz	E.2.fq.gz		
8 | 


--------------------------------------------------------------------------------
/workflow/Snakefile:
--------------------------------------------------------------------------------
 1 | from snakemake.utils import min_version
 2 | 
 3 | ##### set minimum snakemake version #####
 4 | min_version("8.8.0")
 5 | 
 6 | 
 7 | ##### setup report #####
 8 | configfile: "config/config.yaml"
 9 | 
10 | 
11 | report: "report/workflow.rst"
12 | 
13 | 
14 | ##### setup singularity #####
15 | 
16 | 
17 | # this container defines the underlying OS for each job when using the workflow
18 | # with --use-conda --use-singularity
19 | container: "docker://continuumio/miniconda3"
20 | 
21 | 
22 | ##### load rules #####
23 | 
24 | 
25 | include: "rules/common.smk"
26 | include: "rules/ref.smk"
27 | include: "rules/trim.smk"
28 | include: "rules/qc.smk"
29 | include: "rules/align.smk"
30 | include: "rules/diffexp.smk"
31 | 
32 | 
33 | ##### target rules #####
34 | 
35 | 
36 | rule all:
37 |     input:
38 |         get_final_output(),
39 | 


--------------------------------------------------------------------------------
/workflow/envs/biomart.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - bioconductor-biomart =2.56
 7 |   - r-tidyverse =2.0
 8 |   # remove once we can update to bioconductor-biomart of the 3.18 release, which will
 9 |   # include this proper fix for the underlying compatibility issue:
10 |   # https://github.com/Bioconductor/BiocFileCache/pull/50
11 |   - r-dbplyr=2.3.4
12 | 


--------------------------------------------------------------------------------
/workflow/envs/deseq2.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - bioconductor-deseq2 =1.38
7 |   - r-stringr =1.5
8 |   - r-ashr =2.2_54
9 | 


--------------------------------------------------------------------------------
/workflow/envs/gffutils.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - gffutils =0.12
7 | 


--------------------------------------------------------------------------------
/workflow/envs/pandas.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - nodefaults
4 | dependencies:
5 |   - pandas =1.5
6 | 


--------------------------------------------------------------------------------
/workflow/envs/rseqc.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - rseqc =5.0
7 | 


--------------------------------------------------------------------------------
/workflow/report/diffexp.rst:
--------------------------------------------------------------------------------
1 | Table of differential expression results for per gene calculated with DESeq2.
2 | 


--------------------------------------------------------------------------------
/workflow/report/ma.rst:
--------------------------------------------------------------------------------
1 | `MA plot <https://en.wikipedia.org/wiki/MA_plot>`_ of log fold change vs. mean of normalized counts for each gene when calculating differential expression for contrast {{ snakemake.wildcards.contrast }}.
2 | 


--------------------------------------------------------------------------------
/workflow/report/pca.rst:
--------------------------------------------------------------------------------
1 | Principal component analysis over the normalized counts of all genes.
2 | 


--------------------------------------------------------------------------------
/workflow/report/workflow.rst:
--------------------------------------------------------------------------------
1 | This workflow performs differential expression analysis on single- or paired-end RNA-seq data.
2 | After adapter removal with `Cutadapt <http://cutadapt.readthedocs.io>`_, reads were mapped and gene counts were generated with `STAR <https://github.com/alexdobin/STAR>`_.
3 | Gene counts of replicated were summed up.
4 | Integrated normalization and differential expression analysis was conducted with `DESeq2 <https://bioconductor.org/packages/release/bioc/html/DESeq2.html>`_ following standard procedure as outlined in the manual.
5 | 


--------------------------------------------------------------------------------
/workflow/rules/align.smk:
--------------------------------------------------------------------------------
 1 | rule align:
 2 |     input:
 3 |         unpack(get_fq),
 4 |         index="resources/star_genome",
 5 |         gtf="resources/genome.gtf",
 6 |     output:
 7 |         aln="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam",
 8 |         reads_per_gene="results/star/{sample}_{unit}/ReadsPerGene.out.tab",
 9 |     log:
10 |         "logs/star/{sample}_{unit}.log",
11 |     params:
12 |         idx=lambda wc, input: input.index,
13 |         extra=lambda wc, input: f'--outSAMtype BAM SortedByCoordinate --quantMode GeneCounts --sjdbGTFfile {input.gtf} {config["params"]["star"]}',
14 |     threads: 24
15 |     wrapper:
16 |         "v3.5.3/bio/star/align"
17 | 


--------------------------------------------------------------------------------
/workflow/rules/common.smk:
--------------------------------------------------------------------------------
  1 | import glob
  2 | 
  3 | import pandas as pd
  4 | from snakemake.utils import validate
  5 | 
  6 | validate(config, schema="../schemas/config.schema.yaml")
  7 | 
  8 | samples = (
  9 |     pd.read_csv(config["samples"], sep="\t", dtype={"sample_name": str})
 10 |     .set_index("sample_name", drop=False)
 11 |     .sort_index()
 12 | )
 13 | 
 14 | 
 15 | def get_final_output():
 16 |     final_output = expand(
 17 |         "results/diffexp/{contrast}.diffexp.symbol.tsv",
 18 |         contrast=config["diffexp"]["contrasts"],
 19 |     )
 20 |     final_output.append("results/deseq2/normcounts.symbol.tsv")
 21 |     final_output.append("results/counts/all.symbol.tsv")
 22 |     final_output.append("results/qc/multiqc_report.html")
 23 | 
 24 |     if config["pca"]["activate"]:
 25 |         # get all the variables to plot a PCA for
 26 |         pca_variables = list(config["diffexp"]["variables_of_interest"])
 27 |         if config["diffexp"]["batch_effects"]:
 28 |             pca_variables.extend(config["diffexp"]["batch_effects"])
 29 |         if config["pca"]["labels"]:
 30 |             pca_variables.extend(config["pca"]["labels"])
 31 |         final_output.extend(
 32 |             expand("results/pca.{variable}.svg", variable=pca_variables)
 33 |         )
 34 |     return final_output
 35 | 
 36 | 
 37 | validate(samples, schema="../schemas/samples.schema.yaml")
 38 | 
 39 | units = (
 40 |     pd.read_csv(config["units"], sep="\t", dtype={"sample_name": str, "unit_name": str})
 41 |     .set_index(["sample_name", "unit_name"], drop=False)
 42 |     .sort_index()
 43 | )
 44 | validate(units, schema="../schemas/units.schema.yaml")
 45 | 
 46 | 
 47 | wildcard_constraints:
 48 |     sample="|".join(samples["sample_name"]),
 49 |     unit="|".join(units["unit_name"]),
 50 | 
 51 | 
 52 | def get_cutadapt_input(wildcards):
 53 |     unit = units.loc[wildcards.sample].loc[wildcards.unit]
 54 | 
 55 |     if pd.isna(unit["fq1"]):
 56 |         # SRA sample (always paired-end for now)
 57 |         accession = unit["sra"]
 58 |         return expand("sra/{accession}_{read}.fastq", accession=accession, read=[1, 2])
 59 | 
 60 |     if unit["fq1"].endswith("gz"):
 61 |         ending = ".gz"
 62 |     else:
 63 |         ending = ""
 64 | 
 65 |     if pd.isna(unit["fq2"]):
 66 |         # single end local sample
 67 |         return "pipe/cutadapt/{S}/{U}.fq1.fastq{E}".format(
 68 |             S=unit.sample_name, U=unit.unit_name, E=ending
 69 |         )
 70 |     else:
 71 |         # paired end local sample
 72 |         return expand(
 73 |             "pipe/cutadapt/{S}/{U}.{{read}}.fastq{E}".format(
 74 |                 S=unit.sample_name, U=unit.unit_name, E=ending
 75 |             ),
 76 |             read=["fq1", "fq2"],
 77 |         )
 78 | 
 79 | 
 80 | def get_cutadapt_pipe_input(wildcards):
 81 |     files = list(
 82 |         sorted(glob.glob(units.loc[wildcards.sample].loc[wildcards.unit, wildcards.fq]))
 83 |     )
 84 |     assert len(files) > 0
 85 |     return files
 86 | 
 87 | 
 88 | def is_paired_end(sample):
 89 |     sample_units = units.loc[sample]
 90 |     fq2_null = sample_units["fq2"].isnull()
 91 |     sra_null = sample_units["sra"].isnull()
 92 |     paired = ~fq2_null | ~sra_null
 93 |     all_paired = paired.all()
 94 |     all_single = (~paired).all()
 95 |     assert (
 96 |         all_single or all_paired
 97 |     ), "invalid units for sample {}, must be all paired end or all single end".format(
 98 |         sample
 99 |     )
100 |     return all_paired
101 | 
102 | 
103 | def get_fq(wildcards):
104 |     if config["trimming"]["activate"]:
105 |         # activated trimming, use trimmed data
106 |         if is_paired_end(wildcards.sample):
107 |             # paired-end sample
108 |             return dict(
109 |                 zip(
110 |                     ["fq1", "fq2"],
111 |                     expand(
112 |                         "results/trimmed/{sample}_{unit}_{group}.fastq.gz",
113 |                         group=["R1", "R2"],
114 |                         **wildcards,
115 |                     ),
116 |                 )
117 |             )
118 |         # single end sample
119 |         return {
120 |             "fq1": "results/trimmed/{sample}_{unit}_single.fastq.gz".format(**wildcards)
121 |         }
122 |     else:
123 |         # no trimming, use raw reads
124 |         u = units.loc[(wildcards.sample, wildcards.unit)]
125 |         if pd.isna(u["fq1"]):
126 |             # SRA sample (always paired-end for now)
127 |             accession = u["sra"]
128 |             return dict(
129 |                 zip(
130 |                     ["fq1", "fq2"],
131 |                     expand(
132 |                         "sra/{accession}_{group}.fastq",
133 |                         accession=accession,
134 |                         group=["R1", "R2"],
135 |                     ),
136 |                 )
137 |             )
138 |         if not is_paired_end(wildcards.sample):
139 |             return {"fq1": f"{u.fq1}"}
140 |         else:
141 |             return {"fq1": f"{u.fq1}", "fq2": f"{u.fq2}"}
142 | 
143 | 
144 | def get_strandedness(units):
145 |     if "strandedness" in units.columns:
146 |         return units["strandedness"].tolist()
147 |     else:
148 |         strand_list = ["none"]
149 |         return strand_list * units.shape[0]
150 | 
151 | 
152 | def get_deseq2_threads(wildcards=None):
153 |     # https://twitter.com/mikelove/status/918770188568363008
154 |     few_coeffs = False if wildcards is None else len(get_contrast(wildcards)) < 10
155 |     return 1 if len(samples) < 100 or few_coeffs else 6
156 | 
157 | 
158 | def is_activated(xpath):
159 |     c = config
160 |     for entry in xpath.split("/"):
161 |         c = c.get(entry, {})
162 |     return bool(c.get("activate", False))
163 | 
164 | 
165 | def get_bioc_species_name():
166 |     first_letter = config["ref"]["species"][0]
167 |     subspecies = config["ref"]["species"].split("_")[1]
168 |     return first_letter + subspecies
169 | 
170 | 
171 | def get_fastqs(wc):
172 |     if config["trimming"]["activate"]:
173 |         return expand(
174 |             "results/trimmed/{sample}/{unit}_{read}.fastq.gz",
175 |             unit=units.loc[wc.sample, "unit_name"],
176 |             sample=wc.sample,
177 |             read=wc.read,
178 |         )
179 |     unit = units.loc[wc.sample]
180 |     if all(pd.isna(unit["fq1"])):
181 |         # SRA sample (always paired-end for now)
182 |         accession = unit["sra"]
183 |         return expand(
184 |             "sra/{accession}_{read}.fastq", accession=accession, read=wc.read[-1]
185 |         )
186 |     fq = "fq{}".format(wc.read[-1])
187 |     return units.loc[wc.sample, fq].tolist()
188 | 
189 | 
190 | def get_contrast(wildcards):
191 |     return config["diffexp"]["contrasts"][wildcards.contrast]
192 | 


--------------------------------------------------------------------------------
/workflow/rules/diffexp.smk:
--------------------------------------------------------------------------------
 1 | rule count_matrix:
 2 |     input:
 3 |         expand(
 4 |             "results/star/{unit.sample_name}_{unit.unit_name}/ReadsPerGene.out.tab",
 5 |             unit=units.itertuples(),
 6 |         ),
 7 |     output:
 8 |         "results/counts/all.tsv",
 9 |     log:
10 |         "logs/count-matrix.log",
11 |     params:
12 |         samples=units["sample_name"].tolist(),
13 |         strand=get_strandedness(units),
14 |     conda:
15 |         "../envs/pandas.yaml"
16 |     script:
17 |         "../scripts/count-matrix.py"
18 | 
19 | 
20 | rule gene_2_symbol:
21 |     input:
22 |         counts="{prefix}.tsv",
23 |     output:
24 |         symbol="{prefix}.symbol.tsv",
25 |     params:
26 |         species=get_bioc_species_name(),
27 |     log:
28 |         "logs/gene2symbol/{prefix}.log",
29 |     conda:
30 |         "../envs/biomart.yaml"
31 |     script:
32 |         "../scripts/gene2symbol.R"
33 | 
34 | 
35 | rule deseq2_init:
36 |     input:
37 |         counts="results/counts/all.tsv",
38 |     output:
39 |         "results/deseq2/all.rds",
40 |         "results/deseq2/normcounts.tsv",
41 |     conda:
42 |         "../envs/deseq2.yaml"
43 |     log:
44 |         "logs/deseq2/init.log",
45 |     threads: get_deseq2_threads()
46 |     script:
47 |         "../scripts/deseq2-init.R"
48 | 
49 | 
50 | rule pca:
51 |     input:
52 |         "results/deseq2/all.rds",
53 |     output:
54 |         report("results/pca.{variable}.svg", "../report/pca.rst"),
55 |     conda:
56 |         "../envs/deseq2.yaml"
57 |     log:
58 |         "logs/pca.{variable}.log",
59 |     script:
60 |         "../scripts/plot-pca.R"
61 | 
62 | 
63 | rule deseq2:
64 |     input:
65 |         "results/deseq2/all.rds",
66 |     output:
67 |         table=report("results/diffexp/{contrast}.diffexp.tsv", "../report/diffexp.rst"),
68 |         ma_plot=report("results/diffexp/{contrast}.ma-plot.svg", "../report/ma.rst"),
69 |     params:
70 |         contrast=get_contrast,
71 |     conda:
72 |         "../envs/deseq2.yaml"
73 |     log:
74 |         "logs/deseq2/{contrast}.diffexp.log",
75 |     threads: get_deseq2_threads()
76 |     script:
77 |         "../scripts/deseq2.R"
78 | 


--------------------------------------------------------------------------------
/workflow/rules/qc.smk:
--------------------------------------------------------------------------------
  1 | ## RSEQC
  2 | 
  3 | 
  4 | rule rseqc_gtf2bed:
  5 |     input:
  6 |         "resources/genome.gtf",
  7 |     output:
  8 |         bed="results/qc/rseqc/annotation.bed",
  9 |         db=temp("results/qc/rseqc/annotation.db"),
 10 |     log:
 11 |         "logs/rseqc_gtf2bed.log",
 12 |     conda:
 13 |         "../envs/gffutils.yaml"
 14 |     script:
 15 |         "../scripts/gtf2bed.py"
 16 | 
 17 | 
 18 | rule rseqc_junction_annotation:
 19 |     input:
 20 |         bam="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam",
 21 |         bed="results/qc/rseqc/annotation.bed",
 22 |     output:
 23 |         "results/qc/rseqc/{sample}_{unit}.junctionanno.junction.bed",
 24 |     priority: 1
 25 |     log:
 26 |         "logs/rseqc/rseqc_junction_annotation/{sample}_{unit}.log",
 27 |     params:
 28 |         extra=r"-q 255",  # STAR uses 255 as a score for unique mappers
 29 |         prefix=lambda w, output: output[0].replace(".junction.bed", ""),
 30 |     conda:
 31 |         "../envs/rseqc.yaml"
 32 |     shell:
 33 |         "junction_annotation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
 34 |         "> {log[0]} 2>&1"
 35 | 
 36 | 
 37 | rule rseqc_junction_saturation:
 38 |     input:
 39 |         bam="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam",
 40 |         bed="results/qc/rseqc/annotation.bed",
 41 |     output:
 42 |         "results/qc/rseqc/{sample}_{unit}.junctionsat.junctionSaturation_plot.pdf",
 43 |     priority: 1
 44 |     log:
 45 |         "logs/rseqc/rseqc_junction_saturation/{sample}_{unit}.log",
 46 |     params:
 47 |         extra=r"-q 255",
 48 |         prefix=lambda w, output: output[0].replace(".junctionSaturation_plot.pdf", ""),
 49 |     conda:
 50 |         "../envs/rseqc.yaml"
 51 |     shell:
 52 |         "junction_saturation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
 53 |         "> {log} 2>&1"
 54 | 
 55 | 
 56 | rule rseqc_stat:
 57 |     input:
 58 |         "results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam",
 59 |     output:
 60 |         "results/qc/rseqc/{sample}_{unit}.stats.txt",
 61 |     priority: 1
 62 |     log:
 63 |         "logs/rseqc/rseqc_stat/{sample}_{unit}.log",
 64 |     conda:
 65 |         "../envs/rseqc.yaml"
 66 |     shell:
 67 |         "bam_stat.py -i {input} > {output} 2> {log}"
 68 | 
 69 | 
 70 | rule rseqc_infer:
 71 |     input:
 72 |         bam="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam",
 73 |         bed="results/qc/rseqc/annotation.bed",
 74 |     output:
 75 |         "results/qc/rseqc/{sample}_{unit}.infer_experiment.txt",
 76 |     priority: 1
 77 |     log:
 78 |         "logs/rseqc/rseqc_infer/{sample}_{unit}.log",
 79 |     conda:
 80 |         "../envs/rseqc.yaml"
 81 |     shell:
 82 |         "infer_experiment.py -r {input.bed} -i {input.bam} > {output} 2> {log}"
 83 | 
 84 | 
 85 | rule rseqc_innerdis:
 86 |     input:
 87 |         bam="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam",
 88 |         bed="results/qc/rseqc/annotation.bed",
 89 |     output:
 90 |         "results/qc/rseqc/{sample}_{unit}.inner_distance_freq.inner_distance.txt",
 91 |     priority: 1
 92 |     log:
 93 |         "logs/rseqc/rseqc_innerdis/{sample}_{unit}.log",
 94 |     params:
 95 |         prefix=lambda w, output: output[0].replace(".inner_distance.txt", ""),
 96 |     conda:
 97 |         "../envs/rseqc.yaml"
 98 |     shell:
 99 |         "inner_distance.py -r {input.bed} -i {input.bam} -o {params.prefix} > {log} 2>&1"
100 | 
101 | 
102 | rule rseqc_readdis:
103 |     input:
104 |         bam="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam",
105 |         bed="results/qc/rseqc/annotation.bed",
106 |     output:
107 |         "results/qc/rseqc/{sample}_{unit}.readdistribution.txt",
108 |     priority: 1
109 |     log:
110 |         "logs/rseqc/rseqc_readdis/{sample}_{unit}.log",
111 |     conda:
112 |         "../envs/rseqc.yaml"
113 |     shell:
114 |         "read_distribution.py -r {input.bed} -i {input.bam} > {output} 2> {log}"
115 | 
116 | 
117 | rule rseqc_readdup:
118 |     input:
119 |         "results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam",
120 |     output:
121 |         "results/qc/rseqc/{sample}_{unit}.readdup.DupRate_plot.pdf",
122 |     priority: 1
123 |     log:
124 |         "logs/rseqc/rseqc_readdup/{sample}_{unit}.log",
125 |     params:
126 |         prefix=lambda w, output: output[0].replace(".DupRate_plot.pdf", ""),
127 |     conda:
128 |         "../envs/rseqc.yaml"
129 |     shell:
130 |         "read_duplication.py -i {input} -o {params.prefix} > {log} 2>&1"
131 | 
132 | 
133 | rule rseqc_readgc:
134 |     input:
135 |         "results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam",
136 |     output:
137 |         "results/qc/rseqc/{sample}_{unit}.readgc.GC_plot.pdf",
138 |     priority: 1
139 |     log:
140 |         "logs/rseqc/rseqc_readgc/{sample}_{unit}.log",
141 |     params:
142 |         prefix=lambda w, output: output[0].replace(".GC_plot.pdf", ""),
143 |     conda:
144 |         "../envs/rseqc.yaml"
145 |     shell:
146 |         "read_GC.py -i {input} -o {params.prefix} > {log} 2>&1"
147 | 
148 | 
149 | rule multiqc:
150 |     input:
151 |         expand(
152 |             "results/star/{unit.sample_name}_{unit.unit_name}/Aligned.sortedByCoord.out.bam",
153 |             unit=units.itertuples(),
154 |         ),
155 |         expand(
156 |             "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.junctionanno.junction.bed",
157 |             unit=units.itertuples(),
158 |         ),
159 |         expand(
160 |             "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.junctionsat.junctionSaturation_plot.pdf",
161 |             unit=units.itertuples(),
162 |         ),
163 |         expand(
164 |             "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.infer_experiment.txt",
165 |             unit=units.itertuples(),
166 |         ),
167 |         expand(
168 |             "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.stats.txt",
169 |             unit=units.itertuples(),
170 |         ),
171 |         expand(
172 |             "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.inner_distance_freq.inner_distance.txt",
173 |             unit=units.itertuples(),
174 |         ),
175 |         expand(
176 |             "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.readdistribution.txt",
177 |             unit=units.itertuples(),
178 |         ),
179 |         expand(
180 |             "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.readdup.DupRate_plot.pdf",
181 |             unit=units.itertuples(),
182 |         ),
183 |         expand(
184 |             "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.readgc.GC_plot.pdf",
185 |             unit=units.itertuples(),
186 |         ),
187 |         expand(
188 |             "logs/rseqc/rseqc_junction_annotation/{unit.sample_name}_{unit.unit_name}.log",
189 |             unit=units.itertuples(),
190 |         ),
191 |     output:
192 |         "results/qc/multiqc_report.html",
193 |     log:
194 |         "logs/multiqc.log",
195 |     wrapper:
196 |         "v3.5.3/bio/multiqc"
197 | 


--------------------------------------------------------------------------------
/workflow/rules/ref.smk:
--------------------------------------------------------------------------------
 1 | rule get_genome:
 2 |     output:
 3 |         "resources/genome.fasta",
 4 |     log:
 5 |         "logs/get-genome.log",
 6 |     params:
 7 |         species=config["ref"]["species"],
 8 |         datatype="dna",
 9 |         build=config["ref"]["build"],
10 |         release=config["ref"]["release"],
11 |     cache: True
12 |     wrapper:
13 |         "v3.5.3/bio/reference/ensembl-sequence"
14 | 
15 | 
16 | rule get_annotation:
17 |     output:
18 |         "resources/genome.gtf",
19 |     params:
20 |         species=config["ref"]["species"],
21 |         fmt="gtf",
22 |         build=config["ref"]["build"],
23 |         release=config["ref"]["release"],
24 |         flavor="",
25 |     cache: True
26 |     log:
27 |         "logs/get_annotation.log",
28 |     wrapper:
29 |         "v3.5.3/bio/reference/ensembl-annotation"
30 | 
31 | 
32 | rule genome_faidx:
33 |     input:
34 |         "resources/genome.fasta",
35 |     output:
36 |         "resources/genome.fasta.fai",
37 |     log:
38 |         "logs/genome-faidx.log",
39 |     cache: True
40 |     wrapper:
41 |         "v3.5.3/bio/samtools/faidx"
42 | 
43 | 
44 | rule bwa_index:
45 |     input:
46 |         "resources/genome.fasta",
47 |     output:
48 |         multiext("resources/genome.fasta", ".amb", ".ann", ".bwt", ".pac", ".sa"),
49 |     log:
50 |         "logs/bwa_index.log",
51 |     resources:
52 |         mem_mb=369000,
53 |     cache: True
54 |     wrapper:
55 |         "v3.5.3/bio/bwa/index"
56 | 
57 | 
58 | rule star_index:
59 |     input:
60 |         fasta="resources/genome.fasta",
61 |         annotation="resources/genome.gtf",
62 |     output:
63 |         directory("resources/star_genome"),
64 |     threads: 4
65 |     params:
66 |         extra=lambda wc, input: f"--sjdbGTFfile {input.annotation} --sjdbOverhang 100",
67 |     log:
68 |         "logs/star_index_genome.log",
69 |     cache: True
70 |     wrapper:
71 |         "v3.5.3/bio/star/index"
72 | 


--------------------------------------------------------------------------------
/workflow/rules/trim.smk:
--------------------------------------------------------------------------------
 1 | rule get_sra:
 2 |     output:
 3 |         "sra/{accession}_1.fastq",
 4 |         "sra/{accession}_2.fastq",
 5 |     log:
 6 |         "logs/get-sra/{accession}.log",
 7 |     wrapper:
 8 |         "v3.5.3/bio/sra-tools/fasterq-dump"
 9 | 
10 | 
11 | rule cutadapt_pipe:
12 |     input:
13 |         get_cutadapt_pipe_input,
14 |     output:
15 |         pipe("pipe/cutadapt/{sample}/{unit}.{fq}.{ext}"),
16 |     log:
17 |         "logs/pipe-fastqs/catadapt/{sample}_{unit}.{fq}.{ext}.log",
18 |     wildcard_constraints:
19 |         ext=r"fastq|fastq\.gz",
20 |     threads: 0
21 |     shell:
22 |         "cat {input} > {output} 2> {log}"
23 | 
24 | 
25 | rule cutadapt_pe:
26 |     input:
27 |         get_cutadapt_input,
28 |     output:
29 |         fastq1="results/trimmed/{sample}_{unit}_R1.fastq.gz",
30 |         fastq2="results/trimmed/{sample}_{unit}_R2.fastq.gz",
31 |         qc="results/trimmed/{sample}_{unit}.paired.qc.txt",
32 |     log:
33 |         "logs/cutadapt/{sample}_{unit}.log",
34 |     params:
35 |         extra=config["params"]["cutadapt-pe"],
36 |         adapters=lambda w: str(units.loc[w.sample].loc[w.unit, "adapters"]),
37 |     threads: 8
38 |     wrapper:
39 |         "v3.5.3/bio/cutadapt/pe"
40 | 
41 | 
42 | rule cutadapt_se:
43 |     input:
44 |         get_cutadapt_input,
45 |     output:
46 |         fastq="results/trimmed/{sample}_{unit}_single.fastq.gz",
47 |         qc="results/trimmed/{sample}_{unit}_single.qc.txt",
48 |     log:
49 |         "logs/cutadapt/{sample}_{unit}.log",
50 |     params:
51 |         extra=config["params"]["cutadapt-se"],
52 |         adapters=lambda w: str(units.loc[w.sample].loc[w.unit, "adapters"]),
53 |     threads: 8
54 |     wrapper:
55 |         "v3.5.3/bio/cutadapt/se"
56 | 


--------------------------------------------------------------------------------
/workflow/schemas/config.schema.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-06/schema#"
 2 | 
 3 | description: snakemake configuration file
 4 | 
 5 | type: object
 6 | 
 7 | properties:
 8 |   samples:
 9 |     type: string
10 |   units:
11 |     type: string
12 | 
13 |   ref:
14 |     type: object
15 |     properties:
16 |       species:
17 |         type: string
18 |       release:
19 |         type: integer
20 |       build:
21 |         type: string
22 |     required:
23 |       - species
24 |       - release
25 |       - build
26 | 
27 |   trimming:
28 |     type: object
29 |     properties:
30 |       activate:
31 |         type: boolean
32 |     required:
33 |       - activate
34 | 
35 |   pca:
36 |     type: object
37 |     properties:
38 |       activate:
39 |        type: boolean
40 |       labels:
41 |         type: 
42 |           - array
43 |           - string
44 |         items:
45 |           type: string
46 |     required:
47 |       - activate
48 | 
49 |   diffexp:
50 |     type: object
51 |     properties:
52 |       contrasts:
53 |         type: object
54 |       model:
55 |         type: string
56 |     required:
57 |       - contrasts
58 | 
59 |   params:
60 |     type: object
61 |     properties:
62 |       cutadapt-pe:
63 |         type: string
64 |       cutadapt-se:
65 |         type: string
66 |       star: 
67 |         type: string
68 |     required:
69 |       - cutadapt-pe
70 |       - cutadapt-se
71 |       - star
72 | 
73 | required:
74 |   - samples
75 |   - units
76 |   - ref
77 |   - pca
78 |   - diffexp
79 |   - params
80 |   - trimming
81 | 


--------------------------------------------------------------------------------
/workflow/schemas/samples.schema.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-06/schema#"
 2 | 
 3 | description: an entry in the sample sheet
 4 | properties:
 5 |   sample_name:
 6 |     type: string
 7 |     description: sample name/identifier
 8 | 
 9 | required:
10 |   - sample_name
11 | 


--------------------------------------------------------------------------------
/workflow/schemas/units.schema.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-04/schema#"
 2 | description: row of the units.tsv, representing a sequencing unit, i.e. single-end or paired-end data
 3 | type: object
 4 | properties:
 5 |   sample_name:
 6 |     type: string
 7 |     description: sample name/id the unit has been sequenced from
 8 |   unit_name:
 9 |     type: string
10 |     description: unit id
11 |   fq1:
12 |     type: string
13 |     description: path to FASTQ file
14 |   fq2:
15 |     type: string
16 |     description: path to second FASTQ file (leave empty in case of single-end)
17 |   sra:
18 |     type: string
19 |     description: SRA id for automatic download of unit
20 |   adapters:
21 |     type: string
22 |     description: adapter trimming settings to use (for cutadapt)
23 |   strandedness:
24 |     type: string
25 |     description: one of the values 'none', 'yes' or 'reverse' according to protocol strandedness
26 | 
27 | required:
28 |   - sample_name
29 |   - unit_name
30 | 


--------------------------------------------------------------------------------
/workflow/scripts/common/__init__.py:
--------------------------------------------------------------------------------
1 | # Any Python script in the scripts folder will be able to import from this module and beyond.
2 | 


--------------------------------------------------------------------------------
/workflow/scripts/count-matrix.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | # logging
 4 | sys.stderr = open(snakemake.log[0], "w")
 5 | 
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def get_column(strandedness):
10 |     if pd.isnull(strandedness) or strandedness == "none":
11 |         return 1  # non stranded protocol
12 |     elif strandedness == "yes":
13 |         return 2  # 3rd column
14 |     elif strandedness == "reverse":
15 |         return 3  # 4th column, usually for Illumina truseq
16 |     else:
17 |         raise ValueError(
18 |             (
19 |                 "'strandedness' column should be empty or have the "
20 |                 "value 'none', 'yes' or 'reverse', instead has the "
21 |                 "value {}"
22 |             ).format(repr(strandedness))
23 |         )
24 | 
25 | 
26 | counts = [
27 |     pd.read_table(
28 |         f, index_col=0, usecols=[0, get_column(strandedness)], header=None, skiprows=4
29 |     )
30 |     for f, strandedness in zip(snakemake.input, snakemake.params.strand)
31 | ]
32 | 
33 | for t, sample in zip(counts, snakemake.params.samples):
34 |     t.columns = [sample]
35 | 
36 | matrix = pd.concat(counts, axis=1)
37 | matrix.index.name = "gene"
38 | # collapse technical replicates
39 | matrix = matrix.groupby(matrix.columns, axis=1, sort=False).sum()
40 | matrix.to_csv(snakemake.output[0], sep="\t")
41 | 


--------------------------------------------------------------------------------
/workflow/scripts/deseq2-init.R:
--------------------------------------------------------------------------------
 1 | log <- file(snakemake@log[[1]], open = "wt")
 2 | sink(log)
 3 | sink(log, type="message")
 4 | 
 5 | library(stringr)
 6 | library("DESeq2")
 7 | 
 8 | parallel <- FALSE
 9 | if (snakemake@threads > 1) {
10 |     library("BiocParallel")
11 |     # setup parallelization
12 |     register(MulticoreParam(snakemake@threads))
13 |     parallel <- TRUE
14 | }
15 | 
16 | counts_data <- read.table(
17 |   snakemake@input[["counts"]],
18 |   header = TRUE,
19 |   row.names = "gene",
20 |   check.names = FALSE
21 | )
22 | counts_data <- counts_data[, order(names(counts_data))]
23 | 
24 | col_data <- read.table(
25 |   snakemake@config[["samples"]],
26 |   header = TRUE,
27 |   row.names = "sample_name",
28 |   check.names = FALSE
29 | )
30 | col_data <- col_data[order(row.names(col_data)), , drop = FALSE]
31 | 
32 | # properly set the base level to the configuration in config.yaml, avoiding
33 | # the default behaviour of choosing the alphabetical minimum level
34 | for (vof in names(snakemake@config[["diffexp"]][["variables_of_interest"]])) {
35 |   snakemake@config[["diffexp"]][["variables_of_interest"]][[vof]]
36 |   base_level <- snakemake@config[["diffexp"]][["variables_of_interest"]][[vof]][["base_level"]]
37 |   col_data[[vof]] <- relevel(
38 |     factor(col_data[[vof]]), base_level
39 |   )
40 | }
41 | 
42 | # properly turn all batch effects into factors, even if they are numeric
43 | batch_effects <- snakemake@config[["diffexp"]][["batch_effects"]]
44 | for (effect in batch_effects) {
45 |   if (str_length(effect) > 0) {
46 |     col_data[[effect]] <- factor(col_data[[effect]])
47 |   }
48 | }
49 | 
50 | # build up formula with additive batch_effects and all interactions between the
51 | # variables_of_interes
52 | 
53 | design_formula <- snakemake@config[["diffexp"]][["model"]]
54 | 
55 | if (str_length(design_formula) == 0) {
56 |   batch_effects <- str_flatten(batch_effects, " + ")
57 |   if (str_length(batch_effects) > 0) {
58 |     batch_effects <- str_c(batch_effects, " + ")
59 |   }
60 |   vof_interactions <- str_flatten(
61 |     names(snakemake@config[["diffexp"]][["variables_of_interest"]]),
62 |     " * "
63 |   )
64 |   design_formula <- str_c("~", batch_effects, vof_interactions)
65 | }
66 | 
67 | dds <- DESeqDataSetFromMatrix(
68 |   countData = counts_data,
69 |   colData = col_data,
70 |   design = as.formula(design_formula)
71 | )
72 | 
73 | # remove uninformative columns
74 | dds <- dds[rowSums(counts(dds)) > 1, ]
75 | # normalization and preprocessing
76 | dds <- DESeq(dds, parallel = parallel)
77 | 
78 | # Write dds object as RDS
79 | saveRDS(dds, file = snakemake@output[[1]])
80 | # Write normalized counts
81 | norm_counts <- counts(dds, normalized = TRUE)
82 | write.table(
83 |   data.frame(
84 |     "gene" = rownames(norm_counts),
85 |     norm_counts
86 |   ),
87 |   file = snakemake@output[[2]],
88 |   sep = "\t",
89 |   row.names = FALSE
90 | )
91 | 


--------------------------------------------------------------------------------
/workflow/scripts/deseq2.R:
--------------------------------------------------------------------------------
 1 | log <- file(snakemake@log[[1]], open = "wt")
 2 | sink(log)
 3 | sink(log, type = "message")
 4 | 
 5 | library("cli")
 6 | library("DESeq2")
 7 | 
 8 | parallel <- FALSE
 9 | if (snakemake@threads > 1) {
10 |     library("BiocParallel")
11 |     # setup parallelization
12 |     register(MulticoreParam(snakemake@threads))
13 |     parallel <- TRUE
14 | }
15 | 
16 | dds <- readRDS(snakemake@input[[1]])
17 | 
18 | contrast_config <- snakemake@config[["diffexp"]][["contrasts"]][[
19 |     snakemake@wildcards[["contrast"]]
20 | ]]
21 | 
22 | # basic case of contrast specification, see:
23 | # https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#contrasts
24 | if (length(contrast_config) == 2 && typeof(contrast_config) == "list") {
25 |   if (
26 |     # check for existence contrast's variable_of_interest to
27 |     # provide useful error message
28 |     !(contrast_config[["variable_of_interest"]] %in%
29 |       names(snakemake@config[["diffexp"]][["variables_of_interest"]])
30 |     )
31 |   ) {
32 |       cli_abort(
33 |         c(
34 |                 "config.yaml: All variable_of_interest entries under `diffexp: contrasts:`",
35 |           " " = "must also exist under `diffexp: variables_of_interest:`.",
36 |           "x" = "Could not find variable_of_interest: {contrast_config[['variable_of_interest']]}",
37 |           " " = "It was not among the `diffexp: variables_of_interest:`",
38 |           " " = "{names(snakemake@config[['diffexp']][['variables_of_interest']])}",
39 |           "i" = "Are there any typos in the contrasts' `variable_of_interest:` entries?"
40 |         )
41 |       )
42 |   }
43 |   contrast <- c(
44 |     contrast_config[["variable_of_interest"]],
45 |     contrast_config[["level_of_interest"]],
46 |     snakemake@config[["diffexp"]][["variables_of_interest"]][[
47 |       contrast_config[["variable_of_interest"]]
48 |     ]][["base_level"]]
49 |   )
50 | # more complex contrast specification via list(c(), c()), see ?results docs of
51 | # the DESeq2 package and this tutorial (plus the linked seqanswers thread):
52 | # https://github.com/tavareshugo/tutorial_DESeq2_contrasts/blob/main/DESeq2_contrasts.md
53 | } else if (
54 |     length(contrast_config) == 1 &&
55 |     typeof(contrast_config) == "character"
56 |   ) {
57 |   contrast <- d <- eval(parse(text = contrast_config))
58 | }
59 | 
60 | res <- results(
61 |   dds,
62 |   contrast = contrast,
63 |   parallel = parallel
64 | )
65 | # shrink fold changes for lowly expressed genes
66 | # use ashr so we can use `contrast` as conversion to coef is not trivial, see:
67 | # https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#extended-section-on-shrinkage-estimators
68 | res <- lfcShrink(
69 |   dds,
70 |   contrast = contrast,
71 |   res = res,
72 |   type = "ashr"
73 | )
74 | 
75 | # sort by p-value
76 | res <- res[order(res$padj), ]
77 | # TODO explore IHW usage
78 | 
79 | 
80 | # store results
81 | svg(snakemake@output[["ma_plot"]])
82 | plotMA(res, ylim = c(-2, 2))
83 | dev.off()
84 | 
85 | write.table(
86 |   data.frame(
87 |     "gene" = rownames(res),
88 |     res
89 |   ),
90 |   file = snakemake@output[["table"]],
91 |   row.names = FALSE,
92 |   sep = "\t"
93 | )
94 | 


--------------------------------------------------------------------------------
/workflow/scripts/gene2symbol.R:
--------------------------------------------------------------------------------
 1 | library(biomaRt)
 2 | library(tidyverse)
 3 | # useful error messages upon aborting
 4 | library("cli")
 5 | 
 6 | # this variable holds a mirror name until
 7 | # useEnsembl succeeds ("www" is last, because 
 8 | # of very frequent "Internal Server Error"s)
 9 | mart <- "useast"
10 | rounds <- 0
11 | while ( class(mart)[[1]] != "Mart" ) {
12 |   mart <- tryCatch(
13 |     {
14 |       # done here, because error function does not
15 |       # modify outer scope variables, I tried
16 |       if (mart == "www") rounds <- rounds + 1
17 |       # equivalent to useMart, but you can choose
18 |       # the mirror instead of specifying a host
19 |       biomaRt::useEnsembl(
20 |         biomart = "ENSEMBL_MART_ENSEMBL",
21 |         dataset = str_c(snakemake@params[["species"]], "_gene_ensembl"),
22 |         mirror = mart
23 |       )
24 |     },
25 |     error = function(e) {
26 |       # change or make configurable if you want more or
27 |       # less rounds of tries of all the mirrors
28 |       if (rounds >= 3) {
29 |         cli_abort(
30 |           str_c(
31 |             "Have tried all 4 available Ensembl biomaRt mirrors ",
32 |             rounds,
33 |             " times. You might have a connection problem, or no mirror is responsive.\n",
34 |             "The last error message was:\n",
35 |             message(e)
36 |           )
37 |         )
38 |       }
39 |       # hop to next mirror
40 |       mart <- switch(mart,
41 |                      useast = "uswest",
42 |                      uswest = "asia",
43 |                      asia = "www",
44 |                      www = {
45 |                        # wait before starting another round through the mirrors,
46 |                        # hoping that intermittent problems disappear
47 |                        Sys.sleep(30)
48 |                        "useast"
49 |                      }
50 |               )
51 |     }
52 |   )
53 | }
54 | 
55 | 
56 | df <- read.table(snakemake@input[["counts"]], sep='\t', header=1)
57 | 
58 | g2g <- biomaRt::getBM(
59 |             attributes = c( "ensembl_gene_id",
60 |                             "external_gene_name"),
61 |             filters = "ensembl_gene_id",
62 |             values = df$gene,
63 |             mart = mart,
64 |             )
65 | 
66 | annotated <- merge(df, g2g, by.x="gene", by.y="ensembl_gene_id")
67 | annotated$gene <- ifelse(annotated$external_gene_name == '', annotated$gene, annotated$external_gene_name)
68 | annotated$external_gene_name <- NULL
69 | write.table(annotated, snakemake@output[["symbol"]], sep='\t', row.names=F)
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/workflow/scripts/gtf2bed.py:
--------------------------------------------------------------------------------
 1 | import gffutils
 2 | 
 3 | db = gffutils.create_db(
 4 |     snakemake.input[0],
 5 |     dbfn=snakemake.output.db,
 6 |     force=True,
 7 |     keep_order=True,
 8 |     merge_strategy="merge",
 9 |     sort_attribute_values=True,
10 |     disable_infer_genes=True,
11 |     disable_infer_transcripts=True,
12 | )
13 | 
14 | with open(snakemake.output.bed, "w") as outfileobj:
15 |     for tx in db.features_of_type("transcript", order_by="start"):
16 |         bed = [s.strip() for s in db.bed12(tx).split("\t")]
17 |         bed[3] = tx.id
18 |         outfileobj.write("{}\n".format("\t".join(bed)))
19 | 


--------------------------------------------------------------------------------
/workflow/scripts/plot-pca.R:
--------------------------------------------------------------------------------
 1 | log <- file(snakemake@log[[1]], open = "wt")
 2 | sink(log)
 3 | sink(log, type = "message")
 4 | 
 5 | library("DESeq2")
 6 | 
 7 | # load deseq2 data
 8 | dds <- readRDS(snakemake@input[[1]])
 9 | 
10 | # obtain normalized counts
11 | counts <- rlog(dds, blind=FALSE)
12 | svg(snakemake@output[[1]])
13 | plotPCA(counts, intgroup = snakemake@wildcards[["variable"]])
14 | dev.off()
15 | 


--------------------------------------------------------------------------------